<h1 id="processing-pdf-files">Processing PDF files</h1>
<p>Node.js native modules offer a way of parsing PDF documents in a memory
efficient way using streams:</p>
<pre><code class="language-javascript">import * as fs from &#39;fs&#39;;
import { promisify } from &#39;util&#39;;
import * as zlib from &#39;zlib&#39;;

const unzipAsync = promisify(zlib.unzip);

// this regular expression finds the zipped contents
const pageRegExp = /(?&lt;=&gt;&gt;stream\n).+?(?=\nendstream)/gsu;

const parsePDF = async (path) =&gt; {
  // we need to cache data across chunks in case the search area finds itself
  // across the boundary of two chunks
  let data = &#39;&#39;;

  for await (const chunk of fs.createReadStream(path)) {
    // convert chunk to string for regular expression checking
    data += chunk.toString(&#39;binary&#39;);
    let sliceIndex = 0;
    for (const match of data.matchAll(pageRegExp)) {
      sliceIndex = match.index + match[0].length;
      // unzip the data from the match and convert to correct encoding
      console.log(
        (await unzipAsync(Buffer.from(match[0], &#39;binary&#39;))).toString(&#39;utf8&#39;)
      );
    }
    // remove the data that we&#39;ve already seen in preparation for the next chunk
    // matchAll will otherwise match data we&#39;ve already seen
    if (sliceIndex) data = data.slice(sliceIndex);
  }
};
</code></pre>
<p>The original
<a href="https://gist.github.com/averagesecurityguy/ba8d9ed3c59c1deffbd1390dafa5a3c2">gist</a>
(written in Python) that inspired this.</p>