intial code

nielswh · Mar 10, 2018 · 333dc64 · 333dc64
1 parent f706172
commit 333dc64
Show file tree

Hide file tree

Showing 18 changed files with 646 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -4,25 +4,27 @@ node module that will do OCR on PDFs that do not contain searchable text.
 ## Inspired from pdf-extract
 [https://www.npmjs.com/package/pdf-extract] by Noah Isaacson.  Many of the ideas initial design are from this project.
 
-## Difference from pdf-extract
-- Uses ES6 javascript syntax.
-- Uses Promises instead of callbacks
-- Option to OCR just the first page of the PDF. (primary reason this was written for my own selfish reason!)
-- Currently does not OCR searchable PDFs.  Plenty of options out there that does this.
-- If you need to OCR searchable PDFs, I recommend using pdf-extract instead.
+## Differences between pdf-ocr and pdf-extract
+- pdf-extract instructions were outdated when it came to installing dependant binaries. I ran into a couple of pitfalls and wanted to make sure others did not if they used this version.
+- Removed the instructions to update the trained data for tesseract, since version 3.05.01 was newer then the instructions on pdf-extract.
+- Updated code to use ES6 javascript syntax.
+- I Needed an option to OCR just the first page of the PDF.
+- This version currently does not OCR searchable PDFs.  Plenty of options out there that does this.
+- If you need to OCR searchable PDFs, I recommend using pdf-extract instead. (However, use the instructions below to get the dependant binaries.)
 
 ## Installation
 
 `npm install pdf-ocr --save` 
 
-After installing, the following binaries list below will need to be on your system as well as in the paths in your environment settings.
+After installing pdf-ocr, the following binaries listed below will need to be on your system, as well as in the paths in your environment settings.
 
 ### OSX
 
 **pdftk**
-[http://www.pdflabs.com/docs/install-pdftk/](http://www.pdflabs.com/docs/install-pdftk/)
+- Grab the package installer at [http://www.pdflabs.com/docs/install-pdftk/](http://www.pdflabs.com/docs/install-pdftk/)
+
+- If you're installing on OSX Sierra or High Sierra, you'll need to make sure you use the package installer pdftk_server-2.02-mac_osx-10.11-setup.pkg
 
-- If you're installing on OSX Sierra or High Sierra, you'll need to make sure you use pdftk_server-2.02-mac_osx-10.11-setup.pkg
 - Other versions, seemed to hang the process.  If the tests fail, this could the main reason why.
 
 **ghostscript**
@@ -34,12 +36,10 @@ brew install gs
 
 `brew install tesseract`
 
-After tesseract is installed you need to install the alphanumeric config and an updated trained data file
+After tesseract is installed you need to install the alphanumeric config
 ``` bash
 cd <root of this module>
-cp "./share/eng.traineddata" "/usr/local/Cellar/tesseract/3.02.02_3/share/tessdata/eng.traineddata"
-cp "./share/dia.traineddata" "/usr/local/Cellar/tesseract/3.02.02_3/share/tessdata/dia.traineddata"
-cp "./share/configs/alphanumeric" "/usr/local/Cellar/tesseract/3.02.02_3/share/tessdata/configs/alphanumeric"
+cp "./share/configs/alphanumeric" "/usr/local/Cellar/tesseract/3.05.01/share/tessdata/configs/alphanumeric"
 ```
 
 ### Ubuntu
@@ -63,11 +63,10 @@ apt-get install ghostscript
 apt-get install tesseract-ocr
 ```
 
-For the OCR to work, you need to have the tesseract-ocr binaries available on your path. If you only need to handle ASCII characters, the accuracy of the OCR process can be increased by limiting the tesseract output. To do this copy the *alphanumeric* file included with this module into the *tess-data* folder on your system. Also the eng.traineddata included with the standard tesseract-ocr package is out of date. This module provides an up-to-date version which you should copy into the appropriate location on your system.
+For the OCR to work, you need to have the tesseract-ocr binaries available on your path. If you only need to handle ASCII characters, the accuracy of the OCR process can be increased by limiting the tesseract output. To do this copy the *alphanumeric* file included with this module into the *tess-data* folder on your system.
 
 ``` bash
 cd <root of this module>
-cp "./share/eng.traineddata" "/usr/share/tesseract-ocr/tessdata/eng.traineddata"
 cp "./share/configs/alphanumeric" "/usr/share/tesseract-ocr/tessdata/configs/alphanumeric"
 ```
 

diff --git a/index.js b/index.js
@@ -0,0 +1,26 @@
+const path = require('path');
+const fs = require('fs');
+const PdfOcr = require('./lib');
+
+module.exports = function (pdfPath, options, cb) {
+    options = options || {};
+
+    if (!pdfPath) {
+        return cb('you must supply a pdf path as the first parameter and options for the second parameter.');
+    }
+
+    options.scanFirstPageOnly = options.scanFirstPageOnly || false;
+
+    const processor = new PdfOcr(options);
+
+    fs.exists(pdfPath, (exists) => {
+        if (!exists) {
+            return cb('no file exists at the path specified.');
+        }
+
+        processor.process(pdfPath, options);
+        cb();
+    });
+
+    return processor;
+}
diff --git a/lib/convertToTiff.js b/lib/convertToTiff.js
@@ -0,0 +1,37 @@
+const temp = require('temp');
+const path = require('path');
+const exec = require('child_process').exec
+const fs = require('fs');
+let pdfConvertQuality = 400; // default to density 400 for the convert command
+
+module.exports = (inputPath, quality, callback) => {
+    if (!callback || typeof callback !== 'function') {
+        callback = quality;   // callback must be the second parameter
+        quality = undefined;  // no option passed
+    }
+
+    fs.exists(inputPath, (exists) => {
+        if (!exists) { 
+            return callback(`error, no file exists at the path: ${inputPath}`); 
+        }
+
+        const outputPath = temp.path({ prefix: 'tif_output', suffix: '.tif' });
+
+        if (quality) {
+            if (typeof (quality) !== 'string' && typeof (quality) !== 'number') {
+                return callback(`error, pdf quality option must be a string, currently set as: ${typeof (quality)}`);
+            }
+
+            pdfConvertQuality = quality;
+        }
+
+        const cmd = 'gs -sDEVICE=tiffgray -r720x720 -g6120x7920 -sCompression=lzw -o "' + outputPath + '" "' + inputPath + '"';
+        const child = exec(cmd, (err, stderr, stdout) => {
+            if (err) {
+                return callback(err);
+            }
+
+            return callback(null, outputPath);
+        });
+    });
+}
diff --git a/lib/index.js b/lib/index.js
@@ -0,0 +1,110 @@
+const events = require('events');
+const pathHash = require('pathhash');
+const fs = require('fs');
+const async = require('async');
+const splitPdf = require('./splitPdf.js');
+const convertToTiff = require('./convertToTiff.js');
+const ocr = require('./ocr.js');
+
+module.exports = class PdfOcr extends events.EventEmitter {
+    constructor(options) {
+        super();
+        this.scanOptions = options || this.scanOptions;
+    }
+
+    process(pdfPath, options) {
+        const self = this;
+        const textPages = [];
+        let splitOutput;
+
+        options = options || { scanFirstPageOnly : false };
+        options.clean = options.clean || true;
+
+        fs.exists(pdfPath, (exists) => {
+            if (!exists) {
+                var error = `file does not exist at path: ${pdfPath}`;
+                self.emit('error', { error, pdfPath });
+                return;
+            }
+
+            pathHash(pdfPath, (error, hash) => {
+                if (error) {
+                    error = `error hashing file at path: ${pdfPath}. ${error}`;
+                    self.emit('error', { error, pdfPath });
+                    return;
+                }
+
+                splitPdf(pdfPath, this.scanOptions, (error, output) => {
+                    if (error) {
+                        self.emit('error', { error, pdfPath });
+                        return;
+                    }
+
+                    if (!output) {
+                        error = 'no files returned from split';
+                        self.emit('error', { error, pdfPath });
+                        return;
+                    }
+
+                    self.emit('log', `finished splitting pages for file at path: ${pdfPath}`);
+                    splitOutput = output;
+
+                    const pdfFiles = output.files;
+
+                    if (!pdfFiles || !pdfFiles.length) {
+                        error = 'error, no pages where found in pdf document';
+                        self.emit('error', { error, pdfPath });
+                        return;
+                    }
+
+                    let index = 0;
+                    let numPages = pdfFiles.length;
+                    const singlePagePdfFilePaths = [];
+
+                    async.forEachSeries(pdfFiles, (pdfFile, cb) => {
+                        const quality = options.quality ? options.quality : 300;
+
+                        convertToTiff(pdfFile.filePath, quality, (err, tiffPath) => {
+                            const zeroBasedNumPages = numPages - 1;
+
+                            self.emit('log', `converted page to tiff file, page ${index} of ${zeroBasedNumPages}`);
+
+                            if (err) { 
+                                return cb(err); 
+                            }
+
+                            const ocrFlags = options.ocrFlags ? options.ocrFlags : ['-psm 6'];
+
+                            ocr(tiffPath, ocrFlags, (err, text) => {
+                                fs.unlink(tiffPath, (tiffError, reply) => {
+                                    if (tiffError) {
+                                        err += `, error removing tif file: ${tiffError}`;
+                                    }
+
+                                    if (err) { 
+                                        return cb(err); 
+                                    }
+
+                                    self.emit('log', `raw ocr: page ${index} of ${zeroBasedNumPages} complete`);
+
+                                    singlePagePdfFilePaths.push(pdfFile.filePath);
+                                    self.emit('page', { hash, text, index, numPages, pdfPath, singlePagePdfPath: pdfFile.filePath });
+                                    textPages.push(text);
+                                    index++;
+                                    cb();
+                                });
+                            });
+                        });
+                    }, (err) => {
+                        if (err) {
+                            self.emit('error', err);
+                            return;
+                        }
+
+                        self.emit('complete', { hash, textPages, pdfPath, singlePagePdfFilePaths });
+                    });
+                });
+            });
+        });
+    }
+}
diff --git a/lib/ocr.js b/lib/ocr.js
@@ -0,0 +1,40 @@
+const temp = require('temp');
+const path = require('path');
+const exec = require('child_process').exec;
+const fs = require('fs');
+
+module.exports = function (inputPath, options, callback) {
+    if (!callback || typeof callback !== 'function') {
+        callback = options;
+        options = [];
+    }
+
+    fs.exists(inputPath, (exists) => {
+        if (!exists) {
+            return callback(`error, no file exists at the path you specified: ${inputPath}`);
+        }
+
+        const outputPath = temp.path({ prefix: 'ocr_output' });
+        const cmd = 'tesseract "' + inputPath + '" "' + outputPath + '" ' + options.join(' ');
+        const child = exec(cmd, (err) => {
+            if (err) {
+                return callback(err);
+            }
+
+            const textOutputPath = `${outputPath}.txt`;
+            fs.readFile(textOutputPath, 'utf8', (err, output) => {
+                if (err) { 
+                    return callback(err); 
+                }
+
+                fs.unlink(textOutputPath, (err) => {
+                    if (err) { 
+                        return callback(err);
+                    }
+
+                    callback(null, output);
+                });
+            });
+        });
+    });
+}
diff --git a/lib/splitPdf.js b/lib/splitPdf.js
@@ -0,0 +1,116 @@
+var path = require('path');
+var temp = require('temp');
+var exec = require('child_process').exec;
+var fs = require('fs');
+var walk = require('walk');
+var async = require('async');
+
+module.exports = (pdfPath, options, callback) => {
+
+    options.scanFirstPageOnly = options.scanFirstPageOnly || false;
+
+    function fileExist(filePath, callback) {
+        fs.exists(filePath, (exists) => {
+            if (!exists) {
+                return callback(`no file at path: ${filePath}`);
+            }
+
+            return callback();
+        });
+    }
+
+    function deleteDocData(callback) {
+        const folder = path.join(__dirname, '..');
+        const docDataPath = path.join(folder, 'doc_data.txt');
+
+        fs.exists(docDataPath, (exists) => {
+            if (!exists) {
+                return callback();
+            }
+
+            fs.unlink(docDataPath, callback);
+        });
+    }
+
+    function getPdfs(directoryPath, onlyFirstPage, callback) {
+        const filePaths = [];
+        const walker = walk.walk(directoryPath, { followLinks: false });
+        let files = null;
+
+        walker.on('file', (root, stat, next) => {
+            if (onlyFirstPage) {
+                if (stat.name.toLowerCase().endsWith('0001.pdf')) {
+                    const filePath = path.join(directoryPath, stat.name);
+                    filePaths.push({ filePath, fileName: stat.name });
+                }
+            } else {
+                if (stat.name.match(/\.pdf$/i)) {
+                    const filePath = path.join(directoryPath, stat.name);
+                    filePaths.push({ filePath, fileName: stat.name });
+                }
+            }
+
+            next();
+        });
+
+        walker.on('end', () => {
+            filePaths.sort((a, b) => {
+                if (a.fileName < b.fileName) {
+                    return -1;
+                }
+
+                if (a.fileName == b.fileName) {
+                    return 0;
+                }
+
+                return 1;
+            });
+
+            const output = {
+                folder: directoryPath,
+                files: filePaths
+            }
+
+            callback(null, output);
+
+            return;
+        });
+    }
+
+    fileExist(pdfPath, (err) => {
+        if (err) {
+            return callback(err);
+        }
+
+        var outputDir = temp.path({}, 'pdfPages');
+
+        fs.mkdir(outputDir, (err) => {
+            if (err) {
+                return callback(err, null);
+            }
+
+            const outputName = 'page%05d.pdf';
+            const outputPath = path.join(outputDir, outputName);
+            const cmd = `pdftk "${pdfPath}" burst output "${outputPath}"`;
+
+            const child = exec(cmd, (err, stdout, stderr) => {
+                if (err) {
+                    const outputErr = {
+                        message: 'an error occurred while splitting pdf into single pages with the pdftk burst command',
+                        error: err
+                    }
+
+                    return callback(outputErr, null);
+                }
+
+                deleteDocData((err, reply) => {
+                    if (err) { 
+                        return callback(err);
+                    }
+
+                    return getPdfs(outputDir, options.scanFirstPageOnly, callback);
+                });
+            });
+        });
+    });
+}
diff --git a/share/.DS_Store b/share/.DS_Store