-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
646 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
const path = require('path'); | ||
const fs = require('fs'); | ||
const PdfOcr = require('./lib'); | ||
|
||
module.exports = function (pdfPath, options, cb) { | ||
options = options || {}; | ||
|
||
if (!pdfPath) { | ||
return cb('you must supply a pdf path as the first parameter and options for the second parameter.'); | ||
} | ||
|
||
options.scanFirstPageOnly = options.scanFirstPageOnly || false; | ||
|
||
const processor = new PdfOcr(options); | ||
|
||
fs.exists(pdfPath, (exists) => { | ||
if (!exists) { | ||
return cb('no file exists at the path specified.'); | ||
} | ||
|
||
processor.process(pdfPath, options); | ||
cb(); | ||
}); | ||
|
||
return processor; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
const temp = require('temp'); | ||
const path = require('path'); | ||
const exec = require('child_process').exec | ||
const fs = require('fs'); | ||
let pdfConvertQuality = 400; // default to density 400 for the convert command | ||
|
||
module.exports = (inputPath, quality, callback) => { | ||
if (!callback || typeof callback !== 'function') { | ||
callback = quality; // callback must be the second parameter | ||
quality = undefined; // no option passed | ||
} | ||
|
||
fs.exists(inputPath, (exists) => { | ||
if (!exists) { | ||
return callback(`error, no file exists at the path: ${inputPath}`); | ||
} | ||
|
||
const outputPath = temp.path({ prefix: 'tif_output', suffix: '.tif' }); | ||
|
||
if (quality) { | ||
if (typeof (quality) !== 'string' && typeof (quality) !== 'number') { | ||
return callback(`error, pdf quality option must be a string, currently set as: ${typeof (quality)}`); | ||
} | ||
|
||
pdfConvertQuality = quality; | ||
} | ||
|
||
const cmd = 'gs -sDEVICE=tiffgray -r720x720 -g6120x7920 -sCompression=lzw -o "' + outputPath + '" "' + inputPath + '"'; | ||
const child = exec(cmd, (err, stderr, stdout) => { | ||
if (err) { | ||
return callback(err); | ||
} | ||
|
||
return callback(null, outputPath); | ||
}); | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
const events = require('events'); | ||
const pathHash = require('pathhash'); | ||
const fs = require('fs'); | ||
const async = require('async'); | ||
const splitPdf = require('./splitPdf.js'); | ||
const convertToTiff = require('./convertToTiff.js'); | ||
const ocr = require('./ocr.js'); | ||
|
||
module.exports = class PdfOcr extends events.EventEmitter { | ||
constructor(options) { | ||
super(); | ||
this.scanOptions = options || this.scanOptions; | ||
} | ||
|
||
process(pdfPath, options) { | ||
const self = this; | ||
const textPages = []; | ||
let splitOutput; | ||
|
||
options = options || { scanFirstPageOnly : false }; | ||
options.clean = options.clean || true; | ||
|
||
fs.exists(pdfPath, (exists) => { | ||
if (!exists) { | ||
var error = `file does not exist at path: ${pdfPath}`; | ||
self.emit('error', { error, pdfPath }); | ||
return; | ||
} | ||
|
||
pathHash(pdfPath, (error, hash) => { | ||
if (error) { | ||
error = `error hashing file at path: ${pdfPath}. ${error}`; | ||
self.emit('error', { error, pdfPath }); | ||
return; | ||
} | ||
|
||
splitPdf(pdfPath, this.scanOptions, (error, output) => { | ||
if (error) { | ||
self.emit('error', { error, pdfPath }); | ||
return; | ||
} | ||
|
||
if (!output) { | ||
error = 'no files returned from split'; | ||
self.emit('error', { error, pdfPath }); | ||
return; | ||
} | ||
|
||
self.emit('log', `finished splitting pages for file at path: ${pdfPath}`); | ||
splitOutput = output; | ||
|
||
const pdfFiles = output.files; | ||
|
||
if (!pdfFiles || !pdfFiles.length) { | ||
error = 'error, no pages where found in pdf document'; | ||
self.emit('error', { error, pdfPath }); | ||
return; | ||
} | ||
|
||
let index = 0; | ||
let numPages = pdfFiles.length; | ||
const singlePagePdfFilePaths = []; | ||
|
||
async.forEachSeries(pdfFiles, (pdfFile, cb) => { | ||
const quality = options.quality ? options.quality : 300; | ||
|
||
convertToTiff(pdfFile.filePath, quality, (err, tiffPath) => { | ||
const zeroBasedNumPages = numPages - 1; | ||
|
||
self.emit('log', `converted page to tiff file, page ${index} of ${zeroBasedNumPages}`); | ||
|
||
if (err) { | ||
return cb(err); | ||
} | ||
|
||
const ocrFlags = options.ocrFlags ? options.ocrFlags : ['-psm 6']; | ||
|
||
ocr(tiffPath, ocrFlags, (err, text) => { | ||
fs.unlink(tiffPath, (tiffError, reply) => { | ||
if (tiffError) { | ||
err += `, error removing tif file: ${tiffError}`; | ||
} | ||
|
||
if (err) { | ||
return cb(err); | ||
} | ||
|
||
self.emit('log', `raw ocr: page ${index} of ${zeroBasedNumPages} complete`); | ||
|
||
singlePagePdfFilePaths.push(pdfFile.filePath); | ||
self.emit('page', { hash, text, index, numPages, pdfPath, singlePagePdfPath: pdfFile.filePath }); | ||
textPages.push(text); | ||
index++; | ||
cb(); | ||
}); | ||
}); | ||
}); | ||
}, (err) => { | ||
if (err) { | ||
self.emit('error', err); | ||
return; | ||
} | ||
|
||
self.emit('complete', { hash, textPages, pdfPath, singlePagePdfFilePaths }); | ||
}); | ||
}); | ||
}); | ||
}); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
const temp = require('temp'); | ||
const path = require('path'); | ||
const exec = require('child_process').exec; | ||
const fs = require('fs'); | ||
|
||
module.exports = function (inputPath, options, callback) { | ||
if (!callback || typeof callback !== 'function') { | ||
callback = options; | ||
options = []; | ||
} | ||
|
||
fs.exists(inputPath, (exists) => { | ||
if (!exists) { | ||
return callback(`error, no file exists at the path you specified: ${inputPath}`); | ||
} | ||
|
||
const outputPath = temp.path({ prefix: 'ocr_output' }); | ||
const cmd = 'tesseract "' + inputPath + '" "' + outputPath + '" ' + options.join(' '); | ||
const child = exec(cmd, (err) => { | ||
if (err) { | ||
return callback(err); | ||
} | ||
|
||
const textOutputPath = `${outputPath}.txt`; | ||
fs.readFile(textOutputPath, 'utf8', (err, output) => { | ||
if (err) { | ||
return callback(err); | ||
} | ||
|
||
fs.unlink(textOutputPath, (err) => { | ||
if (err) { | ||
return callback(err); | ||
} | ||
|
||
callback(null, output); | ||
}); | ||
}); | ||
}); | ||
}); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
var path = require('path'); | ||
var temp = require('temp'); | ||
var exec = require('child_process').exec; | ||
var fs = require('fs'); | ||
var walk = require('walk'); | ||
var async = require('async'); | ||
|
||
module.exports = (pdfPath, options, callback) => { | ||
|
||
options.scanFirstPageOnly = options.scanFirstPageOnly || false; | ||
|
||
function fileExist(filePath, callback) { | ||
fs.exists(filePath, (exists) => { | ||
if (!exists) { | ||
return callback(`no file at path: ${filePath}`); | ||
} | ||
|
||
return callback(); | ||
}); | ||
} | ||
|
||
function deleteDocData(callback) { | ||
const folder = path.join(__dirname, '..'); | ||
const docDataPath = path.join(folder, 'doc_data.txt'); | ||
|
||
fs.exists(docDataPath, (exists) => { | ||
if (!exists) { | ||
return callback(); | ||
} | ||
|
||
fs.unlink(docDataPath, callback); | ||
}); | ||
} | ||
|
||
function getPdfs(directoryPath, onlyFirstPage, callback) { | ||
const filePaths = []; | ||
const walker = walk.walk(directoryPath, { followLinks: false }); | ||
let files = null; | ||
|
||
walker.on('file', (root, stat, next) => { | ||
if (onlyFirstPage) { | ||
if (stat.name.toLowerCase().endsWith('0001.pdf')) { | ||
const filePath = path.join(directoryPath, stat.name); | ||
filePaths.push({ filePath, fileName: stat.name }); | ||
} | ||
} else { | ||
if (stat.name.match(/\.pdf$/i)) { | ||
const filePath = path.join(directoryPath, stat.name); | ||
filePaths.push({ filePath, fileName: stat.name }); | ||
} | ||
} | ||
|
||
next(); | ||
}); | ||
|
||
walker.on('end', () => { | ||
filePaths.sort((a, b) => { | ||
if (a.fileName < b.fileName) { | ||
return -1; | ||
} | ||
|
||
if (a.fileName == b.fileName) { | ||
return 0; | ||
} | ||
|
||
return 1; | ||
}); | ||
|
||
const output = { | ||
folder: directoryPath, | ||
files: filePaths | ||
} | ||
|
||
callback(null, output); | ||
|
||
return; | ||
}); | ||
} | ||
|
||
fileExist(pdfPath, (err) => { | ||
if (err) { | ||
return callback(err); | ||
} | ||
|
||
var outputDir = temp.path({}, 'pdfPages'); | ||
|
||
fs.mkdir(outputDir, (err) => { | ||
if (err) { | ||
return callback(err, null); | ||
} | ||
|
||
const outputName = 'page%05d.pdf'; | ||
const outputPath = path.join(outputDir, outputName); | ||
const cmd = `pdftk "${pdfPath}" burst output "${outputPath}"`; | ||
|
||
const child = exec(cmd, (err, stdout, stderr) => { | ||
if (err) { | ||
const outputErr = { | ||
message: 'an error occurred while splitting pdf into single pages with the pdftk burst command', | ||
error: err | ||
} | ||
|
||
return callback(outputErr, null); | ||
} | ||
|
||
deleteDocData((err, reply) => { | ||
if (err) { | ||
return callback(err); | ||
} | ||
|
||
return getPdfs(outputDir, options.scanFirstPageOnly, callback); | ||
}); | ||
}); | ||
}); | ||
}); | ||
} |
Binary file not shown.
Oops, something went wrong.