Skip to content

Commit

Permalink
v0.6.8: added '-c'('--content') command line argument to output raw t…
Browse files Browse the repository at this point in the history
…ext content as separate output file
  • Loading branch information
modesty committed Jul 20, 2014
1 parent ec960ba commit 4d35ce9
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 35 deletions.
58 changes: 44 additions & 14 deletions lib/p2jcmd.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
var fs = require('fs'),
path = require('path'),
_ = require('underscore'),
PFParser = require("../pdfparser"),
PDFParser = require("../pdfparser"),
pkInfo = require('../package.json'),
nodeUtil = require("util");
nodeUtil = require("util"),
async = require("async");

var optimist = require('optimist')
.usage('\nUsage: $0 -f|--file [-o|output_dir]')
Expand All @@ -20,7 +21,9 @@ var optimist = require('optimist')
.alias('s', 'silent')
.describe('s', '(optional) when specified, will only log errors, otherwise verbose.\n')
.alias('t', 'fieldTypes')
.describe('t', '(optional) when specified, will generate .fields.json that includes fields ids and types.\n');
.describe('t', '(optional) when specified, will generate .fields.json that includes fields ids and types.\n')
.alias('c', 'content')
.describe('c', '(optional) when specified, will generate .content.txt that includes text content from PDF (Experimental).\n');

var argv = optimist.argv;

Expand All @@ -44,38 +47,65 @@ var PDF2JSONUtil = (function () {
} else {
nodeUtil.p2jinfo(this.inputFile + " => " + fieldTypesFile + " [" + this.outputDir + "] OK");
}
_continue.call(this, callback, err);
callback(err, fieldTypesFile);
}.bind(this));
};

var _generateRawTextContentFile = function(data, callback) {
var contentPath = this.outputPath.replace(".json", ".content.txt");
var contentFile = this.outputFile.replace(".json", ".content.txt");

fs.writeFile(contentPath, this.pdfParser.getRawTextContent(), function(err) {
if (err) {
nodeUtil.p2jwarn(this.inputFile + " => " + contentFile + " Exception: " + err);
} else {
nodeUtil.p2jinfo(this.inputFile + " => " + contentFile + " [" + this.outputDir + "] OK");
}
callback(err, contentFile);
}.bind(this));
};


var _writeOneJSON = function(data, callback) {
var pJSON = JSON.stringify({"formImage":data});

fs.writeFile(this.outputPath, pJSON, function(err) {
if(err) {
nodeUtil.p2jwarn(this.inputFile + " => " + this.outputFile + " Exception: " + err);
this.curProcessor.failedCount++;
_continue.call(this, callback, err);
} else {
nodeUtil.p2jinfo(this.inputFile + " => " + this.outputFile + " [" + this.outputDir + "] OK");
this.curProcessor.successCount++;

if (_.has(argv, 't')) {//needs to generate fields.json file
_generateFieldsTypesFile.call(this, data, callback);
}
else {
_continue.call(this, callback);
}
}
callback(err, this.outputFile);
}.bind(this));
};

var _parseOnePDF = function(callback) {
this.pdfParser = new PFParser();
var processRawTextContent = _.has(argv, 'c');
var self = this;
this.pdfParser = new PDFParser(null, processRawTextContent);

this.pdfParser.on("pdfParser_dataReady", function (evtData) {
if ((!!evtData) && (!!evtData.data)) {
_writeOneJSON.call(this, evtData.data, callback);

var outputTasks = [function(cbFunc) { _writeOneJSON.call(self, evtData.data, cbFunc);}];
if (_.has(argv, 't')) {//needs to generate fields.json file
outputTasks.push(function(cbFunc) {_generateFieldsTypesFile.call(self, evtData.data, cbFunc);});
}
if (processRawTextContent) {//needs to generate content.txt file
outputTasks.push(function(cbFunc) {_generateRawTextContentFile.call(self, evtData.data, cbFunc);});
}

async.series(outputTasks, function(err, results){
if (err) {
nodeUtil.p2jwarn("Error: " + err);
} else {
nodeUtil.p2jinfo("Output files OK", results);
}

_continue.call(self, callback);
});
}
else {
this.curProcessor.failedCount++;
Expand Down
58 changes: 44 additions & 14 deletions lib/pdf.js
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ var PDFJSClass = (function () {
};

// constructor
var cls = function () {
var cls = function (needRawText) {
nodeEvents.EventEmitter.call(this);
// private
var _id = _nextId++;
Expand All @@ -225,14 +225,17 @@ var PDFJSClass = (function () {

// public, this instance copies
this.pdfDocument = null;
this.formImage = null;
this.pages = [];
this.pageWidth = 0;
this.rawTextContents = [];

this.needRawText = needRawText;
};
// inherit from event emitter
nodeUtil.inherits(cls, nodeEvents.EventEmitter);

cls.prototype.parsePDFData = function(arrayBuffer) {
this.pdfDocument = null;
this.formImage = null;

var parameters = {password: '', data: arrayBuffer};
var self = this;
Expand Down Expand Up @@ -274,9 +277,6 @@ var PDFJSClass = (function () {
cls.prototype.load = function(pdfDocument, scale) {
this.pdfDocument = pdfDocument;

this.pages = [];
this.pageWidth = 0;

var pagesCount = pdfDocument.numPages;
var pagePromises = [];
for (var i = 1; i <= pagesCount; i++)
Expand Down Expand Up @@ -333,6 +333,18 @@ var PDFJSClass = (function () {
var pdfPage = promisedPages[id];
var pageParser = new PDFPageParser(pdfPage, id, scale, this.ptiParser);

function continueOnNextPage() {
nodeUtil.p2jinfo("complete parsing page:" + (id+1));
if (id === (self.pdfDocument.numPages - 1) ) {
self.emit("pdfjs_parseDataReady", {Pages:self.pages, Width: self.pageWidth});
}
else {
process.nextTick(function(){
self.parsePage(promisedPages, ++id, scale);
});
}
};

pageParser.parsePage(function() {
if (!self.pageWidth) //get PDF width
self.pageWidth = pageParser.width;
Expand All @@ -341,36 +353,54 @@ var PDFJSClass = (function () {
HLines: pageParser.HLines,
VLines: pageParser.VLines,
Fills:pageParser.Fills,
Content:pdfPage.getTextContent(),
//needs to keep current default output format, text content will output to a separate file if '-c' command line argument is set
// Content:pdfPage.getTextContent(),
Texts: pageParser.Texts,
Fields: pageParser.Fields,
Boxsets: pageParser.Boxsets
};

self.pages.push(page);

if (id === (self.pdfDocument.numPages - 1) ) {
nodeUtil.p2jinfo("complete parsing page:" + (id+1));
self.emit("pdfjs_parseDataReady", {Pages:self.pages, Width: self.pageWidth});
if (self.needRawText) {
pdfPage.getTextContent().then(function(textContent){
self.rawTextContents.push(textContent);
nodeUtil.p2jinfo("complete parsing raw text content:" + (id+1));
continueOnNextPage();
});
}
else {
process.nextTick(function(){
self.parsePage(promisedPages, ++id, scale);
});
continueOnNextPage();
}
}, function(errMsg) {
self.emit("pdfjs_parseDataError", errMsg);
});
};

cls.prototype.getRawTextContent = function() {
var retVal = "";
if (!this.needRawText)
return retVal;

_.each(this.rawTextContents, function(textContent, index) {
_.each(textContent.bidiTexts, function(textObj, idx) {
retVal += textObj.str + "\r\n";
});
retVal += "----------------Page (" + index + ") Break----------------\r\n";
});

return retVal;
};

cls.prototype.destroy = function() {
this.removeAllListeners();

if (this.pdfDocument)
this.pdfDocument.destroy();
this.pdfDocument = null;

this.formImage = null;
this.pages = null;
this.rawTextContents = null;
};

return cls;
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "pdf2json",
"_id": "[email protected].7",
"version": "0.6.7",
"_id": "[email protected].8",
"version": "0.6.8",
"description": "A PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js",
"keywords": [
"pdf",
Expand Down
10 changes: 8 additions & 2 deletions pdfparser.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ var PDFParser = (function () {
var _maxBinBufferCount = 10;

// constructor
var cls = function (context) {
var cls = function (context, needRawText) {
//call constructor for super class
nodeEvents.EventEmitter.call(this);

Expand All @@ -26,10 +26,12 @@ var PDFParser = (function () {
this.get_id = function() { return _id; };
this.get_name = function() { return _name + _id; };

// service context object, only used in Web Service project; null in command line
this.context = context;

this.pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started
this.data = null; //if file read success, data is PDF content; if failed, data is "err" object
this.PDFJS = new PDFJS();
this.PDFJS = new PDFJS(needRawText);
this.parsePropCount = 0;
this.processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging
};
Expand Down Expand Up @@ -123,6 +125,10 @@ var PDFParser = (function () {
startParsingPDF.call(this, pdfBuffer);
};

cls.prototype.getRawTextContent = function() {
return this.PDFJS.getRawTextContent();
};

cls.prototype.destroy = function() {
this.removeAllListeners();

Expand Down
12 changes: 9 additions & 3 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ load PDF file from specified file path asynchronously.
If failed, event "pdfParser_dataError" will be raised with error object;
If success, event "pdfParser_dataReady" will be raised with output data object, which can be saved as json file (in command line) or serialized to json when running in web service.

## Output Text File

Please refer to the "-c" command line argument (v0.6.8) at the bottom of this document.

## Output format Reference

Expand Down Expand Up @@ -670,7 +673,7 @@ Then run it in command line:
or
pdf2json -f [input directory or pdf file] -o [output directory]

v0.5.4 added "-s" or "--silent" commandline parameter to suppress informative logging output. When using pdf2json as a commandline tool, the default verbosity is 5 (INFOS). While when running as a web service, default verbosity is 9 (ERRORS).
v0.5.4 added "-s" or "--silent" command line argument to suppress informative logging output. When using pdf2json as a commandline tool, the default verbosity is 5 (INFOS). While when running as a web service, default verbosity is 9 (ERRORS).
Examples to suppress logging info from commandline:

pdf2json -f [input directory or pdf file] -o [output directory] -s
Expand All @@ -685,7 +688,7 @@ Examples to turn on logging info in web service:

v0.5.7 added the capability to skip input PDF files if filename begins with any one of "!@#$%^&*()+=[]\\\';,/{}|\":<>?~`.-_ ", usually these files are created by PDF authoring tools as backup files.

v0.6.2 added "-t" command line switch to generate fields json file in addition to parsed json. The fields json file will contain one Array which contains fieldInfo object for each field, and each fieldInfo object will have 4 fields:
v0.6.2 added "-t" command line argument to generate fields json file in addition to parsed json. The fields json file will contain one Array which contains fieldInfo object for each field, and each fieldInfo object will have 4 fields:
* id: field ID
* type: string name of field type, like radio, alpha, etc
* calc: true if read only, otherwise false
Expand All @@ -700,7 +703,10 @@ Example of fields.json content:
...
]

The fields.json output can be used to validate fields IDs with other data source, automation data and/or to extract data value from user submitted PDFs.
The fields.json output can be used to validate fields IDs with other data source, and/or to extract data value from user submitted PDFs.

v0.6.8 added "-c" or "--content" command line argument to generate raw text content from PDF. It'll be a separated output file named as <pdf_file_name>.content.txt.
This feature is added to answer some inquiries on retrieving raw test content from PDF, it's in experimental phase at this point, needs more tests.

## Run in a RESTful Web Service

Expand Down

0 comments on commit 4d35ce9

Please sign in to comment.