v0.6.8: added '-c'('--content') command line argument to output raw t…

…ext content as separate output file
modesty · Jul 20, 2014 · 4d35ce9 · 4d35ce9
1 parent ec960ba
commit 4d35ce9
Show file tree

Hide file tree

Showing 5 changed files with 107 additions and 35 deletions.
diff --git a/lib/p2jcmd.js b/lib/p2jcmd.js
@@ -3,9 +3,10 @@
 var fs = require('fs'),
     path = require('path'),
     _ = require('underscore'),
-    PFParser = require("../pdfparser"),
+    PDFParser = require("../pdfparser"),
     pkInfo = require('../package.json'),
-    nodeUtil = require("util");
+    nodeUtil = require("util"),
+    async = require("async");
 
 var optimist = require('optimist')
     .usage('\nUsage: $0 -f|--file [-o|output_dir]')
@@ -20,7 +21,9 @@ var optimist = require('optimist')
     .alias('s', 'silent')
     .describe('s', '(optional) when specified, will only log errors, otherwise verbose.\n')
     .alias('t', 'fieldTypes')
-    .describe('t', '(optional) when specified, will generate .fields.json that includes fields ids and types.\n');
+    .describe('t', '(optional) when specified, will generate .fields.json that includes fields ids and types.\n')
+    .alias('c', 'content')
+    .describe('c', '(optional) when specified, will generate .content.txt that includes text content from PDF (Experimental).\n');
 
 var argv = optimist.argv;
 
@@ -44,38 +47,65 @@ var PDF2JSONUtil = (function () {
             } else {
                 nodeUtil.p2jinfo(this.inputFile + " => " + fieldTypesFile + " [" + this.outputDir + "] OK");
             }
-            _continue.call(this, callback, err);
+            callback(err, fieldTypesFile);
         }.bind(this));
     };
 
+    var _generateRawTextContentFile = function(data, callback) {
+        var contentPath = this.outputPath.replace(".json", ".content.txt");
+        var contentFile = this.outputFile.replace(".json", ".content.txt");
+
+        fs.writeFile(contentPath, this.pdfParser.getRawTextContent(), function(err) {
+            if (err) {
+                nodeUtil.p2jwarn(this.inputFile + " => " + contentFile + " Exception: " + err);
+            } else {
+                nodeUtil.p2jinfo(this.inputFile + " => " + contentFile + " [" + this.outputDir + "] OK");
+            }
+            callback(err, contentFile);
+        }.bind(this));
+    };
+
+
     var _writeOneJSON = function(data, callback) {
         var pJSON = JSON.stringify({"formImage":data});
 
         fs.writeFile(this.outputPath, pJSON, function(err) {
             if(err) {
                 nodeUtil.p2jwarn(this.inputFile + " => " + this.outputFile + " Exception: " + err);
                 this.curProcessor.failedCount++;
-                _continue.call(this, callback, err);
             } else {
                 nodeUtil.p2jinfo(this.inputFile + " => " + this.outputFile + " [" + this.outputDir + "] OK");
                 this.curProcessor.successCount++;
-
-                if (_.has(argv, 't')) {//needs to generate fields.json file
-                    _generateFieldsTypesFile.call(this, data, callback);
-                }
-                else {
-                    _continue.call(this, callback);
-                }
             }
+            callback(err, this.outputFile);
         }.bind(this));
     };
 
     var _parseOnePDF = function(callback) {
-        this.pdfParser = new PFParser();
+        var processRawTextContent = _.has(argv, 'c');
+        var self = this;
+        this.pdfParser = new PDFParser(null, processRawTextContent);
 
         this.pdfParser.on("pdfParser_dataReady", function (evtData) {
             if ((!!evtData) && (!!evtData.data)) {
-                _writeOneJSON.call(this, evtData.data, callback);
+
+                var outputTasks = [function(cbFunc) { _writeOneJSON.call(self, evtData.data, cbFunc);}];
+                if (_.has(argv, 't')) {//needs to generate fields.json file
+                    outputTasks.push(function(cbFunc) {_generateFieldsTypesFile.call(self, evtData.data, cbFunc);});
+                }
+                if (processRawTextContent) {//needs to generate content.txt file
+                    outputTasks.push(function(cbFunc) {_generateRawTextContentFile.call(self, evtData.data, cbFunc);});
+                }
+
+                async.series(outputTasks, function(err, results){
+                    if (err) {
+                        nodeUtil.p2jwarn("Error: " + err);
+                    } else {
+                        nodeUtil.p2jinfo("Output files OK", results);
+                    }
+
+                    _continue.call(self, callback);
+                });
             }
             else {
                 this.curProcessor.failedCount++;

diff --git a/lib/pdf.js b/lib/pdf.js
@@ -214,7 +214,7 @@ var PDFJSClass = (function () {
     };
 
     // constructor
-    var cls = function () {
+    var cls = function (needRawText) {
         nodeEvents.EventEmitter.call(this);
         // private
         var _id = _nextId++;
@@ -225,14 +225,17 @@ var PDFJSClass = (function () {
 
         // public, this instance copies
         this.pdfDocument = null;
-        this.formImage = null;
+        this.pages = [];
+        this.pageWidth = 0;
+        this.rawTextContents = [];
+
+        this.needRawText = needRawText;
     };
     // inherit from event emitter
 	nodeUtil.inherits(cls, nodeEvents.EventEmitter);
 
     cls.prototype.parsePDFData = function(arrayBuffer) {
         this.pdfDocument = null;
-        this.formImage = null;
 
         var parameters = {password: '', data: arrayBuffer};
         var self = this;
@@ -274,9 +277,6 @@ var PDFJSClass = (function () {
     cls.prototype.load = function(pdfDocument, scale) {
         this.pdfDocument = pdfDocument;
 
-        this.pages = [];
-        this.pageWidth = 0;
-
         var pagesCount = pdfDocument.numPages;
         var pagePromises = [];
         for (var i = 1; i <= pagesCount; i++)
@@ -333,6 +333,18 @@ var PDFJSClass = (function () {
         var pdfPage = promisedPages[id];
         var pageParser = new PDFPageParser(pdfPage, id, scale, this.ptiParser);
 
+        function continueOnNextPage() {
+            nodeUtil.p2jinfo("complete parsing page:" + (id+1));
+            if (id === (self.pdfDocument.numPages - 1) ) {
+                self.emit("pdfjs_parseDataReady", {Pages:self.pages, Width: self.pageWidth});
+            }
+            else {
+                process.nextTick(function(){
+                    self.parsePage(promisedPages, ++id, scale);
+                });
+            }
+        };
+
         pageParser.parsePage(function() {
             if (!self.pageWidth)  //get PDF width
                 self.pageWidth = pageParser.width;
@@ -341,36 +353,54 @@ var PDFJSClass = (function () {
                 HLines: pageParser.HLines,
                 VLines: pageParser.VLines,
                 Fills:pageParser.Fills,
-                Content:pdfPage.getTextContent(),
+//needs to keep current default output format, text content will output to a separate file if '-c' command line argument is set
+//                Content:pdfPage.getTextContent(),
                 Texts: pageParser.Texts,
                 Fields: pageParser.Fields,
                 Boxsets: pageParser.Boxsets
             };
 
             self.pages.push(page);
 
-            if (id === (self.pdfDocument.numPages - 1) ) {
-                nodeUtil.p2jinfo("complete parsing page:" + (id+1));
-                self.emit("pdfjs_parseDataReady", {Pages:self.pages, Width: self.pageWidth});
+            if (self.needRawText) {
+                pdfPage.getTextContent().then(function(textContent){
+                    self.rawTextContents.push(textContent);
+                    nodeUtil.p2jinfo("complete parsing raw text content:" + (id+1));
+                    continueOnNextPage();
+                });
             }
             else {
-                process.nextTick(function(){
-                    self.parsePage(promisedPages, ++id, scale);
-                });
+                continueOnNextPage();
             }
         }, function(errMsg) {
             self.emit("pdfjs_parseDataError", errMsg);
         });
     };
 
+    cls.prototype.getRawTextContent = function() {
+        var retVal = "";
+        if (!this.needRawText)
+            return retVal;
+
+        _.each(this.rawTextContents, function(textContent, index) {
+            _.each(textContent.bidiTexts, function(textObj, idx) {
+                retVal += textObj.str + "\r\n";
+            });
+            retVal += "----------------Page (" + index + ") Break----------------\r\n";
+        });
+
+        return retVal;
+    };
+
     cls.prototype.destroy = function() {
         this.removeAllListeners();
 
         if (this.pdfDocument)
             this.pdfDocument.destroy();
         this.pdfDocument = null;
 
-        this.formImage = null;
+        this.pages = null;
+        this.rawTextContents = null;
     };
 
     return cls;

diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "pdf2json",
-  "_id": "[email protected].7",
-  "version": "0.6.7",
+  "_id": "[email protected].8",
+  "version": "0.6.8",
   "description": "A PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js",
   "keywords": [
     "pdf",

diff --git a/pdfparser.js b/pdfparser.js
@@ -15,7 +15,7 @@ var PDFParser = (function () {
     var _maxBinBufferCount = 10;
 
     // constructor
-    var cls = function (context) {
+    var cls = function (context, needRawText) {
 		//call constructor for super class
 		nodeEvents.EventEmitter.call(this);
 
@@ -26,10 +26,12 @@ var PDFParser = (function () {
         this.get_id = function() { return _id; };
         this.get_name = function() { return _name + _id; };
 
+        // service context object, only used in Web Service project; null in command line
         this.context = context;
+
         this.pdfFilePath = null; //current PDF file to load and parse, null means loading/parsing not started
         this.data = null; //if file read success, data is PDF content; if failed, data is "err" object
-        this.PDFJS = new PDFJS();
+        this.PDFJS = new PDFJS(needRawText);
         this.parsePropCount = 0;
         this.processFieldInfoXML = false;//disable additional _fieldInfo.xml parsing and merging
     };
@@ -123,6 +125,10 @@ var PDFParser = (function () {
         startParsingPDF.call(this, pdfBuffer);
     };
 
+    cls.prototype.getRawTextContent = function() {
+        return this.PDFJS.getRawTextContent();
+    };
+
     cls.prototype.destroy = function() {
         this.removeAllListeners();
 

diff --git a/readme.md b/readme.md
@@ -115,6 +115,9 @@ load PDF file from specified file path asynchronously.
 If failed, event "pdfParser_dataError" will be raised with error object;
 If success, event "pdfParser_dataReady" will be raised with output data object, which can be saved as json file (in command line) or serialized to json when running in web service.
 
+## Output Text File
+
+Please refer to the "-c" command line argument (v0.6.8) at the bottom of this document.
 
 ## Output format Reference
 
@@ -670,7 +673,7 @@ Then run it in command line:
             or
             pdf2json -f [input directory or pdf file] -o [output directory]
 
-v0.5.4 added "-s" or "--silent" commandline parameter to suppress informative logging output. When using pdf2json as a commandline tool, the default verbosity is 5 (INFOS). While when running as a web service, default verbosity is 9 (ERRORS).
+v0.5.4 added "-s" or "--silent" command line argument to suppress informative logging output. When using pdf2json as a commandline tool, the default verbosity is 5 (INFOS). While when running as a web service, default verbosity is 9 (ERRORS).
 Examples to suppress logging info from commandline:
 
             pdf2json -f [input directory or pdf file] -o [output directory] -s
@@ -685,7 +688,7 @@ Examples to turn on logging info in web service:
 
 v0.5.7 added the capability to skip input PDF files if filename begins with any one of "!@#$%^&*()+=[]\\\';,/{}|\":<>?~`.-_  ", usually these files are created by PDF authoring tools as backup files.
 
-v0.6.2 added "-t" command line switch to generate fields json file in addition to parsed json. The fields json file will contain one Array which contains fieldInfo object for each field, and each fieldInfo object will have 4 fields:
+v0.6.2 added "-t" command line argument to generate fields json file in addition to parsed json. The fields json file will contain one Array which contains fieldInfo object for each field, and each fieldInfo object will have 4 fields:
 * id: field ID
 * type: string name of field type, like radio, alpha, etc
 * calc: true if read only, otherwise false
@@ -700,7 +703,10 @@ Example of fields.json content:
             ...
             ]
 
-The fields.json output can be used to validate fields IDs with other data source, automation data and/or to extract data value from user submitted PDFs.
+The fields.json output can be used to validate fields IDs with other data source, and/or to extract data value from user submitted PDFs.
+
+v0.6.8 added "-c" or "--content" command line argument to generate raw text content from PDF. It'll be a separated output file named as <pdf_file_name>.content.txt.
+This feature is added to answer some inquiries on retrieving raw test content from PDF, it's in experimental phase at this point, needs more tests.
 
 ## Run in a RESTful Web Service