diff --git a/TextExtraction/lib/interpreter/PDFRecursiveInterpreter.cpp b/TextExtraction/lib/interpreter/PDFRecursiveInterpreter.cpp index 541be59..e427775 100644 --- a/TextExtraction/lib/interpreter/PDFRecursiveInterpreter.cpp +++ b/TextExtraction/lib/interpreter/PDFRecursiveInterpreter.cpp @@ -12,6 +12,7 @@ #include "IPDFRecursiveInterpreterHandler.h" #include +#include using namespace std; using namespace PDFHummus; @@ -109,7 +110,7 @@ PDFObjectParser* InterpreterContext::GetObjectParser() { PDFRecursiveInterpreter::PDFRecursiveInterpreter(void) { - + mNestingContext = NULL; } PDFRecursiveInterpreter::~PDFRecursiveInterpreter(void) { @@ -235,6 +236,18 @@ bool PDFRecursiveInterpreter::InterpretContentStream( LongFilePositionType currentPosition = inParser->GetParserStream()->GetCurrentPosition(); PDFObjectCastPtr xobjectRef = inContext->FindResource(formName, "XObject"); ObjectIDType formObjectID = !xobjectRef ? 0 : xobjectRef->mObjectID; + if(!!mNestingContext) { + ObjectIDTypeList::iterator itFindInStack = find(mNestingContext->nestedXObjects.begin(), mNestingContext->nestedXObjects.end(), formObjectID); + if(itFindInStack != mNestingContext->nestedXObjects.end()) { + // orcish mischief! looping. halt + shouldContinue = false; + break; + } + + // add this form to the nesting stack + mNestingContext->nestedXObjects.push_back(formObjectID); + } + PDFObjectCastPtr formObject(inParser->ParseNewObject(formObjectID)); if(!!formObject && IsForm(formObject.GetPtr())) { bool shouldRecurse = inHandler->OnXObjectDoStart(formName, formObjectID, formObject.GetPtr(), inParser); @@ -248,6 +261,11 @@ bool PDFRecursiveInterpreter::InterpretContentStream( } inHandler->OnXObjectDoEnd(formName, formObjectID, formObject.GetPtr(), inParser); } + + if(!!mNestingContext) { + mNestingContext->nestedXObjects.pop_back(); + } + // restore stream position (hopefully this is enough to continue from where we were...) inParser->GetParserStream()->SetPosition(currentPosition); @@ -294,13 +312,48 @@ bool PDFRecursiveInterpreter::InterpretPageContents( } bool PDFRecursiveInterpreter::InterpretXObjectContents( + PDFParser* inParser, + PDFStreamInput* inXObject, + IPDFRecursiveInterpreterHandler* inHandler) { + // root levels xobject content interpretation, context created here + PDFNestingContext rootNestingContext; + + mNestingContext = &rootNestingContext; + bool result = InterpretXObjectContentsInternal( + inParser, + inXObject, + inHandler + ); + mNestingContext = NULL; + return result; +} + +bool PDFRecursiveInterpreter::InterpretXObjectContents( + PDFParser* inParser, + PDFStreamInput* inXObject, + IPDFRecursiveInterpreterHandler* inHandler, + PDFNestingContext* inNestingContext) { + + // lower levels xobject content interpretation, context coming from higher levels + mNestingContext = inNestingContext; + bool result = InterpretXObjectContentsInternal( + inParser, + inXObject, + inHandler + ); + mNestingContext = NULL; + return result; +} + +bool PDFRecursiveInterpreter::InterpretXObjectContentsInternal( PDFParser* inParser, PDFStreamInput* inXObject, IPDFRecursiveInterpreterHandler* inHandler) { RefCountPtr xobjectDict(inXObject->QueryStreamDictionary()); - + InterpreterContext context(inParser, xobjectDict.GetPtr()); inHandler->OnResourcesRead(&context); return InterpretContentStream(inParser, xobjectDict.GetPtr(), inParser->StartReadingObjectsFromStream(inXObject),&context, inHandler); -} \ No newline at end of file +} + diff --git a/TextExtraction/lib/interpreter/PDFRecursiveInterpreter.h b/TextExtraction/lib/interpreter/PDFRecursiveInterpreter.h index 202a094..71878e7 100644 --- a/TextExtraction/lib/interpreter/PDFRecursiveInterpreter.h +++ b/TextExtraction/lib/interpreter/PDFRecursiveInterpreter.h @@ -1,6 +1,11 @@ #pragma once +#include + +#include "IOBasicTypes.h" #include "IPDFRecursiveInterpreterHandler.h" +typedef std::list ObjectIDTypeList; + class PDFParser; class PDFDictionary; class PDFStreamInput; @@ -23,6 +28,25 @@ class PDFRecursiveInterpreter { IPDFRecursiveInterpreterHandler* inHandler); private: + struct PDFNestingContext { + ObjectIDTypeList nestedXObjects; + }; + + PDFNestingContext* mNestingContext; + + // internal method used by higher level interpreters to call lower level xobject interpreters with nesting context + bool InterpretXObjectContents( + PDFParser* inParser, + PDFStreamInput* inXObject, + IPDFRecursiveInterpreterHandler* inHandler, + PDFNestingContext* inNestingContext); + + // internal method for intrepreting xobjects + bool InterpretXObjectContentsInternal( + PDFParser* inParser, + PDFStreamInput* inXObject, + IPDFRecursiveInterpreterHandler* inHandler); + bool InterpretContentStream( PDFParser* inParser, PDFDictionary* inContentParent,