Skip to content

Commit

Permalink
feat: adding nesting check for xobjects
Browse files Browse the repository at this point in the history
its should be possible to tailor a PDF to cause endless loop with this code using nested xobjects calling each other. this MR deals with that by checking a nesting stack.
  • Loading branch information
galkahana committed Jan 6, 2024
1 parent 380efa1 commit 14fb9bf
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 3 deletions.
59 changes: 56 additions & 3 deletions TextExtraction/lib/interpreter/PDFRecursiveInterpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "IPDFRecursiveInterpreterHandler.h"

#include <string>
#include <algorithm>

using namespace std;
using namespace PDFHummus;
Expand Down Expand Up @@ -109,7 +110,7 @@ PDFObjectParser* InterpreterContext::GetObjectParser() {


PDFRecursiveInterpreter::PDFRecursiveInterpreter(void) {

mNestingContext = NULL;
}

PDFRecursiveInterpreter::~PDFRecursiveInterpreter(void) {
Expand Down Expand Up @@ -235,6 +236,18 @@ bool PDFRecursiveInterpreter::InterpretContentStream(
LongFilePositionType currentPosition = inParser->GetParserStream()->GetCurrentPosition();
PDFObjectCastPtr<PDFIndirectObjectReference> xobjectRef = inContext->FindResource(formName, "XObject");
ObjectIDType formObjectID = !xobjectRef ? 0 : xobjectRef->mObjectID;
if(!!mNestingContext) {
ObjectIDTypeList::iterator itFindInStack = find(mNestingContext->nestedXObjects.begin(), mNestingContext->nestedXObjects.end(), formObjectID);
if(itFindInStack != mNestingContext->nestedXObjects.end()) {
// orcish mischief! looping. halt
shouldContinue = false;
break;
}

// add this form to the nesting stack
mNestingContext->nestedXObjects.push_back(formObjectID);
}

PDFObjectCastPtr<PDFStreamInput> formObject(inParser->ParseNewObject(formObjectID));
if(!!formObject && IsForm(formObject.GetPtr())) {
bool shouldRecurse = inHandler->OnXObjectDoStart(formName, formObjectID, formObject.GetPtr(), inParser);
Expand All @@ -248,6 +261,11 @@ bool PDFRecursiveInterpreter::InterpretContentStream(
}
inHandler->OnXObjectDoEnd(formName, formObjectID, formObject.GetPtr(), inParser);
}

if(!!mNestingContext) {
mNestingContext->nestedXObjects.pop_back();
}


// restore stream position (hopefully this is enough to continue from where we were...)
inParser->GetParserStream()->SetPosition(currentPosition);
Expand Down Expand Up @@ -294,13 +312,48 @@ bool PDFRecursiveInterpreter::InterpretPageContents(
}

bool PDFRecursiveInterpreter::InterpretXObjectContents(
PDFParser* inParser,
PDFStreamInput* inXObject,
IPDFRecursiveInterpreterHandler* inHandler) {
// root levels xobject content interpretation, context created here
PDFNestingContext rootNestingContext;

mNestingContext = &rootNestingContext;
bool result = InterpretXObjectContentsInternal(
inParser,
inXObject,
inHandler
);
mNestingContext = NULL;
return result;
}

bool PDFRecursiveInterpreter::InterpretXObjectContents(
PDFParser* inParser,
PDFStreamInput* inXObject,
IPDFRecursiveInterpreterHandler* inHandler,
PDFNestingContext* inNestingContext) {

// lower levels xobject content interpretation, context coming from higher levels
mNestingContext = inNestingContext;
bool result = InterpretXObjectContentsInternal(
inParser,
inXObject,
inHandler
);
mNestingContext = NULL;
return result;
}

bool PDFRecursiveInterpreter::InterpretXObjectContentsInternal(
PDFParser* inParser,
PDFStreamInput* inXObject,
IPDFRecursiveInterpreterHandler* inHandler) {
RefCountPtr<PDFDictionary> xobjectDict(inXObject->QueryStreamDictionary());

InterpreterContext context(inParser, xobjectDict.GetPtr());
inHandler->OnResourcesRead(&context);

return InterpretContentStream(inParser, xobjectDict.GetPtr(), inParser->StartReadingObjectsFromStream(inXObject),&context, inHandler);
}
}

24 changes: 24 additions & 0 deletions TextExtraction/lib/interpreter/PDFRecursiveInterpreter.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
#pragma once
#include <list>

#include "IOBasicTypes.h"
#include "IPDFRecursiveInterpreterHandler.h"

typedef std::list<ObjectIDType> ObjectIDTypeList;

class PDFParser;
class PDFDictionary;
class PDFStreamInput;
Expand All @@ -23,6 +28,25 @@ class PDFRecursiveInterpreter {
IPDFRecursiveInterpreterHandler* inHandler);

private:
struct PDFNestingContext {
ObjectIDTypeList nestedXObjects;
};

PDFNestingContext* mNestingContext;

// internal method used by higher level interpreters to call lower level xobject interpreters with nesting context
bool InterpretXObjectContents(
PDFParser* inParser,
PDFStreamInput* inXObject,
IPDFRecursiveInterpreterHandler* inHandler,
PDFNestingContext* inNestingContext);

// internal method for intrepreting xobjects
bool InterpretXObjectContentsInternal(
PDFParser* inParser,
PDFStreamInput* inXObject,
IPDFRecursiveInterpreterHandler* inHandler);

bool InterpretContentStream(
PDFParser* inParser,
PDFDictionary* inContentParent,
Expand Down

0 comments on commit 14fb9bf

Please sign in to comment.