From 6c4bb13c7cbe041973737902e81b226f8796cc73 Mon Sep 17 00:00:00 2001 From: Valery Buchinsky Date: Fri, 1 Dec 2023 14:45:13 +0200 Subject: [PATCH] Use PdfDictionary to try and improve parsing --- .../__snapshots__/readRefTable.test.js.snap | 130 ++++++++---------- packages/placeholder-plain/src/findObject.js | 9 +- packages/placeholder-plain/src/getValue.js | 31 ----- .../placeholder-plain/src/getValue.test.js | 22 --- packages/placeholder-plain/src/readPdf.js | 39 +++--- .../placeholder-plain/src/readPdf.test.js | 36 +++-- .../placeholder-plain/src/readRefTable.js | 44 ++---- .../src/readRefTable.test.js | 42 +++--- 8 files changed, 140 insertions(+), 213 deletions(-) delete mode 100644 packages/placeholder-plain/src/getValue.js delete mode 100644 packages/placeholder-plain/src/getValue.test.js diff --git a/packages/placeholder-plain/src/__snapshots__/readRefTable.test.js.snap b/packages/placeholder-plain/src/__snapshots__/readRefTable.test.js.snap index 73d421c9..cfe9ebed 100644 --- a/packages/placeholder-plain/src/__snapshots__/readRefTable.test.js.snap +++ b/packages/placeholder-plain/src/__snapshots__/readRefTable.test.js.snap @@ -1,70 +1,6 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP -exports[`getXref Throws an error when size has unexpected value 1`] = `"Failed to parse size of xref table."`; - -exports[`getXref Throws an error when size is not found 1`] = `"Size not found in xref table."`; - -exports[`getXref Throws an error when xref is not at its expected position 1`] = `"Cross-Reference Streams not yet implemented."`; - -exports[`getXref Throws an error when xref is not found at position 1`] = `"Cross-Reference Streams not yet implemented."`; - -exports[`readRefTable Expects to merge correctly the refTable of resources 1`] = ` -Object { - "maxIndex": 19, - "offsets": Map { - 1 => 19012, - 2 => 19, - 3 => 224, - 4 => 12330, - 5 => 244, - 6 => 11154, - 7 => 11176, - 8 => 11368, - 9 => 11709, - 10 => 11910, - 11 => 11943, - 12 => 12140, - 13 => 12196, - 14 => 18928, - 15 => 12494, - 17 => 13264, - 18 => 18742, - 19 => 18860, - }, - "startingIndex": 0, -} -`; - -exports[`readRefTable Expects to merge correctly the refTable of resources 2`] = ` -Object { - "maxIndex": 21, - "offsets": Map { - 1 => 25091, - 2 => 19, - 3 => 224, - 4 => 12330, - 5 => 244, - 6 => 11154, - 7 => 11176, - 8 => 11368, - 9 => 11709, - 10 => 11910, - 11 => 11943, - 12 => 12140, - 13 => 12196, - 14 => 18948, - 15 => 12494, - 17 => 13264, - 18 => 18742, - 19 => 25016, - 20 => 19431, - 21 => 24878, - }, - "startingIndex": 0, -} -`; - -exports[`readRefTable Expects to merge correctly the refTable of resources 3`] = ` +exports[`readRefTable Expects to merge correctly the refTable of contributing.pdf: contributing.pdf 1`] = ` Object { "maxIndex": 24, "offsets": Map { @@ -97,7 +33,7 @@ Object { } `; -exports[`readRefTable Expects to merge correctly the refTable of resources 4`] = ` +exports[`readRefTable Expects to merge correctly the refTable of formexample.pdf: formexample.pdf 1`] = ` Object { "maxIndex": 62, "offsets": Map { @@ -168,7 +104,7 @@ Object { } `; -exports[`readRefTable Expects to merge correctly the refTable of resources 5`] = ` +exports[`readRefTable Expects to merge correctly the refTable of incrementally_signed.pdf: incrementally_signed.pdf 1`] = ` Object { "maxIndex": 18, "offsets": Map { @@ -195,7 +131,7 @@ Object { } `; -exports[`readRefTable Expects to merge correctly the refTable of resources 6`] = ` +exports[`readRefTable Expects to merge correctly the refTable of signed.pdf: signed.pdf 1`] = ` Object { "maxIndex": 13, "offsets": Map { @@ -217,7 +153,63 @@ Object { } `; -exports[`readRefTable Expects to merge correctly the refTable of resources 7`] = ` +exports[`readRefTable Expects to merge correctly the refTable of signed-once.pdf: signed-once.pdf 1`] = ` +Object { + "maxIndex": 19, + "offsets": Map { + 1 => 19012, + 2 => 19, + 3 => 224, + 4 => 12330, + 5 => 244, + 6 => 11154, + 7 => 11176, + 8 => 11368, + 9 => 11709, + 10 => 11910, + 11 => 11943, + 12 => 12140, + 13 => 12196, + 14 => 18928, + 15 => 12494, + 17 => 13264, + 18 => 18742, + 19 => 18860, + }, + "startingIndex": 0, +} +`; + +exports[`readRefTable Expects to merge correctly the refTable of signed-twice.pdf: signed-twice.pdf 1`] = ` +Object { + "maxIndex": 21, + "offsets": Map { + 1 => 25091, + 2 => 19, + 3 => 224, + 4 => 12330, + 5 => 244, + 6 => 11154, + 7 => 11176, + 8 => 11368, + 9 => 11709, + 10 => 11910, + 11 => 11943, + 12 => 12140, + 13 => 12196, + 14 => 18948, + 15 => 12494, + 17 => 13264, + 18 => 18742, + 19 => 25016, + 20 => 19431, + 21 => 24878, + }, + "startingIndex": 0, +} +`; + +exports[`readRefTable Expects to merge correctly the refTable of w3dummy.pdf: w3dummy.pdf 1`] = ` Object { "maxIndex": 15, "offsets": Map { diff --git a/packages/placeholder-plain/src/findObject.js b/packages/placeholder-plain/src/findObject.js index 0d564b2b..372ae614 100644 --- a/packages/placeholder-plain/src/findObject.js +++ b/packages/placeholder-plain/src/findObject.js @@ -1,4 +1,5 @@ import getIndexFromRef from './getIndexFromRef'; +import PdfDictionary from './PdfDictionary'; /** * @typedef {object} FindObjectAtReturnType @@ -13,12 +14,12 @@ import getIndexFromRef from './getIndexFromRef'; */ export const findObjectAt = (pdf, position) => { let slice = pdf.subarray(position); - slice = slice.subarray(0, slice.indexOf('endobj', 'utf8') - 1); - // ^ Buffer from the start position until the first endobj (included). + slice = slice.subarray(slice.indexOf('obj') + 3, slice.indexOf('endobj', 'utf8') - 1); + // ^ Buffer from the start position until the first endobj. const dictionary = slice.subarray( slice.indexOf('<<', 'utf8') + 2, - slice.indexOf('>>', 'utf8') - 1, + slice.lastIndexOf('>>', 'utf8'), ); const stream = slice.subarray( slice.indexOf('stream', 'utf8') + 6, @@ -26,7 +27,7 @@ export const findObjectAt = (pdf, position) => { ); return { - dictionary, + dictionary: new PdfDictionary(dictionary), stream, }; }; diff --git a/packages/placeholder-plain/src/getValue.js b/packages/placeholder-plain/src/getValue.js deleted file mode 100644 index 2c637281..00000000 --- a/packages/placeholder-plain/src/getValue.js +++ /dev/null @@ -1,31 +0,0 @@ -/** - * @param {Buffer} trailer - * @param {string} key - * @returns {string} - * - * FIXME: - * 0 00 000 Null (NUL) - * 9 09 011 Tab (HT) - * 10 0A 012 Line feed (LF) - * 12 0C 014 Form feed (FF) - * 13 0D 015 Carriage return (CR) - * 32 20 040 Space (SP) - * - * The delimiter characters (, ), <, >, [, ], {, }, /, and % - * - * ^ All are terminators - */ -export const getValue = (trailer, key) => { - let index = trailer.indexOf(key); - - if (index === -1) { - return undefined; - } - - const slice = trailer.subarray(index); - index = slice.indexOf('/', 1); - if (index === -1) { - index = slice.indexOf('>', 1); - } - return slice.subarray(key.length + 1, index).toString().trim(); // key + at least one space -}; diff --git a/packages/placeholder-plain/src/getValue.test.js b/packages/placeholder-plain/src/getValue.test.js deleted file mode 100644 index 3689a3c0..00000000 --- a/packages/placeholder-plain/src/getValue.test.js +++ /dev/null @@ -1,22 +0,0 @@ -import {getValue} from './getValue'; - -describe(getValue, () => { - it('matches snapshots', () => { - const trailer = Buffer.from(`trailer - << - /Size 14 - /Root 2 0 R - /Info 10 0 R - /ID [<455364dbb1253da25540322def2a672b> <455364dbb1253da25540322def2a672b>] - /AtTheEnd 14 - >> - startxref - 4220 - %%EOF`); - - expect(getValue(trailer, '/Root')).toBe('2 0 R'); - expect(getValue(trailer, '/Info')).toBe('10 0 R'); - expect(getValue(trailer, '/Unknown')).toBe(undefined); - expect(getValue(trailer, '/AtTheEnd')).toBe('14'); - }); -}); diff --git a/packages/placeholder-plain/src/readPdf.js b/packages/placeholder-plain/src/readPdf.js index b76d974e..de534028 100644 --- a/packages/placeholder-plain/src/readPdf.js +++ b/packages/placeholder-plain/src/readPdf.js @@ -1,6 +1,6 @@ -import readRefTable from './readRefTable'; -import findObject from './findObject'; -import {getValue} from './getValue'; +import {SignPdfError} from '@signpdf/utils'; +import readRefTable, {getLastXrefPosition} from './readRefTable'; +import findObject, {findObjectAt} from './findObject'; /** * @typedef {object} ReadPdfReturnType @@ -23,26 +23,33 @@ import {getValue} from './getValue'; */ const readPdf = (pdfBuffer) => { // Extract the trailer dictionary. - const trailerStart = pdfBuffer.lastIndexOf('trailer'); - // The trailer is followed by xref. Then an EOF. EOF's length is 6 characters. - const trailer = pdfBuffer.slice(trailerStart, pdfBuffer.length - 6); - - let xRefPosition = trailer.slice(trailer.lastIndexOf('startxref') + 10).toString(); - - xRefPosition = parseInt(xRefPosition); - const refTable = readRefTable(pdfBuffer); - - const rootRef = getValue(trailer, '/Root'); + const xRefPosition = getLastXrefPosition(pdfBuffer); + + let refTable; + + const trailerObject = findObjectAt(pdfBuffer, xRefPosition); + if (trailerObject.stream.indexOf('trailer') !== -1) { + // assuming trailer + refTable = readRefTable(pdfBuffer, xRefPosition); + } else { + // assuming stream + if (!trailerObject.dictionary.has('/Filter')) { + throw new Error('Expected /Filter in trailer with streams.'); + } + throw new SignPdfError( + '/Filter is not implemented.', + SignPdfError.TYPE_PARSE, + ); + } + const rootRef = trailerObject.dictionary.get('/Root'); const root = findObject(pdfBuffer, refTable, rootRef).toString(); - - const infoRef = getValue(trailer, '/Info'); + const infoRef = trailerObject.dictionary.get('/Info'); return { xref: refTable, rootRef, root, infoRef, - trailerStart, xRefPosition, }; }; diff --git a/packages/placeholder-plain/src/readPdf.test.js b/packages/placeholder-plain/src/readPdf.test.js index 4811494f..cd48956a 100644 --- a/packages/placeholder-plain/src/readPdf.test.js +++ b/packages/placeholder-plain/src/readPdf.test.js @@ -2,20 +2,30 @@ import {readTestResource} from '@signpdf/internal-utils'; import readPdf from './readPdf'; describe(readPdf, () => { - it('reads contributing.pdf', () => { - const pdfBuffer = readTestResource('contributing.pdf'); + it.each([ + { + resource: 'signed-once.pdf', + xRefPosition: 19174, + root: 14, + rootByteOffset: 18928, + info: 15, + }, + { + resource: 'contributing.pdf', + xRefPosition: 72203, + root: 12, + rootByteOffset: 4394, + info: 1, + }, + ])('reads $resource', ({ + resource, root, info, xRefPosition, rootByteOffset, + }) => { + const pdfBuffer = readTestResource(resource); const result = readPdf(pdfBuffer); - expect(result.xRefPosition).toBe(72203); - expect(result.rootRef).toBe('12 0 R'); - expect(result.infoRef).toBe('1 0 R'); - }); - it('reads issue-79-test.pdf', () => { - const pdfBuffer = readTestResource('issue-79-test.pdf'); - const result = readPdf(pdfBuffer); - - expect(result.xRefPosition).toBe(1542); - expect(result.rootRef).toBe('2 0 R'); - expect(result.infoRef).toBe('3 0 R'); + expect(result.xRefPosition).toBe(xRefPosition); + expect(result.rootRef).toBe(`${root} 0 R`); + expect(result.infoRef).toBe(`${info} 0 R`); + expect(result.xref.offsets.get(root)).toBe(rootByteOffset); }); }); diff --git a/packages/placeholder-plain/src/readRefTable.js b/packages/placeholder-plain/src/readRefTable.js index 577a21bc..d3cc7c8d 100644 --- a/packages/placeholder-plain/src/readRefTable.js +++ b/packages/placeholder-plain/src/readRefTable.js @@ -1,7 +1,5 @@ import {SignPdfError} from '@signpdf/utils'; import xrefToRefMap from './xrefToRefMap'; -import {findObjectAt} from './findObject'; -// import {getValue} from './getValue'; /** * @param {Buffer} pdf @@ -86,31 +84,6 @@ const readXrefTableAt = (pdfSlice, position) => { }; }; -/** - * @param {Buffer} _pdfSlice - * @param {number} _position - * @returns {GetXRefReturnType | null} - */ -const readXrefStreamAt = (pdfSlice, position) => { - const {dictionary: _d, stream: _s} = findObjectAt(pdfSlice, position); - - // const parsed = { - // size: getValue(_d, '/Size'), - // root: getValue(_d, '/Root'), - // info: getValue(_d, '/Info'), - // filter: getValue(_d, '/Filter'), - // length: getValue(_d, '/Length'), - // }; - - // console.log(_d.toString(), parsed); - // console.log(zlib.deflateSync(stream)); - - throw new SignPdfError( - 'Cross-Reference Streams not yet implemented.', - SignPdfError.TYPE_PARSE, - ); -}; - /** * @typedef {object} GetXRefReturnType * // TODO @@ -123,8 +96,7 @@ const readXrefStreamAt = (pdfSlice, position) => { * @throws {SignPdfError} */ export const getXref = (pdf, position) => { - const table = readXrefTableAt(pdf, position) - || readXrefStreamAt(pdf, position); + const table = readXrefTableAt(pdf, position); if (!table) { throw new SignPdfError( `Could not find xref anywhere at or after startxref position ${position}.`, @@ -140,17 +112,16 @@ export const getXref = (pdf, position) => { /** * @param {Buffer} pdf + * @param {number} xRefPosition * @returns {GetFullXrefTableReturnType} */ -export const getFullXrefTable = (pdf) => { - const lastTrailerPosition = getLastXrefPosition(pdf); - const lastXrefTable = getXref(pdf, lastTrailerPosition); +export const getFullXrefTable = (pdf, xRefPosition) => { + const lastXrefTable = getXref(pdf, xRefPosition); if (lastXrefTable.prev === undefined) { return lastXrefTable.xRefContent; } - const pdfWithoutLastTrailer = pdf.slice(0, lastTrailerPosition); - const partOfXrefTable = getFullXrefTable(pdfWithoutLastTrailer); + const partOfXrefTable = getFullXrefTable(pdf, lastXrefTable.prev); const mergedXrefTable = new Map([ ...partOfXrefTable, @@ -169,10 +140,11 @@ export const getFullXrefTable = (pdf) => { /** * @param {Buffer} pdfBuffer + * @param {number} xRefPosition * @returns {ReadRefTableReturnType} */ -const readRefTable = (pdf) => { - const fullXrefTable = getFullXrefTable(pdf); +const readRefTable = (pdf, xRefPosition) => { + const fullXrefTable = getFullXrefTable(pdf, xRefPosition); const startingIndex = 0; const maxIndex = Math.max(...fullXrefTable.keys()); diff --git a/packages/placeholder-plain/src/readRefTable.test.js b/packages/placeholder-plain/src/readRefTable.test.js index a473f995..0c17081f 100644 --- a/packages/placeholder-plain/src/readRefTable.test.js +++ b/packages/placeholder-plain/src/readRefTable.test.js @@ -3,9 +3,6 @@ import {SignPdfError} from '@signpdf/utils'; import readRefTable, {getFullXrefTable, getXref} from './readRefTable'; describe(getFullXrefTable, () => { - it('works for #79', () => { - expect(getFullXrefTable(readTestResource('issue-79-test.pdf'))).toBeTruthy(); - }); it('skips unreferenced xref tables', () => { const pdf = Buffer.from(`xref 0 3 @@ -48,7 +45,7 @@ describe(getXref, () => { } catch (e) { expect(e instanceof SignPdfError).toBe(true); expect(e.type).toBe(SignPdfError.TYPE_PARSE); - expect(e.message).toMatchSnapshot(); + expect(e.message).toMatchInlineSnapshot('"Could not find xref anywhere at or after startxref position 0."'); } }); it('Throws an error when xref is not at its expected position', () => { @@ -60,7 +57,7 @@ describe(getXref, () => { } catch (e) { expect(e instanceof SignPdfError).toBe(true); expect(e.type).toBe(SignPdfError.TYPE_PARSE); - expect(e.message).toMatchSnapshot(); + expect(e.message).toMatchInlineSnapshot('"Could not find xref anywhere at or after startxref position 2."'); } }); it('Throws an error when size is not found', () => { @@ -72,7 +69,7 @@ describe(getXref, () => { } catch (e) { expect(e instanceof SignPdfError).toBe(true); expect(e.type).toBe(SignPdfError.TYPE_PARSE); - expect(e.message).toMatchSnapshot(); + expect(e.message).toMatchInlineSnapshot('"Size not found in xref table."'); } }); it('Throws an error when size has unexpected value', () => { @@ -84,25 +81,26 @@ describe(getXref, () => { } catch (e) { expect(e instanceof SignPdfError).toBe(true); expect(e.type).toBe(SignPdfError.TYPE_PARSE); - expect(e.message).toMatchSnapshot(); + expect(e.message).toMatchInlineSnapshot('"Failed to parse size of xref table."'); } }); }); describe(readRefTable, () => { - it('Expects to merge correctly the refTable of resources', () => { - [ - 'signed-once.pdf', - 'signed-twice.pdf', - 'contributing.pdf', - 'formexample.pdf', - 'incrementally_signed.pdf', - 'signed.pdf', - 'w3dummy.pdf', - ].forEach((fileName) => { - const pdf = readTestResource(fileName); - const r = readRefTable(pdf); - expect(r).toMatchSnapshot(); - }); - }); + it.each([ + {resource: 'signed-once.pdf', startxref: 19174}, + {resource: 'signed-twice.pdf', startxref: 25264}, + {resource: 'contributing.pdf', startxref: 72203}, + {resource: 'formexample.pdf', startxref: 64251}, + {resource: 'incrementally_signed.pdf', startxref: 17125}, + {resource: 'signed.pdf', startxref: 4220}, + {resource: 'w3dummy.pdf', startxref: 12787}, + ])( + 'Expects to merge correctly the refTable of $resource', + ({resource, startxref}) => { + const pdf = readTestResource(resource); + const r = readRefTable(pdf, startxref); + expect(r).toMatchSnapshot(resource); + }, + ); });