Skip to content

Commit

Permalink
feat: optionally expand MERGED_CELL blocks from table analysis (#66)
Browse files Browse the repository at this point in the history
* feat: optionally expand MERGED_CELL blocks from table analysis

* Minify test json
  • Loading branch information
alexbostock authored Jul 9, 2024
1 parent 0d0108d commit a1e2790
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 8 deletions.
8 changes: 5 additions & 3 deletions src/Document.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,15 @@ export class Document {
pages: Page[];
blocks: BlockStruct[];
blockMap: BlockMap;
options: { expandMergedTableCells?: boolean } | undefined;

/**
* @param blocks - Block objects as returned by Textract
* @throws {ParseError} - If the blocks are not valid
* @throws {UnknownError} - If an unknown error occurs
*/
constructor(blocks: BlockStruct[]) {
constructor(blocks: BlockStruct[], options?: { expandMergedTableCells?: boolean }) {
this.options = options;
try {
const ret = BlockStructSchema.array().safeParse(blocks);
if (!ret.success) {
Expand Down Expand Up @@ -74,14 +76,14 @@ export class Document {
for (const b of this.blocks) {
if (b.BlockType === 'PAGE') {
if (blocks.length > 0) {
this.pages.push(new Page(blocks, this.blockMap));
this.pages.push(new Page(blocks, this.blockMap, this.options));
}
blocks = [];
}
blocks.push(b);
}
if (blocks.length > 0) {
this.pages.push(new Page(blocks, this.blockMap));
this.pages.push(new Page(blocks, this.blockMap, this.options));
}
}

Expand Down
10 changes: 8 additions & 2 deletions src/Page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,14 @@ export class Page {
geometry?: Geometry;
id: string;
content: (Line | Table | Form | Field)[];
options: { expandMergedTableCells?: boolean } | undefined;

constructor(blocks: BlockStruct[], blockMap: BlockMap) {
constructor(
blocks: BlockStruct[],
blockMap: BlockMap,
options?: { expandMergedTableCells?: boolean }
) {
this.options = options;
this.blocks = blocks;
this.text = '';
this.lines = [];
Expand Down Expand Up @@ -48,7 +54,7 @@ export class Page {
break;
}
case 'TABLE': {
const tbl = new Table(b, blockMap);
const tbl = new Table(b, blockMap, this.options);
this.tables.push(tbl);
this.content.push(tbl);
break;
Expand Down
49 changes: 46 additions & 3 deletions src/Table/Table.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@ import { Geometry } from '../Geometry/index.js';
import { Cell } from './Cell.js';
import { Row } from './Row.js';

import type { BlockMap, CellBlock, TableBlock } from '../BlockStruct.js';
import type { BlockMap, CellBlock, MergedCellBlock, TableBlock } from '../BlockStruct.js';

export class Table {
block: TableBlock;
confidence: number;
geometry: Geometry;
id: string;
rows: Row[];

constructor(block: TableBlock, blockMap: BlockMap) {
constructor(
block: TableBlock,
blockMap: BlockMap,
options?: { expandMergedTableCells?: boolean }
) {
this.block = block;
this.confidence = block.Confidence;
this.geometry = new Geometry(block.Geometry);
Expand Down Expand Up @@ -39,6 +42,46 @@ export class Table {
}
}
}

if (options?.expandMergedTableCells) {
this.expandMergedCells(block, blockMap);
}
}

expandMergedCells(block: TableBlock, blockMap: BlockMap) {
const cellIDToContent = new Map();

const mergedCells = block.Relationships.filter(({ Type }) => Type === 'MERGED_CELL').flatMap(
({ Ids }) =>
Ids.map((id) => blockMap.get(id)).filter(
(block): block is MergedCellBlock => block?.BlockType === 'MERGED_CELL'
)
);

for (const mergedCell of mergedCells) {
const cells = mergedCell.Relationships?.filter((rs) => rs.Type === 'CHILD')
.flatMap((rs) => rs.Ids.map((id) => blockMap.get(id)))
.filter((block): block is CellBlock => block?.BlockType === 'CELL');
if (!cells) {
continue;
}
const mergedContent: string[] = [];
for (const cell of cells) {
mergedContent.push(new Cell(cell, blockMap).text);
}
for (const cell of cells) {
cellIDToContent.set(cell.Id, mergedContent.join(' '));
}
}

for (const row of this.rows) {
for (const cell of row.cells) {
const overrideContent = cellIDToContent.get(cell.id);
if (overrideContent) {
cell.text = overrideContent;
}
}
}
}

toString() {
Expand Down
20 changes: 20 additions & 0 deletions tests/all.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,23 @@ describe('Rows from table created correctly', () => {
}
});
});

describe('MERGED_CELL expansion', () => {
it('ignores MERGED_CELL blocks by default', async () => {
const expected = await readFile(join(root, 'merged', 'withoutExpansion.txt'), 'utf8');
const blocks = JSON.parse(await readFile(join(root, 'merged', 'blocks.json'), 'utf8'));

const doc = new Document(blocks);

expect(doc.pages?.[0]?.tables?.[0]?.toString()).toEqual(expected);
});

it('merges content of cells with `expandMergedTableCells` option', async () => {
const expected = await readFile(join(root, 'merged', 'withExpansion.txt'), 'utf8');
const blocks = JSON.parse(await readFile(join(root, 'merged', 'blocks.json'), 'utf8'));

const doc = new Document(blocks, { expandMergedTableCells: true });

expect(doc.pages?.[0]?.tables?.[0]?.toString()).toEqual(expected);
});
});
1 change: 1 addition & 0 deletions tests/merged/blocks.json

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions tests/merged/withExpansion.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Table
==========
Row
==========
[][Old][Old][New][New]
Row
==========
[][Revenue][Profit][Revenue][Profit]
Row
==========
[January][1][2][3][4]
Row
==========
[February][5][6][7][8]
Row
==========
[February][9][10][11][12]
17 changes: 17 additions & 0 deletions tests/merged/withoutExpansion.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Table
==========
Row
==========
[][Old][][New][]
Row
==========
[][Revenue][Profit][Revenue][Profit]
Row
==========
[January][1][2][3][4]
Row
==========
[February][5][6][7][8]
Row
==========
[][9][10][11][12]

0 comments on commit a1e2790

Please sign in to comment.