Skip to content

Commit

Permalink
pdfjs: image generation (just in case)
Browse files Browse the repository at this point in the history
  • Loading branch information
enricoros committed May 10, 2024
1 parent b8aaa4b commit e4e7ac2
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,11 @@ export function AttachmentMenu(props: {
{/* Converters: {aConverters.map(((converter, idx) => ` ${converter.id}${(idx === aConverterIdx) ? '*' : ''}`)).join(', ')}*/}
{/*</Typography>*/}
<Typography level='body-xs'>
🡒 {isOutputMissing ? 'empty' : aOutputs.map(output => `${output.type}, ${output.type === 'text-block' ? output.text.length.toLocaleString() : '(base64 image)'} bytes`).join(' · ')}
🡒 {isOutputMissing ? 'empty' : aOutputs.map(output => `${output.type}, ${output.type === 'text-block'
? output.text.length.toLocaleString()
: output.type === 'image-part'
? output.base64Url.length.toLocaleString()
: '(other)'} bytes`).join(' · ')}
</Typography>
{!!tokenCountApprox && <Typography level='body-xs'>
🡒 {tokenCountApprox.toLocaleString()} tokens
Expand Down
28 changes: 25 additions & 3 deletions src/apps/chat/components/composer/attachments/pipeline.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { callBrowseFetchPage } from '~/modules/browse/browse.client';

import { createBase36Uid } from '~/common/util/textUtils';
import { htmlTableToMarkdown } from '~/common/util/htmlTableToMarkdown';
import { pdfToText } from '~/common/util/pdfUtils';
import { pdfToImageDataURLs, pdfToText } from '~/common/util/pdfUtils';

import type { Attachment, AttachmentConverter, AttachmentId, AttachmentInput, AttachmentSource } from './store-attachments';
import type { ComposerOutputMultiPart } from '../composer.types';
Expand Down Expand Up @@ -297,7 +297,7 @@ export async function attachmentPerformConversion(attachment: Readonly<Attachmen

case 'pdf-text':
if (!(input.data instanceof ArrayBuffer)) {
console.log('Expected ArrayBuffer for PDF converter, got:', typeof input.data);
console.log('Expected ArrayBuffer for PDF text converter, got:', typeof input.data);
break;
}
// duplicate the ArrayBuffer to avoid mutation
Expand All @@ -312,7 +312,29 @@ export async function attachmentPerformConversion(attachment: Readonly<Attachmen
break;

case 'pdf-images':
// TODO: extract all pages as individual images
if (!(input.data instanceof ArrayBuffer)) {
console.log('Expected ArrayBuffer for PDF images converter, got:', typeof input.data);
break;
}
// duplicate the ArrayBuffer to avoid mutation
const pdfData2 = new Uint8Array(input.data.slice(0));
try {
const imageDataURLs = await pdfToImageDataURLs(pdfData2);
imageDataURLs.forEach((pdfImg, index) => {
outputs.push({
type: 'image-part',
base64Url: pdfImg.base64Url,
metadata: {
title: `Page ${index + 1}`,
width: pdfImg.width,
height: pdfImg.height,
},
collapsible: false,
});
});
} catch (error) {
console.error('Error converting PDF to images:', error);
}
break;

case 'image':
Expand Down
7 changes: 7 additions & 0 deletions src/apps/chat/components/composer/composer.types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ export type ComposerOutputPart = {
// TODO: not implemented yet
type: 'image-part',
base64Url: string,
metadata: {
title?: string,
generatedBy?: string,
altText?: string,
width?: number,
height?: number,
},
collapsible: false,
};

Expand Down
57 changes: 51 additions & 6 deletions src/common/util/pdfUtils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,7 @@
* @param pdfBuffer The content of a PDF file
*/
export async function pdfToText(pdfBuffer: ArrayBuffer): Promise<string> {
// Dynamically import the 'pdfjs-dist' library [nextjs]
const { getDocument, GlobalWorkerOptions } = await import('pdfjs-dist');

// Set the worker script path
GlobalWorkerOptions.workerSrc = '/workers/pdf.worker.min.mjs';

const { getDocument } = await dynamicImportPdfJs();
const pdf = await getDocument(pdfBuffer).promise;
const textPages: string[] = []; // Initialize an array to hold text from all pages

Expand Down Expand Up @@ -52,6 +47,56 @@ export async function pdfToText(pdfBuffer: ArrayBuffer): Promise<string> {
return textPages.join('\n\n'); // Join all the page texts at the end
}


type PdfPageImage = { base64Url: string, scale: number, width: number, height: number };

/**
* Renders all pages of a PDF to images
*
* @param pdfBuffer The content of a PDF file
* @param scale The scale factor for the image resolution (default 1.5 for moderate quality)
*/
export async function pdfToImageDataURLs(pdfBuffer: ArrayBuffer, scale = 1.5): Promise<PdfPageImage[]> {
const { getDocument } = await dynamicImportPdfJs();
const pdf = await getDocument({ data: pdfBuffer }).promise;
const images: PdfPageImage[] = [];

for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const viewport = page.getViewport({ scale });
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;

await page.render({
canvasContext: context!,
viewport,
}).promise;

images.push({
base64Url: canvas.toDataURL('image/jpeg'),
scale,
width: viewport.width,
height: viewport.height,
});
}

return images;
}


// Dynamically import the 'pdfjs-dist' library
async function dynamicImportPdfJs() {
// Dynamically import the 'pdfjs-dist' library [nextjs]
const { getDocument, GlobalWorkerOptions } = await import('pdfjs-dist');

// Set the worker script path
GlobalWorkerOptions.workerSrc = '/workers/pdf.worker.min.mjs';

return { getDocument };
}

// Type guard to check if an item has a 'str' property
function isTextItem(item: any): item is { str: string } {
return 'str' in item && typeof item.str === 'string';
Expand Down

0 comments on commit e4e7ac2

Please sign in to comment.