From e4e7ac260aff2936fe1244fca05d8f7fbd8830ce Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Fri, 10 May 2024 02:46:28 -0700 Subject: [PATCH] pdfjs: image generation (just in case) --- .../composer/attachments/AttachmentMenu.tsx | 6 +- .../composer/attachments/pipeline.tsx | 28 ++++++++- .../components/composer/composer.types.ts | 7 +++ src/common/util/pdfUtils.ts | 57 +++++++++++++++++-- 4 files changed, 88 insertions(+), 10 deletions(-) diff --git a/src/apps/chat/components/composer/attachments/AttachmentMenu.tsx b/src/apps/chat/components/composer/attachments/AttachmentMenu.tsx index 5e378c9ec..9c9fcfc3b 100644 --- a/src/apps/chat/components/composer/attachments/AttachmentMenu.tsx +++ b/src/apps/chat/components/composer/attachments/AttachmentMenu.tsx @@ -153,7 +153,11 @@ export function AttachmentMenu(props: { {/* Converters: {aConverters.map(((converter, idx) => ` ${converter.id}${(idx === aConverterIdx) ? '*' : ''}`)).join(', ')}*/} {/**/} - 🡒 {isOutputMissing ? 'empty' : aOutputs.map(output => `${output.type}, ${output.type === 'text-block' ? output.text.length.toLocaleString() : '(base64 image)'} bytes`).join(' · ')} + 🡒 {isOutputMissing ? 'empty' : aOutputs.map(output => `${output.type}, ${output.type === 'text-block' + ? output.text.length.toLocaleString() + : output.type === 'image-part' + ? output.base64Url.length.toLocaleString() + : '(other)'} bytes`).join(' · ')} {!!tokenCountApprox && 🡒 {tokenCountApprox.toLocaleString()} tokens diff --git a/src/apps/chat/components/composer/attachments/pipeline.tsx b/src/apps/chat/components/composer/attachments/pipeline.tsx index 45baf058a..f77eef785 100644 --- a/src/apps/chat/components/composer/attachments/pipeline.tsx +++ b/src/apps/chat/components/composer/attachments/pipeline.tsx @@ -2,7 +2,7 @@ import { callBrowseFetchPage } from '~/modules/browse/browse.client'; import { createBase36Uid } from '~/common/util/textUtils'; import { htmlTableToMarkdown } from '~/common/util/htmlTableToMarkdown'; -import { pdfToText } from '~/common/util/pdfUtils'; +import { pdfToImageDataURLs, pdfToText } from '~/common/util/pdfUtils'; import type { Attachment, AttachmentConverter, AttachmentId, AttachmentInput, AttachmentSource } from './store-attachments'; import type { ComposerOutputMultiPart } from '../composer.types'; @@ -297,7 +297,7 @@ export async function attachmentPerformConversion(attachment: Readonly { + outputs.push({ + type: 'image-part', + base64Url: pdfImg.base64Url, + metadata: { + title: `Page ${index + 1}`, + width: pdfImg.width, + height: pdfImg.height, + }, + collapsible: false, + }); + }); + } catch (error) { + console.error('Error converting PDF to images:', error); + } break; case 'image': diff --git a/src/apps/chat/components/composer/composer.types.ts b/src/apps/chat/components/composer/composer.types.ts index af354f3a2..425fda58f 100644 --- a/src/apps/chat/components/composer/composer.types.ts +++ b/src/apps/chat/components/composer/composer.types.ts @@ -9,6 +9,13 @@ export type ComposerOutputPart = { // TODO: not implemented yet type: 'image-part', base64Url: string, + metadata: { + title?: string, + generatedBy?: string, + altText?: string, + width?: number, + height?: number, + }, collapsible: false, }; diff --git a/src/common/util/pdfUtils.ts b/src/common/util/pdfUtils.ts index 207e442d7..fce030487 100644 --- a/src/common/util/pdfUtils.ts +++ b/src/common/util/pdfUtils.ts @@ -10,12 +10,7 @@ * @param pdfBuffer The content of a PDF file */ export async function pdfToText(pdfBuffer: ArrayBuffer): Promise { - // Dynamically import the 'pdfjs-dist' library [nextjs] - const { getDocument, GlobalWorkerOptions } = await import('pdfjs-dist'); - - // Set the worker script path - GlobalWorkerOptions.workerSrc = '/workers/pdf.worker.min.mjs'; - + const { getDocument } = await dynamicImportPdfJs(); const pdf = await getDocument(pdfBuffer).promise; const textPages: string[] = []; // Initialize an array to hold text from all pages @@ -52,6 +47,56 @@ export async function pdfToText(pdfBuffer: ArrayBuffer): Promise { return textPages.join('\n\n'); // Join all the page texts at the end } + +type PdfPageImage = { base64Url: string, scale: number, width: number, height: number }; + +/** + * Renders all pages of a PDF to images + * + * @param pdfBuffer The content of a PDF file + * @param scale The scale factor for the image resolution (default 1.5 for moderate quality) + */ +export async function pdfToImageDataURLs(pdfBuffer: ArrayBuffer, scale = 1.5): Promise { + const { getDocument } = await dynamicImportPdfJs(); + const pdf = await getDocument({ data: pdfBuffer }).promise; + const images: PdfPageImage[] = []; + + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const viewport = page.getViewport({ scale }); + const canvas = document.createElement('canvas'); + const context = canvas.getContext('2d'); + canvas.height = viewport.height; + canvas.width = viewport.width; + + await page.render({ + canvasContext: context!, + viewport, + }).promise; + + images.push({ + base64Url: canvas.toDataURL('image/jpeg'), + scale, + width: viewport.width, + height: viewport.height, + }); + } + + return images; +} + + +// Dynamically import the 'pdfjs-dist' library +async function dynamicImportPdfJs() { + // Dynamically import the 'pdfjs-dist' library [nextjs] + const { getDocument, GlobalWorkerOptions } = await import('pdfjs-dist'); + + // Set the worker script path + GlobalWorkerOptions.workerSrc = '/workers/pdf.worker.min.mjs'; + + return { getDocument }; +} + // Type guard to check if an item has a 'str' property function isTextItem(item: any): item is { str: string } { return 'str' in item && typeof item.str === 'string';