From f9d33d4888a1937d437e409ccf0eb4864f8cca48 Mon Sep 17 00:00:00 2001 From: Enrico Ros Date: Thu, 16 May 2024 02:57:50 -0700 Subject: [PATCH] Page download: improve --- pages/link/share_target.tsx | 9 +- .../composer/attachments/pipeline.tsx | 16 +-- src/apps/chat/editors/browse-load.ts | 3 +- src/modules/aifn/react/react.ts | 3 +- src/modules/browse/browse.client.ts | 37 +++--- src/modules/browse/browse.router.ts | 105 +++++++++--------- 6 files changed, 90 insertions(+), 83 deletions(-) diff --git a/pages/link/share_target.tsx b/pages/link/share_target.tsx index 5614121cf..be29c7e65 100644 --- a/pages/link/share_target.tsx +++ b/pages/link/share_target.tsx @@ -77,9 +77,12 @@ function AppShareTarget() { setIsDownloading(true); callBrowseFetchPage(intentURL) .then(page => { - if (page.stopReason !== 'error') - queueComposerTextAndLaunchApp('\n\n```' + intentURL + '\n' + page.content + '\n```\n'); - else + if (page.stopReason !== 'error') { + let pageContent = page.content.markdown || page.content.text || page.content.html || ''; + if (pageContent) + pageContent = '\n\n```' + intentURL + '\n' + pageContent + '\n```\n'; + queueComposerTextAndLaunchApp(pageContent); + } else setErrorMessage('Could not read any data' + page.error ? ': ' + page.error : ''); }) .catch(error => setErrorMessage(error?.message || error || 'Unknown error')) diff --git a/src/apps/chat/components/composer/attachments/pipeline.tsx b/src/apps/chat/components/composer/attachments/pipeline.tsx index f77eef785..4aeda14b3 100644 --- a/src/apps/chat/components/composer/attachments/pipeline.tsx +++ b/src/apps/chat/components/composer/attachments/pipeline.tsx @@ -58,16 +58,12 @@ export async function attachmentLoadInputAsync(source: Readonly { async function browse(url: string): Promise { try { const page = await callBrowseFetchPage(url); - return JSON.stringify(page.content ? { text: page.content } : { error: 'Issue reading the page' }); + const pageContent = page.content.markdown || page.content.text || page.content.html || ''; + return JSON.stringify(pageContent ? { text: pageContent } : { error: 'Issue reading the page' }); } catch (error) { console.error('Error browsing:', (error as Error).message); return 'An error occurred while browsing to the URL. Missing WSS Key?'; diff --git a/src/modules/browse/browse.client.ts b/src/modules/browse/browse.client.ts index 806aa6de5..9e68dcf05 100644 --- a/src/modules/browse/browse.client.ts +++ b/src/modules/browse/browse.client.ts @@ -1,4 +1,4 @@ -import { BrowsePageTransform, useBrowseStore } from '~/modules/browse/store-module-browsing'; +import { useBrowseStore } from '~/modules/browse/store-module-browsing'; import { apiAsyncNode } from '~/common/util/trpc.client'; @@ -7,34 +7,39 @@ import { apiAsyncNode } from '~/common/util/trpc.client'; const DEBUG_SHOW_SCREENSHOT = false; -export async function callBrowseFetchPage(url: string, forceTransform?: BrowsePageTransform) { +// export function - // thow if no URL is provided +export async function callBrowseFetchPage( + url: string, + // transforms?: BrowsePageTransform[], + // screenshotOptions?: { width: number, height: number, quality?: number }, +) { + + // validate url url = url?.trim() || ''; if (!url) throw new Error('Browsing error: Invalid URL'); - // assume https if no protocol is provided - // noinspection HttpUrlsUsage + // noinspection HttpUrlsUsage: assume https if no protocol is provided if (!url.startsWith('http://') && !url.startsWith('https://')) url = 'https://' + url; - const { wssEndpoint: clientWssEndpoint, pageTransform } = useBrowseStore.getState(); + const { wssEndpoint, pageTransform } = useBrowseStore.getState(); const { pages } = await apiAsyncNode.browse.fetchPages.mutate({ access: { dialect: 'browse-wss', - ...(!!clientWssEndpoint && { wssEndpoint: clientWssEndpoint }), + ...(!!wssEndpoint && { wssEndpoint }), }, - subjects: [{ + requests: [{ url, - transform: pageTransform || 'text', + transforms: /*transforms ? transforms :*/ [pageTransform], + screenshot: /*screenshotOptions ? screenshotOptions :*/ !DEBUG_SHOW_SCREENSHOT ? undefined : { + width: 512, + height: 512, + // quality: 100, + }, }], - screenshot: DEBUG_SHOW_SCREENSHOT ? { - width: 512, - height: 512, - // quality: 100, - } : undefined, }); if (pages.length !== 1) @@ -45,7 +50,7 @@ export async function callBrowseFetchPage(url: string, forceTransform?: BrowsePa // DEBUG: if there's a screenshot, append it to the dom if (DEBUG_SHOW_SCREENSHOT && page.screenshot) { const img = document.createElement('img'); - img.src = page.screenshot.imageDataUrl; + img.src = page.screenshot.webpDataUrl; img.style.width = `${page.screenshot.width}px`; img.style.height = `${page.screenshot.height}px`; document.body.appendChild(img); @@ -54,7 +59,7 @@ export async function callBrowseFetchPage(url: string, forceTransform?: BrowsePa // throw if there's an error if (page.error) { console.warn('Browsing service error:', page.error); - if (!page.content) + if (!Object.keys(page.content).length) throw new Error(page.error); } diff --git a/src/modules/browse/browse.router.ts b/src/modules/browse/browse.router.ts index 280b2e7bd..3e11247d0 100644 --- a/src/modules/browse/browse.router.ts +++ b/src/modules/browse/browse.router.ts @@ -19,21 +19,22 @@ const browseAccessSchema = z.object({ dialect: z.enum(['browse-wss']), wssEndpoint: z.string().trim().optional(), }); +type BrowseAccessSchema = z.infer; const pageTransformSchema = z.enum(['html', 'text', 'markdown']); type PageTransformSchema = z.infer; const fetchPageInputSchema = z.object({ access: browseAccessSchema, - subjects: z.array(z.object({ + requests: z.array(z.object({ url: z.string().url(), - transform: pageTransformSchema, + transforms: z.array(pageTransformSchema), + screenshot: z.object({ + width: z.number(), + height: z.number(), + quality: z.number().optional(), + }).optional(), })), - screenshot: z.object({ - width: z.number(), - height: z.number(), - quality: z.number().optional(), - }).optional(), }); @@ -41,16 +42,18 @@ const fetchPageInputSchema = z.object({ const fetchPageWorkerOutputSchema = z.object({ url: z.string(), - content: z.string(), + content: z.record(pageTransformSchema, z.string()), error: z.string().optional(), stopReason: z.enum(['end', 'timeout', 'error']), screenshot: z.object({ - imageDataUrl: z.string().startsWith('data:image/'), + webpDataUrl: z.string().startsWith('data:image/webp'), mimeType: z.string().startsWith('image/'), width: z.number(), height: z.number(), }).optional(), }); +type FetchPageWorkerOutputSchema = z.infer; + const fetchPagesOutputSchema = z.object({ pages: z.array(fetchPageWorkerOutputSchema), @@ -62,21 +65,23 @@ export const browseRouter = createTRPCRouter({ fetchPages: publicProcedure .input(fetchPageInputSchema) .output(fetchPagesOutputSchema) - .mutation(async ({ input: { access, subjects, screenshot } }) => { - const pages: FetchPageWorkerOutputSchema[] = []; - - for (const subject of subjects) { - try { - pages.push(await workerPuppeteer(access, subject.url, subject.transform, screenshot?.width, screenshot?.height, screenshot?.quality)); - } catch (error: any) { - pages.push({ - url: subject.url, - content: '', - error: error?.message || JSON.stringify(error) || 'Unknown fetch error', + .mutation(async ({ input: { access, requests } }) => { + + const pagePromises = requests.map(request => + workerPuppeteer(access, request.url, request.transforms, request.screenshot)); + + const results = await Promise.allSettled(pagePromises); + + const pages: FetchPageWorkerOutputSchema[] = results.map((result, index) => + result.status === 'fulfilled' + ? result.value + : { + url: requests[index].url, + content: {}, + error: result.reason?.message || 'Unknown fetch error', stopReason: 'error', - }); - } - } + }, + ); return { pages }; }), @@ -84,18 +89,13 @@ export const browseRouter = createTRPCRouter({ }); -type BrowseAccessSchema = z.infer; -type FetchPageWorkerOutputSchema = z.infer; - - async function workerPuppeteer( access: BrowseAccessSchema, targetUrl: string, - transform: PageTransformSchema, - ssWidth: number | undefined, - ssHeight: number | undefined, - ssQuality: number | undefined, + transforms: PageTransformSchema[], + screenshotOptions?: { width: number, height: number, quality?: number }, ): Promise { + const browserWSEndpoint = (access.wssEndpoint || env.PUPPETEER_WSS_ENDPOINT || '').trim(); const isLocalBrowser = browserWSEndpoint.startsWith('ws://'); if (!browserWSEndpoint || (!browserWSEndpoint.startsWith('wss://') && !isLocalBrowser)) @@ -106,7 +106,7 @@ async function workerPuppeteer( const result: FetchPageWorkerOutputSchema = { url: targetUrl, - content: '', + content: {}, error: undefined, stopReason: 'error', screenshot: undefined, @@ -144,21 +144,23 @@ async function workerPuppeteer( // transform the content of the page as text try { if (result.stopReason !== 'error') { - switch (transform) { - case 'html': - result.content = await page.content(); - break; - case 'text': - result.content = await page.evaluate(() => document.body.innerText || document.textContent || ''); - break; - case 'markdown': - const html = await page.content(); - const cleanedHtml = cleanHtml(html); - const turndownService = new TurndownService({ headingStyle: 'atx' }); - result.content = turndownService.turndown(cleanedHtml); - break; + for (const transform of transforms) { + switch (transform) { + case 'html': + result.content.html = cleanHtml(await page.content()); + break; + case 'text': + result.content.text = await page.evaluate(() => document.body.innerText || document.textContent || ''); + break; + case 'markdown': + const html = await page.content(); + const cleanedHtml = cleanHtml(html); + const turndownService = new TurndownService({ headingStyle: 'atx' }); + result.content.markdown = turndownService.turndown(cleanedHtml); + break; + } } - if (!result.content) + if (!Object.keys(result.content).length) result.error = '[Puppeteer] Empty content'; } } catch (error: any) { @@ -167,10 +169,9 @@ async function workerPuppeteer( // get a screenshot of the page try { - if (ssWidth && ssHeight) { - const width = ssWidth; - const height = ssHeight; - const scale = Math.round(100 * ssWidth / 1024) / 100; + if (screenshotOptions?.width && screenshotOptions?.height) { + const { width, height, quality } = screenshotOptions; + const scale = Math.round(100 * width / 1024) / 100; await page.setViewport({ width: width / scale, height: height / scale, deviceScaleFactor: scale }); @@ -181,10 +182,10 @@ async function workerPuppeteer( type: imageType, encoding: 'base64', clip: { x: 0, y: 0, width: width / scale, height: height / scale }, - ...(ssQuality && { quality: ssQuality }), + ...(quality && { quality }), }) as string; - result.screenshot = { imageDataUrl: `data:${mimeType};base64,${dataString}`, mimeType, width, height }; + result.screenshot = { webpDataUrl: `data:${mimeType};base64,${dataString}`, mimeType, width, height }; } } catch (error: any) { console.error('workerPuppeteer: page.screenshot', error);