Skip to content

Commit

Permalink
Page download: improve
Browse files Browse the repository at this point in the history
  • Loading branch information
enricoros committed May 16, 2024
1 parent 81d99f1 commit f9d33d4
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 83 deletions.
9 changes: 6 additions & 3 deletions pages/link/share_target.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,12 @@ function AppShareTarget() {
setIsDownloading(true);
callBrowseFetchPage(intentURL)
.then(page => {
if (page.stopReason !== 'error')
queueComposerTextAndLaunchApp('\n\n```' + intentURL + '\n' + page.content + '\n```\n');
else
if (page.stopReason !== 'error') {
let pageContent = page.content.markdown || page.content.text || page.content.html || '';
if (pageContent)
pageContent = '\n\n```' + intentURL + '\n' + pageContent + '\n```\n';
queueComposerTextAndLaunchApp(pageContent);
} else
setErrorMessage('Could not read any data' + page.error ? ': ' + page.error : '');
})
.catch(error => setErrorMessage(error?.message || error || 'Unknown error'))
Expand Down
16 changes: 6 additions & 10 deletions src/apps/chat/components/composer/attachments/pipeline.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,12 @@ export async function attachmentLoadInputAsync(source: Readonly<AttachmentSource
edit({ label: source.refUrl, ref: source.refUrl });
try {
const page = await callBrowseFetchPage(source.url);
if (page.content) {
edit({
input: {
mimeType: 'text/plain',
data: page.content,
dataSize: page.content.length,
},
});
} else
edit({ inputError: 'No content found at this link' });
edit(
page.content.markdown ? { input: { mimeType: 'text/markdown', data: page.content.markdown, dataSize: page.content.markdown.length } }
: page.content.text ? { input: { mimeType: 'text/plain', data: page.content.text, dataSize: page.content.text.length } }
: page.content.html ? { input: { mimeType: 'text/html', data: page.content.html, dataSize: page.content.html.length } }
: { inputError: 'No content found at this link' },
);
} catch (error: any) {
edit({ inputError: `Issue downloading page: ${error?.message || (typeof error === 'string' ? error : JSON.stringify(error))}` });
}
Expand Down
3 changes: 2 additions & 1 deletion src/apps/chat/editors/browse-load.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ export const runBrowseGetPageUpdatingState = async (cHandler: ConversationHandle

try {
const page = await callBrowseFetchPage(url);
cHandler.messageEdit(assistantMessageId, { text: page.content || 'Issue: page load did not produce an answer: no text found', typing: false }, true);
const pageContent = page.content.markdown || page.content.text || page.content.html || 'Issue: page load did not produce an answer: no text found';
cHandler.messageEdit(assistantMessageId, { text: pageContent, typing: false }, true);
return true;
} catch (error: any) {
console.error(error);
Expand Down
3 changes: 2 additions & 1 deletion src/modules/aifn/react/react.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ async function search(query: string): Promise<string> {
async function browse(url: string): Promise<string> {
try {
const page = await callBrowseFetchPage(url);
return JSON.stringify(page.content ? { text: page.content } : { error: 'Issue reading the page' });
const pageContent = page.content.markdown || page.content.text || page.content.html || '';
return JSON.stringify(pageContent ? { text: pageContent } : { error: 'Issue reading the page' });
} catch (error) {
console.error('Error browsing:', (error as Error).message);
return 'An error occurred while browsing to the URL. Missing WSS Key?';
Expand Down
37 changes: 21 additions & 16 deletions src/modules/browse/browse.client.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { BrowsePageTransform, useBrowseStore } from '~/modules/browse/store-module-browsing';
import { useBrowseStore } from '~/modules/browse/store-module-browsing';

import { apiAsyncNode } from '~/common/util/trpc.client';

Expand All @@ -7,34 +7,39 @@ import { apiAsyncNode } from '~/common/util/trpc.client';
const DEBUG_SHOW_SCREENSHOT = false;


export async function callBrowseFetchPage(url: string, forceTransform?: BrowsePageTransform) {
// export function

// thow if no URL is provided
export async function callBrowseFetchPage(
url: string,
// transforms?: BrowsePageTransform[],
// screenshotOptions?: { width: number, height: number, quality?: number },
) {

// validate url
url = url?.trim() || '';
if (!url)
throw new Error('Browsing error: Invalid URL');

// assume https if no protocol is provided
// noinspection HttpUrlsUsage
// noinspection HttpUrlsUsage: assume https if no protocol is provided
if (!url.startsWith('http://') && !url.startsWith('https://'))
url = 'https://' + url;

const { wssEndpoint: clientWssEndpoint, pageTransform } = useBrowseStore.getState();
const { wssEndpoint, pageTransform } = useBrowseStore.getState();

const { pages } = await apiAsyncNode.browse.fetchPages.mutate({
access: {
dialect: 'browse-wss',
...(!!clientWssEndpoint && { wssEndpoint: clientWssEndpoint }),
...(!!wssEndpoint && { wssEndpoint }),
},
subjects: [{
requests: [{
url,
transform: pageTransform || 'text',
transforms: /*transforms ? transforms :*/ [pageTransform],
screenshot: /*screenshotOptions ? screenshotOptions :*/ !DEBUG_SHOW_SCREENSHOT ? undefined : {
width: 512,
height: 512,
// quality: 100,
},
}],
screenshot: DEBUG_SHOW_SCREENSHOT ? {
width: 512,
height: 512,
// quality: 100,
} : undefined,
});

if (pages.length !== 1)
Expand All @@ -45,7 +50,7 @@ export async function callBrowseFetchPage(url: string, forceTransform?: BrowsePa
// DEBUG: if there's a screenshot, append it to the dom
if (DEBUG_SHOW_SCREENSHOT && page.screenshot) {
const img = document.createElement('img');
img.src = page.screenshot.imageDataUrl;
img.src = page.screenshot.webpDataUrl;
img.style.width = `${page.screenshot.width}px`;
img.style.height = `${page.screenshot.height}px`;
document.body.appendChild(img);
Expand All @@ -54,7 +59,7 @@ export async function callBrowseFetchPage(url: string, forceTransform?: BrowsePa
// throw if there's an error
if (page.error) {
console.warn('Browsing service error:', page.error);
if (!page.content)
if (!Object.keys(page.content).length)
throw new Error(page.error);
}

Expand Down
105 changes: 53 additions & 52 deletions src/modules/browse/browse.router.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,38 +19,41 @@ const browseAccessSchema = z.object({
dialect: z.enum(['browse-wss']),
wssEndpoint: z.string().trim().optional(),
});
type BrowseAccessSchema = z.infer<typeof browseAccessSchema>;

const pageTransformSchema = z.enum(['html', 'text', 'markdown']);
type PageTransformSchema = z.infer<typeof pageTransformSchema>;

const fetchPageInputSchema = z.object({
access: browseAccessSchema,
subjects: z.array(z.object({
requests: z.array(z.object({
url: z.string().url(),
transform: pageTransformSchema,
transforms: z.array(pageTransformSchema),
screenshot: z.object({
width: z.number(),
height: z.number(),
quality: z.number().optional(),
}).optional(),
})),
screenshot: z.object({
width: z.number(),
height: z.number(),
quality: z.number().optional(),
}).optional(),
});


// Output schemas

const fetchPageWorkerOutputSchema = z.object({
url: z.string(),
content: z.string(),
content: z.record(pageTransformSchema, z.string()),
error: z.string().optional(),
stopReason: z.enum(['end', 'timeout', 'error']),
screenshot: z.object({
imageDataUrl: z.string().startsWith('data:image/'),
webpDataUrl: z.string().startsWith('data:image/webp'),
mimeType: z.string().startsWith('image/'),
width: z.number(),
height: z.number(),
}).optional(),
});
type FetchPageWorkerOutputSchema = z.infer<typeof fetchPageWorkerOutputSchema>;


const fetchPagesOutputSchema = z.object({
pages: z.array(fetchPageWorkerOutputSchema),
Expand All @@ -62,40 +65,37 @@ export const browseRouter = createTRPCRouter({
fetchPages: publicProcedure
.input(fetchPageInputSchema)
.output(fetchPagesOutputSchema)
.mutation(async ({ input: { access, subjects, screenshot } }) => {
const pages: FetchPageWorkerOutputSchema[] = [];

for (const subject of subjects) {
try {
pages.push(await workerPuppeteer(access, subject.url, subject.transform, screenshot?.width, screenshot?.height, screenshot?.quality));
} catch (error: any) {
pages.push({
url: subject.url,
content: '',
error: error?.message || JSON.stringify(error) || 'Unknown fetch error',
.mutation(async ({ input: { access, requests } }) => {

const pagePromises = requests.map(request =>
workerPuppeteer(access, request.url, request.transforms, request.screenshot));

const results = await Promise.allSettled(pagePromises);

const pages: FetchPageWorkerOutputSchema[] = results.map((result, index) =>
result.status === 'fulfilled'
? result.value
: {
url: requests[index].url,
content: {},
error: result.reason?.message || 'Unknown fetch error',
stopReason: 'error',
});
}
}
},
);

return { pages };
}),

});


type BrowseAccessSchema = z.infer<typeof browseAccessSchema>;
type FetchPageWorkerOutputSchema = z.infer<typeof fetchPageWorkerOutputSchema>;


async function workerPuppeteer(
access: BrowseAccessSchema,
targetUrl: string,
transform: PageTransformSchema,
ssWidth: number | undefined,
ssHeight: number | undefined,
ssQuality: number | undefined,
transforms: PageTransformSchema[],
screenshotOptions?: { width: number, height: number, quality?: number },
): Promise<FetchPageWorkerOutputSchema> {

const browserWSEndpoint = (access.wssEndpoint || env.PUPPETEER_WSS_ENDPOINT || '').trim();
const isLocalBrowser = browserWSEndpoint.startsWith('ws://');
if (!browserWSEndpoint || (!browserWSEndpoint.startsWith('wss://') && !isLocalBrowser))
Expand All @@ -106,7 +106,7 @@ async function workerPuppeteer(

const result: FetchPageWorkerOutputSchema = {
url: targetUrl,
content: '',
content: {},
error: undefined,
stopReason: 'error',
screenshot: undefined,
Expand Down Expand Up @@ -144,21 +144,23 @@ async function workerPuppeteer(
// transform the content of the page as text
try {
if (result.stopReason !== 'error') {
switch (transform) {
case 'html':
result.content = await page.content();
break;
case 'text':
result.content = await page.evaluate(() => document.body.innerText || document.textContent || '');
break;
case 'markdown':
const html = await page.content();
const cleanedHtml = cleanHtml(html);
const turndownService = new TurndownService({ headingStyle: 'atx' });
result.content = turndownService.turndown(cleanedHtml);
break;
for (const transform of transforms) {
switch (transform) {
case 'html':
result.content.html = cleanHtml(await page.content());
break;
case 'text':
result.content.text = await page.evaluate(() => document.body.innerText || document.textContent || '');
break;
case 'markdown':
const html = await page.content();
const cleanedHtml = cleanHtml(html);
const turndownService = new TurndownService({ headingStyle: 'atx' });
result.content.markdown = turndownService.turndown(cleanedHtml);
break;
}
}
if (!result.content)
if (!Object.keys(result.content).length)
result.error = '[Puppeteer] Empty content';
}
} catch (error: any) {
Expand All @@ -167,10 +169,9 @@ async function workerPuppeteer(

// get a screenshot of the page
try {
if (ssWidth && ssHeight) {
const width = ssWidth;
const height = ssHeight;
const scale = Math.round(100 * ssWidth / 1024) / 100;
if (screenshotOptions?.width && screenshotOptions?.height) {
const { width, height, quality } = screenshotOptions;
const scale = Math.round(100 * width / 1024) / 100;

await page.setViewport({ width: width / scale, height: height / scale, deviceScaleFactor: scale });

Expand All @@ -181,10 +182,10 @@ async function workerPuppeteer(
type: imageType,
encoding: 'base64',
clip: { x: 0, y: 0, width: width / scale, height: height / scale },
...(ssQuality && { quality: ssQuality }),
...(quality && { quality }),
}) as string;

result.screenshot = { imageDataUrl: `data:${mimeType};base64,${dataString}`, mimeType, width, height };
result.screenshot = { webpDataUrl: `data:${mimeType};base64,${dataString}`, mimeType, width, height };
}
} catch (error: any) {
console.error('workerPuppeteer: page.screenshot', error);
Expand Down

0 comments on commit f9d33d4

Please sign in to comment.