Skip to content
This repository has been archived by the owner on Sep 11, 2024. It is now read-only.

Commit

Permalink
initial osrun commit for arkalis
Browse files Browse the repository at this point in the history
  • Loading branch information
lg committed Sep 8, 2023
1 parent 7fe1b3c commit e4e4a23
Show file tree
Hide file tree
Showing 11 changed files with 292 additions and 217 deletions.
18 changes: 11 additions & 7 deletions .devcontainer/devcontainer.json → .devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,18 @@
"ghcr.io/guiyomh/features/just:0": {},
"ghcr.io/devcontainers-contrib/features/actionlint:1": {},
"ghcr.io/dhoeric/features/hadolint:1": {},
"./feature-playwright": { "version": "1.31.1" },
"./feature-novnc": {},
"ghcr.io/lukewiwa/features/shellcheck:0": {}
"ghcr.io/lukewiwa/features/shellcheck:0": {},
"ghcr.io/devcontainers/features/docker-in-docker:2": {}
},
"privileged": true, // to pass through /dev/kvm for osrun (in docker-in-docker)
"mounts": [ // for osrun to store its Windows image
{
"source": "osrun-cache",
"target": "/osruncache",
"type": "volume"
}
],
"postCreateCommand": "npm i",
"containerEnv": {
"CHROME_PATH": "/ms-playwright/chromium-1048/chrome-linux/chrome"
},
"customizations": {
"vscode": {
"extensions": [
Expand All @@ -24,4 +28,4 @@
]
}
}
}
}
13 changes: 0 additions & 13 deletions .devcontainer/feature-novnc/devcontainer-feature.json

This file was deleted.

44 changes: 0 additions & 44 deletions .devcontainer/feature-novnc/install.sh

This file was deleted.

19 changes: 0 additions & 19 deletions .devcontainer/feature-playwright/devcontainer-feature.json

This file was deleted.

9 changes: 0 additions & 9 deletions .devcontainer/feature-playwright/install.sh

This file was deleted.

4 changes: 3 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
"program": "${workspaceFolder}/dist/awardwiz-scrapers/main-debug.js",
"args": "${input:run-scraper-params}",
"envFile": "${workspaceFolder}/.env",
"env": {
"TZ": "America/Los_Angeles",
},
"request": "launch",
"skipFiles": ["<node_internals>/**", "**/node_modules/**"],
"type": "node",
"preLaunchTask": "tsc: build - tsconfig.json",
"console": "integratedTerminal",
"internalConsoleOptions": "openOnSessionStart",
}
],
"inputs": [
Expand Down
2 changes: 1 addition & 1 deletion Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ check: build test
actionlint -color
hadolint **/Dockerfile
npm exec -- ajv -s config.schema.json -d config.json
shellcheck **/*.sh .devcontainer/**/*.sh
shellcheck **/*.sh
NODE_NO_WARNINGS=1 npm exec -- depcheck --ignores depcheck,npm-check,typescript,devtools-protocol,@types/har-format,@iconify/json,~icons,@vitest/coverage-c8,vite-node,node-fetch,geo-tz,@types/node-fetch,@svgr/plugin-jsx,typescript-json-schema,ajv-cli
@echo 'ok'

Expand Down
3 changes: 2 additions & 1 deletion arkalis/arkalis.ts
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ export type ArkalisCore = {
pause: () => Promise<unknown>,
scraperMeta: Required<ScraperMetadata>,
debugOptions: Required<DebugOptions>,
identifier: string,
}

type ArkalisPluginBuiltins = {
Expand Down Expand Up @@ -149,7 +150,7 @@ async function runArkalisAttempt<T>(code: (arkalis: Arkalis) => Promise<T>, debu
log(`Starting Arkalis run for scraper ${scraperMeta.name}`)

const loadedPlugins: ArkalisPluginExports[] = []
const arkalisCore: ArkalisCore = { client: undefined! as CDP.Client, log, warn, wait, scraperMeta, debugOptions, pause }
const arkalisCore: ArkalisCore = { client: undefined! as CDP.Client, log, warn, wait, scraperMeta, debugOptions, pause, identifier }

// Loading plugins one at a time, populating the Arkalis object with their exports. Note that though we cast this
// object as ArkalisCore, it can be recasted to Arkalis in the plugin, allowing access to previous plugins' exports.
Expand Down
146 changes: 89 additions & 57 deletions arkalis/browser.ts
Original file line number Diff line number Diff line change
@@ -1,62 +1,112 @@
import { promisify } from "util"
import { exec as execNoPromise } from "node:child_process"
import url from "node:url"
import ChromeLauncher from "chrome-launcher"
import { Arkalis, ArkalisCore } from "./arkalis.js"
import CDP from "chrome-remote-interface"
import Dockerode from "dockerode"
import path from "node:path"
import fs from "node:fs"

const launchChromeViaOsRunDocker = async (arkalis: ArkalisCore, switches: string[]) => {
switches.push(...[
"user-data-dir=c:\\chrome-user-data",
arkalis.scraperMeta.useGlobalBrowserCache ? "disk-cache-dir=\"\\\\10.0.2.4\\qemu\\chrome-cache\"" : "",
].filter(s => s !== ""))

const command =
"netsh interface portproxy add v4tov4 listenaddress=10.0.2.15 listenport=9222 connectaddress=127.0.0.1 connectport=9222 " +
` & "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" ${switches.map(s => s.length > 0 ? `--${s}` : "").join(" ")}`
arkalis.debugOptions.browserDebug && arkalis.log(`Launching chrome with command: ${command}`)

const globalBrowserCacheDir = path.resolve(arkalis.debugOptions.globalBrowserCacheDir)
if (arkalis.scraperMeta.useGlobalBrowserCache && !fs.existsSync(globalBrowserCacheDir))
fs.mkdirSync(globalBrowserCacheDir, { recursive: true })
arkalis.debugOptions.browserDebug && arkalis.log(`Using global browser cache: ${globalBrowserCacheDir}`)

const docker = new Dockerode()
const containerName = `arkalis-${arkalis.identifier}`
void docker.run("ghcr.io/lg/osrun", ["-f", "9222", command], process.stdout, {
name: containerName,
ExposedPorts: { "8000/tcp": {}, "9222/tcp": {} },
Env: [ "TZ=America/Los_Angeles" ],
HostConfig: {
AutoRemove: true,
Mounts: [
{ Type: "bind", Source: "/osruncache", Target: "/cache" },
arkalis.scraperMeta.useGlobalBrowserCache ? { Type: "bind", Source: globalBrowserCacheDir, Target: "/tmp/qemu-status/chrome-cache" } : undefined,
].filter(m => !!m) as Dockerode.MountConfig,
Devices: [ { PathOnHost: "/dev/kvm", PathInContainer: "/dev/kvm", CGroupPermissions: "rwm" } ],
PortBindings: { "8000/tcp": [{ HostPort: "8000" }], "9222/tcp": [{ HostPort: "9222" }] },
}
}, )

const exec = promisify(execNoPromise)
arkalis.debugOptions.browserDebug && arkalis.log("Waiting for chrome to be ready on port 9222")
let client = undefined
while ((client = await CDP({ port: 9222 }).catch(() => undefined)) === undefined) {
await arkalis.wait(500)
}
arkalis.debugOptions.browserDebug && arkalis.log("Chrome ready on port 9222")

export const arkalisBrowser = async (arkalis: ArkalisCore) => {
async function genWindowCoords() {
const screenResolution = await exec("xdpyinfo | grep dimensions")
const rawRes = / (?<res>\d+x\d+) /u.exec(screenResolution.stdout)?.groups?.["res"]?.trim().split("x")
if (!rawRes || rawRes.length !== 2)
throw new Error("Unable to get screen resolution")
const res = (rawRes as [string, string]).map(num => parseInt(num)) as [number, number]
const size = [Math.ceil(res[0] * (Math.random() * 0.2 + 0.8)), Math.ceil(res[1] * (Math.random() * 0.2 + 0.8))] as const
return {
size,
pos: [Math.ceil((res[0] - size[0]) * Math.random()), Math.ceil((res[1] - size[1]) * Math.random())] as const
return {
client,
closeBrowser: async () => {
await closeCDPClient(arkalis)

arkalis.debugOptions.browserDebug && arkalis.log("Waiting for browser to close on its own")
const startTime = Date.now()
const container = docker.getContainer(containerName)
while (await container.inspect().catch(() => undefined) !== undefined && Date.now() - startTime < 3000) {
await arkalis.wait(200)
}
if (await container.inspect().catch(() => undefined) !== undefined) {
arkalis.debugOptions.browserDebug && arkalis.log("Browser did not close on its own after 3 seconds, killing it")
await container.stop({ t: 0, signal: "SIGINT" }).catch(() => {})
}
}
}
}

const closeCDPClient = async (arkalis: ArkalisCore) => {
arkalis.debugOptions.browserDebug && arkalis.log("Closing cdp client")

// generate a random window size
const window = await genWindowCoords()
for (const domain of [arkalis.client.Network, arkalis.client.Page, arkalis.client.Runtime, arkalis.client.DOM])
await domain.disable().catch(() => {})
await arkalis.client.Browser.close().catch(() => {})
await arkalis.client.close().catch(() => {})
}

export const arkalisBrowser = async (arkalis: ArkalisCore) => {
// these domains are used by the browser when creating a new profile
const blockDomains = [
"accounts.google.com", "clients2.google.com", "optimizationguide-pa.googleapis.com",
"content-autofill.googleapis.com"
"accounts.google.com", "clients2.google.com", "optimizationguide-pa.googleapis.com", "edgedl.me.gvt1.com",
"content-autofill.googleapis.com", "update.googleapis.com"
]

const switches = [
// these should all be undetectable, but speed things up
"disable-sync", "disable-backgrounding-occluded-windows", "disable-breakpad",
"disable-domain-reliability", "disable-background-networking", "disable-features=AutofillServerCommunication",
"disable-features=CertificateTransparencyComponentUpdater", "enable-crash-reporter-for-testing", "no-service-autorun",
"disable-features=OptimizationHints,MediaRouter,AutofillServerCommunication,CertificateTransparencyComponentUpdater,CalculateNativeWinOcclusion,InterestFeedContentSuggestions,Translate",
"disable-sync", "disable-backgrounding-occluded-windows", "disable-breakpad", "disable-renderer-backgrounding",
"disable-domain-reliability", "disable-background-networking", "disable-background-timer-throttling",
"enable-crash-reporter-for-testing", "no-service-autorun", "disable-ipc-flooding-protection", "password-store=basic",
"no-first-run", "no-default-browser-check", "disable-prompt-on-repost", "disable-client-side-phishing-detection",
"disable-features=InterestFeedContentSuggestions", "disable-features=Translate", "disable-hang-monitor",
"autoplay-policy=no-user-gesture-required", "use-mock-keychain", "disable-omnibox-autocomplete-off-method",
"disable-hang-monitor", "autoplay-policy=no-user-gesture-required", "use-mock-keychain", "disable-omnibox-autocomplete-off-method",
"disable-gaia-services", "disable-crash-reporter", "noerrdialogs", "disable-component-update",
"disable-features=MediaRouter", "metrics-recording-only", "disable-features=OptimizationHints",
"disable-component-update", "disable-features=CalculateNativeWinOcclusion", "enable-precise-memory-info",
"metrics-recording-only", "disable-component-update", "enable-precise-memory-info",
"force-fieldtrials=*BackgroundTracing/default/",

"no-sandbox", "disable-dev-shm-usage", // for linux docker
// "no-sandbox", "disable-dev-shm-usage", // for linux docker

// "disable-blink-features=AutomationControlled", // not working
// "auto-open-devtools-for-tabs",
// "log-net-log=tmp/out.json", "net-log-capture-mode=Everything", // note, does not log requests
// TODO: pass this in dyanmically from a hook in the har scraper
"log-net-log=./tmp/netlog.json", "net-log-capture-mode=Everything",
//"log-net-log=./tmp/netlog.json", "net-log-capture-mode=Everything",

arkalis.debugOptions.browserDebug === "verbose" ? "enable-logging=stderr": "",
arkalis.debugOptions.browserDebug === "verbose" ? "v=2" : "",
arkalis.scraperMeta.useGlobalBrowserCache ? `disk-cache-dir=${arkalis.debugOptions.globalBrowserCacheDir}` : "",
`window-position=${window.pos[0]},${window.pos[1]}`,
`window-size=${window.size[0]},${window.size[1]}`,
`host-rules=${blockDomains.map(blockDomain => `MAP ${blockDomain} 0.0.0.0`).join(", ")}`, // NOTE: detectable!
]
// arkalis.scraperMeta.useGlobalBrowserCache ? `disk-cache-dir=${arkalis.debugOptions.globalBrowserCacheDir}` : "",
// `window-position=${window.pos[0]},${window.pos[1]}`,
// `window-size=${window.size[0]},${window.size[1]}`,
`host-rules="${blockDomains.map(blockDomain => `MAP ${blockDomain} 0.0.0.0`).join(", ")}"`, // NOTE: detectable!

"remote-debugging-port=9222"
].filter(s => s !== "")

// apply proxy
const proxy = (arkalis as Arkalis).proxy
Expand All @@ -69,15 +119,9 @@ export const arkalisBrowser = async (arkalis: ArkalisCore) => {
}

// launch chrome
const instance = await ChromeLauncher.launch({
chromeFlags: switches.map(s => s.length > 0 ? `--${s}` : ""),
ignoreDefaultFlags: true,
logLevel: arkalis.debugOptions.browserDebug ? "verbose" : "silent",
})

// connect to cdp client
arkalis.debugOptions.browserDebug && arkalis.log("connecting to cdp client")
arkalis.client = await CDP({ port: instance.port })
const { closeBrowser, client } = await launchChromeViaOsRunDocker(arkalis, switches)
arkalis.client = client

await arkalis.client.Network.enable()
await arkalis.client.Page.enable()
await arkalis.client.Runtime.enable()
Expand All @@ -92,18 +136,6 @@ export const arkalisBrowser = async (arkalis: ArkalisCore) => {
await arkalis.client.Network.setBlockedURLs({ urls: arkalis.scraperMeta.blockUrls })

return {
close: async () => {
arkalis.debugOptions.browserDebug && arkalis.log("closing cdp client and browser")

await arkalis.client.Network.disable().catch(() => {})
await arkalis.client.Page.disable().catch(() => {})
await arkalis.client.Runtime.disable().catch(() => {})
await arkalis.client.DOM.disable().catch(() => {})

await arkalis.client.Browser.close().catch(() => {})
await arkalis.client.close().catch(() => {})

instance.kill()
}
close: closeBrowser
}
}
Loading

0 comments on commit e4e4a23

Please sign in to comment.