Skip to content

Commit

Permalink
added benchmark code for individual page requests
Browse files Browse the repository at this point in the history
  • Loading branch information
amygao9 committed Dec 21, 2020
1 parent 2450bea commit 0449b09
Show file tree
Hide file tree
Showing 5 changed files with 1,302 additions and 7,823 deletions.
14 changes: 12 additions & 2 deletions newCrawler/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ const Apify = require('apify');
const path = require('path');
var { Readability } = require('@mozilla/readability');
var JSDOM = require('jsdom').JSDOM;
const { performance } = require('perf_hooks');

var fs = require('fs');
var util = require('util');
Expand Down Expand Up @@ -100,6 +101,7 @@ Apify.main(async () => {
useChrome: false,
},
handlePageFunction: async ({ request, page }) => {
const t2 = performance.now();
const title = await page.title(); // Get the title of the page.
let domainNameIndex = 5;
let general_regex = /(http(s)?:\/\/((w|W){3}\.)?)([^.]+)((\.[a-zA-Z]+)+)/;
Expand Down Expand Up @@ -195,22 +197,30 @@ Apify.main(async () => {
}
// Add this list to the dict.
output_dict[request.url] = elem;

const t3 = performance.now();
// Log the time for this request.
console.log(`Call to "${request.url}" took ${t3/1000.0 - t2/1000.0} seconds.`);
// Enqueue the deeper URLs to crawl.
await Apify.utils.enqueueLinks({ page, selector: 'a', pseudoUrls, requestQueue });
},
// The max concurrency and max requests to crawl through.
maxRequestsPerCrawl: 20,
maxConcurrency: 10,
});

const t0 = performance.now();
// Run the crawler.
await crawler.run();
const t1 = performance.now();
// Log the time to run the crawler.
console.log(`Call to run Crawler took ${t1/1000.0 - t0/1000.0} milliseconds.`);

// Delete the apify storage.
// Note: If the apify_storage file is not removed, it doesn't crawl
// during subsequent runs.
// Implementation of rmdir.
console.log(JSON.stringify(output_dict));
// console.log(JSON.stringify(output_dict));

const rmDir = function (dirPath, removeSelf) {
if (removeSelf === undefined)
removeSelf = true;
Expand Down
74 changes: 45 additions & 29 deletions newCrawler/debug.log
Original file line number Diff line number Diff line change
@@ -1,33 +1,49 @@
INFO: System info {"apifyVersion":"0.19.1","apifyClientVersion":"0.5.26","osType":"Darwin","nodeVersion":"v12.18.3"}
WARNING: Neither APIFY_LOCAL_STORAGE_DIR nor APIFY_TOKEN environment variable is set, defaulting to APIFY_LOCAL_STORAGE_DIR="/Volumes/Raiyan-External-HDD/School/University/Work-Study/Repositories/mediacat-domain-crawler/newCrawler/apify_storage"
[
'https://www.cnn.com/',
'https://www.aljazeera.com/',
'https://www.nytimes.com/'
]
WARNING: Neither APIFY_LOCAL_STORAGE_DIR nor APIFY_TOKEN environment variable is set, defaulting to APIFY_LOCAL_STORAGE_DIR="/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/apify_storage"
[ 'https://www.aljazeera.com/news/' ]
INFO: AutoscaledPool: Setting max memory of this run to 2048 MB. Use the APIFY_MEMORY_MBYTES environment variable to override it.
INFO: AutoscaledPool state {"currentConcurrency":0,"desiredConcurrency":2,"systemStatus":{"isSystemIdle":true,"memInfo":{"isOverloaded":false,"limitRatio":0.2,"actualRatio":null},"eventLoopInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":null},"cpuInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":null},"clientInfo":{"isOverloaded":false,"limitRatio":0.3,"actualRatio":null}}}
INFO: Launching Puppeteer {"headless":true,"stealth":false,"useChrome":false,"args":["--no-sandbox","--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"],"timeout":15000,"defaultViewport":{"width":1366,"height":768}}
Title of "https://www.aljazeera.com/" is "Breaking News, World News and Video from Al Jazeera"
Title of "https://www.cnn.com/" is "CNN - Breaking News, Latest News and Videos"
Title of "https://www.nytimes.com/" is "The New York Times - Breaking News, US News, World News and Videos"
Title of "https://www.aljazeera.com/news/" is "News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/africa/" is "Africa News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/middle-east/" is "Middle East News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/latin-america/" is "Latin America News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/asia/" is "Asia News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/live" is "LIVE - Al Jazeera English News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/us-canada/" is "US & Canada News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/asia-pacific/" is "Asia Pacific News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/features/" is "Features | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/europe/" is "Europe News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/us-elections-2020/" is "US Elections 2020 News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/economy/" is "Economy News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/opinion/" is "Opinion | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/videos/" is "Videos News | Today's latest from Al Jazeera"
Title of "https://www.aljazeera.com/investigations/" is "Investigations News | Today's latest from Al Jazeera"
Title of "https://www.cnn.com/privacy" is "CNN Privacy Statement - CNN"
Title of "https://www.cnn.com/terms" is "CNN Terms of Use - CNN"
Title of "https://www.cnn.com/us" is "US News – Top national stories and latest headlines - CNN"
INFO: BasicCrawler: Crawler reached the maxRequestsPerCrawl limit of 20 requests and will shut down soon. Requests that are in progress will be allowed to finish.
INFO: BasicCrawler: Earlier, the crawler reached the maxRequestsPerCrawl limit of 20 requests and all requests that were in progress at that time have now finished. In total, the crawler processed 21 requests and will shut down.
INFO: Crawler request statistics: {"avgDurationMillis":null,"perMinute":0,"finished":0,"failed":0,"retryHistogram":[]}
INFO: AutoscaledPool state {"currentConcurrency":4,"desiredConcurrency":2,"systemStatus":{"isSystemIdle":false,"memInfo":{"isOverloaded":true,"limitRatio":0.2,"actualRatio":1},"eventLoopInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":0.039},"cpuInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":0},"clientInfo":{"isOverloaded":false,"limitRatio":0.3,"actualRatio":0}}}
ERROR: BasicCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"https://www.aljazeera.com/news/2020/12/18/us-muslims-press-organization-of-islamic-cooperation-on-china","retryCount":1,"id":"08gT7tqg6RvqM6b"}
TimeoutError: Navigation timeout of 60000 ms exceeded
at /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
-- ASYNC --
at Frame.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:111:15)
at Page.goto (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/Page.js:675:49)
at Page.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:112:23)
at gotoExtended (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/puppeteer_utils.js:473:15)
at PuppeteerCrawler._defaultGotoFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:365:46)
at PuppeteerCrawler._handleRequestFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:325:35)
at processTicksAndRejections (internal/process/task_queues.js:97:5)
at async /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/utils.js:409:20
ERROR: BasicCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"https://www.aljazeera.com/news/2020/12/11/china-visa","retryCount":1,"id":"bCtLGC9H7yxQnv9"}
TimeoutError: Navigation timeout of 60000 ms exceeded
at /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
-- ASYNC --
at Frame.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:111:15)
at Page.goto (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/Page.js:675:49)
at Page.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:112:23)
at gotoExtended (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/puppeteer_utils.js:473:15)
at PuppeteerCrawler._defaultGotoFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:365:46)
at PuppeteerCrawler._handleRequestFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:325:35)
at processTicksAndRejections (internal/process/task_queues.js:97:5)
at async /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/utils.js:409:20
ERROR: BasicCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"https://www.aljazeera.com/news/2020/12/18/in-wake-of-election-loss-trumps-enemies-list-keeps-expanding","retryCount":1,"id":"k4moHyQJ6b5YJCc"}
TimeoutError: Navigation timeout of 60000 ms exceeded
at /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
-- ASYNC --
at Frame.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:111:15)
at Page.goto (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/Page.js:675:49)
at Page.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:112:23)
at gotoExtended (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/puppeteer_utils.js:473:15)
at PuppeteerCrawler._defaultGotoFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:365:46)
at PuppeteerCrawler._handleRequestFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:325:35)
at runMicrotasks (<anonymous>)
at processTicksAndRejections (internal/process/task_queues.js:97:5)
at async /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/utils.js:409:20
Title of "https://www.aljazeera.com/news/2020/12/20/trump-campaign-will-again-ask-top-court-to-upend-election-result" is "Trump campaign will again ask top court to upend election result | Donald Trump News | Al Jazeera"
Call to "https://www.aljazeera.com/news/2020/12/20/trump-campaign-will-again-ask-top-court-to-upend-election-result" took 0.9164294899702128 seconds.
Title of "https://www.aljazeera.com/news/2020/12/20/us-senator-romney-says-trump-has-a-russia-blind-spot" is "Trump has a Russia ‘blind spot’, says US Senator Romney | Cybercrime News | Al Jazeera"
Call to "https://www.aljazeera.com/news/2020/12/20/us-senator-romney-says-trump-has-a-russia-blind-spot" took 1.1262937870025667 seconds.
Loading

0 comments on commit 0449b09

Please sign in to comment.