Skip to content

Commit

Permalink
merged branch #19-benchmark-log
Browse files Browse the repository at this point in the history
  • Loading branch information
jacqueline-chan committed Jan 8, 2021
2 parents 7368a55 + 0449b09 commit 822c864
Show file tree
Hide file tree
Showing 5 changed files with 272 additions and 7,907 deletions.
5 changes: 4 additions & 1 deletion newCrawler/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ const Apify = require('apify');
const path = require('path');
var { Readability } = require('@mozilla/readability');
var JSDOM = require('jsdom').JSDOM;

const { v5: uuidv5 } = require('uuid');
const parse = require('csv-parse/lib/sync')
const { performance } = require('perf_hooks');

var fs = require('fs');
var util = require('util');
Expand Down Expand Up @@ -331,7 +333,8 @@ Apify.main(async () => {
// Note: If the apify_storage file is not removed, it doesn't crawl
// during subsequent runs.
// Implementation of rmdir.
console.log(JSON.stringify(output_dict));
// console.log(JSON.stringify(output_dict));

const rmDir = function (dirPath, removeSelf) {
if (removeSelf === undefined)
removeSelf = true;
Expand Down
154 changes: 46 additions & 108 deletions newCrawler/debug.log
Original file line number Diff line number Diff line change
@@ -1,112 +1,50 @@
INFO: System info {"apifyVersion":"0.19.1","apifyClientVersion":"0.5.26","osType":"Darwin","nodeVersion":"v12.18.3"}
WARNING: Neither APIFY_LOCAL_STORAGE_DIR nor APIFY_TOKEN environment variable is set, defaulting to APIFY_LOCAL_STORAGE_DIR="/Volumes/Raiyan-External-HDD/School/University/Work-Study/Repositories/mediacat-domain-crawler/newCrawler/apify_storage"
SCOPE:
[
'https://www.tvnz.co.nz/one-news/',
'https://7days.ru/',
'https://www.aamulehti.fi/',
'https://abcnews.go.com/',
'https://www.aberdeennews.com/',
'https://www.ad.nl/',
'https://www.adnkronos.com/',
'https://africanmanager.com/',
'https://www.afp.com/en/',
'https://www.afp.com/',
'https://www.agi.it/',
'https://www.alarab.com/',
'https://www.alarabiya.net/',
'https://www.alaraby.co.uk/',
'https://www.albawaba.com/',
'https://alghad.com/',
'https://www.aljazeera.com/',
'http://almashareq.com/',
'http://english.ahram.org.eg/',
'https://www.al-akhbar.com/',
'http://aljazeerah.info/',
'https://www.al-monitor.com/',
'http://www.aps.dz/',
'https://allafrica.com/',
'https://www.alternet.org/',
'https://www.ambito.com/',
'https://www.americanisraelite.com/',
'https://www.adn.com/',
'https://www.aa.com.tr/',
'https://www.anp.nl/',
'http://www.ansa.it/',
'https://answersingenesis.org/answers/magazine/',
'https://en.antaranews.com/',
'https://www.antiwar.com/',
'https://www.aporrea.org/',
'https://www.appeal-democrat.com/',
'http://www.arabnews.com/',
'https://www.artforum.com/',
'https://ag.org/',
'https://atlantajewishtimes.timesofisrael.com/',
'https://gellerreport.com/',
'https://www.keystone-sda.ch/',
'https://www.statesman.com/',
'https://www.afr.com/',
'https://www.axios.com/',
'http://axisoflogic.com/',
'https://www.bna.bh/',
'https://jewishtimes.com/',
'http://www.bpnews.net/',
'https://www.bbc.com/news/',
'https://www.bharian.com.my/',
'https://www.morgenpost.de/',
'http://bernama.com/en/',
'https://www.bloomberg.com/',
'https://www.bndestem.nl/',
'http://bostonglobe.com/',
'https://www.bd.nl/',
'https://www.breakingisraelnews.com/',
'https://www.breakingnews.ie/',
'https://briarpatchmagazine.com/',
'https://www.breitbart.com/',
'http://buffalonews.com/',
'https://www.buzzfeed.com/',
'https://calgaryherald.com/',
'https://www.capebretonpost.com/',
'http://www.caretas.com.pe/',
'http://www.cartacapital.com.br/',
'https://www.cbsnews.com/',
'http://www.charentelibre.fr/',
'https://www.charismamag.com/',
'https://www.chicagojewishnews.com/',
'https://chicago.suntimes.com/',
'https://www.chicagotribune.com/',
'http://chinadaily.com.cn/',
'http://www.ecns.cn/',
'https://www1.cbn.com/',
'https://www.christiancentury.org/',
'https://www.christianheadlines.com/',
'https://christiannewsjournal.com/',
'https://www.csmonitor.com/',
'https://www.christianitytoday.com/',
'https://citizentruth.org/',
'http://www.clarin.com/',
'https://www.clevelandjewishnews.com/',
'https://www.cnn.com/',
'https://www.cogencis.com/newssection/news/',
'http://www.cjr.org/',
'http://columbian.com/',
'https://columbustelegram.com/',
'https://www.commondreams.org/',
'https://www.concordmonitor.com/',
'https://www.corriere.it/',
'https://www.counter-currents.com/',
'http://www.counterpunch.org/',
'https://www.crikey.com.au/',
'https://www.ctk.eu/',
'https://cyprus-mail.com/',
'https://www.dvhn.nl/',
'https://www.dailybreeze.com/',
'https://www.dailymail.co.uk/',
... 1271 more items
]
Results folder created.
WARNING: Neither APIFY_LOCAL_STORAGE_DIR nor APIFY_TOKEN environment variable is set, defaulting to APIFY_LOCAL_STORAGE_DIR="/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/apify_storage"
[ 'https://www.aljazeera.com/news/' ]
INFO: AutoscaledPool: Setting max memory of this run to 2048 MB. Use the APIFY_MEMORY_MBYTES environment variable to override it.
INFO: AutoscaledPool state {"currentConcurrency":0,"desiredConcurrency":2,"systemStatus":{"isSystemIdle":true,"memInfo":{"isOverloaded":false,"limitRatio":0.2,"actualRatio":null},"eventLoopInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":null},"cpuInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":null},"clientInfo":{"isOverloaded":false,"limitRatio":0.3,"actualRatio":null}}}
INFO: Launching Puppeteer {"headless":true,"stealth":false,"useChrome":false,"args":["--no-sandbox","--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"],"timeout":15000,"defaultViewport":{"width":1366,"height":768}}
Title of "https://www.tvnz.co.nz/one-news" is "Home | 1 NEWS | TVNZ"
Title of "https://7days.ru" is "7Дней.ру - Новости, интервью, фото и видео звезд"
INFO: Crawler request statistics: {"avgDurationMillis":null,"perMinute":0,"finished":0,"failed":0,"retryHistogram":[]}
INFO: AutoscaledPool state {"currentConcurrency":4,"desiredConcurrency":2,"systemStatus":{"isSystemIdle":false,"memInfo":{"isOverloaded":true,"limitRatio":0.2,"actualRatio":1},"eventLoopInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":0.039},"cpuInfo":{"isOverloaded":false,"limitRatio":0.4,"actualRatio":0},"clientInfo":{"isOverloaded":false,"limitRatio":0.3,"actualRatio":0}}}
ERROR: BasicCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"https://www.aljazeera.com/news/2020/12/18/us-muslims-press-organization-of-islamic-cooperation-on-china","retryCount":1,"id":"08gT7tqg6RvqM6b"}
TimeoutError: Navigation timeout of 60000 ms exceeded
at /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
-- ASYNC --
at Frame.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:111:15)
at Page.goto (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/Page.js:675:49)
at Page.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:112:23)
at gotoExtended (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/puppeteer_utils.js:473:15)
at PuppeteerCrawler._defaultGotoFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:365:46)
at PuppeteerCrawler._handleRequestFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:325:35)
at processTicksAndRejections (internal/process/task_queues.js:97:5)
at async /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/utils.js:409:20
ERROR: BasicCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"https://www.aljazeera.com/news/2020/12/11/china-visa","retryCount":1,"id":"bCtLGC9H7yxQnv9"}
TimeoutError: Navigation timeout of 60000 ms exceeded
at /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
-- ASYNC --
at Frame.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:111:15)
at Page.goto (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/Page.js:675:49)
at Page.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:112:23)
at gotoExtended (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/puppeteer_utils.js:473:15)
at PuppeteerCrawler._defaultGotoFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:365:46)
at PuppeteerCrawler._handleRequestFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:325:35)
at processTicksAndRejections (internal/process/task_queues.js:97:5)
at async /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/utils.js:409:20
ERROR: BasicCrawler: handleRequestFunction failed, reclaiming failed request back to the list or queue {"url":"https://www.aljazeera.com/news/2020/12/18/in-wake-of-election-loss-trumps-enemies-list-keeps-expanding","retryCount":1,"id":"k4moHyQJ6b5YJCc"}
TimeoutError: Navigation timeout of 60000 ms exceeded
at /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
-- ASYNC --
at Frame.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:111:15)
at Page.goto (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/Page.js:675:49)
at Page.<anonymous> (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/node_modules/puppeteer/lib/helper.js:112:23)
at gotoExtended (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/puppeteer_utils.js:473:15)
at PuppeteerCrawler._defaultGotoFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:365:46)
at PuppeteerCrawler._handleRequestFunction (/Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/crawlers/puppeteer_crawler.js:325:35)
at runMicrotasks (<anonymous>)
at processTicksAndRejections (internal/process/task_queues.js:97:5)
at async /Users/Amy0127/Desktop/Mediacat/mediacat-domain-crawler/newCrawler/node_modules/apify/build/utils.js:409:20
Title of "https://www.aljazeera.com/news/2020/12/20/trump-campaign-will-again-ask-top-court-to-upend-election-result" is "Trump campaign will again ask top court to upend election result | Donald Trump News | Al Jazeera"
Call to "https://www.aljazeera.com/news/2020/12/20/trump-campaign-will-again-ask-top-court-to-upend-election-result" took 0.9164294899702128 seconds.
Title of "https://www.aljazeera.com/news/2020/12/20/us-senator-romney-says-trump-has-a-russia-blind-spot" is "Trump has a Russia ‘blind spot’, says US Senator Romney | Cybercrime News | Al Jazeera"
Call to "https://www.aljazeera.com/news/2020/12/20/us-senator-romney-says-trump-has-a-russia-blind-spot" took 1.1262937870025667 seconds.
>>>>>>> origin/#19-benchmark-log
Loading

0 comments on commit 822c864

Please sign in to comment.