Skip to content

Commit

Permalink
Merge pull request #21 from UTMediaCAT/#16-Metascraper-Integration
Browse files Browse the repository at this point in the history
#16 metascraper integration
  • Loading branch information
kstapelfeldt authored Jan 28, 2021
2 parents 59f4f95 + c3db58d commit 82ef730
Show file tree
Hide file tree
Showing 326 changed files with 540 additions and 80 deletions.
33 changes: 33 additions & 0 deletions newCrawler/DBHelpers.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
const mongoose = require('mongoose');
let db = require('./database.js')

mongoose.connection
.once('open', () => console.log('Connected to DB'))
.on('error', (error) => {
console.log("Your Error", error);
});
function clearDB() {
db.metaModel.deleteMany({}).then(function(){
console.log("Data deleted"); // Success
mongoose.connection.close();
}).catch(function(error){
console.log(error); // Failure
});
}
// Resets all entries with updated value true to updated value being false
function resetUpdatedVar() {
db.metaModel.find({'updated': true}, async function (err, docs) {
if (err) return console.log(err);
await db.metaModel.updateMany({}, {'updated': false});
console.log("Data reset to updated = false")
mongoose.connection.close();
});
}
let arg = process.argv.slice(2)[0];
console.log(arg);
if (arg == 'clear') {
clearDB();
}
else if (arg == 'reset') {
resetUpdatedVar();
}
8 changes: 5 additions & 3 deletions newCrawler/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -278,15 +278,17 @@ Apify.main(async () => {

let elem = {
title: parsedArticle.title,
title_metascraper: '',
url: request.url,
author_metadata: parsedArticle.byline,
author_metascraper: '',
date: '',
html_content: parsedArticle.content,
article_text: parsedArticle.textContent,
article_len: parsedArticle.length,
domain: url_list[listIndex],
found_urls: tuple_list,
out_of_scope_urls: local_out_of_scope
domain: domain_url,
updated: false,
found_urls: tuple_list
}

// Create a JSON for this link with a uuid.
Expand Down
3 changes: 3 additions & 0 deletions newCrawler/database.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@ mongoose.connect('mongodb://localhost/articles', {useNewUrlParser: true, useUnif

const schema = new mongoose.Schema({
title: String,
title_metascraper: String,
url: String,
author_metadata: String,
author_metascraper: String,
date: String,
html_content: String,
article_text: String,
article_len: String,
domain: String,
updated: Boolean,
found_urls: [{
title: String,
url: String
Expand Down
1 change: 1 addition & 0 deletions newCrawler/domainTest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"nytimes":["https://www.nytimes.com/live/2021/01/07/us/capitol-building-trump?action=click&module=Spotlight&pgtype=Homepage","https://www.nytimes.com/2021/01/07/technology/trump-social-media.html","https://www.nytimes.com/2021/01/07/us/now-it-can-be-told-how-neil-sheehan-got-the-pentagon-papers.html","https://www.nytimes.com/2021/01/07/style/archives-hashtag-party.html","https://www.nytimes.com/2021/01/07/opinion/capitol-riot-trump-america.html?action=click&module=Opinion&pgtype=Homepage"],"cnn":["https://www.cnn.com/2021/01/07/politics/trump-biden-us-capitol-electoral-college-insurrection/index.html","https://www.cnn.com/2021/01/07/politics/law-enforcement-capitol-riot/index.html","https://www.cnn.com/politics","https://www.cnn.com/2021/01/07/politics/georgia-senate-runoff-loeffler-warnock/index.html"]}
132 changes: 55 additions & 77 deletions newCrawler/getDates.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,129 +11,107 @@ const metascraper = require('metascraper')([
const linkFile = './link_title_list.json';

const got = require('got');
var fs = require('fs') ;

exports.getDate = (file) => {
readFile(file, promiseLoop);
}


function readFile(linkFile, callback){

// callback
fs.readFile(linkFile, function (err, data) {
if (err) {
throw err;
}

try {
// json
let json = JSON.parse(data);
// console.log(json);

callback(json);
} catch (e) {
throw e;
}

});

}
const mongoose = require('mongoose');
let db = require('./database.js')

mongoose.connection
.once('open', () => console.log('Connected to DB'))
.on('error', (error) => {
console.log("Your Error", error);
});
exports.getDate = () => {
ongoing = true;
databaseLoop();
}
function databaseLoop() {
db.metaModel.find({'updated': false}, function (err, docs) {
if (err) return handleError(err);
// Checks the amount of links that weren't updated, if there aren't any, exits, otherwise applies recursion.
if (docs.length != 0) {
console.log(docs.length);
promiseLoop(docs);
//mongoose.connection.close();
} else {
db.metaModel.find({'updated': true}, function (err, docs) {
if (err) return handleError(err);
console.log(docs.length);
//mongoose.connection.close();
});
console.log('we are done! :)');
}
});
}

async function promiseLoop(json) {
let newJson = await promiseLoopAsync(json);
console.log(newJson);

console.log('we are done! :)');

fs.writeFile('metadata_modified_list.json', JSON.stringify(newJson), function(err) {
// console.log(newJson)
if (err) {
console.log('The date json has been processed with an error');
console.log(err);
} else {
console.log('complete!');
}
});
await promiseLoopAsync(json);
databaseLoop();
}


async function promiseLoopAsync (json) {

let newJson = {};
let key;

for (key in json) {
let articlesList = json[key];
let promises = [];
let i;
for (i = 0; i < articlesList.length; i++) {
let article = articlesList[i];
promises.push(promiseDate(article));
}
await Promise.allSettled(promises).then( (promiseResults) => {
promises = []
for (let i = 0; i < json.length; i++) {
let article = json[i];
promises.push(promiseDate(article));
await Promise.allSettled(promises).then( (promiseResults) => {
let articleListMetadata = [];
promiseResults.forEach(
(result) => {
async (result) => {
if (result.status === 'fulfilled') {
// change from list to object here TODO: Raiyan
let link = result.value[0];
let metascraper = result.value[result.value.length - 1];
let date = metascraper.date
// console.log("result : ", result)
console.log('Valid link ' + link + 'with date ' + date);
articleListMetadata.push(result.value);
newArticle = result.value;
let date = newArticle["date"];
console.log('Valid link ' + newArticle["url"] + ' with date ' + date + ' with author ' + newArticle["author_metadata"] +
' with title ' + newArticle["title"]);
await db.metaModel.updateOne({"_id":newArticle["_id"]},
{"date":date, "updated":true, "author_metascraper":newArticle["author_metascraper"], "title_metascraper":newArticle["title_metascraper"]});
} else {
// console.log("result : ", result)
console.log('Failed link ' + result.reason);
articleListMetadata.push(result.reason);
}
}
);

newJson[key] = articleListMetadata;

}).catch ((e) =>
// this part is suppose to be unreachable
}).catch ((e) =>
// this part is supposed to be unreachable
console.log(e)
);
);

}

return newJson;

}


// article is an array
function promiseDate (article) {

let promise = new Promise ((resolve, reject) => {
let link = article[0];
let link = article['url'];
got(link)
.on('request', request => setTimeout(() => request.destroy(), 20000))
.then(({ body: html, url }) => {
metascraper({ html, url }).then((metadata) => {
console.log('metadata : ', metadata);
console.log('Pass: '+link+ ' '+metadata['date']);
article.push(metadata);
article['date'] = metadata['date'];
article['title_metascraper'] = metadata['title'];
article['author_metascraper'] = metadata['author'];
resolve(article);
}).catch ((e) => {
console.log('error : ', e.hostname);
console.log('Fail: '+link);
article.push(null);
reject(article);
});
}).catch((e) => {
console.log('error : ', e.hostname);
console.log('Fail: at get request: ' + link);
article.push(null);
reject(article);
});
});
return promise;
}

function handleError(err) {
console.log(err);
}
if (require.main === module) {
this.getDate(linkFile);
this.getDate();
}
1 change: 1 addition & 0 deletions newCrawler/metadata_modified_list.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"https://www.aljazeera.com/":[],"https://www.cnn.com/":[],"https://www.nytimes.com/":[],"https://www.aljazeera.com/news/":[],"https://www.aljazeera.com/africa/":[],"https://www.aljazeera.com/middle-east/":[],"https://www.aljazeera.com/latin-america/":[],"https://www.aljazeera.com/asia/":[],"https://www.aljazeera.com/us-canada/":[],"https://www.aljazeera.com/live":[],"https://www.aljazeera.com/asia-pacific/":[],"https://www.aljazeera.com/europe/":[],"https://www.aljazeera.com/features/":[],"https://www.aljazeera.com/us-elections-2020/":[],"https://www.aljazeera.com/economy/":[],"https://www.aljazeera.com/opinion/":[],"https://www.aljazeera.com/investigations/":[],"https://www.aljazeera.com/videos/":[],"https://www.cnn.com/privacy":[],"https://www.cnn.com/terms":[],"https://www.cnn.com/us":[]}
Loading

0 comments on commit 82ef730

Please sign in to comment.