Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove Site Name #11

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,22 @@ var title = require('./lib/parser/title');
var summary = require('./lib/parser/summary');

module.exports = {
extractData: function (articleUrl, callback) {
request(articleUrl, function (err, response, body) {
var data = {};
var preppedHtml = cleaner.prepForParsing(body);
extractData: function (articleUrl, callback) {
request(articleUrl, function (err, response, body) {

data.domain = url.parse(articleUrl).host;
data.author = author.getAuthor(preppedHtml);
data.title = title.getTitle(preppedHtml);
data.content = content.getArticleContent(preppedHtml, data.host);
data.summary = summary.getSummary(preppedHtml, data.content);
var preppedHtml = cleaner.prepForParsing(body);
var data = {
body:preppedHtml,
article: {}
};

callback(null, data);
});
}
data.article.domain = url.parse(articleUrl).host;
data.article.author = author.getAuthor(preppedHtml);
data.article.title = title.getTitle(preppedHtml);
data.article.content = content.getArticleContent(preppedHtml, data.host);
data.article.summary = summary.getSummary(preppedHtml, data.content);
callback(null, data);

});
}
}
18 changes: 18 additions & 0 deletions lib/cleaner/clean-formatting-async.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
var jsdom = require("jsdom-no-contextify");
var fs = require("fs");
var jquery = fs.readFileSync(__dirname + '/../jquery.js', "utf-8");
/**
* Cleans up parsed HTML formatting by removing newlines.
*
* @param rawHtml
* @returns {string}
*/
module.exports = function (rawHtml,cb) {
setTimeout(function(){
rawHtml = rawHtml
.replace(/\n/g, '')
.trim();

cb(rawHtml);
});
};
18 changes: 18 additions & 0 deletions lib/cleaner/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,15 @@

var prepForParsing = require('./prep-for-parsing');
var removeAttributes = require('./remove-attributes');
var removeAttributesAsync = require('./remove-attributes-async');
var cleanFormatting = require('./clean-formatting');
var cleanFormattingAsync = require('./clean-formatting-async');
var removeSocialElements = require('./remove-social-elements');
var removeSocialElementsAsync = require('./remove-social-elements-async');
var removeNavigationalElements = require('./remove-navigational-elements');
var removeNavigationalElementsAsync = require('./remove-navigational-elements-async');
var removeEmptyElements = require('./remove-empty-elements');
var removeEmptyElementsAsync = require('./remove-empty-elements-async');

module.exports = {
prepForParsing: prepForParsing,
Expand All @@ -25,5 +30,18 @@ module.exports = {
rawHtml = cleanFormatting(rawHtml);

return rawHtml;
},
cleanAfterParsingAsync: function (rawHtml, host,cb) {
removeAttributesAsync(rawHtml,function(rawHtml){
removeSocialElementsAsync(rawHtml,function(rawHtml){
removeNavigationalElementsAsync(rawHtml, host,function(rawHtml){
removeEmptyElementsAsync(rawHtml,function(rawHtml){
cleanFormattingAsync(rawHtml,function(rawHtml){
cb(rawHtml);
})
});
})
})
});
}
};
5 changes: 5 additions & 0 deletions lib/cleaner/prep-for-parsing.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@ var blacklistRegex = /ads|social|comment/i;
* @param rawHtml
*/
module.exports = function (rawHtml) {

if(!rawHtml){
return '';
}

var $ = cheerio.load(rawHtml);

var $body = $('body');
Expand Down
45 changes: 45 additions & 0 deletions lib/cleaner/remove-attributes-async.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
var cheerio = require('cheerio');
var _ = require('lodash');
var jsdom = require("jsdom-no-contextify");
var fs = require("fs");
var jquery = fs.readFileSync(__dirname + '/../jquery.js', "utf-8");

var attributesToKeep = [
'src',
'href',
'target'
];

/**
* Removes all attributes from a given HTML string, except for the ones we're still interested in, such as img src,
* anchor hrefs, ...
*
* @param rawHtml
*/
module.exports = function (rawHtml,cb) {

if(!rawHtml){
cb('');
}

jsdom.env({
html: rawHtml,
src: [jquery],
done: function (errors, window) {
var $ = require("jquery")(window);
$('*').each(function () {
var element = this;
var attributes = _.chain(element.attribs)
.keys()
.difference(attributesToKeep)
.value();

attributes.forEach(function (attribute) {
$(element).removeAttr(attribute);
});
});

cb($.html());
}
});
};
5 changes: 5 additions & 0 deletions lib/cleaner/remove-attributes.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ var attributesToKeep = [
* @param rawHtml
*/
module.exports = function (rawHtml) {

if(!rawHtml){
return '';
}

var $ = cheerio.load(rawHtml);

$('*').each(function () {
Expand Down
36 changes: 36 additions & 0 deletions lib/cleaner/remove-empty-elements-async.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
var cheerio = require('cheerio');
var _ = require('lodash');
var jsdom = require("jsdom-no-contextify");
var fs = require("fs");
var jquery = fs.readFileSync(__dirname + '/../jquery.js', "utf-8");

/**
* Removes all empty elements.
*
* @param rawHtml
*/
module.exports = function (rawHtml,cb) {
if(!rawHtml){
cb('');
}

jsdom.env({
html: rawHtml,
src: [jquery],
done: function (errors, window) {
var $ = require("jquery")(window);
$('*').each(function () {
var children = $(this).children().length;
var content = $(this).text().replace(/\t|\s/g, '');
var isImage = $(this)[0].tagName === 'img';

if (!children && !content && !isImage) {
$(this).remove();
}
});

cb($.html());
}
});

}
3 changes: 3 additions & 0 deletions lib/cleaner/remove-empty-elements.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ var _ = require('lodash');
* @param rawHtml
*/
module.exports = function (rawHtml) {
if(!rawHtml){
return '';
}
var $ = cheerio.load(rawHtml);

$('*').each(function () {
Expand Down
60 changes: 60 additions & 0 deletions lib/cleaner/remove-navigational-elements-async.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
var cheerio = require('cheerio');
var _ = require('lodash');
var jsdom = require("jsdom-no-contextify");
var fs = require("fs");
var jquery = fs.readFileSync(__dirname + '/../jquery.js', "utf-8");

/**
* Removes all elements that are used for navigation (such as 'to top' links, article tags, ...)
*
* @param rawHtml
*/
module.exports = function (rawHtml, host,cb) {
if(!rawHtml){
cb('');
}
jsdom.env({
html: rawHtml,
src: [jquery],
done: function (errors, window) {
var $ = require("jquery")(window);
// Filter out 'back to top' links
$('a').filter(function () {
var hasTopInText = $(this).text().toLowerCase().indexOf('top') > -1;

var hasHashInHref;
if($(this).attr('href')){
hasHashInHref = $(this).attr('href').indexOf('#') > -1;
}
return hasTopInText && hasHashInHref;
}).remove();

// Filter out any links that have the `rel="tag"` attribute, or link back to the same host with 'tag' in the URL.
$('a').each(function () {
var relTag = $(this).attr('rel');
var href = $(this).attr('href');

if(!href){
return;
}
var isRelTag = relTag === 'tag';
var isPartOfList = $(this).parents('ul').length > 0;
var containsUrlWithTag = href.indexOf(host) > -1 && href.indexOf('tag') > -1;

if (isRelTag || containsUrlWithTag) {
if (isPartOfList) {
$(this).parents('ul').remove();
}
else {
$(this).remove();
}
}

// Remove any other elements with a `tags` class.
$('.tags').remove();
});

cb($.html());
}
});
}
12 changes: 11 additions & 1 deletion lib/cleaner/remove-navigational-elements.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,19 @@ var _ = require('lodash');
* @param rawHtml
*/
module.exports = function (rawHtml, host) {
if(!rawHtml){
return '';
}
var $ = cheerio.load(rawHtml);

// Filter out 'back to top' links
$('a').filter(function () {
var hasTopInText = $(this).text().toLowerCase().indexOf('top') > -1;
var hasHashInHref = $(this).attr('href').indexOf('#') > -1;

var hasHashInHref;
if($(this).attr('href')){
hasHashInHref = $(this).attr('href').indexOf('#') > -1;
}
return hasTopInText && hasHashInHref;
}).remove();

Expand All @@ -21,6 +28,9 @@ module.exports = function (rawHtml, host) {
var relTag = $(this).attr('rel');
var href = $(this).attr('href');

if(!href){
return;
}
var isRelTag = relTag === 'tag';
var isPartOfList = $(this).parents('ul').length > 0;
var containsUrlWithTag = href.indexOf(host) > -1 && href.indexOf('tag') > -1;
Expand Down
3 changes: 3 additions & 0 deletions lib/cleaner/remove-post-data-elements.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ var _ = require('lodash');
* @param rawHtml
*/
module.exports = function (rawHtml, host) {
if(!rawHtml){
return '';
}
var $ = cheerio.load(rawHtml);

$('*[property="author"]').remove();
Expand Down
49 changes: 49 additions & 0 deletions lib/cleaner/remove-social-elements-async.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
var cheerio = require('cheerio');
var _ = require('lodash');
var jsdom = require("jsdom-no-contextify");
var fs = require("fs");
var jquery = fs.readFileSync(__dirname + '/../jquery.js', "utf-8");

var shareUrls = [
'twitter.com/intent',
'facebook.com/sharer'
];

/**
* Removes all elements that contain any social keywords.
*
* @param rawHtml
*/
module.exports = function (rawHtml,cb) {
if(!rawHtml){
cb('');
}

jsdom.env({
html: rawHtml,
src: [jquery],
done: function (errors, window) {
var $ = require("jquery")(window);
$('*').each(function () {
var text = $(this).text().toLowerCase();
var possibleSocialElement = text.indexOf('share on') > -1;

if (possibleSocialElement) {
var anchors = $(this).find('a');
anchors.each(function () {
var $anchor = $(this);
var href = $anchor.attr('href');

_.each(shareUrls, function (shareUrl) {
if (href.indexOf(shareUrl) > -1) {
$anchor.remove();
}
});
});
}
});

cb($.html());
}
});
}
3 changes: 3 additions & 0 deletions lib/cleaner/remove-social-elements.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ var shareUrls = [
* @param rawHtml
*/
module.exports = function (rawHtml) {
if(!rawHtml){
return '';
}
var $ = cheerio.load(rawHtml);

$('*').each(function () {
Expand Down
4 changes: 4 additions & 0 deletions lib/jquery.js

Large diffs are not rendered by default.

Loading