Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements in data structure (keywords and image added) and corrections in href logic #12

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
node_modules/
data/
data/
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,16 @@ The result looks like this:
"author": "Thomas Tuts",
"title": "Article Extractor Demo",
"summary": "A Node.js module to retrieve article content and metadata from a URL.",
"content": "<p>This is the article content.</p>"
"content": "<p>This is the article content.</p>",
"keywords": "article,extractor,demo,node,module,metadata",
"image": "http://ep.yimg.com/ca/I/paulgraham_2271_3232"
}
```

## Change log

#1.1.0
* Corrected a logic related to parse and extract information abou href presence in `<a>` elements and it's '#' href marker
* Improved structure with keywords based on meta tags `<meta name="keywords">` and `<meta class="siwftype" name="tags">`
* Improved structure with image based on scored rank (`how far images are related to <main> or <body> element`) or `<meta class="siwftype" name="image">`
* Added a option to use `<meta class="siwftype" name="blogger_name">` as a source for author name
4 changes: 4 additions & 0 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ var author = require('./lib/parser/author');
var content = require('./lib/parser/content');
var title = require('./lib/parser/title');
var summary = require('./lib/parser/summary');
var keywords = require('./lib/parser/keywords');
var image = require('./lib/parser/image');

module.exports = {
extractData: function (articleUrl, callback) {
Expand All @@ -17,6 +19,8 @@ module.exports = {
data.title = title.getTitle(preppedHtml);
data.content = content.getArticleContent(preppedHtml, data.host);
data.summary = summary.getSummary(preppedHtml, data.content);
data.keywords = keywords.getKeywords(preppedHtml);
data.image = image.getImage(preppedHtml);

callback(null, data);
});
Expand Down
4 changes: 2 additions & 2 deletions lib/cleaner/remove-navigational-elements.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ module.exports = function (rawHtml, host) {
// Filter out 'back to top' links
$('a').filter(function () {
var hasTopInText = $(this).text().toLowerCase().indexOf('top') > -1;
var hasHashInHref = $(this).attr('href').indexOf('#') > -1;
var hasHashInHref = $(this).attr('href') ? $(this).attr('href').indexOf('#') > -1 : false;
return hasTopInText && hasHashInHref;
}).remove();

Expand All @@ -23,7 +23,7 @@ module.exports = function (rawHtml, host) {

var isRelTag = relTag === 'tag';
var isPartOfList = $(this).parents('ul').length > 0;
var containsUrlWithTag = href.indexOf(host) > -1 && href.indexOf('tag') > -1;
var containsUrlWithTag = href ? href.indexOf(host) > -1 && href.indexOf('tag') > -1 : false;

if (isRelTag || containsUrlWithTag) {
if (isPartOfList) {
Expand Down
3 changes: 2 additions & 1 deletion lib/parser/author.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@ var cheerio = require('cheerio');
function getAuthor(html) {
var $ = cheerio.load(html);

var swifttypeAuthor = $('meta[class="swiftype"]').filter('[name="blogger_name"]').eq(0).attr('content');
var metatagAuthor = $('meta[name="author"]').attr('content');
var semanticAuthor = $('*[rel="author"]').eq(0).text();
var classAuthor = $('.author').eq(0).text();
return metatagAuthor || semanticAuthor || classAuthor;
return swifttypeAuthor || metatagAuthor || semanticAuthor || classAuthor;
}

module.exports = {
Expand Down
43 changes: 43 additions & 0 deletions lib/parser/image.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
var cheerio = require('cheerio');

/**
* Fetch <body> and <main> elements from DOM and, for each <img>, try to score its distance from
* parent body / main. Highest score is used to retrieve image.
*
* @param {object} body
*/
function getImageCandidate(body) {
var scoredImages = [];
var main = body.find('main').eq(0);

// fetching all other images from body and calculating distance to main or body section
var imgs = body.find('img');
imgs.each(index => {
var img = cheerio(imgs.get(index));
scoredImages.push({ score: 9 - img.parentsUntil(main || body).get().length, src: img.attr('src') })
})

// sorting
scoredImages = scoredImages.sort((a, b) => a.score > b.score ? -1 : 1);
return scoredImages[0].src;
}

/**
* Tries to get the image from three sources: a scored rank based on how far the images are from <main> or <body> elements,
* or uses contents from the `<meta class="swiftype" name="image">` tag.
*
* @param html
* @returns {string}
*/
function getImage(html) {
var $ = cheerio.load(html);

var swifttypeImage = $('meta[class="swiftype"]').filter('[name="image"]').eq(0).attr('content');
var imageCandidate = getImageCandidate($('body'));

return imageCandidate || swifttypeImage;
}

module.exports = {
getImage: getImage
};
20 changes: 20 additions & 0 deletions lib/parser/keywords.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
var cheerio = require('cheerio');

/**
* Tries to get the keywords from two sources: the `<meta name="keywords">` tag,
* or from <meta class="swiftype" name="tags"> tag.
*
* @param html
* @returns {string}
*/
function getKeywords(html) {
var $ = cheerio.load(html);

var swifttypeTags = $('meta[class="swiftype"]').filter('[name="tags"]').get().map(tag => $(tag).attr('content')).join();
var metatagKeywords = $('meta[name="keywords"]').eq(0).attr('content');
return swifttypeTags || metatagKeywords;
}

module.exports = {
getKeywords: getKeywords
};
Loading