diff --git a/.gitignore b/.gitignore
index 902b281..ff227fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,2 @@
node_modules/
-data/
+data/
\ No newline at end of file
diff --git a/README.md b/README.md
index fcff3a3..b21594d 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,16 @@ The result looks like this:
"author": "Thomas Tuts",
"title": "Article Extractor Demo",
"summary": "A Node.js module to retrieve article content and metadata from a URL.",
- "content": "
This is the article content.
"
+ "content": "This is the article content.
",
+ "keywords": "article,extractor,demo,node,module,metadata",
+ "image": "http://ep.yimg.com/ca/I/paulgraham_2271_3232"
}
```
+
+## Change log
+
+#1.1.0
+* Corrected a logic related to parse and extract information abou href presence in `` elements and it's '#' href marker
+* Improved structure with keywords based on meta tags `` and ``
+* Improved structure with image based on scored rank (`how far images are related to or element`) or ``
+* Added a option to use `` as a source for author name
diff --git a/index.js b/index.js
index 9872a24..925af10 100644
--- a/index.js
+++ b/index.js
@@ -5,6 +5,8 @@ var author = require('./lib/parser/author');
var content = require('./lib/parser/content');
var title = require('./lib/parser/title');
var summary = require('./lib/parser/summary');
+var keywords = require('./lib/parser/keywords');
+var image = require('./lib/parser/image');
module.exports = {
extractData: function (articleUrl, callback) {
@@ -17,6 +19,8 @@ module.exports = {
data.title = title.getTitle(preppedHtml);
data.content = content.getArticleContent(preppedHtml, data.host);
data.summary = summary.getSummary(preppedHtml, data.content);
+ data.keywords = keywords.getKeywords(preppedHtml);
+ data.image = image.getImage(preppedHtml);
callback(null, data);
});
diff --git a/lib/cleaner/remove-navigational-elements.js b/lib/cleaner/remove-navigational-elements.js
index 7985ebf..e7b8e41 100644
--- a/lib/cleaner/remove-navigational-elements.js
+++ b/lib/cleaner/remove-navigational-elements.js
@@ -12,7 +12,7 @@ module.exports = function (rawHtml, host) {
// Filter out 'back to top' links
$('a').filter(function () {
var hasTopInText = $(this).text().toLowerCase().indexOf('top') > -1;
- var hasHashInHref = $(this).attr('href').indexOf('#') > -1;
+ var hasHashInHref = $(this).attr('href') ? $(this).attr('href').indexOf('#') > -1 : false;
return hasTopInText && hasHashInHref;
}).remove();
@@ -23,7 +23,7 @@ module.exports = function (rawHtml, host) {
var isRelTag = relTag === 'tag';
var isPartOfList = $(this).parents('ul').length > 0;
- var containsUrlWithTag = href.indexOf(host) > -1 && href.indexOf('tag') > -1;
+ var containsUrlWithTag = href ? href.indexOf(host) > -1 && href.indexOf('tag') > -1 : false;
if (isRelTag || containsUrlWithTag) {
if (isPartOfList) {
diff --git a/lib/parser/author.js b/lib/parser/author.js
index 1c4dc51..8585444 100644
--- a/lib/parser/author.js
+++ b/lib/parser/author.js
@@ -10,10 +10,11 @@ var cheerio = require('cheerio');
function getAuthor(html) {
var $ = cheerio.load(html);
+ var swifttypeAuthor = $('meta[class="swiftype"]').filter('[name="blogger_name"]').eq(0).attr('content');
var metatagAuthor = $('meta[name="author"]').attr('content');
var semanticAuthor = $('*[rel="author"]').eq(0).text();
var classAuthor = $('.author').eq(0).text();
- return metatagAuthor || semanticAuthor || classAuthor;
+ return swifttypeAuthor || metatagAuthor || semanticAuthor || classAuthor;
}
module.exports = {
diff --git a/lib/parser/image.js b/lib/parser/image.js
new file mode 100644
index 0000000..c10d8ab
--- /dev/null
+++ b/lib/parser/image.js
@@ -0,0 +1,43 @@
+var cheerio = require('cheerio');
+
+/**
+ * Fetch and elements from DOM and, for each , try to score its distance from
+ * parent body / main. Highest score is used to retrieve image.
+ *
+ * @param {object} body
+ */
+function getImageCandidate(body) {
+ var scoredImages = [];
+ var main = body.find('main').eq(0);
+
+ // fetching all other images from body and calculating distance to main or body section
+ var imgs = body.find('img');
+ imgs.each(index => {
+ var img = cheerio(imgs.get(index));
+ scoredImages.push({ score: 9 - img.parentsUntil(main || body).get().length, src: img.attr('src') })
+ })
+
+ // sorting
+ scoredImages = scoredImages.sort((a, b) => a.score > b.score ? -1 : 1);
+ return scoredImages[0].src;
+}
+
+/**
+ * Tries to get the image from three sources: a scored rank based on how far the images are from or elements,
+ * or uses contents from the `` tag.
+ *
+ * @param html
+ * @returns {string}
+ */
+function getImage(html) {
+ var $ = cheerio.load(html);
+
+ var swifttypeImage = $('meta[class="swiftype"]').filter('[name="image"]').eq(0).attr('content');
+ var imageCandidate = getImageCandidate($('body'));
+
+ return imageCandidate || swifttypeImage;
+}
+
+module.exports = {
+ getImage: getImage
+};
\ No newline at end of file
diff --git a/lib/parser/keywords.js b/lib/parser/keywords.js
new file mode 100644
index 0000000..37e768b
--- /dev/null
+++ b/lib/parser/keywords.js
@@ -0,0 +1,20 @@
+var cheerio = require('cheerio');
+
+/**
+ * Tries to get the keywords from two sources: the `` tag,
+ * or from tag.
+ *
+ * @param html
+ * @returns {string}
+ */
+function getKeywords(html) {
+ var $ = cheerio.load(html);
+
+ var swifttypeTags = $('meta[class="swiftype"]').filter('[name="tags"]').get().map(tag => $(tag).attr('content')).join();
+ var metatagKeywords = $('meta[name="keywords"]').eq(0).attr('content');
+ return swifttypeTags || metatagKeywords;
+ }
+
+ module.exports = {
+ getKeywords: getKeywords
+ };
\ No newline at end of file
diff --git a/package-lock.json b/package-lock.json
new file mode 100644
index 0000000..18259e5
--- /dev/null
+++ b/package-lock.json
@@ -0,0 +1,828 @@
+{
+ "name": "article-extractor",
+ "version": "1.0.2",
+ "lockfileVersion": 1,
+ "requires": true,
+ "dependencies": {
+ "ajv": {
+ "version": "5.5.2",
+ "resolved": "https://registry.npmjs.org/ajv/-/ajv-5.5.2.tgz",
+ "integrity": "sha1-c7Xuyj+rZT49P5Qis0GtQiBdyWU=",
+ "requires": {
+ "co": "^4.6.0",
+ "fast-deep-equal": "^1.0.0",
+ "fast-json-stable-stringify": "^2.0.0",
+ "json-schema-traverse": "^0.3.0"
+ }
+ },
+ "asn1": {
+ "version": "0.2.4",
+ "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz",
+ "integrity": "sha512-jxwzQpLQjSmWXgwaCZE9Nz+glAG01yF1QnWgbhGwHI5A6FRIEY6IVqtHhIepHqI7/kyEyQEagBC5mBEFlIYvdg==",
+ "requires": {
+ "safer-buffer": "~2.1.0"
+ }
+ },
+ "assert-plus": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
+ "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU="
+ },
+ "async": {
+ "version": "0.9.2",
+ "resolved": "http://registry.npmjs.org/async/-/async-0.9.2.tgz",
+ "integrity": "sha1-rqdNXmHB+JlhO/ZL2mbUx48v0X0="
+ },
+ "asynckit": {
+ "version": "0.4.0",
+ "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
+ "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k="
+ },
+ "aws-sign2": {
+ "version": "0.7.0",
+ "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.7.0.tgz",
+ "integrity": "sha1-tG6JCTSpWR8tL2+G1+ap8bP+dqg="
+ },
+ "aws4": {
+ "version": "1.8.0",
+ "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.8.0.tgz",
+ "integrity": "sha512-ReZxvNHIOv88FlT7rxcXIIC0fPt4KZqZbOlivyWtXLt8ESx84zd3kMC6iK5jVeS2qt+g7ftS7ye4fi06X5rtRQ=="
+ },
+ "backoff": {
+ "version": "2.5.0",
+ "resolved": "https://registry.npmjs.org/backoff/-/backoff-2.5.0.tgz",
+ "integrity": "sha1-9hbtqdPktmuMp/ynn2lXIsX44m8=",
+ "dev": true,
+ "requires": {
+ "precond": "0.2"
+ }
+ },
+ "balanced-match": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz",
+ "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=",
+ "dev": true,
+ "optional": true
+ },
+ "bcrypt-pbkdf": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz",
+ "integrity": "sha1-pDAdOJtqQ/m2f/PKEaP2Y342Dp4=",
+ "requires": {
+ "tweetnacl": "^0.14.3"
+ }
+ },
+ "boolbase": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
+ "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24="
+ },
+ "brace-expansion": {
+ "version": "1.1.11",
+ "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
+ "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
+ "dev": true,
+ "optional": true,
+ "requires": {
+ "balanced-match": "^1.0.0",
+ "concat-map": "0.0.1"
+ }
+ },
+ "bunyan": {
+ "version": "1.3.4",
+ "resolved": "https://registry.npmjs.org/bunyan/-/bunyan-1.3.4.tgz",
+ "integrity": "sha1-DvkW1JeRKJbKQIDg+HZL1AhzXIw=",
+ "dev": true,
+ "requires": {
+ "dtrace-provider": "~0.4",
+ "mv": "~2",
+ "safe-json-stringify": "~1"
+ }
+ },
+ "caseless": {
+ "version": "0.12.0",
+ "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz",
+ "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw="
+ },
+ "cheerio": {
+ "version": "0.19.0",
+ "resolved": "http://registry.npmjs.org/cheerio/-/cheerio-0.19.0.tgz",
+ "integrity": "sha1-dy5wFfLuKZZQltcepBdbdas1SSU=",
+ "requires": {
+ "css-select": "~1.0.0",
+ "dom-serializer": "~0.1.0",
+ "entities": "~1.1.1",
+ "htmlparser2": "~3.8.1",
+ "lodash": "^3.2.0"
+ }
+ },
+ "co": {
+ "version": "4.6.0",
+ "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
+ "integrity": "sha1-bqa989hTrlTMuOR7+gvz+QMfsYQ="
+ },
+ "combined-stream": {
+ "version": "1.0.7",
+ "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.7.tgz",
+ "integrity": "sha512-brWl9y6vOB1xYPZcpZde3N9zDByXTosAeMDo4p1wzo6UMOX4vumB+TP1RZ76sfE6Md68Q0NJSrE/gbezd4Ul+w==",
+ "requires": {
+ "delayed-stream": "~1.0.0"
+ }
+ },
+ "concat-map": {
+ "version": "0.0.1",
+ "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
+ "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=",
+ "dev": true,
+ "optional": true
+ },
+ "core-util-is": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
+ "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac="
+ },
+ "css-select": {
+ "version": "1.0.0",
+ "resolved": "http://registry.npmjs.org/css-select/-/css-select-1.0.0.tgz",
+ "integrity": "sha1-sRIcpRhI3SZOIkTQWM7iVN7rRLA=",
+ "requires": {
+ "boolbase": "~1.0.0",
+ "css-what": "1.0",
+ "domutils": "1.4",
+ "nth-check": "~1.0.0"
+ }
+ },
+ "css-what": {
+ "version": "1.0.0",
+ "resolved": "http://registry.npmjs.org/css-what/-/css-what-1.0.0.tgz",
+ "integrity": "sha1-18wt9FGAZm+Z0rFEYmOUaeAPc2w="
+ },
+ "csv": {
+ "version": "0.4.6",
+ "resolved": "http://registry.npmjs.org/csv/-/csv-0.4.6.tgz",
+ "integrity": "sha1-jbrn3f26rmLB6ph8Pg+Kmsc3tz0=",
+ "dev": true,
+ "requires": {
+ "csv-generate": "^0.0.6",
+ "csv-parse": "^1.0.0",
+ "csv-stringify": "^0.0.8",
+ "stream-transform": "^0.1.0"
+ }
+ },
+ "csv-generate": {
+ "version": "0.0.6",
+ "resolved": "http://registry.npmjs.org/csv-generate/-/csv-generate-0.0.6.tgz",
+ "integrity": "sha1-l+TmOuRrIZEs2UdbwxRp0m9a3mY=",
+ "dev": true
+ },
+ "csv-parse": {
+ "version": "1.3.3",
+ "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-1.3.3.tgz",
+ "integrity": "sha1-0c/YdDwvhJoKuy/VRNtWaV0ZpJA=",
+ "dev": true
+ },
+ "csv-stringify": {
+ "version": "0.0.8",
+ "resolved": "http://registry.npmjs.org/csv-stringify/-/csv-stringify-0.0.8.tgz",
+ "integrity": "sha1-Usw7PfwZd1jFWtMlqVvoUHH55Rs=",
+ "dev": true
+ },
+ "ctype": {
+ "version": "0.5.3",
+ "resolved": "http://registry.npmjs.org/ctype/-/ctype-0.5.3.tgz",
+ "integrity": "sha1-gsGMJGH3QRTvFsE1IkrQuRRMoS8=",
+ "dev": true
+ },
+ "dashdash": {
+ "version": "1.14.1",
+ "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz",
+ "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=",
+ "requires": {
+ "assert-plus": "^1.0.0"
+ }
+ },
+ "deep-equal": {
+ "version": "1.0.1",
+ "resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz",
+ "integrity": "sha1-9dJgKStmDghO/0zbyfCK0yR0SLU=",
+ "dev": true
+ },
+ "delayed-stream": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
+ "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk="
+ },
+ "dom-serializer": {
+ "version": "0.1.0",
+ "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz",
+ "integrity": "sha1-BzxpdUbOB4DOI75KKOKT5AvDDII=",
+ "requires": {
+ "domelementtype": "~1.1.1",
+ "entities": "~1.1.1"
+ },
+ "dependencies": {
+ "domelementtype": {
+ "version": "1.1.3",
+ "resolved": "http://registry.npmjs.org/domelementtype/-/domelementtype-1.1.3.tgz",
+ "integrity": "sha1-vSh3PiZCiBrsUVRJJCmcXNgiGFs="
+ }
+ }
+ },
+ "domelementtype": {
+ "version": "1.2.1",
+ "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.2.1.tgz",
+ "integrity": "sha512-SQVCLFS2E7G5CRCMdn6K9bIhRj1bS6QBWZfF0TUPh4V/BbqrQ619IdSS3/izn0FZ+9l+uODzaZjb08fjOfablA=="
+ },
+ "domhandler": {
+ "version": "2.3.0",
+ "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.3.0.tgz",
+ "integrity": "sha1-LeWaCCLVAn+r/28DLCsloqir5zg=",
+ "requires": {
+ "domelementtype": "1"
+ }
+ },
+ "domutils": {
+ "version": "1.4.3",
+ "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.4.3.tgz",
+ "integrity": "sha1-CGVRN5bGswYDGFDhdVFrr4C3Km8=",
+ "requires": {
+ "domelementtype": "1"
+ }
+ },
+ "dtrace-provider": {
+ "version": "0.4.0",
+ "resolved": "https://registry.npmjs.org/dtrace-provider/-/dtrace-provider-0.4.0.tgz",
+ "integrity": "sha1-C2e8HMd+eb+IuHrSBmT0p1POPyY=",
+ "dev": true,
+ "optional": true,
+ "requires": {
+ "nan": "~1.5.1"
+ }
+ },
+ "ecc-jsbn": {
+ "version": "0.1.2",
+ "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz",
+ "integrity": "sha1-OoOpBOVDUyh4dMVkt1SThoSamMk=",
+ "requires": {
+ "jsbn": "~0.1.0",
+ "safer-buffer": "^2.1.0"
+ }
+ },
+ "entities": {
+ "version": "1.1.2",
+ "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz",
+ "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w=="
+ },
+ "escape-regexp-component": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/escape-regexp-component/-/escape-regexp-component-1.0.2.tgz",
+ "integrity": "sha1-nGO20LJf8qiMOtvRjFthrMO5+qI=",
+ "dev": true
+ },
+ "extend": {
+ "version": "3.0.2",
+ "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
+ "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g=="
+ },
+ "extsprintf": {
+ "version": "1.3.0",
+ "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz",
+ "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU="
+ },
+ "fast-deep-equal": {
+ "version": "1.1.0",
+ "resolved": "http://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-1.1.0.tgz",
+ "integrity": "sha1-wFNHeBfIa1HaqFPIHgWbcz0CNhQ="
+ },
+ "fast-json-stable-stringify": {
+ "version": "2.0.0",
+ "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.0.0.tgz",
+ "integrity": "sha1-1RQsDK7msRifh9OnYREGT4bIu/I="
+ },
+ "forever-agent": {
+ "version": "0.6.1",
+ "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz",
+ "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE="
+ },
+ "form-data": {
+ "version": "2.3.3",
+ "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
+ "integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
+ "requires": {
+ "asynckit": "^0.4.0",
+ "combined-stream": "^1.0.6",
+ "mime-types": "^2.1.12"
+ }
+ },
+ "formidable": {
+ "version": "1.2.1",
+ "resolved": "https://registry.npmjs.org/formidable/-/formidable-1.2.1.tgz",
+ "integrity": "sha512-Fs9VRguL0gqGHkXS5GQiMCr1VhZBxz0JnJs4JmMp/2jL18Fmbzvv7vOFRU+U8TBkHEE/CX1qDXzJplVULgsLeg==",
+ "dev": true
+ },
+ "getpass": {
+ "version": "0.1.7",
+ "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz",
+ "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=",
+ "requires": {
+ "assert-plus": "^1.0.0"
+ }
+ },
+ "glob": {
+ "version": "6.0.4",
+ "resolved": "https://registry.npmjs.org/glob/-/glob-6.0.4.tgz",
+ "integrity": "sha1-DwiGD2oVUSey+t1PnOJLGqtuTSI=",
+ "dev": true,
+ "optional": true,
+ "requires": {
+ "inflight": "^1.0.4",
+ "inherits": "2",
+ "minimatch": "2 || 3",
+ "once": "^1.3.0",
+ "path-is-absolute": "^1.0.0"
+ }
+ },
+ "har-schema": {
+ "version": "2.0.0",
+ "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz",
+ "integrity": "sha1-qUwiJOvKwEeCoNkDVSHyRzW37JI="
+ },
+ "har-validator": {
+ "version": "5.1.0",
+ "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.0.tgz",
+ "integrity": "sha512-+qnmNjI4OfH2ipQ9VQOw23bBd/ibtfbVdK2fYbY4acTDqKTW/YDp9McimZdDbG8iV9fZizUqQMD5xvriB146TA==",
+ "requires": {
+ "ajv": "^5.3.0",
+ "har-schema": "^2.0.0"
+ }
+ },
+ "htmlparser2": {
+ "version": "3.8.3",
+ "resolved": "http://registry.npmjs.org/htmlparser2/-/htmlparser2-3.8.3.tgz",
+ "integrity": "sha1-mWwosZFRaovoZQGn15dX5ccMEGg=",
+ "requires": {
+ "domelementtype": "1",
+ "domhandler": "2.3",
+ "domutils": "1.5",
+ "entities": "1.0",
+ "readable-stream": "1.1"
+ },
+ "dependencies": {
+ "domutils": {
+ "version": "1.5.1",
+ "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz",
+ "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=",
+ "requires": {
+ "dom-serializer": "0",
+ "domelementtype": "1"
+ }
+ },
+ "entities": {
+ "version": "1.0.0",
+ "resolved": "http://registry.npmjs.org/entities/-/entities-1.0.0.tgz",
+ "integrity": "sha1-sph6o4ITR/zeZCsk/fyeT7cSvyY="
+ }
+ }
+ },
+ "http-signature": {
+ "version": "1.2.0",
+ "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz",
+ "integrity": "sha1-muzZJRFHcvPZW2WmCruPfBj7rOE=",
+ "requires": {
+ "assert-plus": "^1.0.0",
+ "jsprim": "^1.2.2",
+ "sshpk": "^1.7.0"
+ }
+ },
+ "inflight": {
+ "version": "1.0.6",
+ "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
+ "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=",
+ "dev": true,
+ "optional": true,
+ "requires": {
+ "once": "^1.3.0",
+ "wrappy": "1"
+ }
+ },
+ "inherits": {
+ "version": "2.0.3",
+ "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz",
+ "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4="
+ },
+ "is-typedarray": {
+ "version": "1.0.0",
+ "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz",
+ "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo="
+ },
+ "isarray": {
+ "version": "0.0.1",
+ "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz",
+ "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8="
+ },
+ "isstream": {
+ "version": "0.1.2",
+ "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz",
+ "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo="
+ },
+ "jsbn": {
+ "version": "0.1.1",
+ "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
+ "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM="
+ },
+ "json-schema": {
+ "version": "0.2.3",
+ "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz",
+ "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM="
+ },
+ "json-schema-traverse": {
+ "version": "0.3.1",
+ "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.3.1.tgz",
+ "integrity": "sha1-NJptRMU6Ud6JtAgFxdXlm0F9M0A="
+ },
+ "json-stringify-safe": {
+ "version": "5.0.1",
+ "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
+ "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus="
+ },
+ "jsprim": {
+ "version": "1.4.1",
+ "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.1.tgz",
+ "integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=",
+ "requires": {
+ "assert-plus": "1.0.0",
+ "extsprintf": "1.3.0",
+ "json-schema": "0.2.3",
+ "verror": "1.10.0"
+ }
+ },
+ "keep-alive-agent": {
+ "version": "0.0.1",
+ "resolved": "https://registry.npmjs.org/keep-alive-agent/-/keep-alive-agent-0.0.1.tgz",
+ "integrity": "sha1-RIR8o5TOjWtSGuhYFr1kUJlCs4U=",
+ "dev": true
+ },
+ "lodash": {
+ "version": "3.10.1",
+ "resolved": "http://registry.npmjs.org/lodash/-/lodash-3.10.1.tgz",
+ "integrity": "sha1-W/Rejkm6QYnhfUgnid/RW9FAt7Y="
+ },
+ "lru-cache": {
+ "version": "2.7.3",
+ "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-2.7.3.tgz",
+ "integrity": "sha1-bUUk6LlV+V1PW1iFHOId1y+06VI=",
+ "dev": true
+ },
+ "mime": {
+ "version": "1.6.0",
+ "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
+ "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==",
+ "dev": true
+ },
+ "mime-db": {
+ "version": "1.37.0",
+ "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.37.0.tgz",
+ "integrity": "sha512-R3C4db6bgQhlIhPU48fUtdVmKnflq+hRdad7IyKhtFj06VPNVdk2RhiYL3UjQIlso8L+YxAtFkobT0VK+S/ybg=="
+ },
+ "mime-types": {
+ "version": "2.1.21",
+ "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.21.tgz",
+ "integrity": "sha512-3iL6DbwpyLzjR3xHSFNFeb9Nz/M8WDkX33t1GFQnFOllWk8pOrh/LSrB5OXlnlW5P9LH73X6loW/eogc+F5lJg==",
+ "requires": {
+ "mime-db": "~1.37.0"
+ }
+ },
+ "minimatch": {
+ "version": "3.0.4",
+ "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz",
+ "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==",
+ "dev": true,
+ "optional": true,
+ "requires": {
+ "brace-expansion": "^1.1.7"
+ }
+ },
+ "minimist": {
+ "version": "0.0.8",
+ "resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz",
+ "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=",
+ "dev": true,
+ "optional": true
+ },
+ "mkdirp": {
+ "version": "0.5.1",
+ "resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz",
+ "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=",
+ "dev": true,
+ "optional": true,
+ "requires": {
+ "minimist": "0.0.8"
+ }
+ },
+ "mv": {
+ "version": "2.1.1",
+ "resolved": "https://registry.npmjs.org/mv/-/mv-2.1.1.tgz",
+ "integrity": "sha1-rmzg1vbV4KT32JN5jQPB6pVZtqI=",
+ "dev": true,
+ "optional": true,
+ "requires": {
+ "mkdirp": "~0.5.1",
+ "ncp": "~2.0.0",
+ "rimraf": "~2.4.0"
+ }
+ },
+ "nan": {
+ "version": "1.5.3",
+ "resolved": "http://registry.npmjs.org/nan/-/nan-1.5.3.tgz",
+ "integrity": "sha1-TNDswTO3sHAKSSpkat1CeuijGOs=",
+ "dev": true,
+ "optional": true
+ },
+ "ncp": {
+ "version": "2.0.0",
+ "resolved": "http://registry.npmjs.org/ncp/-/ncp-2.0.0.tgz",
+ "integrity": "sha1-GVoh1sRuNh0vsSgbo4uR6d9727M=",
+ "dev": true,
+ "optional": true
+ },
+ "negotiator": {
+ "version": "0.5.3",
+ "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.5.3.tgz",
+ "integrity": "sha1-Jp1cR2gQ7JLtvntsLygxY4T5p+g=",
+ "dev": true
+ },
+ "nth-check": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz",
+ "integrity": "sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==",
+ "requires": {
+ "boolbase": "~1.0.0"
+ }
+ },
+ "oauth-sign": {
+ "version": "0.9.0",
+ "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz",
+ "integrity": "sha512-fexhUFFPTGV8ybAtSIGbV6gOkSv8UtRbDBnAyLQw4QPKkgNlsH2ByPGtMUqdWkos6YCRmAqViwgZrJc/mRDzZQ=="
+ },
+ "once": {
+ "version": "1.4.0",
+ "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
+ "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=",
+ "dev": true,
+ "requires": {
+ "wrappy": "1"
+ }
+ },
+ "path-is-absolute": {
+ "version": "1.0.1",
+ "resolved": "http://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
+ "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=",
+ "dev": true,
+ "optional": true
+ },
+ "performance-now": {
+ "version": "2.1.0",
+ "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
+ "integrity": "sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns="
+ },
+ "precond": {
+ "version": "0.2.3",
+ "resolved": "https://registry.npmjs.org/precond/-/precond-0.2.3.tgz",
+ "integrity": "sha1-qpWRvKokkj8eD0hJ0kD0fvwQdaw=",
+ "dev": true
+ },
+ "psl": {
+ "version": "1.1.29",
+ "resolved": "https://registry.npmjs.org/psl/-/psl-1.1.29.tgz",
+ "integrity": "sha512-AeUmQ0oLN02flVHXWh9sSJF7mcdFq0ppid/JkErufc3hGIV/AMa8Fo9VgDo/cT2jFdOWoFvHp90qqBH54W+gjQ=="
+ },
+ "punycode": {
+ "version": "1.4.1",
+ "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz",
+ "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4="
+ },
+ "qs": {
+ "version": "6.5.2",
+ "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz",
+ "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA=="
+ },
+ "readable-stream": {
+ "version": "1.1.14",
+ "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-1.1.14.tgz",
+ "integrity": "sha1-fPTFTvZI44EwhMY23SB54WbAgdk=",
+ "requires": {
+ "core-util-is": "~1.0.0",
+ "inherits": "~2.0.1",
+ "isarray": "0.0.1",
+ "string_decoder": "~0.10.x"
+ }
+ },
+ "request": {
+ "version": "2.88.0",
+ "resolved": "https://registry.npmjs.org/request/-/request-2.88.0.tgz",
+ "integrity": "sha512-NAqBSrijGLZdM0WZNsInLJpkJokL72XYjUpnB0iwsRgxh7dB6COrHnTBNwN0E+lHDAJzu7kLAkDeY08z2/A0hg==",
+ "requires": {
+ "aws-sign2": "~0.7.0",
+ "aws4": "^1.8.0",
+ "caseless": "~0.12.0",
+ "combined-stream": "~1.0.6",
+ "extend": "~3.0.2",
+ "forever-agent": "~0.6.1",
+ "form-data": "~2.3.2",
+ "har-validator": "~5.1.0",
+ "http-signature": "~1.2.0",
+ "is-typedarray": "~1.0.0",
+ "isstream": "~0.1.2",
+ "json-stringify-safe": "~5.0.1",
+ "mime-types": "~2.1.19",
+ "oauth-sign": "~0.9.0",
+ "performance-now": "^2.1.0",
+ "qs": "~6.5.2",
+ "safe-buffer": "^5.1.2",
+ "tough-cookie": "~2.4.3",
+ "tunnel-agent": "^0.6.0",
+ "uuid": "^3.3.2"
+ }
+ },
+ "restify": {
+ "version": "3.0.3",
+ "resolved": "http://registry.npmjs.org/restify/-/restify-3.0.3.tgz",
+ "integrity": "sha1-dpOYIemR05ULcwXwJA6J9rqTfKU=",
+ "dev": true,
+ "requires": {
+ "assert-plus": "^0.1.5",
+ "backoff": "^2.4.0",
+ "bunyan": "1.3.4",
+ "csv": "^0.4.0",
+ "deep-equal": "^1.0.0",
+ "dtrace-provider": "^0.4.0",
+ "escape-regexp-component": "^1.0.2",
+ "formidable": "^1.0.14",
+ "http-signature": "^0.10.0",
+ "keep-alive-agent": "^0.0.1",
+ "lru-cache": "^2.5.0",
+ "mime": "^1.2.11",
+ "negotiator": "^0.5.1",
+ "node-uuid": "^1.4.1",
+ "once": "^1.3.0",
+ "qs": "^2.4.1",
+ "semver": "^4.3.3",
+ "spdy": "^1.26.5",
+ "tunnel-agent": "^0.4.0",
+ "verror": "^1.4.0"
+ },
+ "dependencies": {
+ "asn1": {
+ "version": "0.1.11",
+ "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.1.11.tgz",
+ "integrity": "sha1-VZvhg3bQik7E2+gId9J4GGObLfc=",
+ "dev": true
+ },
+ "assert-plus": {
+ "version": "0.1.5",
+ "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-0.1.5.tgz",
+ "integrity": "sha1-7nQAlBMALYTOxyGcasgRgS5yMWA=",
+ "dev": true
+ },
+ "http-signature": {
+ "version": "0.10.1",
+ "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-0.10.1.tgz",
+ "integrity": "sha1-T72sEyVZqoMjEh5UB3nAoBKyfmY=",
+ "dev": true,
+ "requires": {
+ "asn1": "0.1.11",
+ "assert-plus": "^0.1.5",
+ "ctype": "0.5.3"
+ }
+ },
+ "node-uuid": {
+ "version": "1.4.8",
+ "resolved": "https://registry.npmjs.org/node-uuid/-/node-uuid-1.4.8.tgz",
+ "integrity": "sha1-sEDrCSOWivq/jTL7HxfxFn/auQc=",
+ "dev": true
+ },
+ "qs": {
+ "version": "2.4.2",
+ "resolved": "https://registry.npmjs.org/qs/-/qs-2.4.2.tgz",
+ "integrity": "sha1-9854jld33wtQENp/fE5zujJHD1o=",
+ "dev": true
+ },
+ "tunnel-agent": {
+ "version": "0.4.3",
+ "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.4.3.tgz",
+ "integrity": "sha1-Y3PbdpCf5XDgjXNYM2Xtgop07us=",
+ "dev": true
+ }
+ }
+ },
+ "rimraf": {
+ "version": "2.4.5",
+ "resolved": "http://registry.npmjs.org/rimraf/-/rimraf-2.4.5.tgz",
+ "integrity": "sha1-7nEM5dk6j9uFb7Xqj/Di11k0sto=",
+ "dev": true,
+ "optional": true,
+ "requires": {
+ "glob": "^6.0.1"
+ }
+ },
+ "safe-buffer": {
+ "version": "5.1.2",
+ "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz",
+ "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g=="
+ },
+ "safe-json-stringify": {
+ "version": "1.2.0",
+ "resolved": "https://registry.npmjs.org/safe-json-stringify/-/safe-json-stringify-1.2.0.tgz",
+ "integrity": "sha512-gH8eh2nZudPQO6TytOvbxnuhYBOvDBBLW52tz5q6X58lJcd/tkmqFR+5Z9adS8aJtURSXWThWy/xJtJwixErvg==",
+ "dev": true,
+ "optional": true
+ },
+ "safer-buffer": {
+ "version": "2.1.2",
+ "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
+ "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="
+ },
+ "semver": {
+ "version": "4.3.6",
+ "resolved": "http://registry.npmjs.org/semver/-/semver-4.3.6.tgz",
+ "integrity": "sha1-MAvG4OhjdPe6YQaLWx7NV/xlMto=",
+ "dev": true
+ },
+ "spdy": {
+ "version": "1.32.5",
+ "resolved": "http://registry.npmjs.org/spdy/-/spdy-1.32.5.tgz",
+ "integrity": "sha1-cO/yPN5Ol9UqRF9lr93MVpXrXts=",
+ "dev": true
+ },
+ "sshpk": {
+ "version": "1.15.2",
+ "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.15.2.tgz",
+ "integrity": "sha512-Ra/OXQtuh0/enyl4ETZAfTaeksa6BXks5ZcjpSUNrjBr0DvrJKX+1fsKDPpT9TBXgHAFsa4510aNVgI8g/+SzA==",
+ "requires": {
+ "asn1": "~0.2.3",
+ "assert-plus": "^1.0.0",
+ "bcrypt-pbkdf": "^1.0.0",
+ "dashdash": "^1.12.0",
+ "ecc-jsbn": "~0.1.1",
+ "getpass": "^0.1.1",
+ "jsbn": "~0.1.0",
+ "safer-buffer": "^2.0.2",
+ "tweetnacl": "~0.14.0"
+ }
+ },
+ "stream-transform": {
+ "version": "0.1.2",
+ "resolved": "https://registry.npmjs.org/stream-transform/-/stream-transform-0.1.2.tgz",
+ "integrity": "sha1-fY5rTgOsR4F3j4x5UXUBv7B2Kp8=",
+ "dev": true
+ },
+ "string_decoder": {
+ "version": "0.10.31",
+ "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
+ "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ="
+ },
+ "tough-cookie": {
+ "version": "2.4.3",
+ "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.4.3.tgz",
+ "integrity": "sha512-Q5srk/4vDM54WJsJio3XNn6K2sCG+CQ8G5Wz6bZhRZoAe/+TxjWB/GlFAnYEbkYVlON9FMk/fE3h2RLpPXo4lQ==",
+ "requires": {
+ "psl": "^1.1.24",
+ "punycode": "^1.4.1"
+ }
+ },
+ "tunnel-agent": {
+ "version": "0.6.0",
+ "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
+ "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=",
+ "requires": {
+ "safe-buffer": "^5.0.1"
+ }
+ },
+ "tweetnacl": {
+ "version": "0.14.5",
+ "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz",
+ "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q="
+ },
+ "uuid": {
+ "version": "3.3.2",
+ "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.3.2.tgz",
+ "integrity": "sha512-yXJmeNaw3DnnKAOKJE51sL/ZaYfWJRl1pK9dr19YFCu0ObS231AB1/LbqTKRAQ5kw8A90rA6fr4riOUpTZvQZA=="
+ },
+ "verror": {
+ "version": "1.10.0",
+ "resolved": "https://registry.npmjs.org/verror/-/verror-1.10.0.tgz",
+ "integrity": "sha1-OhBcoXBTr1XW4nDB+CiGguGNpAA=",
+ "requires": {
+ "assert-plus": "^1.0.0",
+ "core-util-is": "1.0.2",
+ "extsprintf": "^1.2.0"
+ }
+ },
+ "wrappy": {
+ "version": "1.0.2",
+ "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
+ "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=",
+ "dev": true
+ }
+ }
+}
diff --git a/package.json b/package.json
index 3cdde00..eed3b3e 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "article-extractor",
- "version": "1.0.2",
+ "version": "1.1.0",
"description": "Extract metadata and content from web articles.",
"main": "index.js",
"keywords": [
diff --git a/test.js b/test.js
index f9f8dfb..3e55cf6 100644
--- a/test.js
+++ b/test.js
@@ -4,6 +4,14 @@ var path = require('path');
var async = require('async');
var articlesToParse = [
+ {
+ url: 'https://www.engadget.com/2018/11/08/samsung-foldable-phone-infinity-flex/',
+ filename: 'engadget'
+ },
+ {
+ url: 'https://www.engadget.com/2018/11/08/youtube-available-for-switch/',
+ filename: 'engadget2'
+ },
{
url: 'http://gizmodo.com/watch-a-single-day-on-the-london-tube-in-two-minutes-1692810056',
filename: 'gizmodo'
@@ -43,7 +51,12 @@ async.each(articlesToParse, function (articleToParse, parseCallback) {
console.log('Parsed article:', data.title);
console.log(data.summary);
console.log('-----');
- fs.writeFileSync(path.join(process.cwd(), 'data/' + articleToParse.filename + '.html'), data.content);
+
+ const rootPath = path.join(path.join(process.cwd(), 'data'));
+ if (!fs.existsSync(rootPath)) fs.mkdirSync(rootPath);
+
+ fs.writeFileSync(path.join(rootPath, '/' + articleToParse.filename + '.js'), JSON.stringify(data));
+ fs.writeFileSync(path.join(rootPath, '/' + articleToParse.filename + '.html'), data.content);
parseCallback();
});
}, function () {