diff --git a/.gitignore b/.gitignore index 902b281..ff227fe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ node_modules/ -data/ +data/ \ No newline at end of file diff --git a/README.md b/README.md index fcff3a3..b21594d 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,16 @@ The result looks like this: "author": "Thomas Tuts", "title": "Article Extractor Demo", "summary": "A Node.js module to retrieve article content and metadata from a URL.", - "content": "

This is the article content.

" + "content": "

This is the article content.

", + "keywords": "article,extractor,demo,node,module,metadata", + "image": "http://ep.yimg.com/ca/I/paulgraham_2271_3232" } ``` + +## Change log + +#1.1.0 +* Corrected a logic related to parse and extract information abou href presence in `` elements and it's '#' href marker +* Improved structure with keywords based on meta tags `` and `` +* Improved structure with image based on scored rank (`how far images are related to
or element`) or `` +* Added a option to use `` as a source for author name diff --git a/index.js b/index.js index 9872a24..925af10 100644 --- a/index.js +++ b/index.js @@ -5,6 +5,8 @@ var author = require('./lib/parser/author'); var content = require('./lib/parser/content'); var title = require('./lib/parser/title'); var summary = require('./lib/parser/summary'); +var keywords = require('./lib/parser/keywords'); +var image = require('./lib/parser/image'); module.exports = { extractData: function (articleUrl, callback) { @@ -17,6 +19,8 @@ module.exports = { data.title = title.getTitle(preppedHtml); data.content = content.getArticleContent(preppedHtml, data.host); data.summary = summary.getSummary(preppedHtml, data.content); + data.keywords = keywords.getKeywords(preppedHtml); + data.image = image.getImage(preppedHtml); callback(null, data); }); diff --git a/lib/cleaner/remove-navigational-elements.js b/lib/cleaner/remove-navigational-elements.js index 7985ebf..e7b8e41 100644 --- a/lib/cleaner/remove-navigational-elements.js +++ b/lib/cleaner/remove-navigational-elements.js @@ -12,7 +12,7 @@ module.exports = function (rawHtml, host) { // Filter out 'back to top' links $('a').filter(function () { var hasTopInText = $(this).text().toLowerCase().indexOf('top') > -1; - var hasHashInHref = $(this).attr('href').indexOf('#') > -1; + var hasHashInHref = $(this).attr('href') ? $(this).attr('href').indexOf('#') > -1 : false; return hasTopInText && hasHashInHref; }).remove(); @@ -23,7 +23,7 @@ module.exports = function (rawHtml, host) { var isRelTag = relTag === 'tag'; var isPartOfList = $(this).parents('ul').length > 0; - var containsUrlWithTag = href.indexOf(host) > -1 && href.indexOf('tag') > -1; + var containsUrlWithTag = href ? href.indexOf(host) > -1 && href.indexOf('tag') > -1 : false; if (isRelTag || containsUrlWithTag) { if (isPartOfList) { diff --git a/lib/parser/author.js b/lib/parser/author.js index 1c4dc51..8585444 100644 --- a/lib/parser/author.js +++ b/lib/parser/author.js @@ -10,10 +10,11 @@ var cheerio = require('cheerio'); function getAuthor(html) { var $ = cheerio.load(html); + var swifttypeAuthor = $('meta[class="swiftype"]').filter('[name="blogger_name"]').eq(0).attr('content'); var metatagAuthor = $('meta[name="author"]').attr('content'); var semanticAuthor = $('*[rel="author"]').eq(0).text(); var classAuthor = $('.author').eq(0).text(); - return metatagAuthor || semanticAuthor || classAuthor; + return swifttypeAuthor || metatagAuthor || semanticAuthor || classAuthor; } module.exports = { diff --git a/lib/parser/image.js b/lib/parser/image.js new file mode 100644 index 0000000..c10d8ab --- /dev/null +++ b/lib/parser/image.js @@ -0,0 +1,43 @@ +var cheerio = require('cheerio'); + +/** + * Fetch and
elements from DOM and, for each , try to score its distance from + * parent body / main. Highest score is used to retrieve image. + * + * @param {object} body + */ +function getImageCandidate(body) { + var scoredImages = []; + var main = body.find('main').eq(0); + + // fetching all other images from body and calculating distance to main or body section + var imgs = body.find('img'); + imgs.each(index => { + var img = cheerio(imgs.get(index)); + scoredImages.push({ score: 9 - img.parentsUntil(main || body).get().length, src: img.attr('src') }) + }) + + // sorting + scoredImages = scoredImages.sort((a, b) => a.score > b.score ? -1 : 1); + return scoredImages[0].src; +} + +/** + * Tries to get the image from three sources: a scored rank based on how far the images are from
or elements, + * or uses contents from the `` tag. + * + * @param html + * @returns {string} + */ +function getImage(html) { + var $ = cheerio.load(html); + + var swifttypeImage = $('meta[class="swiftype"]').filter('[name="image"]').eq(0).attr('content'); + var imageCandidate = getImageCandidate($('body')); + + return imageCandidate || swifttypeImage; +} + +module.exports = { + getImage: getImage +}; \ No newline at end of file diff --git a/lib/parser/keywords.js b/lib/parser/keywords.js new file mode 100644 index 0000000..37e768b --- /dev/null +++ b/lib/parser/keywords.js @@ -0,0 +1,20 @@ +var cheerio = require('cheerio'); + +/** + * Tries to get the keywords from two sources: the `` tag, + * or from tag. + * + * @param html + * @returns {string} + */ +function getKeywords(html) { + var $ = cheerio.load(html); + + var swifttypeTags = $('meta[class="swiftype"]').filter('[name="tags"]').get().map(tag => $(tag).attr('content')).join(); + var metatagKeywords = $('meta[name="keywords"]').eq(0).attr('content'); + return swifttypeTags || metatagKeywords; + } + + module.exports = { + getKeywords: getKeywords + }; \ No newline at end of file diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..18259e5 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,828 @@ +{ + "name": "article-extractor", + "version": "1.0.2", + "lockfileVersion": 1, + "requires": true, + "dependencies": { + "ajv": { + "version": "5.5.2", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-5.5.2.tgz", + "integrity": "sha1-c7Xuyj+rZT49P5Qis0GtQiBdyWU=", + "requires": { + "co": "^4.6.0", + "fast-deep-equal": "^1.0.0", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.3.0" + } + }, + "asn1": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz", + "integrity": "sha512-jxwzQpLQjSmWXgwaCZE9Nz+glAG01yF1QnWgbhGwHI5A6FRIEY6IVqtHhIepHqI7/kyEyQEagBC5mBEFlIYvdg==", + "requires": { + "safer-buffer": "~2.1.0" + } + }, + "assert-plus": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz", + "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=" + }, + "async": { + "version": "0.9.2", + "resolved": "http://registry.npmjs.org/async/-/async-0.9.2.tgz", + "integrity": "sha1-rqdNXmHB+JlhO/ZL2mbUx48v0X0=" + }, + "asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k=" + }, + "aws-sign2": { + "version": "0.7.0", + "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.7.0.tgz", + "integrity": "sha1-tG6JCTSpWR8tL2+G1+ap8bP+dqg=" + }, + "aws4": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.8.0.tgz", + "integrity": "sha512-ReZxvNHIOv88FlT7rxcXIIC0fPt4KZqZbOlivyWtXLt8ESx84zd3kMC6iK5jVeS2qt+g7ftS7ye4fi06X5rtRQ==" + }, + "backoff": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/backoff/-/backoff-2.5.0.tgz", + "integrity": "sha1-9hbtqdPktmuMp/ynn2lXIsX44m8=", + "dev": true, + "requires": { + "precond": "0.2" + } + }, + "balanced-match": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", + "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=", + "dev": true, + "optional": true + }, + "bcrypt-pbkdf": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz", + "integrity": "sha1-pDAdOJtqQ/m2f/PKEaP2Y342Dp4=", + "requires": { + "tweetnacl": "^0.14.3" + } + }, + "boolbase": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", + "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24=" + }, + "brace-expansion": { + "version": "1.1.11", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", + "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "dev": true, + "optional": true, + "requires": { + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" + } + }, + "bunyan": { + "version": "1.3.4", + "resolved": "https://registry.npmjs.org/bunyan/-/bunyan-1.3.4.tgz", + "integrity": "sha1-DvkW1JeRKJbKQIDg+HZL1AhzXIw=", + "dev": true, + "requires": { + "dtrace-provider": "~0.4", + "mv": "~2", + "safe-json-stringify": "~1" + } + }, + "caseless": { + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz", + "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw=" + }, + "cheerio": { + "version": "0.19.0", + "resolved": "http://registry.npmjs.org/cheerio/-/cheerio-0.19.0.tgz", + "integrity": "sha1-dy5wFfLuKZZQltcepBdbdas1SSU=", + "requires": { + "css-select": "~1.0.0", + "dom-serializer": "~0.1.0", + "entities": "~1.1.1", + "htmlparser2": "~3.8.1", + "lodash": "^3.2.0" + } + }, + "co": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", + "integrity": "sha1-bqa989hTrlTMuOR7+gvz+QMfsYQ=" + }, + "combined-stream": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.7.tgz", + "integrity": "sha512-brWl9y6vOB1xYPZcpZde3N9zDByXTosAeMDo4p1wzo6UMOX4vumB+TP1RZ76sfE6Md68Q0NJSrE/gbezd4Ul+w==", + "requires": { + "delayed-stream": "~1.0.0" + } + }, + "concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=", + "dev": true, + "optional": true + }, + "core-util-is": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", + "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=" + }, + "css-select": { + "version": "1.0.0", + "resolved": "http://registry.npmjs.org/css-select/-/css-select-1.0.0.tgz", + "integrity": "sha1-sRIcpRhI3SZOIkTQWM7iVN7rRLA=", + "requires": { + "boolbase": "~1.0.0", + "css-what": "1.0", + "domutils": "1.4", + "nth-check": "~1.0.0" + } + }, + "css-what": { + "version": "1.0.0", + "resolved": "http://registry.npmjs.org/css-what/-/css-what-1.0.0.tgz", + "integrity": "sha1-18wt9FGAZm+Z0rFEYmOUaeAPc2w=" + }, + "csv": { + "version": "0.4.6", + "resolved": "http://registry.npmjs.org/csv/-/csv-0.4.6.tgz", + "integrity": "sha1-jbrn3f26rmLB6ph8Pg+Kmsc3tz0=", + "dev": true, + "requires": { + "csv-generate": "^0.0.6", + "csv-parse": "^1.0.0", + "csv-stringify": "^0.0.8", + "stream-transform": "^0.1.0" + } + }, + "csv-generate": { + "version": "0.0.6", + "resolved": "http://registry.npmjs.org/csv-generate/-/csv-generate-0.0.6.tgz", + "integrity": "sha1-l+TmOuRrIZEs2UdbwxRp0m9a3mY=", + "dev": true + }, + "csv-parse": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-1.3.3.tgz", + "integrity": "sha1-0c/YdDwvhJoKuy/VRNtWaV0ZpJA=", + "dev": true + }, + "csv-stringify": { + "version": "0.0.8", + "resolved": "http://registry.npmjs.org/csv-stringify/-/csv-stringify-0.0.8.tgz", + "integrity": "sha1-Usw7PfwZd1jFWtMlqVvoUHH55Rs=", + "dev": true + }, + "ctype": { + "version": "0.5.3", + "resolved": "http://registry.npmjs.org/ctype/-/ctype-0.5.3.tgz", + "integrity": "sha1-gsGMJGH3QRTvFsE1IkrQuRRMoS8=", + "dev": true + }, + "dashdash": { + "version": "1.14.1", + "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", + "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=", + "requires": { + "assert-plus": "^1.0.0" + } + }, + "deep-equal": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz", + "integrity": "sha1-9dJgKStmDghO/0zbyfCK0yR0SLU=", + "dev": true + }, + "delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk=" + }, + "dom-serializer": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-0.1.0.tgz", + "integrity": "sha1-BzxpdUbOB4DOI75KKOKT5AvDDII=", + "requires": { + "domelementtype": "~1.1.1", + "entities": "~1.1.1" + }, + "dependencies": { + "domelementtype": { + "version": "1.1.3", + "resolved": "http://registry.npmjs.org/domelementtype/-/domelementtype-1.1.3.tgz", + "integrity": "sha1-vSh3PiZCiBrsUVRJJCmcXNgiGFs=" + } + } + }, + "domelementtype": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-1.2.1.tgz", + "integrity": "sha512-SQVCLFS2E7G5CRCMdn6K9bIhRj1bS6QBWZfF0TUPh4V/BbqrQ619IdSS3/izn0FZ+9l+uODzaZjb08fjOfablA==" + }, + "domhandler": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-2.3.0.tgz", + "integrity": "sha1-LeWaCCLVAn+r/28DLCsloqir5zg=", + "requires": { + "domelementtype": "1" + } + }, + "domutils": { + "version": "1.4.3", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.4.3.tgz", + "integrity": "sha1-CGVRN5bGswYDGFDhdVFrr4C3Km8=", + "requires": { + "domelementtype": "1" + } + }, + "dtrace-provider": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/dtrace-provider/-/dtrace-provider-0.4.0.tgz", + "integrity": "sha1-C2e8HMd+eb+IuHrSBmT0p1POPyY=", + "dev": true, + "optional": true, + "requires": { + "nan": "~1.5.1" + } + }, + "ecc-jsbn": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz", + "integrity": "sha1-OoOpBOVDUyh4dMVkt1SThoSamMk=", + "requires": { + "jsbn": "~0.1.0", + "safer-buffer": "^2.1.0" + } + }, + "entities": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/entities/-/entities-1.1.2.tgz", + "integrity": "sha512-f2LZMYl1Fzu7YSBKg+RoROelpOaNrcGmE9AZubeDfrCEia483oW4MI4VyFd5VNHIgQ/7qm1I0wUHK1eJnn2y2w==" + }, + "escape-regexp-component": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/escape-regexp-component/-/escape-regexp-component-1.0.2.tgz", + "integrity": "sha1-nGO20LJf8qiMOtvRjFthrMO5+qI=", + "dev": true + }, + "extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==" + }, + "extsprintf": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz", + "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=" + }, + "fast-deep-equal": { + "version": "1.1.0", + "resolved": "http://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-1.1.0.tgz", + "integrity": "sha1-wFNHeBfIa1HaqFPIHgWbcz0CNhQ=" + }, + "fast-json-stable-stringify": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.0.0.tgz", + "integrity": "sha1-1RQsDK7msRifh9OnYREGT4bIu/I=" + }, + "forever-agent": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz", + "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE=" + }, + "form-data": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz", + "integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==", + "requires": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.6", + "mime-types": "^2.1.12" + } + }, + "formidable": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/formidable/-/formidable-1.2.1.tgz", + "integrity": "sha512-Fs9VRguL0gqGHkXS5GQiMCr1VhZBxz0JnJs4JmMp/2jL18Fmbzvv7vOFRU+U8TBkHEE/CX1qDXzJplVULgsLeg==", + "dev": true + }, + "getpass": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz", + "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=", + "requires": { + "assert-plus": "^1.0.0" + } + }, + "glob": { + "version": "6.0.4", + "resolved": "https://registry.npmjs.org/glob/-/glob-6.0.4.tgz", + "integrity": "sha1-DwiGD2oVUSey+t1PnOJLGqtuTSI=", + "dev": true, + "optional": true, + "requires": { + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "2 || 3", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + } + }, + "har-schema": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz", + "integrity": "sha1-qUwiJOvKwEeCoNkDVSHyRzW37JI=" + }, + "har-validator": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.0.tgz", + "integrity": "sha512-+qnmNjI4OfH2ipQ9VQOw23bBd/ibtfbVdK2fYbY4acTDqKTW/YDp9McimZdDbG8iV9fZizUqQMD5xvriB146TA==", + "requires": { + "ajv": "^5.3.0", + "har-schema": "^2.0.0" + } + }, + "htmlparser2": { + "version": "3.8.3", + "resolved": "http://registry.npmjs.org/htmlparser2/-/htmlparser2-3.8.3.tgz", + "integrity": "sha1-mWwosZFRaovoZQGn15dX5ccMEGg=", + "requires": { + "domelementtype": "1", + "domhandler": "2.3", + "domutils": "1.5", + "entities": "1.0", + "readable-stream": "1.1" + }, + "dependencies": { + "domutils": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-1.5.1.tgz", + "integrity": "sha1-3NhIiib1Y9YQeeSMn3t+Mjc2gs8=", + "requires": { + "dom-serializer": "0", + "domelementtype": "1" + } + }, + "entities": { + "version": "1.0.0", + "resolved": "http://registry.npmjs.org/entities/-/entities-1.0.0.tgz", + "integrity": "sha1-sph6o4ITR/zeZCsk/fyeT7cSvyY=" + } + } + }, + "http-signature": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz", + "integrity": "sha1-muzZJRFHcvPZW2WmCruPfBj7rOE=", + "requires": { + "assert-plus": "^1.0.0", + "jsprim": "^1.2.2", + "sshpk": "^1.7.0" + } + }, + "inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", + "dev": true, + "optional": true, + "requires": { + "once": "^1.3.0", + "wrappy": "1" + } + }, + "inherits": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", + "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=" + }, + "is-typedarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz", + "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo=" + }, + "isarray": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz", + "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=" + }, + "isstream": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", + "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo=" + }, + "jsbn": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz", + "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM=" + }, + "json-schema": { + "version": "0.2.3", + "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz", + "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=" + }, + "json-schema-traverse": { + "version": "0.3.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.3.1.tgz", + "integrity": "sha1-NJptRMU6Ud6JtAgFxdXlm0F9M0A=" + }, + "json-stringify-safe": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz", + "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus=" + }, + "jsprim": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.1.tgz", + "integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=", + "requires": { + "assert-plus": "1.0.0", + "extsprintf": "1.3.0", + "json-schema": "0.2.3", + "verror": "1.10.0" + } + }, + "keep-alive-agent": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/keep-alive-agent/-/keep-alive-agent-0.0.1.tgz", + "integrity": "sha1-RIR8o5TOjWtSGuhYFr1kUJlCs4U=", + "dev": true + }, + "lodash": { + "version": "3.10.1", + "resolved": "http://registry.npmjs.org/lodash/-/lodash-3.10.1.tgz", + "integrity": "sha1-W/Rejkm6QYnhfUgnid/RW9FAt7Y=" + }, + "lru-cache": { + "version": "2.7.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-2.7.3.tgz", + "integrity": "sha1-bUUk6LlV+V1PW1iFHOId1y+06VI=", + "dev": true + }, + "mime": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", + "integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg==", + "dev": true + }, + "mime-db": { + "version": "1.37.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.37.0.tgz", + "integrity": "sha512-R3C4db6bgQhlIhPU48fUtdVmKnflq+hRdad7IyKhtFj06VPNVdk2RhiYL3UjQIlso8L+YxAtFkobT0VK+S/ybg==" + }, + "mime-types": { + "version": "2.1.21", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.21.tgz", + "integrity": "sha512-3iL6DbwpyLzjR3xHSFNFeb9Nz/M8WDkX33t1GFQnFOllWk8pOrh/LSrB5OXlnlW5P9LH73X6loW/eogc+F5lJg==", + "requires": { + "mime-db": "~1.37.0" + } + }, + "minimatch": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", + "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", + "dev": true, + "optional": true, + "requires": { + "brace-expansion": "^1.1.7" + } + }, + "minimist": { + "version": "0.0.8", + "resolved": "http://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", + "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=", + "dev": true, + "optional": true + }, + "mkdirp": { + "version": "0.5.1", + "resolved": "http://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", + "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", + "dev": true, + "optional": true, + "requires": { + "minimist": "0.0.8" + } + }, + "mv": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/mv/-/mv-2.1.1.tgz", + "integrity": "sha1-rmzg1vbV4KT32JN5jQPB6pVZtqI=", + "dev": true, + "optional": true, + "requires": { + "mkdirp": "~0.5.1", + "ncp": "~2.0.0", + "rimraf": "~2.4.0" + } + }, + "nan": { + "version": "1.5.3", + "resolved": "http://registry.npmjs.org/nan/-/nan-1.5.3.tgz", + "integrity": "sha1-TNDswTO3sHAKSSpkat1CeuijGOs=", + "dev": true, + "optional": true + }, + "ncp": { + "version": "2.0.0", + "resolved": "http://registry.npmjs.org/ncp/-/ncp-2.0.0.tgz", + "integrity": "sha1-GVoh1sRuNh0vsSgbo4uR6d9727M=", + "dev": true, + "optional": true + }, + "negotiator": { + "version": "0.5.3", + "resolved": "https://registry.npmjs.org/negotiator/-/negotiator-0.5.3.tgz", + "integrity": "sha1-Jp1cR2gQ7JLtvntsLygxY4T5p+g=", + "dev": true + }, + "nth-check": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/nth-check/-/nth-check-1.0.2.tgz", + "integrity": "sha512-WeBOdju8SnzPN5vTUJYxYUxLeXpCaVP5i5e0LF8fg7WORF2Wd7wFX/pk0tYZk7s8T+J7VLy0Da6J1+wCT0AtHg==", + "requires": { + "boolbase": "~1.0.0" + } + }, + "oauth-sign": { + "version": "0.9.0", + "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz", + "integrity": "sha512-fexhUFFPTGV8ybAtSIGbV6gOkSv8UtRbDBnAyLQw4QPKkgNlsH2ByPGtMUqdWkos6YCRmAqViwgZrJc/mRDzZQ==" + }, + "once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", + "dev": true, + "requires": { + "wrappy": "1" + } + }, + "path-is-absolute": { + "version": "1.0.1", + "resolved": "http://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=", + "dev": true, + "optional": true + }, + "performance-now": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz", + "integrity": "sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns=" + }, + "precond": { + "version": "0.2.3", + "resolved": "https://registry.npmjs.org/precond/-/precond-0.2.3.tgz", + "integrity": "sha1-qpWRvKokkj8eD0hJ0kD0fvwQdaw=", + "dev": true + }, + "psl": { + "version": "1.1.29", + "resolved": "https://registry.npmjs.org/psl/-/psl-1.1.29.tgz", + "integrity": "sha512-AeUmQ0oLN02flVHXWh9sSJF7mcdFq0ppid/JkErufc3hGIV/AMa8Fo9VgDo/cT2jFdOWoFvHp90qqBH54W+gjQ==" + }, + "punycode": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", + "integrity": "sha1-wNWmOycYgArY4esPpSachN1BhF4=" + }, + "qs": { + "version": "6.5.2", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz", + "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA==" + }, + "readable-stream": { + "version": "1.1.14", + "resolved": "http://registry.npmjs.org/readable-stream/-/readable-stream-1.1.14.tgz", + "integrity": "sha1-fPTFTvZI44EwhMY23SB54WbAgdk=", + "requires": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.1", + "isarray": "0.0.1", + "string_decoder": "~0.10.x" + } + }, + "request": { + "version": "2.88.0", + "resolved": "https://registry.npmjs.org/request/-/request-2.88.0.tgz", + "integrity": "sha512-NAqBSrijGLZdM0WZNsInLJpkJokL72XYjUpnB0iwsRgxh7dB6COrHnTBNwN0E+lHDAJzu7kLAkDeY08z2/A0hg==", + "requires": { + "aws-sign2": "~0.7.0", + "aws4": "^1.8.0", + "caseless": "~0.12.0", + "combined-stream": "~1.0.6", + "extend": "~3.0.2", + "forever-agent": "~0.6.1", + "form-data": "~2.3.2", + "har-validator": "~5.1.0", + "http-signature": "~1.2.0", + "is-typedarray": "~1.0.0", + "isstream": "~0.1.2", + "json-stringify-safe": "~5.0.1", + "mime-types": "~2.1.19", + "oauth-sign": "~0.9.0", + "performance-now": "^2.1.0", + "qs": "~6.5.2", + "safe-buffer": "^5.1.2", + "tough-cookie": "~2.4.3", + "tunnel-agent": "^0.6.0", + "uuid": "^3.3.2" + } + }, + "restify": { + "version": "3.0.3", + "resolved": "http://registry.npmjs.org/restify/-/restify-3.0.3.tgz", + "integrity": "sha1-dpOYIemR05ULcwXwJA6J9rqTfKU=", + "dev": true, + "requires": { + "assert-plus": "^0.1.5", + "backoff": "^2.4.0", + "bunyan": "1.3.4", + "csv": "^0.4.0", + "deep-equal": "^1.0.0", + "dtrace-provider": "^0.4.0", + "escape-regexp-component": "^1.0.2", + "formidable": "^1.0.14", + "http-signature": "^0.10.0", + "keep-alive-agent": "^0.0.1", + "lru-cache": "^2.5.0", + "mime": "^1.2.11", + "negotiator": "^0.5.1", + "node-uuid": "^1.4.1", + "once": "^1.3.0", + "qs": "^2.4.1", + "semver": "^4.3.3", + "spdy": "^1.26.5", + "tunnel-agent": "^0.4.0", + "verror": "^1.4.0" + }, + "dependencies": { + "asn1": { + "version": "0.1.11", + "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.1.11.tgz", + "integrity": "sha1-VZvhg3bQik7E2+gId9J4GGObLfc=", + "dev": true + }, + "assert-plus": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-0.1.5.tgz", + "integrity": "sha1-7nQAlBMALYTOxyGcasgRgS5yMWA=", + "dev": true + }, + "http-signature": { + "version": "0.10.1", + "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-0.10.1.tgz", + "integrity": "sha1-T72sEyVZqoMjEh5UB3nAoBKyfmY=", + "dev": true, + "requires": { + "asn1": "0.1.11", + "assert-plus": "^0.1.5", + "ctype": "0.5.3" + } + }, + "node-uuid": { + "version": "1.4.8", + "resolved": "https://registry.npmjs.org/node-uuid/-/node-uuid-1.4.8.tgz", + "integrity": "sha1-sEDrCSOWivq/jTL7HxfxFn/auQc=", + "dev": true + }, + "qs": { + "version": "2.4.2", + "resolved": "https://registry.npmjs.org/qs/-/qs-2.4.2.tgz", + "integrity": "sha1-9854jld33wtQENp/fE5zujJHD1o=", + "dev": true + }, + "tunnel-agent": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.4.3.tgz", + "integrity": "sha1-Y3PbdpCf5XDgjXNYM2Xtgop07us=", + "dev": true + } + } + }, + "rimraf": { + "version": "2.4.5", + "resolved": "http://registry.npmjs.org/rimraf/-/rimraf-2.4.5.tgz", + "integrity": "sha1-7nEM5dk6j9uFb7Xqj/Di11k0sto=", + "dev": true, + "optional": true, + "requires": { + "glob": "^6.0.1" + } + }, + "safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==" + }, + "safe-json-stringify": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/safe-json-stringify/-/safe-json-stringify-1.2.0.tgz", + "integrity": "sha512-gH8eh2nZudPQO6TytOvbxnuhYBOvDBBLW52tz5q6X58lJcd/tkmqFR+5Z9adS8aJtURSXWThWy/xJtJwixErvg==", + "dev": true, + "optional": true + }, + "safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==" + }, + "semver": { + "version": "4.3.6", + "resolved": "http://registry.npmjs.org/semver/-/semver-4.3.6.tgz", + "integrity": "sha1-MAvG4OhjdPe6YQaLWx7NV/xlMto=", + "dev": true + }, + "spdy": { + "version": "1.32.5", + "resolved": "http://registry.npmjs.org/spdy/-/spdy-1.32.5.tgz", + "integrity": "sha1-cO/yPN5Ol9UqRF9lr93MVpXrXts=", + "dev": true + }, + "sshpk": { + "version": "1.15.2", + "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.15.2.tgz", + "integrity": "sha512-Ra/OXQtuh0/enyl4ETZAfTaeksa6BXks5ZcjpSUNrjBr0DvrJKX+1fsKDPpT9TBXgHAFsa4510aNVgI8g/+SzA==", + "requires": { + "asn1": "~0.2.3", + "assert-plus": "^1.0.0", + "bcrypt-pbkdf": "^1.0.0", + "dashdash": "^1.12.0", + "ecc-jsbn": "~0.1.1", + "getpass": "^0.1.1", + "jsbn": "~0.1.0", + "safer-buffer": "^2.0.2", + "tweetnacl": "~0.14.0" + } + }, + "stream-transform": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/stream-transform/-/stream-transform-0.1.2.tgz", + "integrity": "sha1-fY5rTgOsR4F3j4x5UXUBv7B2Kp8=", + "dev": true + }, + "string_decoder": { + "version": "0.10.31", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz", + "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=" + }, + "tough-cookie": { + "version": "2.4.3", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.4.3.tgz", + "integrity": "sha512-Q5srk/4vDM54WJsJio3XNn6K2sCG+CQ8G5Wz6bZhRZoAe/+TxjWB/GlFAnYEbkYVlON9FMk/fE3h2RLpPXo4lQ==", + "requires": { + "psl": "^1.1.24", + "punycode": "^1.4.1" + } + }, + "tunnel-agent": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz", + "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=", + "requires": { + "safe-buffer": "^5.0.1" + } + }, + "tweetnacl": { + "version": "0.14.5", + "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz", + "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=" + }, + "uuid": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.3.2.tgz", + "integrity": "sha512-yXJmeNaw3DnnKAOKJE51sL/ZaYfWJRl1pK9dr19YFCu0ObS231AB1/LbqTKRAQ5kw8A90rA6fr4riOUpTZvQZA==" + }, + "verror": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/verror/-/verror-1.10.0.tgz", + "integrity": "sha1-OhBcoXBTr1XW4nDB+CiGguGNpAA=", + "requires": { + "assert-plus": "^1.0.0", + "core-util-is": "1.0.2", + "extsprintf": "^1.2.0" + } + }, + "wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=", + "dev": true + } + } +} diff --git a/package.json b/package.json index 3cdde00..eed3b3e 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "article-extractor", - "version": "1.0.2", + "version": "1.1.0", "description": "Extract metadata and content from web articles.", "main": "index.js", "keywords": [ diff --git a/test.js b/test.js index f9f8dfb..3e55cf6 100644 --- a/test.js +++ b/test.js @@ -4,6 +4,14 @@ var path = require('path'); var async = require('async'); var articlesToParse = [ + { + url: 'https://www.engadget.com/2018/11/08/samsung-foldable-phone-infinity-flex/', + filename: 'engadget' + }, + { + url: 'https://www.engadget.com/2018/11/08/youtube-available-for-switch/', + filename: 'engadget2' + }, { url: 'http://gizmodo.com/watch-a-single-day-on-the-london-tube-in-two-minutes-1692810056', filename: 'gizmodo' @@ -43,7 +51,12 @@ async.each(articlesToParse, function (articleToParse, parseCallback) { console.log('Parsed article:', data.title); console.log(data.summary); console.log('-----'); - fs.writeFileSync(path.join(process.cwd(), 'data/' + articleToParse.filename + '.html'), data.content); + + const rootPath = path.join(path.join(process.cwd(), 'data')); + if (!fs.existsSync(rootPath)) fs.mkdirSync(rootPath); + + fs.writeFileSync(path.join(rootPath, '/' + articleToParse.filename + '.js'), JSON.stringify(data)); + fs.writeFileSync(path.join(rootPath, '/' + articleToParse.filename + '.html'), data.content); parseCallback(); }); }, function () {