From a64f336d4f44b2389466b617ff7440e374dea646 Mon Sep 17 00:00:00 2001 From: Ron S Date: Mon, 12 Jul 2021 17:49:17 -0400 Subject: [PATCH 1/3] feat: Add range to nodes and fix whitespace issue (fixes #137) --- src/nodes/comment.ts | 4 +- src/nodes/html.ts | 105 +++++++++++++++++++++++++++---------------- src/nodes/node.ts | 12 ++++- src/nodes/text.ts | 4 +- test/node-ranges.js | 85 +++++++++++++++++++++++++++++++++++ test/parse.js | 5 +++ 6 files changed, 171 insertions(+), 44 deletions(-) create mode 100755 test/node-ranges.js diff --git a/src/nodes/comment.ts b/src/nodes/comment.ts index c9df5eb..7d9ff79 100644 --- a/src/nodes/comment.ts +++ b/src/nodes/comment.ts @@ -3,8 +3,8 @@ import NodeType from './type'; import HTMLElement from './html'; export default class CommentNode extends Node { - public constructor(public rawText: string, parentNode: HTMLElement) { - super(parentNode); + public constructor(public rawText: string, parentNode: HTMLElement, range?: [ number, number ]) { + super(parentNode, range); } /** diff --git a/src/nodes/html.ts b/src/nodes/html.ts index 902c71f..e327b2d 100644 --- a/src/nodes/html.ts +++ b/src/nodes/html.ts @@ -197,8 +197,14 @@ export default class HTMLElement extends Node { * * @memberof HTMLElement */ - public constructor(tagName: string, keyAttrs: KeyAttributes, private rawAttrs = '', parentNode: HTMLElement | null) { - super(parentNode); + public constructor( + tagName: string, + keyAttrs: KeyAttributes, + private rawAttrs = '', + parentNode: HTMLElement | null, + range?: [ number, number ] + ) { + super(parentNode, range); this.rawTagName = tagName; this.rawAttrs = rawAttrs || ''; this.id = keyAttrs.id || ''; @@ -1012,88 +1018,109 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co return it.test(tag); }); } - const root = new HTMLElement(null, {}, '', null); + const createRange = (startPos: number, endPos: number): [ number, number ] => + [ startPos - frameFlagOffset, endPos - frameFlagOffset ]; + + const root = new HTMLElement(null, {}, '', null, [ 0, data.length ]); let currentParent = root; const stack = [root]; let lastTextPos = -1; let match: RegExpExecArray; // https://github.com/taoqf/node-html-parser/issues/38 data = `<${frameflag}>${data}`; + + const dataEndPos = data.length - (frameflag.length + 2); + const frameFlagOffset = frameflag.length + 2; + while ((match = kMarkupPattern.exec(data))) { + const tagStartPos = kMarkupPattern.lastIndex - match[0].length; + const tagEndPos = kMarkupPattern.lastIndex; + + // Add TextNode if content if (lastTextPos > -1) { - if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) { - // if has content - const text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length); - currentParent.appendChild(new TextNode(text, currentParent)); + if (lastTextPos + match[0].length < tagEndPos) { + const text = data.substring(lastTextPos, tagStartPos); + currentParent.appendChild(new TextNode(text, currentParent, createRange(lastTextPos, tagStartPos))); } } + lastTextPos = kMarkupPattern.lastIndex; - if (match[2] === frameflag) { - continue; - } + + // https://github.com/taoqf/node-html-parser/issues/38 + // Skip frameflag node + if (match[2] === frameflag) continue; + + // Handle comments if (match[0][1] === '!') { - // this is a comment if (options.comment) { // Only keep what is in between - const text = data.substring(lastTextPos - 3, lastTextPos - match[0].length + 4); - currentParent.appendChild(new CommentNode(text, currentParent)); + const text = data.substring(tagStartPos + 4, tagEndPos - 3); + currentParent.appendChild(new CommentNode(text, currentParent, createRange(tagStartPos, tagEndPos))); } continue; } - if (options.lowerCaseTagName) { - match[2] = match[2].toLowerCase(); - } + + /* -- Handle tag matching -- */ + // Fix tag casing if necessary + if (options.lowerCaseTagName) match[2] = match[2].toLowerCase(); + + // Handle opening tags (ie. not ) if (!match[1]) { - // not or ... + // Find closing tag const closeMarkup = ``; - const index = (() => { - if (options.lowerCaseTagName) { - return data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex); - } - return data.indexOf(closeMarkup, kMarkupPattern.lastIndex); - })(); + const closeIndex = options.lowerCaseTagName + ? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex) + : data.indexOf(closeMarkup, kMarkupPattern.lastIndex); + const textEndPos = closeIndex === -1 ? dataEndPos : closeIndex; + if (element_should_be_ignore(match[2])) { - let text: string; - if (index === -1) { - // there is no matching ending for the text element. - text = data.substr(kMarkupPattern.lastIndex); - } else { - text = data.substring(kMarkupPattern.lastIndex, index); - } - if (text.length > 0) { - currentParent.appendChild(new TextNode(text, currentParent)); + const text = data.substring(tagEndPos, textEndPos); + if (text.length > 0 && /\S/.test(text)) { + currentParent.appendChild(new TextNode(text, currentParent, createRange(tagEndPos, textEndPos))); } } - if (index === -1) { + + if (closeIndex === -1) { lastTextPos = kMarkupPattern.lastIndex = data.length + 1; } else { - lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length; + lastTextPos = kMarkupPattern.lastIndex = closeIndex + closeMarkup.length; + // Cause to be treated as self-closing, because no close found match[1] = 'true'; } } } + + // Handle closing tags or self-closed elements (ie or
) if (match[1] || match[4] || kSelfClosingElements[match[2]]) { - // or
etc. while (true) { if (currentParent.rawTagName === match[2]) { + // Update range end for closed tag + (<[ number, number ]>currentParent.range)[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1]; stack.pop(); currentParent = arr_back(stack); break; diff --git a/src/nodes/node.ts b/src/nodes/node.ts index 7a46b3d..ddc7d70 100644 --- a/src/nodes/node.ts +++ b/src/nodes/node.ts @@ -8,11 +8,21 @@ import HTMLElement from './html'; export default abstract class Node { abstract nodeType: NodeType; public childNodes = [] as Node[]; + public range: readonly [ number, number ]; abstract text: string; abstract rawText: string; // abstract get rawText(): string; abstract toString(): string; - public constructor(public parentNode = null as HTMLElement | null) { + public constructor( + public parentNode = null as HTMLElement | null, + range?: [ number, number ] + ) { + Object.defineProperty(this, 'range', { + enumerable: false, + writable: true, + configurable: true, + value: range ?? [ -1, -1 ] + }); } public get innerText() { return this.rawText; diff --git a/src/nodes/text.ts b/src/nodes/text.ts index 755aa75..3f844ee 100644 --- a/src/nodes/text.ts +++ b/src/nodes/text.ts @@ -8,8 +8,8 @@ import NodeType from './type'; * @param {string} value [description] */ export default class TextNode extends Node { - public constructor(rawText: string, parentNode: HTMLElement) { - super(parentNode); + public constructor(rawText: string, parentNode: HTMLElement, range?: [ number, number ]) { + super(parentNode, range); this._rawText = rawText; } diff --git a/test/node-ranges.js b/test/node-ranges.js new file mode 100755 index 0000000..af1bbae --- /dev/null +++ b/test/node-ranges.js @@ -0,0 +1,85 @@ +const { parse, HTMLElement, TextNode, CommentNode } = require('../dist'); +const hp2 = require('htmlparser2') +const mochaEach = require('mocha-each'); + +// Use https://astexplorer.net/ to compare +const html = ` +Leading text + + +
+

Text Content

+ Goes Here +
+ + + +
+  block Text
+
+The space between us is vast +Closing text +`; + +function prepare() { + const nodeMeta = []; + const abbreviate = (s, maxLen = 8) => + (s.length > maxLen ? s.slice(0, maxLen) + '...' : s).replace(/(\r?\n)/g, '\\n'); + + // Parse AST + const hp2ast = hp2.parseDocument(html, { withEndIndices: true, withStartIndices: true }); + const ast = parse(html, { comment: true }); + + // Prepare flatNodes + ast.childNodes.forEach((n, idx, arr) => walk(arr, idx, hp2ast.childNodes)); + + return { nodeMeta, ast, hp2ast }; + + function walk(nodeArr, idx, mirrorArr) { + const node = nodeArr[idx]; + const mirrorNode = mirrorArr[idx]; + + const label = mirrorNode.type !== 'tag' ? `<${mirrorNode.type}: '${abbreviate(node.text)}'>` : node.tagName; + nodeMeta.push([ label, node, mirrorNode ]); + + node.childNodes.forEach((n, idx, arr) => walk(arr, idx, mirrorNode.childNodes)); + } +} + +// See: https://github.com/taoqf/node-html-parser/issues/137 +describe(`Elements ranges`, function () { + const { nodeMeta, ast } = prepare(); + + before(() => { + // Pre-check to make sure configured html is not altered + ast.childNodes.length.should.be.greaterThan(2); + }); + + describe(`parsed elements created with proper ranges`, () => { + mochaEach(nodeMeta).it(`%s`, (label, node, hp2Node) => { + /* Ensure we have the right node mapping */ + const expectedProto = hp2Node.type === 'comment' ? CommentNode : + hp2Node.type === 'text' ? TextNode : + HTMLElement; + Object.getPrototypeOf(node).constructor.should.eql(expectedProto); + if (expectedProto === HTMLElement) node.tagName.toLocaleLowerCase().should.eql(hp2Node.name.toLocaleLowerCase()); + + // Check range + node.range.should.eql([ hp2Node.startIndex, hp2Node.endIndex + 1 ]); + }); + }); + + it(`new nodes are created with [ -1, -1 ] range by default`, () => { + const nodes = [ + new HTMLElement('B', {}, '', null), + new TextNode('text', null), + new CommentNode('text', null) + ]; + + for (const node of nodes) node.range.should.eql([ -1, -1 ]); + }); +}); \ No newline at end of file diff --git a/test/parse.js b/test/parse.js index d00b29c..2d44562 100644 --- a/test/parse.js +++ b/test/parse.js @@ -18,4 +18,9 @@ describe('HTML Parser', function () { const root = parse(''); root.toString().should.eql(''); }); + // See: https://github.com/taoqf/node-html-parser/issues/137 + it(`parses all whitespace`, () => { + const root = parse(`test1 test2\ntest3\r\ntest4`); + root.text.should.eql('test1 test2\ntest3\r\ntest4'); + }); }); From eeda8cdddd37ee78fd110db4c65c844d050faeb4 Mon Sep 17 00:00:00 2001 From: Ron S Date: Mon, 12 Jul 2021 18:29:38 -0400 Subject: [PATCH 2/3] docs(readme): Add `range` property to readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index c52c4a9..be4f385 100644 --- a/README.md +++ b/README.md @@ -249,3 +249,7 @@ Get all attributes of current element. **Notice: do not try to change the return ### HTMLElement#classList Get all attributes of current element. **Notice: do not try to change the returned value.** + +### HTMLElement#range + +Corresponding source code start and end indexes (ie [ 0, 40 ]) \ No newline at end of file From 188a158c44a0c2a9e1acabc3b829004fc7c45ab2 Mon Sep 17 00:00:00 2001 From: Ron S Date: Mon, 12 Jul 2021 18:48:59 -0400 Subject: [PATCH 3/3] build(deps): Add devDependency mocha-each --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index 1dbdab1..ce6d401 100644 --- a/package.json +++ b/package.json @@ -54,6 +54,7 @@ "htmlparser-benchmark": "^1.1.3", "htmlparser2": "^6.0.0", "mocha": "latest", + "mocha-each": "^2.0.1", "np": "latest", "parse5": "^6.0.1", "should": "latest",