From 5cf9c52f8048711eb003b5efb3d6b646925a4a4f Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Thu, 26 Oct 2023 11:54:45 +0300 Subject: [PATCH 01/49] Importer --- package.json | 12 ++-- tools/importer/import.js | 151 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+), 6 deletions(-) create mode 100644 tools/importer/import.js diff --git a/package.json b/package.json index f011c3bf..9062cec2 100644 --- a/package.json +++ b/package.json @@ -20,19 +20,19 @@ }, "homepage": "https://github.com/adobe/aem-boilerplate#readme", "devDependencies": { + "@babel/core": "7.21.0", + "@babel/eslint-parser": "7.19.1", + "@esm-bundle/chai": "4.3.4-fix.0", "@semantic-release/changelog": "6.0.3", "@semantic-release/exec": "6.0.3", "@semantic-release/git": "10.0.1", - "semantic-release": "21.0.5", - "@babel/core": "7.21.0", - "@babel/eslint-parser": "7.19.1", + "@web/test-runner": "0.15.1", + "@web/test-runner-commands": "0.6.5", "chai": "4.3.7", "eslint": "8.35.0", "eslint-config-airbnb-base": "15.0.0", "eslint-plugin-import": "2.27.5", - "@esm-bundle/chai": "4.3.4-fix.0", - "@web/test-runner": "0.15.1", - "@web/test-runner-commands": "0.6.5", + "semantic-release": "21.0.5", "sinon": "15.0.1", "stylelint": "15.2.0", "stylelint-config-standard": "30.0.1" diff --git a/tools/importer/import.js b/tools/importer/import.js new file mode 100644 index 00000000..c1998bad --- /dev/null +++ b/tools/importer/import.js @@ -0,0 +1,151 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +/* global WebImporter */ +/* eslint-disable no-console, class-methods-use-this */ + +const createMetadataBlock = (main, document) => { + const meta = {}; + + // Author + const authorLink = document.querySelector('.cmp-blog-author-info__author a'); + if (authorLink) { + meta.Author = authorLink.textContent; + meta['Author Link'] = authorLink.href; + } + + // Publication Date + const date = document.querySelector('.cmp-blog-author-info__date'); + if (date) { + // parse date which is in format "October 23, 2023" considering local en-US + const dateStr = date.textContent.trim(); + const dateObj = new Date(dateStr); + // format date to mm/dd/yyyy + meta['Publication Date'] = dateObj.toLocaleDateString('en-US'); + } + + // Title + const title = document.querySelector('title'); + if (title) { + meta.Title = title.textContent.replace(/[\n\t]/gm, ''); + } + + // Keywords + const keywords = document.querySelector('meta[name="keywords"]'); + if (keywords && keywords.content) { + meta.Keywords = keywords.content; + } + + // Description + const desc = document.querySelector('[property="og:description"]'); + if (desc) { + meta.Description = desc.content; + } + + // Template + meta.Template = 'Blog Article'; + + // Category + const tags = document.querySelectorAll('.cmp-blog-author-info__tags li'); + if (tags) { + // for each tag, if present in a predefined list, then add it to the metadata + const topics = ['AI and Automation', + 'Application Development', + 'Careers', + 'Crisis Management', + 'Culture', + 'Customer Experience', + 'Customer Stories', + 'Cybersecurity and Risk', + 'Education', + 'Employee Experience', + 'Events', + 'Financial Services', + 'Government', + 'Healthcare', + 'IT Management', + 'Manufacturing', + 'Now on Now', + 'Now Platform', + 'Telecommunications']; + + const categories = ['Trends and Research', 'About ServiceNow', 'Solutions', 'Life at Now']; + + // for each tag if present in the topics array, then create a list of topics + + for (let i = 0; i < tags.length; i++) { + if (categories.includes(tags[i].textContent.trim())) { + if (meta.Category) { + meta.Category = meta.Category + ', ' + tags[i].textContent.trim(); + } + else { + meta.Category = tags[i].textContent.trim(); + } + } + + if (topics.includes(tags[i].textContent.trim())) { + meta.Topic = tags[i].textContent.trim(); + } + + } + } + + // Image + const img = document.querySelector('[property="og:image"]'); + if (img && img.content) { + const el = document.createElement('img'); + el.src = img.content; + meta.Image = el; + } + + const block = WebImporter.Blocks.getMetadataBlock(document, meta); + main.append(block); + + return meta; +}; + +export default { + /** + * Apply DOM operations to the provided document and return + * the root element to be then transformed to Markdown. + * @param {HTMLDocument} document The document + * @param {string} url The url of the page imported + * @param {string} html The raw html (the document is cleaned up during preprocessing) + * @param {object} params Object containing some parameters given by the import process. + * @returns {HTMLElement} The root element to be transformed + */ + transformDOM: ({ + // eslint-disable-next-line no-unused-vars + document, url, html, params, + }) => { + const main = document.querySelector('body'); + + createMetadataBlock(main, document); + + main.querySelectorAll('.legacyHtml, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); + return main; + + }, + + /** + * Return a path that describes the document being transformed (file name, nesting...). + * The path is then used to create the corresponding Word document. + * @param {HTMLDocument} document The document + * @param {string} url The url of the page imported + * @param {string} html The raw html (the document is cleaned up during preprocessing) + * @param {object} params Object containing some parameters given by the import process. + * @return {string} The path + */ + generateDocumentPath: ({ + // eslint-disable-next-line no-unused-vars + document, url, html, params, + }) => WebImporter.FileUtils.sanitizePath(new URL(url).pathname.replace(/\.html$/, '').replace(/\/$/, '')), +}; \ No newline at end of file From f39b66d843168b77598cb8b58b88e6a2aa7113b6 Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Thu, 26 Oct 2023 12:37:32 +0300 Subject: [PATCH 02/49] Added topics in other languages --- tools/importer/import.js | 67 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index c1998bad..271bcd8c 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -25,7 +25,6 @@ const createMetadataBlock = (main, document) => { // Publication Date const date = document.querySelector('.cmp-blog-author-info__date'); if (date) { - // parse date which is in format "October 23, 2023" considering local en-US const dateStr = date.textContent.trim(); const dateObj = new Date(dateStr); // format date to mm/dd/yyyy @@ -75,7 +74,71 @@ const createMetadataBlock = (main, document) => { 'Manufacturing', 'Now on Now', 'Now Platform', - 'Telecommunications']; + 'Telecommunications', + + // de-DE + 'Application Development', + 'Behörden', + 'Bildungswesen', + 'Crisis Management', + 'Cybersicherheit und Risikomanagement', + 'Events', + 'Fertigungsindustrie', + 'Finanzindustrie', + 'Gesundheitswesen', + 'IT-Management', + 'Karriere', + 'KI und Automatisierung', + 'Kultur', + 'Kunden-Experience', + 'Kundengeschichten', + 'Mitarbeiter-Experience', + 'Now on Now', + 'Now Platform', + 'Telekommunikation', + + // fr-FR + 'Carrières', + 'Culture', + 'Cyber-sécurité et risques', + 'Développement d’applications', + 'Expérience client', + 'Expérience des employés', + 'Gestion de crise', + 'Gestion de l’IT', + 'Gouvernement', + 'IA et automatisation', + 'Now on Now', + 'Now Platform', + 'Production industrielle', + 'Santé', + 'Services financiers', + 'Télécommunications', + 'Témoignages Client', + 'Éducation', + 'Événements', + // nl-NL + 'AI en automatisering', + 'Applicatieontwikkeling', + 'Carrière', + 'Crisismanagement', + 'Cultuur', + 'Cybersecurity and Risk', + 'Events', + 'Financiële dienstverlening', + 'Gezondheidszorg', + 'IT-beheer', + 'Klantervaring', + 'Klantverhalen', + 'Now on Now', + 'Now Platform', + 'Opleiding', + 'Overheid', + 'Productie', + 'Telecommunicatie', + 'Werknemerservaring' + + ]; const categories = ['Trends and Research', 'About ServiceNow', 'Solutions', 'Life at Now']; From c01028f44bac33a847e7281d3518e1db8c7a4e67 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 10 Nov 2023 17:05:45 +0100 Subject: [PATCH 03/49] clean whitespaces. move metadata at the beginning. remove metadata image --- tools/importer/import.js | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 271bcd8c..5b8496cd 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -162,15 +162,15 @@ const createMetadataBlock = (main, document) => { } // Image - const img = document.querySelector('[property="og:image"]'); - if (img && img.content) { - const el = document.createElement('img'); - el.src = img.content; - meta.Image = el; - } + // const img = document.querySelector('[property="og:image"]'); + // if (img && img.content) { + // const el = document.createElement('img'); + // el.src = img.content; + // meta.Image = el; + // } const block = WebImporter.Blocks.getMetadataBlock(document, meta); - main.append(block); + main.prepend(block); return meta; }; @@ -194,6 +194,7 @@ export default { createMetadataBlock(main, document); main.querySelectorAll('.legacyHtml, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); + main.querySelectorAll('br, nbsp').forEach((el) => el.remove()); return main; }, From 81109beaafbebe358d28c72cda0d2be618aafb27 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 10 Nov 2023 20:52:31 +0100 Subject: [PATCH 04/49] fix date format --- tools/importer/import.js | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 5b8496cd..8f6cd006 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -12,6 +12,10 @@ /* global WebImporter */ /* eslint-disable no-console, class-methods-use-this */ +const pageUrl = "https://main--servicenow--hlxsites.hlx.page"; +const liveUrl = "https://main--servicenow--hlxsites.hlx.page"; + + const createMetadataBlock = (main, document) => { const meta = {}; @@ -19,7 +23,7 @@ const createMetadataBlock = (main, document) => { const authorLink = document.querySelector('.cmp-blog-author-info__author a'); if (authorLink) { meta.Author = authorLink.textContent; - meta['Author Link'] = authorLink.href; + meta['Author Link'] = new URL(authorLink.href, pageUrl); } // Publication Date @@ -28,7 +32,10 @@ const createMetadataBlock = (main, document) => { const dateStr = date.textContent.trim(); const dateObj = new Date(dateStr); // format date to mm/dd/yyyy - meta['Publication Date'] = dateObj.toLocaleDateString('en-US'); + meta['Publication Date'] = dateObj.toLocaleDateString( + 'en-US', + { day: '2-digit', month: '2-digit', year: 'numeric' }, + ); } // Title From 1596810047014472023b2aef47ae03ed2ab26e7c Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Wed, 15 Nov 2023 17:55:43 +0100 Subject: [PATCH 05/49] use AEM JSON rendition to extract tags and map them with the xcel sheet --- tools/importer/import.js | 210 +++++++++++++++------------------------ 1 file changed, 82 insertions(+), 128 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 8f6cd006..96aab0a8 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -15,15 +15,81 @@ const pageUrl = "https://main--servicenow--hlxsites.hlx.page"; const liveUrl = "https://main--servicenow--hlxsites.hlx.page"; +function fetchSync(method, url) { + // we use old XMLHttpRequest as fetch seams to have problems in bulk import + const request = new XMLHttpRequest(); + request.open(method, url, false); + request.overrideMimeType('text/json; UTF-8'); + request.send(null); + return { + status: request.status, + body: request.responseText, + } +} + +function jsonRenditionURL(url) { + return url.replace('.html', '.1.json'); +} + +function getAllTags() { + const tagsURL = 'https://main--servicenow--hlxsites.hlx.live/blogs/tags.json'; + const response = fetchSync('GET', tagsURL); + if (response.status === 200) { + return JSON.parse(response.body); + } + return {}; +} + +function getOriginalTags(jsonRendition) { + return jsonRendition['jcr:content']['cq:tags']; +} + +function getOriginalCategoryTag(originalTags) { + return originalTags.find((tag) => tag.startsWith('sn-blog-docs:category')); +} + +function getOriginalTopicTag(originalTags) { + // TODO is this the correct tag or sn-blog-docs:new-trend ? + return originalTags.find((tag) => tag.startsWith('sn-blog-docs:topic')); +} + +const createMetadataBlock = (main, document, url) => { + const jsonRendition = JSON.parse(fetchSync('GET', jsonRenditionURL(url)).body); + const allTags = getAllTags(); + const originalTags = getOriginalTags(jsonRendition); + + const originalCategoryTag = getOriginalCategoryTag(originalTags); + const originalTopicTag = getOriginalTopicTag(originalTags); -const createMetadataBlock = (main, document) => { const meta = {}; // Author const authorLink = document.querySelector('.cmp-blog-author-info__author a'); if (authorLink) { meta.Author = authorLink.textContent; - meta['Author Link'] = new URL(authorLink.href, pageUrl); + + // TODO whether we remove .html might depend on the outcome of https://github.com/hlxsites/servicenow/issues/24 + meta['Author Link'] = new URL(new URL(authorLink.href.replace('.html', '')).pathname, pageUrl); + } + + // Category + meta.Category = allTags.category.data.find((tag) => tag['legacy-identifier'] === originalCategoryTag)?.identifier; + + // Topic + meta.Topic = allTags.topic.data.find((tag) => tag['legacy-identifier'] === originalTopicTag)?.identifier; + + // New Trend - TODO what is it used for? + + // Title + const title = document.querySelector('title'); + if (title) { + meta.Title = title.textContent.replace(/[\n\t]/gm, ''); + } + + // Description + const desc = document.querySelector('[property="og:description"]'); + if (desc) { + meta.Description = desc.content; } // Publication Date @@ -38,137 +104,16 @@ const createMetadataBlock = (main, document) => { ); } - // Title - const title = document.querySelector('title'); - if (title) { - meta.Title = title.textContent.replace(/[\n\t]/gm, ''); - } - // Keywords const keywords = document.querySelector('meta[name="keywords"]'); if (keywords && keywords.content) { meta.Keywords = keywords.content; } - // Description - const desc = document.querySelector('[property="og:description"]'); - if (desc) { - meta.Description = desc.content; - } - - // Template - meta.Template = 'Blog Article'; - - // Category - const tags = document.querySelectorAll('.cmp-blog-author-info__tags li'); - if (tags) { - // for each tag, if present in a predefined list, then add it to the metadata - const topics = ['AI and Automation', - 'Application Development', - 'Careers', - 'Crisis Management', - 'Culture', - 'Customer Experience', - 'Customer Stories', - 'Cybersecurity and Risk', - 'Education', - 'Employee Experience', - 'Events', - 'Financial Services', - 'Government', - 'Healthcare', - 'IT Management', - 'Manufacturing', - 'Now on Now', - 'Now Platform', - 'Telecommunications', - - // de-DE - 'Application Development', - 'Behörden', - 'Bildungswesen', - 'Crisis Management', - 'Cybersicherheit und Risikomanagement', - 'Events', - 'Fertigungsindustrie', - 'Finanzindustrie', - 'Gesundheitswesen', - 'IT-Management', - 'Karriere', - 'KI und Automatisierung', - 'Kultur', - 'Kunden-Experience', - 'Kundengeschichten', - 'Mitarbeiter-Experience', - 'Now on Now', - 'Now Platform', - 'Telekommunikation', - - // fr-FR - 'Carrières', - 'Culture', - 'Cyber-sécurité et risques', - 'Développement d’applications', - 'Expérience client', - 'Expérience des employés', - 'Gestion de crise', - 'Gestion de l’IT', - 'Gouvernement', - 'IA et automatisation', - 'Now on Now', - 'Now Platform', - 'Production industrielle', - 'Santé', - 'Services financiers', - 'Télécommunications', - 'Témoignages Client', - 'Éducation', - 'Événements', - // nl-NL - 'AI en automatisering', - 'Applicatieontwikkeling', - 'Carrière', - 'Crisismanagement', - 'Cultuur', - 'Cybersecurity and Risk', - 'Events', - 'Financiële dienstverlening', - 'Gezondheidszorg', - 'IT-beheer', - 'Klantervaring', - 'Klantverhalen', - 'Now on Now', - 'Now Platform', - 'Opleiding', - 'Overheid', - 'Productie', - 'Telecommunicatie', - 'Werknemerservaring' - - ]; - - const categories = ['Trends and Research', 'About ServiceNow', 'Solutions', 'Life at Now']; - - // for each tag if present in the topics array, then create a list of topics - - for (let i = 0; i < tags.length; i++) { - if (categories.includes(tags[i].textContent.trim())) { - if (meta.Category) { - meta.Category = meta.Category + ', ' + tags[i].textContent.trim(); - } - else { - meta.Category = tags[i].textContent.trim(); - } - } + // Template set in the metadata.xls + // meta.Template = 'Blog Article'; - if (topics.includes(tags[i].textContent.trim())) { - meta.Topic = tags[i].textContent.trim(); - } - - } - } - - // Image + // Image - TODO if we find blogs for which the first image is not the same as the og:image // const img = document.querySelector('[property="og:image"]'); // if (img && img.content) { // const el = document.createElement('img'); @@ -198,10 +143,19 @@ export default { }) => { const main = document.querySelector('body'); - createMetadataBlock(main, document); + createMetadataBlock(main, document, url); - main.querySelectorAll('.legacyHtml, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); + main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); + + // TODO is this ok? main.querySelectorAll('br, nbsp').forEach((el) => el.remove()); + + // Remove copyright as we create a fragment with it. + main.querySelectorAll('p').forEach((paragraph) => { + if (paragraph.textContent.includes('ServiceNow, Inc. All rights reserved.')) { + paragraph.remove(); + } + }) return main; }, From 0d418c43fca6947a9f6325d990d7339395b827ae Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Thu, 16 Nov 2023 13:02:13 +0200 Subject: [PATCH 06/49] Crawling of topics and categories --- package-lock.json | 34 ++++++++++++- package.json | 4 +- tools/importer/tags.js | 112 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 tools/importer/tags.js diff --git a/package-lock.json b/package-lock.json index 6fc073c6..4f08b109 100644 --- a/package-lock.json +++ b/package-lock.json @@ -24,7 +24,9 @@ "semantic-release": "21.0.5", "sinon": "15.0.1", "stylelint": "15.2.0", - "stylelint-config-standard": "30.0.1" + "stylelint-config-standard": "30.0.1", + "xmldom": "0.6.0", + "xmlhttprequest": "1.8.0" } }, "node_modules/@ampproject/remapping": { @@ -13126,6 +13128,24 @@ } } }, + "node_modules/xmldom": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/xmldom/-/xmldom-0.6.0.tgz", + "integrity": "sha512-iAcin401y58LckRZ0TkI4k0VSM1Qg0KGSc3i8rU+xrxe19A/BN1zHyVSJY7uoutVlaTSzYyk/v5AmkewAP7jtg==", + "dev": true, + "engines": { + "node": ">=10.0.0" + } + }, + "node_modules/xmlhttprequest": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/xmlhttprequest/-/xmlhttprequest-1.8.0.tgz", + "integrity": "sha512-58Im/U0mlVBLM38NdZjHyhuMtCqa61469k2YP/AaPbvCoV9aQGUpbJBj1QRm2ytRiVQBD/fsw7L2bJGDVQswBA==", + "dev": true, + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", @@ -22709,6 +22729,18 @@ "dev": true, "requires": {} }, + "xmldom": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/xmldom/-/xmldom-0.6.0.tgz", + "integrity": "sha512-iAcin401y58LckRZ0TkI4k0VSM1Qg0KGSc3i8rU+xrxe19A/BN1zHyVSJY7uoutVlaTSzYyk/v5AmkewAP7jtg==", + "dev": true + }, + "xmlhttprequest": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/xmlhttprequest/-/xmlhttprequest-1.8.0.tgz", + "integrity": "sha512-58Im/U0mlVBLM38NdZjHyhuMtCqa61469k2YP/AaPbvCoV9aQGUpbJBj1QRm2ytRiVQBD/fsw7L2bJGDVQswBA==", + "dev": true + }, "xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", diff --git a/package.json b/package.json index 9062cec2..766073a6 100644 --- a/package.json +++ b/package.json @@ -35,6 +35,8 @@ "semantic-release": "21.0.5", "sinon": "15.0.1", "stylelint": "15.2.0", - "stylelint-config-standard": "30.0.1" + "stylelint-config-standard": "30.0.1", + "xmlhttprequest" : "1.8.0", + "xmldom" : "0.6.0" } } diff --git a/tools/importer/tags.js b/tools/importer/tags.js new file mode 100644 index 00000000..4d7bc3df --- /dev/null +++ b/tools/importer/tags.js @@ -0,0 +1,112 @@ +// script that is called from command line to fetch a sitemap.xml and iterate over it +// +// usage: +// node tools/importer/import.js https://main--servicenow--hlxsites.hlx.page + + + +/* eslint-disable no-console, class-methods-use-this */ + + + +const XMLHttpRequest = require('xmlhttprequest').XMLHttpRequest; +var DOMParser = require('xmldom').DOMParser; + +function fetchSync(method, url) { + // we use old XMLHttpRequest as fetch seams to have problems in bulk import + const request = new XMLHttpRequest(); + request.open(method, url, false); + // request.overrideMimeType('text/json; UTF-8'); + request.send(null); + return { + status: request.status, + body: request.responseText, + } +} + +function jsonRenditionURL(url) { + return url.replace('.html', '.1.json'); +} + +function getOriginalTags(jsonRendition) { + return jsonRendition['jcr:content']['cq:tags']; +} + + + + +// get sitemap content and parse it to Document Object Model +function getXMLSitemapObject(sitemapFile, callback) { + var xhttp = new XMLHttpRequest(); + xhttp.onreadystatechange = function() { + if ((this.readyState === 4) && (this.status === 200)) { + var sitemapContent = this.responseText; + var sitemapObject = parseXMLSitemap(sitemapContent); + callback(sitemapObject); + } + }; + xhttp.open('GET', sitemapFile, true); + xhttp.send(); +} + +// parse a text string into an XML DOM object +function parseXMLSitemap(sitemapContent) { + var parser = new DOMParser(); + var xmlDoc = parser.parseFromString(sitemapContent, 'text/xml'); + return xmlDoc; +} + +function getOriginalCategoryTag(originalTags) { + return originalTags.find((tag) => tag.startsWith('sn-blog-docs:category')); +} + +function getOriginalTopicTag(originalTags) { + // TODO is this the correct tag or sn-blog-docs:new-trend ? + return originalTags.find((tag) => tag.startsWith('sn-blog-docs:topic')); +} + +getXMLSitemapObject('https://www.servicenow.com/sitemap.xml', function(sitemapObject) { + // retrieve properties from the sitemap object + var urls = sitemapObject.getElementsByTagName('url'); + + // create an empty list of topics + var topics = []; + var categories = []; + + for (var i = 0; i < urls.length; i++) { + var urlElement = urls[i]; + + var loc = urlElement.getElementsByTagName('loc')[0].textContent; + + // if loc contains regular expression /blogs/*.html + if (loc.match(/\/blogs\/[0-9]*\/.*\.html/)) { + console.log(i + ' of ' + urls.length + ': ' + loc); + let pageAsJson = fetchSync('GET', jsonRenditionURL(loc)); + if (pageAsJson.status === 200) { + const jsonRendition = JSON.parse(pageAsJson.body); + const originalTags = getOriginalTags(jsonRendition); + if (originalTags !== undefined) { + let originalTopicTag = getOriginalTopicTag(originalTags); + let originalCategoryTag = getOriginalCategoryTag(originalTags); + + // console.log(originalTopicTag); + // if topics does not contain originalTopicTag add it + if (!topics.includes(originalTopicTag)) { + topics.push(originalTopicTag); + } + + if (!categories.includes(originalCategoryTag)) { + categories.push(originalCategoryTag); + } + } else { + console.log('originalTags is undefined'); + } + } + } + } + + console.log(topics); + + console.log(categories); + +}); From ffedc455b4e90d1d02be870b2b33d17dfd7017a0 Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Thu, 16 Nov 2023 14:41:34 +0200 Subject: [PATCH 07/49] Removed hero image --- tools/importer/import.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 96aab0a8..80b74b29 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -78,6 +78,8 @@ const createMetadataBlock = (main, document, url) => { // Topic meta.Topic = allTags.topic.data.find((tag) => tag['legacy-identifier'] === originalTopicTag)?.identifier; + meta.Template = 'Blog Article'; + // New Trend - TODO what is it used for? // Title @@ -145,7 +147,7 @@ export default { createMetadataBlock(main, document, url); - main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); + main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4, .hero-image').forEach(el => el.remove()); // TODO is this ok? main.querySelectorAll('br, nbsp').forEach((el) => el.remove()); From 7bb9803650843aa17fa9b22491d3c9cfc93d5b56 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 17 Nov 2023 16:43:18 +0100 Subject: [PATCH 08/49] extract new trend tags --- tools/importer/tags.js | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tools/importer/tags.js b/tools/importer/tags.js index 4d7bc3df..af8356ed 100644 --- a/tools/importer/tags.js +++ b/tools/importer/tags.js @@ -61,10 +61,13 @@ function getOriginalCategoryTag(originalTags) { } function getOriginalTopicTag(originalTags) { - // TODO is this the correct tag or sn-blog-docs:new-trend ? return originalTags.find((tag) => tag.startsWith('sn-blog-docs:topic')); } +function getOriginalNewTrendTag(originalTags) { + return originalTags.find((tag) => tag.startsWith('sn-blog-docs:new-trend')); +} + getXMLSitemapObject('https://www.servicenow.com/sitemap.xml', function(sitemapObject) { // retrieve properties from the sitemap object var urls = sitemapObject.getElementsByTagName('url'); @@ -72,6 +75,7 @@ getXMLSitemapObject('https://www.servicenow.com/sitemap.xml', function(sitemapOb // create an empty list of topics var topics = []; var categories = []; + var newTrends = []; for (var i = 0; i < urls.length; i++) { var urlElement = urls[i]; @@ -88,6 +92,7 @@ getXMLSitemapObject('https://www.servicenow.com/sitemap.xml', function(sitemapOb if (originalTags !== undefined) { let originalTopicTag = getOriginalTopicTag(originalTags); let originalCategoryTag = getOriginalCategoryTag(originalTags); + let originalNewTrendTag = getOriginalNewTrendTag(originalTags); // console.log(originalTopicTag); // if topics does not contain originalTopicTag add it @@ -98,6 +103,10 @@ getXMLSitemapObject('https://www.servicenow.com/sitemap.xml', function(sitemapOb if (!categories.includes(originalCategoryTag)) { categories.push(originalCategoryTag); } + + if (!newTrends.includes(originalNewTrendTag)) { + newTrends.push(originalNewTrendTag); + } } else { console.log('originalTags is undefined'); } @@ -105,8 +114,9 @@ getXMLSitemapObject('https://www.servicenow.com/sitemap.xml', function(sitemapOb } } - console.log(topics); + console.log(topics.sort()); - console.log(categories); + console.log(categories.sort()); + console.log(newTrends.sort()); }); From 8bf22d7800a34d377db5d12281217e6847732622 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 17 Nov 2023 17:18:05 +0100 Subject: [PATCH 09/49] extract new trend tags --- tools/importer/import.js | 20 +++++++++++++++++--- tools/importer/tags.js | 1 + 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 80b74b29..64442cad 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -53,6 +53,10 @@ function getOriginalTopicTag(originalTags) { return originalTags.find((tag) => tag.startsWith('sn-blog-docs:topic')); } +function getOriginalNewTrendTag(originalTags) { + return originalTags.find((tag) => tag.startsWith('sn-blog-docs:new-trend')); +} + const createMetadataBlock = (main, document, url) => { const jsonRendition = JSON.parse(fetchSync('GET', jsonRenditionURL(url)).body); const allTags = getAllTags(); @@ -60,6 +64,7 @@ const createMetadataBlock = (main, document, url) => { const originalCategoryTag = getOriginalCategoryTag(originalTags); const originalTopicTag = getOriginalTopicTag(originalTags); + const originalNewTrendTag = getOriginalNewTrendTag(originalTags); const meta = {}; @@ -76,11 +81,20 @@ const createMetadataBlock = (main, document, url) => { meta.Category = allTags.category.data.find((tag) => tag['legacy-identifier'] === originalCategoryTag)?.identifier; // Topic - meta.Topic = allTags.topic.data.find((tag) => tag['legacy-identifier'] === originalTopicTag)?.identifier; + meta.Topic = allTags.topic.data.find((tag) => tag['legacy-identifier-topic'] === originalTopicTag)?.identifier; + + const newTredTag = allTags.topic.data.find((tag) => tag['legacy-identifier-newtrend'] === originalTopicTag)?.identifier; + if (newTredTag) { + meta['New Trend'] = newTredTag; + } - meta.Template = 'Blog Article'; + if (originalTags.includes('sn-blog-docs:trend/trends-research')) { + meta['Trend'] = 'Trends and Research'; + } + + // This comes from the bulk metadata for Blog articles + // meta.Template = 'Blog Article'; - // New Trend - TODO what is it used for? // Title const title = document.querySelector('title'); diff --git a/tools/importer/tags.js b/tools/importer/tags.js index af8356ed..18524fb9 100644 --- a/tools/importer/tags.js +++ b/tools/importer/tags.js @@ -68,6 +68,7 @@ function getOriginalNewTrendTag(originalTags) { return originalTags.find((tag) => tag.startsWith('sn-blog-docs:new-trend')); } +console.log('running'); getXMLSitemapObject('https://www.servicenow.com/sitemap.xml', function(sitemapObject) { // retrieve properties from the sitemap object var urls = sitemapObject.getElementsByTagName('url'); From 3434bc7f9db759bf61e72dac553d3c100940433c Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Sat, 18 Nov 2023 18:53:32 +0100 Subject: [PATCH 10/49] extract new trend tags --- tools/importer/import.js | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 64442cad..397068a6 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -49,7 +49,6 @@ function getOriginalCategoryTag(originalTags) { } function getOriginalTopicTag(originalTags) { - // TODO is this the correct tag or sn-blog-docs:new-trend ? return originalTags.find((tag) => tag.startsWith('sn-blog-docs:topic')); } @@ -83,7 +82,7 @@ const createMetadataBlock = (main, document, url) => { // Topic meta.Topic = allTags.topic.data.find((tag) => tag['legacy-identifier-topic'] === originalTopicTag)?.identifier; - const newTredTag = allTags.topic.data.find((tag) => tag['legacy-identifier-newtrend'] === originalTopicTag)?.identifier; + const newTredTag = allTags.topic.data.find((tag) => tag['legacy-identifier-newtrend'] === originalNewTrendTag)?.identifier; if (newTredTag) { meta['New Trend'] = newTredTag; } From b882dea4850e8670571cadfea34aabb3c2da1545 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Sun, 19 Nov 2023 19:10:31 +0100 Subject: [PATCH 11/49] process links --- tools/importer/import.js | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 397068a6..b9a47892 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -14,6 +14,7 @@ const pageUrl = "https://main--servicenow--hlxsites.hlx.page"; const liveUrl = "https://main--servicenow--hlxsites.hlx.page"; +const servicenowUrl = 'https://www.servicenow.com'; function fetchSync(method, url) { // we use old XMLHttpRequest as fetch seams to have problems in bulk import @@ -56,6 +57,28 @@ function getOriginalNewTrendTag(originalTags) { return originalTags.find((tag) => tag.startsWith('sn-blog-docs:new-trend')); } +function isServiceNowLink(link) { + return (link.host.startsWith('localhost') || link.host.endsWith('servicenow.com')); +} + +function isBlogLink(link) { + + return isServiceNowLink(link) && + (link.pathname.startsWith('/blogs') + || link.pathname.startsWith('/fr/blogs') + || link.pathname.startsWith('/de/blogs') + || link.pathname.startsWith('/uk/blogs') + || link.pathname.startsWith('/nk/blogs')); +} + +function getServiceNowUrl(link) { + return new URL(new URL(link.href).pathname, servicenowUrl); +} + +function getPageUrl(link) { + return new URL(new URL(link.href).pathname.replace('.html', ''), pageUrl); +} + const createMetadataBlock = (main, document, url) => { const jsonRendition = JSON.parse(fetchSync('GET', jsonRenditionURL(url)).body); const allTags = getAllTags(); @@ -160,17 +183,29 @@ export default { createMetadataBlock(main, document, url); + // CLEANUP main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4, .hero-image').forEach(el => el.remove()); - // TODO is this ok? main.querySelectorAll('br, nbsp').forEach((el) => el.remove()); - + main.querySelectorAll('img[src^="/akam/13/pixel"]').forEach((el) => el.remove()); // Remove copyright as we create a fragment with it. main.querySelectorAll('p').forEach((paragraph) => { if (paragraph.textContent.includes('ServiceNow, Inc. All rights reserved.')) { paragraph.remove(); } }) + + // Processing... + main.querySelectorAll('a').forEach((link) => { + if (isServiceNowLink(link)) { + if (isBlogLink(link)) { + link.href = getPageUrl(link); + } else { + link.href = getServiceNowUrl(link); + } + } + }); + return main; }, From bbe8bf2f85b45129d26d61a82d9ca791bb15ffda Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Mon, 20 Nov 2023 13:05:03 +0100 Subject: [PATCH 12/49] import filter pages: topic, category, year --- tools/importer/import-filter-page.js | 143 +++++++++++++++++++++++++++ tools/importer/import.js | 11 ++- 2 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 tools/importer/import-filter-page.js diff --git a/tools/importer/import-filter-page.js b/tools/importer/import-filter-page.js new file mode 100644 index 00000000..9e32f58a --- /dev/null +++ b/tools/importer/import-filter-page.js @@ -0,0 +1,143 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +/* global WebImporter */ +/* eslint-disable no-console, class-methods-use-this */ + +function fetchSync(method, url) { + // we use old XMLHttpRequest as fetch seams to have problems in bulk import + const request = new XMLHttpRequest(); + request.open(method, url, false); + request.overrideMimeType('text/json; UTF-8'); + request.send(null); + return { + status: request.status, + body: request.responseText, + } +} + +function jsonRenditionURL(url, depth = 1) { + return url.replace('.html', `.${depth}.json`); +} + +function getAllTags() { + const tagsURL = 'https://main--servicenow--hlxsites.hlx.live/blogs/tags.json'; + const response = fetchSync('GET', tagsURL); + if (response.status === 200) { + return JSON.parse(response.body); + } + return {}; +} + +const createMetadataBlock = (main, document, url) => { + const meta = {}; + + // Title + const title = document.querySelector('title'); + if (title) { + let titleText = title.textContent.replace(/[\n\t]/gm, '').trim(); + const suffix = ' - ServiceNow Blog'; + if (titleText.endsWith(suffix)) { + titleText = titleText.substring(0, titleText.length - suffix.length); + } + meta.Title = titleText; + } + + // Description + const desc = document.querySelector('[property="og:description"]'); + if (desc && desc.content.trim()) { + meta.Description = desc.content.trim(); + } else { + meta.Description = 'Read about ServiceNow\'s Company News, Announcements, and Updates.'; + } + + // Keywords + const keywords = document.querySelector('meta[name="keywords"]'); + if (keywords && keywords.content) { + meta.Keywords = keywords.content; + } + + const block = WebImporter.Blocks.getMetadataBlock(document, meta); + main.append(block); + + return meta; +}; + +export default { + /** + * Apply DOM operations to the provided document and return + * the root element to be then transformed to Markdown. + * @param {HTMLDocument} document The document + * @param {string} url The url of the page imported + * @param {string} html The raw html (the document is cleaned up during preprocessing) + * @param {object} params Object containing some parameters given by the import process. + * @returns {HTMLElement} The root element to be transformed + */ + transformDOM: ({ + // eslint-disable-next-line no-unused-vars + document, url, html, params, + }) => { + const main = document.querySelector('body'); + + // CLEANUP + main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4, .hero-image').forEach(el => el.remove()); + // TODO is this ok? + main.querySelectorAll('br, nbsp').forEach((el) => el.remove()); + main.querySelectorAll('img[src^="/akam/13/pixel"]').forEach((el) => el.remove()); + main.querySelectorAll('.servicenow-blog-list--block, .cmp-list').forEach((el) => el.remove()); + + main.append(document.createElement('hr')); // create section + + const jsonRendition = JSON.parse(fetchSync('GET', jsonRenditionURL(url, 5)).body); + const listTag = jsonRendition['jcr:content']?.root?.responsivegrid?.responsivegrid?.list?.tags[0]?.trim(); + + const allTags = getAllTags(); + let blogList = null; + if (url.toString().includes('/topics/')) { + console.log('here'); + // topic page + const newTredTag = allTags.topic.data.find((tag) => tag['legacy-identifier-newtrend'] === listTag)?.identifier; + blogList = WebImporter.DOMUtils.createTable([['Blog List'], ['New Trend', newTredTag]], document); + } else if (url.toString().includes('/category/')) { + // category page + const categoryTag = allTags.category.data.find((tag) => tag['legacy-identifier'] === listTag)?.identifier; + blogList = WebImporter.DOMUtils.createTable([['Blog List'], ['Category', categoryTag]], document); + } else if (params.originalURL.match(/\b\d{4}\.html$/)){ + const urlParts = params.originalURL.replace('.html', '').split('/'); + const year = urlParts[urlParts.length - 1]; + // year page + blogList = WebImporter.DOMUtils.createTable([['Blog List'], ['Year', year]], document); + } + + // TODO author + + if (blogList) { + main.append(blogList); + } + + createMetadataBlock(main, document, url); + return main; + }, + + /** + * Return a path that describes the document being transformed (file name, nesting...). + * The path is then used to create the corresponding Word document. + * @param {HTMLDocument} document The document + * @param {string} url The url of the page imported + * @param {string} html The raw html (the document is cleaned up during preprocessing) + * @param {object} params Object containing some parameters given by the import process. + * @return {string} The path + */ + generateDocumentPath: ({ + // eslint-disable-next-line no-unused-vars + document, url, html, params, + }) => WebImporter.FileUtils.sanitizePath(new URL(url).pathname.replace(/\.html$/, '').replace(/\/$/, '')), +}; \ No newline at end of file diff --git a/tools/importer/import.js b/tools/importer/import.js index b9a47892..2f0aa7ae 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -121,13 +121,18 @@ const createMetadataBlock = (main, document, url) => { // Title const title = document.querySelector('title'); if (title) { - meta.Title = title.textContent.replace(/[\n\t]/gm, ''); + let titleText = title.textContent.replace(/[\n\t]/gm, ''); + const suffix = ' – ServiceNow Blog'; + if (titleText.endsWith(suffix)) { + titleText = titleText.substring(0, titleText.length - suffix.length); + } + meta.Title = titleText; } // Description const desc = document.querySelector('[property="og:description"]'); - if (desc) { - meta.Description = desc.content; + if (desc && desc.content.trim()) { + meta.Description = desc.content.trim(); } // Publication Date From 51ae0297b4f5e6e6b96e67c46905701c98c18310 Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Mon, 20 Nov 2023 16:39:33 +0200 Subject: [PATCH 13/49] Extracted new category --- tools/importer/tags.js | 150 ++++++++++++++++++++++++----------------- 1 file changed, 89 insertions(+), 61 deletions(-) diff --git a/tools/importer/tags.js b/tools/importer/tags.js index 18524fb9..8dc92a75 100644 --- a/tools/importer/tags.js +++ b/tools/importer/tags.js @@ -3,14 +3,10 @@ // usage: // node tools/importer/import.js https://main--servicenow--hlxsites.hlx.page - - /* eslint-disable no-console, class-methods-use-this */ - - -const XMLHttpRequest = require('xmlhttprequest').XMLHttpRequest; -var DOMParser = require('xmldom').DOMParser; +const { XMLHttpRequest } = require('xmlhttprequest'); +const { DOMParser } = require('xmldom'); function fetchSync(method, url) { // we use old XMLHttpRequest as fetch seams to have problems in bulk import @@ -21,7 +17,7 @@ function fetchSync(method, url) { return { status: request.status, body: request.responseText, - } + }; } function jsonRenditionURL(url) { @@ -32,16 +28,13 @@ function getOriginalTags(jsonRendition) { return jsonRendition['jcr:content']['cq:tags']; } - - - // get sitemap content and parse it to Document Object Model function getXMLSitemapObject(sitemapFile, callback) { - var xhttp = new XMLHttpRequest(); - xhttp.onreadystatechange = function() { + const xhttp = new XMLHttpRequest(); + xhttp.onreadystatechange = function () { if ((this.readyState === 4) && (this.status === 200)) { - var sitemapContent = this.responseText; - var sitemapObject = parseXMLSitemap(sitemapContent); + const sitemapContent = this.responseText; + const sitemapObject = parseXMLSitemap(sitemapContent); callback(sitemapObject); } }; @@ -51,8 +44,8 @@ function getXMLSitemapObject(sitemapFile, callback) { // parse a text string into an XML DOM object function parseXMLSitemap(sitemapContent) { - var parser = new DOMParser(); - var xmlDoc = parser.parseFromString(sitemapContent, 'text/xml'); + const parser = new DOMParser(); + const xmlDoc = parser.parseFromString(sitemapContent, 'text/xml'); return xmlDoc; } @@ -68,56 +61,91 @@ function getOriginalNewTrendTag(originalTags) { return originalTags.find((tag) => tag.startsWith('sn-blog-docs:new-trend')); } +const sitemaps = [ + // { locale: 'en-US', sitemap: 'https://www.servicenow.com/sitemap.xml' }, + // { locale: 'fr-FR', sitemap: 'https://www.servicenow.com/fr/sitemap.xml' }, + // { locale: 'de-DE', sitemap: 'https://www.servicenow.com/de/sitemap.xml' }, + // { locale: 'en-GB', sitemap: 'https://www.servicenow.com/uk/sitemap.xml' }, + { locale: 'nl-NL', sitemap: 'https://www.servicenow.com/nl/sitemap.xml' }, +]; + console.log('running'); -getXMLSitemapObject('https://www.servicenow.com/sitemap.xml', function(sitemapObject) { - // retrieve properties from the sitemap object - var urls = sitemapObject.getElementsByTagName('url'); - - // create an empty list of topics - var topics = []; - var categories = []; - var newTrends = []; - - for (var i = 0; i < urls.length; i++) { - var urlElement = urls[i]; - - var loc = urlElement.getElementsByTagName('loc')[0].textContent; - - // if loc contains regular expression /blogs/*.html - if (loc.match(/\/blogs\/[0-9]*\/.*\.html/)) { - console.log(i + ' of ' + urls.length + ': ' + loc); - let pageAsJson = fetchSync('GET', jsonRenditionURL(loc)); - if (pageAsJson.status === 200) { - const jsonRendition = JSON.parse(pageAsJson.body); - const originalTags = getOriginalTags(jsonRendition); - if (originalTags !== undefined) { - let originalTopicTag = getOriginalTopicTag(originalTags); - let originalCategoryTag = getOriginalCategoryTag(originalTags); - let originalNewTrendTag = getOriginalNewTrendTag(originalTags); - - // console.log(originalTopicTag); - // if topics does not contain originalTopicTag add it - if (!topics.includes(originalTopicTag)) { - topics.push(originalTopicTag); - } - if (!categories.includes(originalCategoryTag)) { - categories.push(originalCategoryTag); +for (let j = 0; j < sitemaps.length; j++) { + getXMLSitemapObject(sitemaps[j].sitemap, (sitemapObject) => { + console.log('Processing locale %s', sitemaps[j].locale); + // create an empty list of topics + const topics = []; + const categories = []; + const newTrends = []; + + const all = []; + + // retrieve properties from the sitemap object + const urls = sitemapObject.getElementsByTagName('url'); + + for (let i = 0; i < urls.length; i++) { + const urlElement = urls[i]; + + const loc = urlElement.getElementsByTagName('loc')[0].textContent; + + // if loc contains regular expression /blogs/*.html + if (loc.match(/\/blogs\/[0-9]*\/.*\.html/)) { + console.log(`${i} of ${urls.length}: ${loc}`); + const pageAsJson = fetchSync('GET', jsonRenditionURL(loc)); + if (pageAsJson.status === 200) { + let jsonRendition = {}; + + try { + jsonRendition = JSON.parse(pageAsJson.body); + } catch (e) { + console.log('error parsing json'); + continue; } - if (!newTrends.includes(originalNewTrendTag)) { - newTrends.push(originalNewTrendTag); + const originalTags = getOriginalTags(jsonRendition); + if (originalTags !== undefined) { + const originalTopicTag = getOriginalTopicTag(originalTags); + const originalCategoryTag = getOriginalCategoryTag(originalTags); + const originalNewTrendTag = getOriginalNewTrendTag(originalTags); + + // console.log(originalTopicTag); + // if topics does not contain originalTopicTag add it + if (!topics.includes(originalTopicTag)) { + topics.push(originalTopicTag); + } + + if (!categories.includes(originalCategoryTag)) { + categories.push(originalCategoryTag); + } + + if (!newTrends.includes(originalNewTrendTag)) { + newTrends.push(originalNewTrendTag); + } + + all.push({ + locale: sitemaps[j].locale, + loc, + topic: originalTopicTag, + category: originalCategoryTag, + newTrend: originalNewTrendTag, + }); + } else { + console.log('originalTags is undefined'); } - } else { - console.log('originalTags is undefined'); } } } - } - - console.log(topics.sort()); - - console.log(categories.sort()); - - console.log(newTrends.sort()); -}); + // console.log(topics.sort()); + // + // console.log(categories.sort()); + // + // console.log(newTrends.sort()); + + // itearte over all array and display properties loc and topic as csv line + for (let k = 0; k < all.length; k++) { + const element = all[k]; + console.log(`${element.locale},${element.loc},${element.topic},${element.category},${element.newTrend}`); + } + }); +} From e6ded05e8b41f5b1580cd4f4b80be613f5ac1e0f Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Thu, 23 Nov 2023 18:33:32 +0100 Subject: [PATCH 14/49] import brightcove --- tools/importer/import.js | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/importer/import.js b/tools/importer/import.js index 2f0aa7ae..a4ca3418 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -211,6 +211,20 @@ export default { } }); + main.querySelectorAll('.brightcove-video-wrapper video').forEach((brightcoveVideo) => { + const defaultBCAccountId = '5703385908001'; + const defaultBCPlayer = 'default'; + + const brightcoveRows = [['Embed (Brightcove)'], ['Video ID', brightcoveVideo.dataset.videoId]]; + if (brightcoveVideo.dataset.account !== defaultBCAccountId) { + brightcoveRows.push(['Account', brightcoveVideo.dataset.account]); + } + if (brightcoveVideo.dataset.player !== defaultBCPlayer) { + brightcoveRows.push(['Player', brightcoveVideo.dataset.player]); + } + brightcoveVideo.replaceWith(WebImporter.DOMUtils.createTable(brightcoveRows, document)); + }); + return main; }, From 9df29ca96013d98b1735e1f3a72fefce54d05793 Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Fri, 24 Nov 2023 10:27:53 +0200 Subject: [PATCH 15/49] Added import for authors --- tools/importer/import-author.js | 141 ++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 tools/importer/import-author.js diff --git a/tools/importer/import-author.js b/tools/importer/import-author.js new file mode 100644 index 00000000..372c7771 --- /dev/null +++ b/tools/importer/import-author.js @@ -0,0 +1,141 @@ +/* + * Copyright 2023 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +/* global WebImporter */ +/* eslint-disable no-console, class-methods-use-this */ + +const pageUrl = "https://main--servicenow--hlxsites.hlx.page"; +const servicenowUrl = 'https://www.servicenow.com'; + +function isServiceNowLink(link) { + return (link.host.startsWith('localhost') || link.host.endsWith('servicenow.com')); +} + +function isBlogLink(link) { + + return isServiceNowLink(link) && + (link.pathname.startsWith('/blogs') + || link.pathname.startsWith('/fr/blogs') + || link.pathname.startsWith('/de/blogs') + || link.pathname.startsWith('/uk/blogs') + || link.pathname.startsWith('/nk/blogs')); +} + +function getServiceNowUrl(link) { + return new URL(new URL(link.href).pathname, servicenowUrl); +} + +function getPageUrl(link) { + return new URL(new URL(link.href).pathname.replace('.html', ''), pageUrl); +} + +const createMetadataBlock = (main, document, url) => { + + const meta = {}; + + // Title + const title = document.querySelector('title'); + if (title) { + let titleText = title.textContent.replace(/[\n\t]/gm, ''); + const suffix = ' – ServiceNow Blog'; + if (titleText.endsWith(suffix)) { + titleText = titleText.substring(0, titleText.length - suffix.length); + } + meta.Title = titleText; + } + + // Description + const desc = document.querySelector('[property="og:description"]'); + if (desc && desc.content.trim()) { + meta.Description = desc.content.trim(); + } + + // Keywords + const keywords = document.querySelector('meta[name="keywords"]'); + if (keywords && keywords.content) { + meta.Keywords = keywords.content; + } + + const author = document.querySelector('[property="og:url"]'); + var authorUrl = ''; + + if (author && author.content.trim()) { + authorUrl = author.content.trim(); + } + // insert new hr element as last element of main + const hr = document.createElement('hr'); + main.append(hr); + + const blogList = WebImporter.DOMUtils.createTable([['Blog List'], ['Author ', authorUrl]], document); + const metadataBlock = WebImporter.Blocks.getMetadataBlock(document, meta); + + main.append(blogList); + main.append(metadataBlock); + + return meta; +}; + +export default { + /** + * Apply DOM operations to the provided document and return + * the root element to be then transformed to Markdown. + * @param {HTMLDocument} document The document + * @param {string} url The url of the page imported + * @param {string} html The raw html (the document is cleaned up during preprocessing) + * @param {object} params Object containing some parameters given by the import process. + * @returns {HTMLElement} The root element to be transformed + */ + transformDOM: ({ + // eslint-disable-next-line no-unused-vars + document, url, html, params, + }) => { + const main = document.querySelector('body'); + + console.debug(url); + + createMetadataBlock(main, document, url); + + // CLEANUP + main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4, .hero-image, .com-seperator, .servicenow-blog-list--block').forEach(el => el.remove()); + + main.querySelectorAll('br, nbsp').forEach((el) => el.remove()); + main.querySelectorAll('img[src^="/akam/13/pixel"]').forEach((el) => el.remove()); + + + // Processing... + main.querySelectorAll('a').forEach((link) => { + if (isServiceNowLink(link)) { + if (isBlogLink(link)) { + link.href = getPageUrl(link); + } else { + link.href = getServiceNowUrl(link); + } + } + }); + + return main; + + }, + + /** + * Return a path that describes the document being transformed (file name, nesting...). + * The path is then used to create the corresponding Word document. + * @param {HTMLDocument} document The document + * @param {string} url The url of the page imported + * @param {string} html The raw html (the document is cleaned up during preprocessing) + * @param {object} params Object containing some parameters given by the import process. + * @return {string} The path + */ + generateDocumentPath: ({ + // eslint-disable-next-line no-unused-vars + document, url, html, params, + }) => WebImporter.FileUtils.sanitizePath(new URL(url).pathname.replace(/\.html$/, '').replace(/\/$/, '')), +}; \ No newline at end of file From ebc083ec926b24e1d3cb4ac3dbe6e7fa1b9c61b3 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 24 Nov 2023 11:04:15 +0100 Subject: [PATCH 16/49] add back hero-image --- tools/importer/import.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index a4ca3418..b736a54e 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -189,7 +189,7 @@ export default { createMetadataBlock(main, document, url); // CLEANUP - main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4, .hero-image').forEach(el => el.remove()); + main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); // TODO is this ok? main.querySelectorAll('br, nbsp').forEach((el) => el.remove()); main.querySelectorAll('img[src^="/akam/13/pixel"]').forEach((el) => el.remove()); @@ -215,7 +215,7 @@ export default { const defaultBCAccountId = '5703385908001'; const defaultBCPlayer = 'default'; - const brightcoveRows = [['Embed (Brightcove)'], ['Video ID', brightcoveVideo.dataset.videoId]]; + const brightcoveRows = [['Embed (Brightcove)'], ['VideoID', brightcoveVideo.dataset.videoId]]; if (brightcoveVideo.dataset.account !== defaultBCAccountId) { brightcoveRows.push(['Account', brightcoveVideo.dataset.account]); } From 358dafc0aeb30abe249e8ab6f8157c12c0162e5e Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 24 Nov 2023 13:31:16 +0100 Subject: [PATCH 17/49] get raw urls from sitemap --- tools/importer/sitemap.js | 44 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tools/importer/sitemap.js diff --git a/tools/importer/sitemap.js b/tools/importer/sitemap.js new file mode 100644 index 00000000..6be66730 --- /dev/null +++ b/tools/importer/sitemap.js @@ -0,0 +1,44 @@ + +const https = require('https'); +const { DOMParser } = require('xmldom'); + +// parse a text string into an XML DOM object +function parseXMLSitemap(sitemapContent) { + const parser = new DOMParser(); + const xmlDoc = parser.parseFromString(sitemapContent, 'text/xml'); + return xmlDoc; +} + +async function getSitemap() { + return new Promise((resolve) => { + https.get('https://www.servicenow.com/sitemap.xml', (res) => { + const data = []; + + res.on('data', (chunk) => { + data.push(chunk); + }); + + res.on('end', () => { + const entries = parseXMLSitemap(Buffer.concat(data).toString()); + resolve(entries); + }); + }).on('error', (err) => { + console.log('Error: ', err.message); + }); + }); +} + +async function main() { + const sitemap = await getSitemap(); + const urls = []; + const urlElements = sitemap.getElementsByTagName('url'); + + for (let i = 0; i < urlElements.length; i++) { + const urlElement = urlElements[i]; + urls.push(urlElement.getElementsByTagName('loc')[0].textContent); + } + + urls.forEach((url) => console.log(url)); +} + +main(); \ No newline at end of file From 55ead3be806aa0552a18778cfc480cfd4d3a0bd6 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 24 Nov 2023 13:44:23 +0100 Subject: [PATCH 18/49] transform legacy section titles into h3s --- tools/importer/import.js | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/importer/import.js b/tools/importer/import.js index b736a54e..1216a50c 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -225,6 +225,18 @@ export default { brightcoveVideo.replaceWith(WebImporter.DOMUtils.createTable(brightcoveRows, document)); }); + main.querySelectorAll('b').forEach((bold) => { + if (bold.textContent.trim() === bold.parentElement.textContent.trim() + && !bold.textContent.startsWith('Click') + && !bold.closest('ul') + && !bold.closest('ol') + ) { + const h3 = document.createElement('h3'); + h3.textContent = bold.textContent; + bold.parentElement.replaceWith(h3); + } + }) + return main; }, From 1576fef8927cb436a83984f0391b8cd9d9463dd2 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 24 Nov 2023 14:40:44 +0100 Subject: [PATCH 19/49] 107 - [Importer] Handle Youtube videos --- tools/importer/import.js | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 1216a50c..d3920b41 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -200,7 +200,7 @@ export default { } }) - // Processing... + // Processing Links main.querySelectorAll('a').forEach((link) => { if (isServiceNowLink(link)) { if (isBlogLink(link)) { @@ -211,6 +211,7 @@ export default { } }); + // Brightcove videos main.querySelectorAll('.brightcove-video-wrapper video').forEach((brightcoveVideo) => { const defaultBCAccountId = '5703385908001'; const defaultBCPlayer = 'default'; @@ -225,6 +226,15 @@ export default { brightcoveVideo.replaceWith(WebImporter.DOMUtils.createTable(brightcoveRows, document)); }); + // Youtube Videos + main.querySelectorAll('.comp-youtube-video iframe').forEach((youtubeIframe) => { + const youtubeLink = document.createElement('a'); + youtubeLink.href = youtubeIframe.src; + youtubeLink.textContent = youtubeIframe.src; + youtubeIframe.replaceWith(WebImporter.DOMUtils.createTable([['Embed'], [ youtubeLink ]], document)); + }); + + // Replace legacy standlone bold elements with h3s main.querySelectorAll('b').forEach((bold) => { if (bold.textContent.trim() === bold.parentElement.textContent.trim() && !bold.textContent.startsWith('Click') @@ -235,10 +245,9 @@ export default { h3.textContent = bold.textContent; bold.parentElement.replaceWith(h3); } - }) + }); return main; - }, /** From c05123a35fa26e6298ba398815d8a5043ace8844 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Mon, 27 Nov 2023 14:45:41 +0100 Subject: [PATCH 20/49] import images with links and use the customer urls --- tools/importer/import.js | 12 ++++++++++-- tools/importer/sitemap.js | 41 +++++++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index d3920b41..8b5139f7 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -12,8 +12,8 @@ /* global WebImporter */ /* eslint-disable no-console, class-methods-use-this */ -const pageUrl = "https://main--servicenow--hlxsites.hlx.page"; -const liveUrl = "https://main--servicenow--hlxsites.hlx.page"; +const pageUrl = "https://main--aemeds--servicenow-martech.hlx.page"; +const liveUrl = "https://main--aemeds--servicenow-martech.hlx.live"; const servicenowUrl = 'https://www.servicenow.com'; function fetchSync(method, url) { @@ -211,6 +211,14 @@ export default { } }); + main.querySelectorAll('a img, a picture').forEach((image) => { + const link = image.closest('a'); + link.before(image, document.createElement('br')); + if (link.textContent.trim() === '') { + link.textContent = link.href; + } + }); + // Brightcove videos main.querySelectorAll('.brightcove-video-wrapper video').forEach((brightcoveVideo) => { const defaultBCAccountId = '5703385908001'; diff --git a/tools/importer/sitemap.js b/tools/importer/sitemap.js index 6be66730..b4433fe8 100644 --- a/tools/importer/sitemap.js +++ b/tools/importer/sitemap.js @@ -1,6 +1,7 @@ const https = require('https'); const { DOMParser } = require('xmldom'); +const fs = require('fs'); // parse a text string into an XML DOM object function parseXMLSitemap(sitemapContent) { @@ -10,35 +11,41 @@ function parseXMLSitemap(sitemapContent) { } async function getSitemap() { - return new Promise((resolve) => { - https.get('https://www.servicenow.com/sitemap.xml', (res) => { - const data = []; - - res.on('data', (chunk) => { - data.push(chunk); - }); - - res.on('end', () => { - const entries = parseXMLSitemap(Buffer.concat(data).toString()); - resolve(entries); - }); - }).on('error', (err) => { - console.log('Error: ', err.message); - }); - }); + return parseXMLSitemap(fs.readFileSync('sitemap.xml').toString()); + //return parseXMLSitemap(); + // return new Promise((resolve) => { + // https.get('https://www.servicenow.com/sitemap.xml', (res) => { + // const data = []; + + // res.on('data', (chunk) => { + // data.push(chunk); + // }); + + // res.on('end', () => { + // console.log(Buffer.concat(data).toString()); + // const entries = parseXMLSitemap(Buffer.concat(data).toString()); + // resolve(entries); + // }); + // }).on('error', (err) => { + // console.log('Error: ', err.message); + // }); + // }); } async function main() { const sitemap = await getSitemap(); const urls = []; + const edsURL = []; const urlElements = sitemap.getElementsByTagName('url'); for (let i = 0; i < urlElements.length; i++) { const urlElement = urlElements[i]; urls.push(urlElement.getElementsByTagName('loc')[0].textContent); + edsURL.push(new URL(new URL(urls[i]).pathname, 'https://main--aemeds--servicenow-martech.hlx.live/').toString()); } - urls.forEach((url) => console.log(url)); + //urls.forEach((url) => console.log(url)); + //edsURL.forEach((url) => console.log(url)); } main(); \ No newline at end of file From 791c87bb316e4036d75b2c81de3c47ee204ec6e3 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Tue, 28 Nov 2023 13:31:25 +0100 Subject: [PATCH 21/49] fix - EDS URL output --- tools/importer/sitemap.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/importer/sitemap.js b/tools/importer/sitemap.js index b4433fe8..2857c067 100644 --- a/tools/importer/sitemap.js +++ b/tools/importer/sitemap.js @@ -41,11 +41,11 @@ async function main() { for (let i = 0; i < urlElements.length; i++) { const urlElement = urlElements[i]; urls.push(urlElement.getElementsByTagName('loc')[0].textContent); - edsURL.push(new URL(new URL(urls[i]).pathname, 'https://main--aemeds--servicenow-martech.hlx.live/').toString()); + edsURL.push(new URL(new URL(urls[i]).pathname.replace('.html', ''), 'https://main--aemeds--servicenow-martech.hlx.live/').toString()); } //urls.forEach((url) => console.log(url)); - //edsURL.forEach((url) => console.log(url)); + edsURL.forEach((url) => console.log(url)); } main(); \ No newline at end of file From a851690f7d5e698d72858a15958df705299d0cc7 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Tue, 28 Nov 2023 17:03:14 +0100 Subject: [PATCH 22/49] use a complete list of tags for import --- tools/importer/import-filter-page.js | 2 +- tools/importer/import.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/importer/import-filter-page.js b/tools/importer/import-filter-page.js index 9e32f58a..e277b76a 100644 --- a/tools/importer/import-filter-page.js +++ b/tools/importer/import-filter-page.js @@ -29,7 +29,7 @@ function jsonRenditionURL(url, depth = 1) { } function getAllTags() { - const tagsURL = 'https://main--servicenow--hlxsites.hlx.live/blogs/tags.json'; + const tagsURL = 'https://main--servicenow--hlxsites.hlx.live/drafts/import/tags.json'; const response = fetchSync('GET', tagsURL); if (response.status === 200) { return JSON.parse(response.body); diff --git a/tools/importer/import.js b/tools/importer/import.js index 8b5139f7..31a26e97 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -33,7 +33,7 @@ function jsonRenditionURL(url) { } function getAllTags() { - const tagsURL = 'https://main--servicenow--hlxsites.hlx.live/blogs/tags.json'; + const tagsURL = 'https://main--servicenow--hlxsites.hlx.live/drafts/import/tags.json'; const response = fetchSync('GET', tagsURL); if (response.status === 200) { return JSON.parse(response.body); From 563361fc4009f4ea00b223d467953cc7cafffb34 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Wed, 29 Nov 2023 13:19:44 +0100 Subject: [PATCH 23/49] 124 - Import links to subdomains of servicenow.com --- tools/importer/import.js | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 31a26e97..f6e40ef3 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -58,11 +58,14 @@ function getOriginalNewTrendTag(originalTags) { } function isServiceNowLink(link) { - return (link.host.startsWith('localhost') || link.host.endsWith('servicenow.com')); + // we don't want to overwrite subdomains, so we use strict host check + // see https://github.com/hlxsites/servicenow/issues/124 + return (link.host.startsWith('localhost') + || link.host === 'www.servicenow.com' + || link.host === 'servicenow.com'); } function isBlogLink(link) { - return isServiceNowLink(link) && (link.pathname.startsWith('/blogs') || link.pathname.startsWith('/fr/blogs') From ee5967c94ccbc1b949e3762b44b92b3653c3ac29 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Wed, 29 Nov 2023 13:25:10 +0100 Subject: [PATCH 24/49] 139 - Browser tab title suffix --- tools/importer/import.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index f6e40ef3..4773ba7c 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -125,9 +125,9 @@ const createMetadataBlock = (main, document, url) => { const title = document.querySelector('title'); if (title) { let titleText = title.textContent.replace(/[\n\t]/gm, ''); - const suffix = ' – ServiceNow Blog'; + const suffix = 'ServiceNow Blog'; if (titleText.endsWith(suffix)) { - titleText = titleText.substring(0, titleText.length - suffix.length); + titleText = titleText.substring(0, titleText.length - (suffix.length + 3)).trim(); } meta.Title = titleText; } From 8d484cdc4ec3e6005cc8b569d2daa59f74e0b5f9 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Sat, 2 Dec 2023 20:56:02 +0100 Subject: [PATCH 25/49] fix - linked images, partial brs --- tools/importer/import.js | 46 ++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 4773ba7c..44735cbb 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -173,6 +173,10 @@ const createMetadataBlock = (main, document, url) => { return meta; }; +const h3ConvertExceptions = [ + 'https://www.servicenow.com/blogs/2022/nvidia-gtc-22-leaders-ai-panel-q-and-a.html', +]; + export default { /** * Apply DOM operations to the provided document and return @@ -187,6 +191,7 @@ export default { // eslint-disable-next-line no-unused-vars document, url, html, params, }) => { + const main = document.querySelector('body'); createMetadataBlock(main, document, url); @@ -194,7 +199,6 @@ export default { // CLEANUP main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); // TODO is this ok? - main.querySelectorAll('br, nbsp').forEach((el) => el.remove()); main.querySelectorAll('img[src^="/akam/13/pixel"]').forEach((el) => el.remove()); // Remove copyright as we create a fragment with it. main.querySelectorAll('p').forEach((paragraph) => { @@ -216,7 +220,7 @@ export default { main.querySelectorAll('a img, a picture').forEach((image) => { const link = image.closest('a'); - link.before(image, document.createElement('br')); + link.parentElement.before(image); if (link.textContent.trim() === '') { link.textContent = link.href; } @@ -245,18 +249,34 @@ export default { youtubeIframe.replaceWith(WebImporter.DOMUtils.createTable([['Embed'], [ youtubeLink ]], document)); }); - // Replace legacy standlone bold elements with h3s - main.querySelectorAll('b').forEach((bold) => { - if (bold.textContent.trim() === bold.parentElement.textContent.trim() - && !bold.textContent.startsWith('Click') - && !bold.closest('ul') - && !bold.closest('ol') - ) { - const h3 = document.createElement('h3'); - h3.textContent = bold.textContent; - bold.parentElement.replaceWith(h3); + if (!h3ConvertExceptions.includes(params.originalURL)) { + // Replace legacy standlone bold elements with h3s + main.querySelectorAll('b').forEach((bold) => { + if (bold.textContent.trim() === bold.parentElement.textContent.trim() + && (bold.textContent.length < 100) + && !bold.textContent.startsWith('Click') + && !bold.closest('ul') + && !bold.closest('ol') + ) { + const h3 = document.createElement('h3'); + h3.textContent = bold.textContent; + bold.parentElement.replaceWith(h3); + } + }); + } else { + console.log('Skipping h3 conversion for URL', params.originalURL) + } + + main.querySelectorAll('br').forEach((br) => { + const parent = br.parentElement; + if (!parent) return; + if (!parent.nextElementSibling) return; + + if (parent.innerHTML.trim().endsWith('
') && parent.nextElementSibling.tagName === 'P') { + parent.innerHTML += '\n' + parent.nextElementSibling.innerHTML; + parent.nextElementSibling.remove(); } - }); + }) return main; }, From 000cec2ba51474aa6695ddcb59541bea0d89bf30 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Sat, 2 Dec 2023 21:50:17 +0100 Subject: [PATCH 26/49] fix more spacing issues in import --- tools/importer/import.js | 44 +++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 44735cbb..b3d688cd 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -267,16 +267,40 @@ export default { console.log('Skipping h3 conversion for URL', params.originalURL) } - main.querySelectorAll('br').forEach((br) => { - const parent = br.parentElement; - if (!parent) return; - if (!parent.nextElementSibling) return; - - if (parent.innerHTML.trim().endsWith('
') && parent.nextElementSibling.tagName === 'P') { - parent.innerHTML += '\n' + parent.nextElementSibling.innerHTML; - parent.nextElementSibling.remove(); - } - }) + let edited; + do { + edited = false; + const BRs = [...main.querySelectorAll('br')]; + for (let i = 0; i < BRs.length; i++) { + const br = BRs[i]; + const parent = br.parentElement; + if (!parent) return; + + // used in https://www.servicenow.com/blogs/2022/data-augmentation-intent-classification.html + if (parent.previousElementSibling + && parent.innerHTML.trim().startsWith('
\n') + && parent.previousElementSibling.tagName === 'P') { + + while(parent.querySelector('br:last-child')) { + parent.querySelector('br:last-child').remove(); + } + + parent.previousElementSibling.innerHTML += '\n
\n' + parent.innerHTML; + parent.remove(); + edited = true; + break; + } + + if (parent.nextElementSibling + && parent.innerHTML.trim().endsWith('
') + && parent.nextElementSibling.tagName === 'P') { + parent.innerHTML += '\n' + parent.nextElementSibling.innerHTML; + parent.nextElementSibling.remove(); + edited = true; + break; + } + }; + } while(edited); return main; }, From 4934fd311fa9f4b1eea23a07c64f61a5487920cc Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Sun, 3 Dec 2023 19:00:37 +0100 Subject: [PATCH 27/49] add exception --- tools/importer/import.js | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/importer/import.js b/tools/importer/import.js index b3d688cd..f8e7a0cb 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -175,6 +175,7 @@ const createMetadataBlock = (main, document, url) => { const h3ConvertExceptions = [ 'https://www.servicenow.com/blogs/2022/nvidia-gtc-22-leaders-ai-panel-q-and-a.html', + 'https://www.servicenow.com/blogs/2022/knowledge-2022-social-media-contest-rules.html', ]; export default { From aaf5eb71b29469e28b96e8bf1e354b994c3cc6fc Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Sun, 3 Dec 2023 20:05:57 +0100 Subject: [PATCH 28/49] fix more BR issues --- tools/importer/import.js | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/importer/import.js b/tools/importer/import.js index f8e7a0cb..1e6264c6 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -293,6 +293,7 @@ export default { } if (parent.nextElementSibling + && parent.tagName !== 'H3' && parent.innerHTML.trim().endsWith('
') && parent.nextElementSibling.tagName === 'P') { parent.innerHTML += '\n' + parent.nextElementSibling.innerHTML; From 20e7c758efcf62d4f90fb8c7cff12ab7ad5f49cb Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Mon, 4 Dec 2023 16:45:12 +0100 Subject: [PATCH 29/49] use columns block for authors --- tools/importer/import-author.js | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tools/importer/import-author.js b/tools/importer/import-author.js index 372c7771..f4fc2844 100644 --- a/tools/importer/import-author.js +++ b/tools/importer/import-author.js @@ -45,9 +45,9 @@ const createMetadataBlock = (main, document, url) => { const title = document.querySelector('title'); if (title) { let titleText = title.textContent.replace(/[\n\t]/gm, ''); - const suffix = ' – ServiceNow Blog'; + const suffix = 'ServiceNow Blog'; if (titleText.endsWith(suffix)) { - titleText = titleText.substring(0, titleText.length - suffix.length); + titleText = titleText.substring(0, titleText.length - (suffix.length + 3)).trim(); } meta.Title = titleText; } @@ -70,6 +70,16 @@ const createMetadataBlock = (main, document, url) => { if (author && author.content.trim()) { authorUrl = author.content.trim(); } + + const authorPicture = document.querySelector('.image img'); + const authorDescription = document.querySelectorAll('.text p'); + + const authorColumns = WebImporter.DOMUtils.createTable([ + ['Columns (Blog Authors)'], + [ authorPicture, [...authorDescription]] + ], document); + main.append(authorColumns); + // insert new hr element as last element of main const hr = document.createElement('hr'); main.append(hr); From e82744809987fdbb06557899c3ea3bd3a2bcc1e4 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Mon, 4 Dec 2023 18:58:07 +0100 Subject: [PATCH 30/49] fixes for pages and authors --- tools/importer/import-author.js | 24 ++++++++++++++++++------ tools/importer/import.js | 2 +- tools/importer/sitemap.js | 4 ++-- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/tools/importer/import-author.js b/tools/importer/import-author.js index f4fc2844..6fc26c32 100644 --- a/tools/importer/import-author.js +++ b/tools/importer/import-author.js @@ -12,7 +12,7 @@ /* global WebImporter */ /* eslint-disable no-console, class-methods-use-this */ -const pageUrl = "https://main--servicenow--hlxsites.hlx.page"; +const pageUrl = "https://main--aemeds--servicenow-martech.hlx.page"; const servicenowUrl = 'https://www.servicenow.com'; function isServiceNowLink(link) { @@ -34,7 +34,7 @@ function getServiceNowUrl(link) { } function getPageUrl(link) { - return new URL(new URL(link.href).pathname.replace('.html', ''), pageUrl); + return new URL(new URL(link).pathname.replace('.html', ''), pageUrl); } const createMetadataBlock = (main, document, url) => { @@ -44,11 +44,12 @@ const createMetadataBlock = (main, document, url) => { // Title const title = document.querySelector('title'); if (title) { - let titleText = title.textContent.replace(/[\n\t]/gm, ''); + let titleText = title.textContent.replace(/[\n\t]/gm, '').trim(); const suffix = 'ServiceNow Blog'; if (titleText.endsWith(suffix)) { titleText = titleText.substring(0, titleText.length - (suffix.length + 3)).trim(); } + console.log(titleText); meta.Title = titleText; } @@ -71,8 +72,19 @@ const createMetadataBlock = (main, document, url) => { authorUrl = author.content.trim(); } - const authorPicture = document.querySelector('.image img'); - const authorDescription = document.querySelectorAll('.text p'); + let authorPicture = document.querySelector('.image img'); + if (!authorPicture) { + authorPicture = document.createElement('img'); + authorPicture.src = '/blogs/author/servicenow-blog/_jcr_content/root/responsivegrid/responsivegrid/responsivegrid_387144068/image.coreimg.png/1619481053507/servicenow-blog.png'; + } + + let authorDescription = document.querySelectorAll('.text p'); + if (authorDescription.length === 0 && document.querySelector('.cmp-text')) { + const paragraph = document.createElement('p'); + paragraph.textContent = document.querySelector('.cmp-text').textContent; + document.querySelector('.cmp-text').remove(); + authorDescription = [ paragraph ]; + } const authorColumns = WebImporter.DOMUtils.createTable([ ['Columns (Blog Authors)'], @@ -84,7 +96,7 @@ const createMetadataBlock = (main, document, url) => { const hr = document.createElement('hr'); main.append(hr); - const blogList = WebImporter.DOMUtils.createTable([['Blog List'], ['Author ', authorUrl]], document); + const blogList = WebImporter.DOMUtils.createTable([['Blog List'], ['Author ', getPageUrl(authorUrl)]], document); const metadataBlock = WebImporter.Blocks.getMetadataBlock(document, meta); main.append(blogList); diff --git a/tools/importer/import.js b/tools/importer/import.js index 1e6264c6..b986f251 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -124,7 +124,7 @@ const createMetadataBlock = (main, document, url) => { // Title const title = document.querySelector('title'); if (title) { - let titleText = title.textContent.replace(/[\n\t]/gm, ''); + let titleText = title.textContent.replace(/[\n\t]/gm, '').trim(); const suffix = 'ServiceNow Blog'; if (titleText.endsWith(suffix)) { titleText = titleText.substring(0, titleText.length - (suffix.length + 3)).trim(); diff --git a/tools/importer/sitemap.js b/tools/importer/sitemap.js index 2857c067..8a3278b0 100644 --- a/tools/importer/sitemap.js +++ b/tools/importer/sitemap.js @@ -44,8 +44,8 @@ async function main() { edsURL.push(new URL(new URL(urls[i]).pathname.replace('.html', ''), 'https://main--aemeds--servicenow-martech.hlx.live/').toString()); } - //urls.forEach((url) => console.log(url)); - edsURL.forEach((url) => console.log(url)); + urls.forEach((url) => console.log(url)); + //edsURL.forEach((url) => console.log(url)); } main(); \ No newline at end of file From 867f44c1e585f0fc722419308c61756124f69410 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Tue, 5 Dec 2023 13:34:05 +0100 Subject: [PATCH 31/49] fix - whitespaces --- tools/importer/import.js | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/importer/import.js b/tools/importer/import.js index b986f251..57325c8e 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -179,6 +179,15 @@ const h3ConvertExceptions = [ ]; export default { + onLoad: async ({ document, url, params }) => { + [...document.querySelectorAll('p')].forEach((paragraph) => { + if (paragraph.querySelector('br') && paragraph.innerHTML.trim().endsWith(' ')) { + paragraph.dataset.endsWithNBSPPresent = true; + } + }); + + }, + /** * Apply DOM operations to the provided document and return * the root element to be then transformed to Markdown. @@ -296,6 +305,10 @@ export default { && parent.tagName !== 'H3' && parent.innerHTML.trim().endsWith('
') && parent.nextElementSibling.tagName === 'P') { + if (parent.dataset.endsWithNBSPPresent) { + parent.innerHTML += '\n
'; + } + parent.innerHTML += '\n' + parent.nextElementSibling.innerHTML; parent.nextElementSibling.remove(); edited = true; From 6388d3d12b3d7aced0771186f053fb80f4b129fc Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Tue, 5 Dec 2023 15:41:01 +0100 Subject: [PATCH 32/49] fix - h6s --- tools/importer/import.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/importer/import.js b/tools/importer/import.js index 57325c8e..fbab42c4 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -273,6 +273,12 @@ export default { bold.parentElement.replaceWith(h3); } }); + + main.querySelectorAll('h6').forEach((h6) => { + const h3 = document.createElement('h3'); + h3.textContent = h6.textContent; + h6.replaceWith(h3); + }); } else { console.log('Skipping h3 conversion for URL', params.originalURL) } From 463c1c7b1ad2c2acb1ad15ca17620564f043ceeb Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Tue, 5 Dec 2023 16:07:08 +0100 Subject: [PATCH 33/49] fix - more h3s --- tools/importer/import.js | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/importer/import.js b/tools/importer/import.js index fbab42c4..1db4e700 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -272,8 +272,19 @@ export default { h3.textContent = bold.textContent; bold.parentElement.replaceWith(h3); } + + if ((bold.nextElementSibling.tagName === 'BR' || bold.innerHTML.trim().endsWith('
')) && bold.textContent.length < 100) { + const h3 = document.createElement('h3'); + h3.textContent = bold.textContent; + + const parent = bold.parentElement; + bold.remove(); + parent.before(h3); + } }); + + main.querySelectorAll('h6').forEach((h6) => { const h3 = document.createElement('h3'); h3.textContent = h6.textContent; From ac2673dd1e7ab7d477473b3e7ab7f2a0797a25e5 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Tue, 5 Dec 2023 23:00:05 +0100 Subject: [PATCH 34/49] fix - whitespace issues --- tools/importer/import.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 1db4e700..db841c9e 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -273,7 +273,7 @@ export default { bold.parentElement.replaceWith(h3); } - if ((bold.nextElementSibling.tagName === 'BR' || bold.innerHTML.trim().endsWith('
')) && bold.textContent.length < 100) { + if (((bold.nextSibling && bold.nextSibling.tagName === 'BR') || bold.innerHTML.trim().endsWith('
')) && bold.textContent.length < 100) { const h3 = document.createElement('h3'); h3.textContent = bold.textContent; From ee32a02dda5896582ee97807a9281ad9facfc2d5 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Thu, 7 Dec 2023 08:30:42 +0100 Subject: [PATCH 35/49] fix - remove copyright variant --- tools/importer/import.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/importer/import.js b/tools/importer/import.js index db841c9e..b1bd4f01 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -215,6 +215,10 @@ export default { if (paragraph.textContent.includes('ServiceNow, Inc. All rights reserved.')) { paragraph.remove(); } + + if (paragraph.textContent.includes('ServiceNow, the ServiceNow logo, Now, and other ServiceNow marks are trademarks')) { + paragraph.remove(); + } }) // Processing Links From a92dd5cda4a5475e84a04658aab23c8bb40ec111 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Thu, 7 Dec 2023 13:15:43 +0100 Subject: [PATCH 36/49] fix - filter page meta title --- tools/importer/import-filter-page.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/importer/import-filter-page.js b/tools/importer/import-filter-page.js index e277b76a..036199c4 100644 --- a/tools/importer/import-filter-page.js +++ b/tools/importer/import-filter-page.js @@ -44,9 +44,9 @@ const createMetadataBlock = (main, document, url) => { const title = document.querySelector('title'); if (title) { let titleText = title.textContent.replace(/[\n\t]/gm, '').trim(); - const suffix = ' - ServiceNow Blog'; + const suffix = 'ServiceNow Blog'; if (titleText.endsWith(suffix)) { - titleText = titleText.substring(0, titleText.length - suffix.length); + titleText = titleText.substring(0, titleText.length - (suffix.length + 3)).trim(); } meta.Title = titleText; } From 128577a47b96ca4df98a6c22b2c1c97e55338541 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Thu, 7 Dec 2023 15:36:53 +0100 Subject: [PATCH 37/49] fixes --- tools/importer/import.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index b1bd4f01..793c586c 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -85,7 +85,7 @@ function getPageUrl(link) { const createMetadataBlock = (main, document, url) => { const jsonRendition = JSON.parse(fetchSync('GET', jsonRenditionURL(url)).body); const allTags = getAllTags(); - const originalTags = getOriginalTags(jsonRendition); + const originalTags = getOriginalTags(jsonRendition) || []; const originalCategoryTag = getOriginalCategoryTag(originalTags); const originalTopicTag = getOriginalTopicTag(originalTags); @@ -265,7 +265,7 @@ export default { if (!h3ConvertExceptions.includes(params.originalURL)) { // Replace legacy standlone bold elements with h3s - main.querySelectorAll('b').forEach((bold) => { + main.querySelectorAll('strong, b').forEach((bold) => { if (bold.textContent.trim() === bold.parentElement.textContent.trim() && (bold.textContent.length < 100) && !bold.textContent.startsWith('Click') From f4d587bceabeb5b1413bb788a448ebdddfceb5ea Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 8 Dec 2023 13:47:47 +0100 Subject: [PATCH 38/49] fixes --- tools/importer/import-author.js | 2 +- tools/importer/import-filter-page.js | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/importer/import-author.js b/tools/importer/import-author.js index 6fc26c32..2d62be98 100644 --- a/tools/importer/import-author.js +++ b/tools/importer/import-author.js @@ -129,7 +129,7 @@ export default { main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4, .hero-image, .com-seperator, .servicenow-blog-list--block').forEach(el => el.remove()); main.querySelectorAll('br, nbsp').forEach((el) => el.remove()); - main.querySelectorAll('img[src^="/akam/13/pixel"]').forEach((el) => el.remove()); + main.querySelectorAll('img[src^="/akam/13/pixel"], noscript').forEach((el) => el.remove()); // Processing... diff --git a/tools/importer/import-filter-page.js b/tools/importer/import-filter-page.js index 036199c4..70690f7f 100644 --- a/tools/importer/import-filter-page.js +++ b/tools/importer/import-filter-page.js @@ -55,9 +55,10 @@ const createMetadataBlock = (main, document, url) => { const desc = document.querySelector('[property="og:description"]'); if (desc && desc.content.trim()) { meta.Description = desc.content.trim(); - } else { - meta.Description = 'Read about ServiceNow\'s Company News, Announcements, and Updates.'; - } + } + // else { + // meta.Description = 'Read about ServiceNow\'s Company News, Announcements, and Updates.'; + // } // Keywords const keywords = document.querySelector('meta[name="keywords"]'); @@ -86,6 +87,7 @@ export default { document, url, html, params, }) => { const main = document.querySelector('body'); + main.querySelectorAll('img[src^="/akam/13/pixel"], noscript').forEach((el) => el.remove()); // CLEANUP main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4, .hero-image').forEach(el => el.remove()); From a0bbfa5989087b2e5f011d1eda9af782e47aeb32 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 8 Dec 2023 15:15:20 +0100 Subject: [PATCH 39/49] fixes --- tools/importer/import.js | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 793c586c..8eee8ad2 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -141,8 +141,7 @@ const createMetadataBlock = (main, document, url) => { // Publication Date const date = document.querySelector('.cmp-blog-author-info__date'); if (date) { - const dateStr = date.textContent.trim(); - const dateObj = new Date(dateStr); + const dateObj = new Date(jsonRendition['jcr:content'].date); // format date to mm/dd/yyyy meta['Publication Date'] = dateObj.toLocaleDateString( 'en-US', @@ -207,9 +206,9 @@ export default { createMetadataBlock(main, document, url); // CLEANUP - main.querySelectorAll('.legacyHTML, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); + main.querySelectorAll('.legacyHTML, .social-sharing, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); // TODO is this ok? - main.querySelectorAll('img[src^="/akam/13/pixel"]').forEach((el) => el.remove()); + main.querySelectorAll('img[src^="/akam/13/pixel"], noscript').forEach((el) => el.remove()); // Remove copyright as we create a fragment with it. main.querySelectorAll('p').forEach((paragraph) => { if (paragraph.textContent.includes('ServiceNow, Inc. All rights reserved.')) { @@ -219,6 +218,10 @@ export default { if (paragraph.textContent.includes('ServiceNow, the ServiceNow logo, Now, and other ServiceNow marks are trademarks')) { paragraph.remove(); } + + if (paragraph.textContent.includes('ServiceNow, Inc. Alle Rechte vorbehalten. ServiceNow, das ServiceNow-Logo, Now und andere Marken')) { + paragraph.remove(); + } }) // Processing Links From faa1fd56af5421ff0867021c1dc9833a19aa9c42 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 8 Dec 2023 18:36:28 +0100 Subject: [PATCH 40/49] fixes --- tools/importer/import.js | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 8eee8ad2..0d62c1df 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -175,6 +175,8 @@ const createMetadataBlock = (main, document, url) => { const h3ConvertExceptions = [ 'https://www.servicenow.com/blogs/2022/nvidia-gtc-22-leaders-ai-panel-q-and-a.html', 'https://www.servicenow.com/blogs/2022/knowledge-2022-social-media-contest-rules.html', + 'https://www.servicenow.com/de/blogs/2020/servicenow-bei-boehringer-ingelheim-one-stop-shop-fuer-exzellenten-service-und-best-in-class-customer-experience.html', + 'https://www.servicenow.com/de/blogs/2020/loesungen-fuer-die-finanzbranche-quasi-make-up-mit-tiefenwirkung.html', ]; export default { @@ -184,7 +186,6 @@ export default { paragraph.dataset.endsWithNBSPPresent = true; } }); - }, /** @@ -207,7 +208,7 @@ export default { // CLEANUP main.querySelectorAll('.legacyHTML, .social-sharing, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); - // TODO is this ok? + main.querySelectorAll('img[src^="/akam/13/pixel"], noscript').forEach((el) => el.remove()); // Remove copyright as we create a fragment with it. main.querySelectorAll('p').forEach((paragraph) => { @@ -290,8 +291,6 @@ export default { } }); - - main.querySelectorAll('h6').forEach((h6) => { const h3 = document.createElement('h3'); h3.textContent = h6.textContent; From e46642d1209f9a28d67d6a42fb671c1df6cfa1f5 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Sat, 9 Dec 2023 20:21:02 +0100 Subject: [PATCH 41/49] fix - copyright translations --- tools/importer/import.js | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/importer/import.js b/tools/importer/import.js index 0d62c1df..2911ef61 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -223,6 +223,14 @@ export default { if (paragraph.textContent.includes('ServiceNow, Inc. Alle Rechte vorbehalten. ServiceNow, das ServiceNow-Logo, Now und andere Marken')) { paragraph.remove(); } + + if (paragraph.textContent.includes('ServiceNow, Inc. Tous droits réservés. ServiceNow, le logo ServiceNow, Now et les autres')) { + paragraph.remove(); + } + + if (paragraph.textContent.includes('ServiceNow, Inc. Alle rechten voorbehouden. ServiceNow, het ServiceNow-logo, Now en alle overige')) { + paragraph.remove(); + } }) // Processing Links From e3ee791849c432dc95275636000b5fb353c48464 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Mon, 11 Dec 2023 11:04:51 +0100 Subject: [PATCH 42/49] fix - brightcove --- tools/importer/import.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 2911ef61..df8bc66b 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -264,7 +264,7 @@ export default { if (brightcoveVideo.dataset.player !== defaultBCPlayer) { brightcoveRows.push(['Player', brightcoveVideo.dataset.player]); } - brightcoveVideo.replaceWith(WebImporter.DOMUtils.createTable(brightcoveRows, document)); + brightcoveVideo.closest('.brightcove-video').replaceWith(WebImporter.DOMUtils.createTable(brightcoveRows, document)); }); // Youtube Videos From c3d47683bc54d731c72257aae9faead5e9cba99b Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Wed, 13 Dec 2023 16:51:09 +0200 Subject: [PATCH 43/49] Added specifics for 2016-2019 blogs --- tools/importer/import.js | 72 ++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 11 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index df8bc66b..7efb898c 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -82,6 +82,18 @@ function getPageUrl(link) { return new URL(new URL(link.href).pathname.replace('.html', ''), pageUrl); } + +function makeProxySrc(imgSrc) { + // imgSrc comes in format https://www.servicenow.com/content/dam/servicenow-blog-docs/servicematters/2017/ITSM-Dashboard-Final.png + // extract path from imgSrc + const u = new URL(imgSrc); + u.searchParams.append('host', u.origin); + + console.log('makeProxySrc: ', u.pathname, u.search); + + return `http://localhost:3001${u.pathname}${u.search}`; +} + const createMetadataBlock = (main, document, url) => { const jsonRendition = JSON.parse(fetchSync('GET', jsonRenditionURL(url)).body); const allTags = getAllTags(); @@ -117,10 +129,6 @@ const createMetadataBlock = (main, document, url) => { meta['Trend'] = 'Trends and Research'; } - // This comes from the bulk metadata for Blog articles - // meta.Template = 'Blog Article'; - - // Title const title = document.querySelector('title'); if (title) { @@ -158,13 +166,30 @@ const createMetadataBlock = (main, document, url) => { // Template set in the metadata.xls // meta.Template = 'Blog Article'; - // Image - TODO if we find blogs for which the first image is not the same as the og:image - // const img = document.querySelector('[property="og:image"]'); - // if (img && img.content) { - // const el = document.createElement('img'); - // el.src = img.content; - // meta.Image = el; - // } + // Image - if this is an older article with hero being removeed from content (due to actual logic in blogs) + // log the url and check if it should be removed + if (isOlderThan2020(url) && shouldHeroBeRemoved(main)) { + const img = document.querySelector('[property="og:image"]'); + + if (img && img.content) { + + console.log('fould og:image ', img.content); + // console.log('proxySrc: ', makeProxySrc(img.content)); + + const el = document.createElement('img'); + el.src = makeProxySrc(img.content); + meta.Image = el; + } else { + // find image in the div with class hero-image + const heroImage = document.querySelector('.hero-image img'); + if (heroImage) { + const el = document.createElement('img'); + el.src = makeProxySrc(heroImage.src); + meta.Image = el; + } + } + + } const block = WebImporter.Blocks.getMetadataBlock(document, meta); main.prepend(block); @@ -177,8 +202,28 @@ const h3ConvertExceptions = [ 'https://www.servicenow.com/blogs/2022/knowledge-2022-social-media-contest-rules.html', 'https://www.servicenow.com/de/blogs/2020/servicenow-bei-boehringer-ingelheim-one-stop-shop-fuer-exzellenten-service-und-best-in-class-customer-experience.html', 'https://www.servicenow.com/de/blogs/2020/loesungen-fuer-die-finanzbranche-quasi-make-up-mit-tiefenwirkung.html', + 'https://www.servicenow.com/blogs/2017/4-years-row-servicenow-named-leader-gartner-magic-quadrant-service-management-tools.html', + 'https://www.servicenow.com/blogs/2017/business-value-servicenow-look-hr-business-apps-customer-service.html', + 'https://www.servicenow.com/blogs/2017/meet-servicenow-agent-intelligence-machine-learning-for-everyday-work.html', + ]; +function shouldHeroBeRemoved(main) { + return main.querySelectorAll('.sn-blog-main-copy-wrapper').length > 0; +} + +function cleanup2016to2019Blogs(main) { + // remove hero image if there is one div with class sn-blog-main-copy-wrapper + if (shouldHeroBeRemoved(main)) { + main.querySelectorAll('.hero-image').forEach(el => el.remove()); + } +} + +function isOlderThan2020(url) { + // return true if url contains /blogs/2016 or /blogs/2017 or /blogs/2018 or /blogs/2019 + return url.includes('/blogs/2016') || url.includes('/blogs/2017') || url.includes('/blogs/2018') || url.includes('/blogs/2019'); +} + export default { onLoad: async ({ document, url, params }) => { [...document.querySelectorAll('p')].forEach((paragraph) => { @@ -209,6 +254,11 @@ export default { // CLEANUP main.querySelectorAll('.legacyHTML, .social-sharing, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); + // if url has the following pattern https://www.servicenow.com/blogs/ where year is 2016, 2017, 2018 or 2019 then do the cleanup + if (isOlderThan2020(url)) { + cleanup2016to2019Blogs(main); + } + main.querySelectorAll('img[src^="/akam/13/pixel"], noscript').forEach((el) => el.remove()); // Remove copyright as we create a fragment with it. main.querySelectorAll('p').forEach((paragraph) => { From e37e33b4f8b7e1e3fb74cfd09722a1eeef42c0cc Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Wed, 13 Dec 2023 22:07:15 +0200 Subject: [PATCH 44/49] include embed-video --- tools/importer/import.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 7efb898c..48ab3ef2 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -318,7 +318,7 @@ export default { }); // Youtube Videos - main.querySelectorAll('.comp-youtube-video iframe').forEach((youtubeIframe) => { + main.querySelectorAll('.comp-youtube-video iframe, .embed-youtube iframe').forEach((youtubeIframe) => { const youtubeLink = document.createElement('a'); youtubeLink.href = youtubeIframe.src; youtubeLink.textContent = youtubeIframe.src; From ae90459237cffa84b1e42dc0dfa227ce2dc3cc32 Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Mon, 18 Dec 2023 16:54:28 +0200 Subject: [PATCH 45/49] Fixed import date --- tools/importer/import.js | 41 +++++++++++++++++++---- tools/importer/tags.js | 70 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 102 insertions(+), 9 deletions(-) diff --git a/tools/importer/import.js b/tools/importer/import.js index 48ab3ef2..58946df9 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -83,6 +83,10 @@ function getPageUrl(link) { } +const timezoneMap = { + '+0530': 'Asia/Calcutta', +}; + function makeProxySrc(imgSrc) { // imgSrc comes in format https://www.servicenow.com/content/dam/servicenow-blog-docs/servicematters/2017/ITSM-Dashboard-Final.png // extract path from imgSrc @@ -149,14 +153,39 @@ const createMetadataBlock = (main, document, url) => { // Publication Date const date = document.querySelector('.cmp-blog-author-info__date'); if (date) { - const dateObj = new Date(jsonRendition['jcr:content'].date); - // format date to mm/dd/yyyy - meta['Publication Date'] = dateObj.toLocaleDateString( - 'en-US', - { day: '2-digit', month: '2-digit', year: 'numeric' }, - ); + let dateString = jsonRendition['jcr:content'].date; + + // find the tzMatch in the dateString which has the followin format Wed Mar 24 2021 16:11:00 GMT-0700 + const tzMatch = dateString.match(/GMT[+-][0-9]{4}/); + let timeZone= ''; + if (tzMatch) { + // tzMatch[0] has the folllowing format "GMT" e.g. GMT-0700 or GMT+0200. Extract the offset from it + const offset = tzMatch[0].substring(3); + // if the offset ends in 00 remove it from offset + if (offset.endsWith('00')) { + const o = offset.substring(1, offset.length - 2); + const sign = offset.startsWith('-') ? '+' : '-'; + // remove any leading zeroes from o e.g. -0700 becomes -7 + timeZone = `Etc/GMT${sign}${o.replace(/^0+/, '')}`; + + console.log('date:', dateString, ' timeZone: ', timeZone); + } else { + timeZone = timezoneMap[offset]; + console.log('Odd offset found: ', offset); + } } + const dateObj = new Date(dateString); + + console.log('date in seconds ', dateObj.getTime(), ' Date object: ', dateObj); + + // format date to mm/dd/yyyy + meta['Publication Date'] = dateObj.toLocaleDateString( + 'en-US', + { day: '2-digit', month: '2-digit', year: 'numeric' , timeZone}, + ); + } + // Keywords const keywords = document.querySelector('meta[name="keywords"]'); if (keywords && keywords.content) { diff --git a/tools/importer/tags.js b/tools/importer/tags.js index 8dc92a75..82123f2a 100644 --- a/tools/importer/tags.js +++ b/tools/importer/tags.js @@ -61,12 +61,67 @@ function getOriginalNewTrendTag(originalTags) { return originalTags.find((tag) => tag.startsWith('sn-blog-docs:new-trend')); } +function getPublishedDate(jsonRendition) { + return jsonRendition['jcr:content'].date; +} + +const timezoneMap = { + '+0530': 'Asia/Calcutta', +}; + +function getCorrectedDate(dateString) { + const tzMatch = dateString.match(/GMT[+-][0-9]{4}/); + let timeZone = ''; + if (tzMatch) { + const offset = tzMatch[0].substring(3); + // if the offset ends in 00 remove it from offset + if (offset.endsWith('00')) { + + // remove 00 suffix + const o = offset.substring(1, offset.length - 2); + const absolute = o.replace(/^0+/, ''); + + const sign = offset.startsWith('-') ? '+' : '-'; + // remove any leading zeroes from o e.g. -0700 becomes -7 + timeZone = absolute === '' ? 'Etc/GMT' : `Etc/GMT${sign}${absolute}`; + + console.log('date:', dateString, ' timeZone: ', timeZone); + } else { + timeZone = timezoneMap[offset]; + console.log('Odd offset found: ', offset); + } + } else { + console.log('date does not match time zone pattern: ', dateString, ' timeZone: ', timeZone); + } + + const dateObj = new Date(dateString); + + console.log('Converting date: ', dateString, ' timeZone: ', timeZone); + + return dateObj.toLocaleDateString( + 'en-US', + { + day: '2-digit', month: '2-digit', year: 'numeric', timeZone, + }, + ); +} + +function getPreviousDate(dateString) { + const dateObj = new Date(dateString); + return dateObj.toLocaleDateString( + 'en-US', + { + day: '2-digit', month: '2-digit', year: 'numeric', + }, + ); +} + const sitemaps = [ // { locale: 'en-US', sitemap: 'https://www.servicenow.com/sitemap.xml' }, // { locale: 'fr-FR', sitemap: 'https://www.servicenow.com/fr/sitemap.xml' }, // { locale: 'de-DE', sitemap: 'https://www.servicenow.com/de/sitemap.xml' }, - // { locale: 'en-GB', sitemap: 'https://www.servicenow.com/uk/sitemap.xml' }, - { locale: 'nl-NL', sitemap: 'https://www.servicenow.com/nl/sitemap.xml' }, + { locale: 'en-GB', sitemap: 'https://www.servicenow.com/uk/sitemap.xml' }, + // { locale: 'nl-NL', sitemap: 'https://www.servicenow.com/nl/sitemap.xml' }, ]; console.log('running'); @@ -123,12 +178,21 @@ for (let j = 0; j < sitemaps.length; j++) { newTrends.push(originalNewTrendTag); } + const publishedDate = getPublishedDate(jsonRendition); + const correctedDate = getCorrectedDate(publishedDate); + const previousDate = getPreviousDate(publishedDate); + + const wrongDate = previousDate !== correctedDate; + all.push({ locale: sitemaps[j].locale, loc, topic: originalTopicTag, category: originalCategoryTag, newTrend: originalNewTrendTag, + previousDate, + correctedDate, + wrongDate, }); } else { console.log('originalTags is undefined'); @@ -145,7 +209,7 @@ for (let j = 0; j < sitemaps.length; j++) { // itearte over all array and display properties loc and topic as csv line for (let k = 0; k < all.length; k++) { const element = all[k]; - console.log(`${element.locale},${element.loc},${element.topic},${element.category},${element.newTrend}`); + console.log(`${element.locale},${element.loc},${element.topic},${element.category},${element.newTrend},${element.previousDate},${element.correctedDate},${element.wrongDate}`); } }); } From 129c2f28ac85cbf4d47142ef729d43267b396edd Mon Sep 17 00:00:00 2001 From: Florentin Sardan Date: Thu, 21 Dec 2023 16:19:43 +0100 Subject: [PATCH 46/49] sitemap tools --- tools/sitemaps/.gitignore | 3 + tools/sitemaps/app.mjs | 383 +++++++++++++++++++++++++++++++ tools/sitemaps/package-lock.json | 39 ++++ tools/sitemaps/package.json | 17 ++ 4 files changed, 442 insertions(+) create mode 100644 tools/sitemaps/.gitignore create mode 100644 tools/sitemaps/app.mjs create mode 100644 tools/sitemaps/package-lock.json create mode 100644 tools/sitemaps/package.json diff --git a/tools/sitemaps/.gitignore b/tools/sitemaps/.gitignore new file mode 100644 index 00000000..142523fc --- /dev/null +++ b/tools/sitemaps/.gitignore @@ -0,0 +1,3 @@ +node_modules/ +data_old/ +data_new/ \ No newline at end of file diff --git a/tools/sitemaps/app.mjs b/tools/sitemaps/app.mjs new file mode 100644 index 00000000..f84a267d --- /dev/null +++ b/tools/sitemaps/app.mjs @@ -0,0 +1,383 @@ +import * as fs from 'fs'; +import { mkdir, readFile } from 'node:fs/promises'; +import { DOMParser } from 'xmldom'; +import { program } from 'commander'; +import { Readable } from 'stream'; +import { finished } from 'stream/promises'; +import * as path from 'path'; + +/* +Usage: +node app.mjs diff +node app.mjs download_old +node app.mjs download_new +node app.mjs find uppercase +node app.mjs find nonalnum +node app.mjs redirects | pbcopy +paste into redirects.xls + click on Ctrl (paste options) > Text to columns > Select 'Comma' > Apply +*/ + +const SKIP_URLS = ` +https://www.servicenow.com/blogs/topics/work-better.html +https://www.servicenow.com/blogs/topics/talent-and-engagement.html +https://www.servicenow.com/blogs/topics/strategy.html +https://www.servicenow.com/blogs/topics/service-delivery-and-management.html +https://www.servicenow.com/blogs/topics/partners.html +https://www.servicenow.com/blogs/topics/organizational-change-management.html +https://www.servicenow.com/blogs/topics/operations-and-uptime.html +https://www.servicenow.com/blogs/topics/low-code.html +https://www.servicenow.com/blogs/topics/life-at-now.html +https://www.servicenow.com/blogs/topics/knowledge.html +https://www.servicenow.com/blogs/topics/knowledge-2020.html +https://www.servicenow.com/blogs/topics/industry.html +https://www.servicenow.com/blogs/topics/global-impact.html +https://www.servicenow.com/blogs/topics/diversity-inclusion-and-belonging.html +https://www.servicenow.com/blogs/topics/digital-workflows.html +https://www.servicenow.com/blogs/topics/data-and-analytics.html +https://www.servicenow.com/blogs/topics/customer-service.html +https://www.servicenow.com/blogs/topics/creatorcon.html +https://www.servicenow.com/blogs/topics/covid-19.html +https://www.servicenow.com/blogs/topics/corporate-announcement.html +https://www.servicenow.com/blogs/topics/asset-and-cost-management.html +https://www.servicenow.com/blogs/topics.html +https://www.servicenow.com/blogs/category.html +https://www.servicenow.com/blogs/blogs/2023/leader-low-code-no-code-development.html +https://www.servicenow.com/blogs/blogs/2022/leader-low-code-development-platforms.html +https://www.servicenow.com/blogs/blogs/2021/leader-in-low-code-development-platforms.html +https://www.servicenow.com/blogs/author.html +https://www.servicenow.com/uk/blogs/topics.html +https://www.servicenow.com/uk/blogs/category.html +https://www.servicenow.com/uk/blogs/author.html +https://www.servicenow.com/uk/blogs/2022/iamremarkable-how-servicenow-is-supporting-women-to-the-top.html +https://www.servicenow.com/de/blogs/topics/work-better.html +https://www.servicenow.com/de/blogs/topics/template.html +https://www.servicenow.com/de/blogs/topics/talent-and-engagement.html +https://www.servicenow.com/de/blogs/topics/strategy.html +https://www.servicenow.com/de/blogs/topics/research.html +https://www.servicenow.com/de/blogs/topics/partners.html +https://www.servicenow.com/de/blogs/topics/life-at-now.html +https://www.servicenow.com/de/blogs/topics/knowledge.html +https://www.servicenow.com/de/blogs/topics/knowledge-2020.html +https://www.servicenow.com/de/blogs/topics/industry.html +https://www.servicenow.com/de/blogs/topics/global-impact.html +https://www.servicenow.com/de/blogs/topics/diversity-inclusion-and-belonging.html +https://www.servicenow.com/de/blogs/topics/digital-workflows.html +https://www.servicenow.com/de/blogs/topics/digital-transformation.html +https://www.servicenow.com/de/blogs/topics/data-and-analytics.html +https://www.servicenow.com/de/blogs/topics/app-engine.html +https://www.servicenow.com/de/blogs/topics.html +https://www.servicenow.com/de/blogs/template.html +https://www.servicenow.com/de/blogs/category.html +https://www.servicenow.com/fr/blogs/topics/work-better.html +https://www.servicenow.com/fr/blogs/topics/template.html +https://www.servicenow.com/fr/blogs/topics/talent-and-engagement.html +https://www.servicenow.com/fr/blogs/topics/strategy.html +https://www.servicenow.com/fr/blogs/topics/research.html +https://www.servicenow.com/fr/blogs/topics/partners.html +https://www.servicenow.com/fr/blogs/topics/life-at-now.html +https://www.servicenow.com/fr/blogs/topics/knowledge.html +https://www.servicenow.com/fr/blogs/topics/knowledge-2020.html +https://www.servicenow.com/fr/blogs/topics/industry.html +https://www.servicenow.com/fr/blogs/topics/healthcare.html +https://www.servicenow.com/fr/blogs/topics/global-impact.html +https://www.servicenow.com/fr/blogs/topics/education.html +https://www.servicenow.com/fr/blogs/topics/diversity-inclusion-and-belonging.html +https://www.servicenow.com/fr/blogs/topics/digital-workflows.html +https://www.servicenow.com/fr/blogs/topics/digital-transformation.html +https://www.servicenow.com/fr/blogs/topics/data-and-analytics.html +https://www.servicenow.com/fr/blogs/topics/covid-19.html +https://www.servicenow.com/fr/blogs/topics/business-impact.html +https://www.servicenow.com/fr/blogs/topics.html +https://www.servicenow.com/fr/blogs/template.html +https://www.servicenow.com/fr/blogs/fr/blogs/2023/lia-et-les-competences-de-demain.html +https://www.servicenow.com/fr/blogs/fr/blogs/2020/se-preparer-a-la-normalite-de-demain.html +https://www.servicenow.com/fr/blogs/category/home.html +https://www.servicenow.com/fr/blogs/category.html +https://www.servicenow.com/fr/blogs/author/matt-schvimmer/mike-vilimek.html +https://www.servicenow.com/fr/blogs/author.html +https://www.servicenow.com/nl/blogs/topics/work-better.html +https://www.servicenow.com/nl/blogs/topics/strategy.html +https://www.servicenow.com/nl/blogs/topics/research.html +https://www.servicenow.com/nl/blogs/topics/partners.html +https://www.servicenow.com/nl/blogs/topics/life-at-now.html +https://www.servicenow.com/nl/blogs/topics/knowledge.html +https://www.servicenow.com/nl/blogs/topics/knowledge-2020.html +https://www.servicenow.com/nl/blogs/topics/industry.html +https://www.servicenow.com/nl/blogs/topics/global-impact.html +https://www.servicenow.com/nl/blogs/topics/diversity-inclusion-and-belonging.html +https://www.servicenow.com/nl/blogs/topics/digital-workflows.html +https://www.servicenow.com/nl/blogs/topics/digital-transformation.html +https://www.servicenow.com/nl/blogs/topics/covid-19.html +https://www.servicenow.com/nl/blogs/topics/business-impact.html +https://www.servicenow.com/nl/blogs/topics.html +https://www.servicenow.com/nl/blogs/category.html +https://www.servicenow.com/nl/blogs/author.html +`; + +const LANGUAGES = ['us', 'uk', 'fr', 'de', 'nl']; +const DATA_OLD = 'data_old'; +const DATA_NEW = 'data_new'; + +function old2newLink(pathname) { + const ext = '.html'; + if (!pathname.toLowerCase().endsWith(ext)) { + throw new Error(`${pathname} does not contain .html`); + } + return `${pathname.substring(0, pathname.lastIndexOf('.html')).toLowerCase()}`; +} + +async function downloadFile(url, fileName) { + const headers = new Headers({ + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/81.0', + }); + const res = await fetch(url, { headers }); + if (!fs.existsSync(path.dirname(fileName))) await mkdir(path.dirname(fileName)); + const fileStream = fs.createWriteStream(fileName, { flags: 'w' }); + await finished(Readable.fromWeb(res.body).pipe(fileStream)); +} + +async function getDoc(file) { + const xmlString = await readFile(file, 'utf8'); + const parser = new DOMParser(); + const xmlDoc = parser.parseFromString(xmlString, 'text/xml'); + return xmlDoc; +} + +// topics, category, author, year +async function getSegments(lang, name) { + // TODO: error handling + const xmlDoc = await getDoc(`./${DATA_OLD}/${lang}.sitemap.xml`); + + // Find all loc elements that contain '/blogs/category/' + const locs = xmlDoc.getElementsByTagName('loc'); + const checkPath = (lang === 'us') ? `com/blogs/${name}/` : `com/${lang}/blogs/${name}/`; + const result = []; + + for (let i = 0; i < locs.length; i += 1) { + const loc = locs[i]; + if (loc.textContent.includes(checkPath) && !loc.textContent.endsWith('/home.html')) { + // ignore /blogs/category/home.html which redirects to /blogs.html + const url = new URL(loc.textContent); + // get last segment from url.pathname + const segments = url.pathname.split('/'); + const category = segments[segments.length - 1]; + const categoryText = category.replace('.html', '').replace(/-/g, ' ').toLowerCase(); + result.push(categoryText); + } + } + return result; +} + +async function getLinks(lang, name) { + // TODO: error handling + const xmlDoc = await getDoc(`./${DATA_OLD}/${lang}.sitemap.xml`); + + const locs = xmlDoc.getElementsByTagName('loc'); + let result = ''; + const checkPath = (lang === 'us') ? `com/blogs/${name}/` : `com/${lang}/blogs/${name}/`; + for (let i = 0; i < locs.length; i += 1) { + const loc = locs[i]; + if (loc.textContent.includes(checkPath)) { + result += `${loc.textContent.trim()}\n`; + } + } + return result; +} + + +program + .version('1.0.0') + .description('A CLI tool for extracting data from sitemaps'); + +/* +node app.mjs names us category +node app.mjs names nl category +node app.mjs names us topics +node app.mjs names us author +node app.mjs names us 2021 +*/ +program.command('names') + .description('Prints all names of a certain type') + .argument('') + .argument('', 'One of: category, topics, author, 2023, 2022, etc') + .action(async (lang, name) => { + const result = await getSegments(lang, name); + console.log(result); + }); + +/* +node app.mjs links nl author +node app.mjs links us author +node app.mjs links uk author +node app.mjs links uk 2021 +*/ +program.command('links') + .description('Prints all links') + .argument('') + .argument('') + .action(async (lang, name) => { + const result = await getLinks(lang, name); + console.log(result); + }); + +// node app.mjs all-categories +program.command('all-categories') + .description('Prints all categories') + .action(async () => { + const result = {}; + const type = 'category'; + result.us = await getSegments('us', type); + result.uk = await getSegments('uk', type); + result.fr = await getSegments('fr', type); + result.de = await getSegments('de', type); + result.nl = await getSegments('nl', type); + console.log(result); + }); + +// node app.mjs all-topics +program.command('all-topics') + .description('Prints all topics') + .action(async () => { + const result = {}; + const type = 'topics'; + result.us = await getSegments('us', type); + result.uk = await getSegments('uk', type); + result.fr = await getSegments('fr', type); + result.de = await getSegments('de', type); + result.nl = await getSegments('nl', type); + console.log(result); + }); + +// node app.mjs all-articles us +program.command('all-articles') + .description('Prints all articles') + .argument('') + .action(async (lang) => { + // let result = ''; + const promisesArray = []; + for (let year = 2013; year <= 2024; year += 1) { + promisesArray.push(getLinks(lang, year)); + } + const result = await Promise.all(promisesArray); + const res = result.filter((x) => x); + console.log(res.join('')); + }); + +// node app.mjs redirects | pbcopy +program.command('redirects') + .description('Prints a comma separated list of blog redirects') + .action(async () => { + const results = []; + + // eslint-disable-next-line no-restricted-syntax + for await (const lang of LANGUAGES) { + const checkPath = (lang === 'us') ? '/blogs/' : `/${lang}/blogs/`; + const xmlDoc = await getDoc(`./${DATA_OLD}/${lang}.sitemap.xml`); + const locs = xmlDoc.getElementsByTagName('loc'); + for (let i = 0; i < locs.length; i += 1) { + const url = new URL(locs[i].textContent); + if (url.pathname.startsWith(checkPath)) { + results.push(`${url.pathname},${old2newLink(url.pathname)}`); + } + } + } + console.log([...new Set(results)].join('\n')); + }); + +// node app.mjs redirects | pbcopy +program.command('find') + .description('Finds uppercase letters in urls') + .argument('') + .action(async (kind) => { + const results = []; + // eslint-disable-next-line no-restricted-syntax + for await (const lang of LANGUAGES) { + const xmlDoc = await getDoc(`./${DATA_OLD}/${lang}.sitemap.xml`); + const locs = xmlDoc.getElementsByTagName('loc'); + for (let i = 0; i < locs.length; i += 1) { + const checkPath = (lang === 'us') ? '/blogs/' : `/${lang}/blogs/`; + const url = new URL(locs[i].textContent); + if (url.pathname.startsWith(checkPath)) { + switch (kind) { + case 'uppercase': { + const uppercaseLetters = url.pathname.match(/[A-Z]/g); + if (uppercaseLetters) { + results.push(url.pathname); + } + break; + } + case 'nonalnum': { + // const found = url.pathname.match(/[^[:alnum:]_]/g); + const found = url.pathname.match(/[^-\w./]/g); + if (found) { + results.push(url.pathname); + } + break; + } + default: + } + } + } + } + console.log([...new Set(results)].join('\n')); + }); + +program.command('download_old') + .description('Downloads a separate sitemap.xml for each language') + .action(async () => { + LANGUAGES.forEach(async (lang) => { + const pathName = (lang === 'us') ? '/sitemap.xml' : `/${lang}/sitemap.xml`; + const url = `https://www.servicenow.com${pathName}`; + await downloadFile(url, `./${DATA_OLD}/${lang}.sitemap.xml`); + }); + }); + +program.command('download_new') + .description('Downloads the ESD sitemap.xml') + .action(async () => { + const url = 'https://main--servicenow--hlxsites.hlx.live/blogs/sitemap.xml'; + await downloadFile(url, `./${DATA_NEW}/sitemap.xml`); + }); + +program.command('diff') + .description('Diff between old and new sitemap links') + .action(async () => { + const oldLinks = []; + const newLinks = []; + const skipLinks = SKIP_URLS.split('\n').filter((u) => u).map((u) => { const url = new URL(u); return old2newLink(url.pathname); }); + // console.log(skipLinks); + + // eslint-disable-next-line no-restricted-syntax + for await (const lang of LANGUAGES) { + const xmlDoc = await getDoc(`./${DATA_OLD}/${lang}.sitemap.xml`); + const locs = xmlDoc.getElementsByTagName('loc'); + for (let i = 0; i < locs.length; i += 1) { + const checkPath = (lang === 'us') ? '/blogs/' : `/${lang}/blogs/`; + const url = new URL(locs[i].textContent); + if (url.pathname.startsWith(checkPath)) { + oldLinks.push(old2newLink(url.pathname)); + } + } + } + const xmlDoc = await getDoc(`./${DATA_NEW}/sitemap.xml`); + const locs = xmlDoc.getElementsByTagName('loc'); + for (let i = 0; i < locs.length; i += 1) { + const url = new URL(locs[i].textContent); + newLinks.push(url.pathname); + } + + const differenceOldNew = oldLinks + .filter((x) => !newLinks.includes(x)) + .filter((x) => !skipLinks.includes(x)); + console.log(`oldLinks - newLinks = ${differenceOldNew.length}`); + console.dir(differenceOldNew, { maxArrayLength: null }); + + const differenceNewOld = newLinks + .filter((x) => !oldLinks.includes(x)) + .filter((x) => !skipLinks.includes(x)); + console.log(`newLinks - oldLinks = ${differenceNewOld.length}`); + console.dir(differenceNewOld, { maxArrayLength: null }); + }); + +program.parse(process.argv); diff --git a/tools/sitemaps/package-lock.json b/tools/sitemaps/package-lock.json new file mode 100644 index 00000000..1e8b2e15 --- /dev/null +++ b/tools/sitemaps/package-lock.json @@ -0,0 +1,39 @@ +{ + "name": "sitemaps", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "sitemaps", + "version": "1.0.0", + "license": "ISC", + "dependencies": { + "commander": "^11.1.0", + "filereader": "^0.10.3", + "xmldom": "^0.6.0" + } + }, + "node_modules/commander": { + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-11.1.0.tgz", + "integrity": "sha512-yPVavfyCcRhmorC7rWlkHn15b4wDVgVmBA7kV4QVBsF7kv/9TKJAbAXVTxvTnwP8HHKjRCJDClKbciiYS7p0DQ==", + "engines": { + "node": ">=16" + } + }, + "node_modules/filereader": { + "version": "0.10.3", + "resolved": "https://registry.npmjs.org/filereader/-/filereader-0.10.3.tgz", + "integrity": "sha512-7F8w6GSXuHLN80ukaVOcHgBaiTRHUZr8GeEhNdqfAECcnBoROg4i8hTl+KqtF4yUPffOJVHEFg4iDJb7xIYFng==" + }, + "node_modules/xmldom": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/xmldom/-/xmldom-0.6.0.tgz", + "integrity": "sha512-iAcin401y58LckRZ0TkI4k0VSM1Qg0KGSc3i8rU+xrxe19A/BN1zHyVSJY7uoutVlaTSzYyk/v5AmkewAP7jtg==", + "engines": { + "node": ">=10.0.0" + } + } + } +} diff --git a/tools/sitemaps/package.json b/tools/sitemaps/package.json new file mode 100644 index 00000000..cba87ac8 --- /dev/null +++ b/tools/sitemaps/package.json @@ -0,0 +1,17 @@ +{ + "name": "sitemaps", + "version": "1.0.0", + "description": "", + "main": "index.js", + "scripts": { + "test": "echo \"Error: no test specified\" && exit 1", + "app": "app.mjs" + }, + "author": "", + "license": "ISC", + "dependencies": { + "commander": "^11.1.0", + "filereader": "^0.10.3", + "xmldom": "^0.6.0" + } +} From 0e5753a21ea1a86568195b4c94bedc79e4821f38 Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Fri, 22 Dec 2023 10:08:01 +0200 Subject: [PATCH 47/49] remove undeed urls from the diff output --- tools/sitemaps/app.mjs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/sitemaps/app.mjs b/tools/sitemaps/app.mjs index f84a267d..5eecef58 100644 --- a/tools/sitemaps/app.mjs +++ b/tools/sitemaps/app.mjs @@ -113,6 +113,17 @@ https://www.servicenow.com/nl/blogs/topics/business-impact.html https://www.servicenow.com/nl/blogs/topics.html https://www.servicenow.com/nl/blogs/category.html https://www.servicenow.com/nl/blogs/author.html +https://www.servicenow.com/blogs/2015.html +https://www.servicenow.com/blogs/2014.html +https://www.servicenow.com/de/blogs/topics/covid-19.html +https://www.servicenow.com/blogs/2021/meet-yannis-servicenow-france-leader.html +https://www.servicenow.com/de/blogs/author.html +https://www.servicenow.com/blogs/category/home.html +https://www.servicenow.com/blogs/category/home.html +https://www.servicenow.com/de/blogs/category/home.html +https://www.servicenow.com/fr/blogs/category/home.html +https://www.servicenow.com/nl/blogs/category/home.html +https://www.servicenow.com/uk/blogs/category/home.html `; const LANGUAGES = ['us', 'uk', 'fr', 'de', 'nl']; @@ -336,7 +347,7 @@ program.command('download_old') program.command('download_new') .description('Downloads the ESD sitemap.xml') .action(async () => { - const url = 'https://main--servicenow--hlxsites.hlx.live/blogs/sitemap.xml'; + const url = 'https://main--aemeds--servicenow-martech.hlx.live/blogs/sitemap.xml'; await downloadFile(url, `./${DATA_NEW}/sitemap.xml`); }); @@ -376,8 +387,8 @@ program.command('diff') const differenceNewOld = newLinks .filter((x) => !oldLinks.includes(x)) .filter((x) => !skipLinks.includes(x)); - console.log(`newLinks - oldLinks = ${differenceNewOld.length}`); - console.dir(differenceNewOld, { maxArrayLength: null }); + // console.log(`newLinks - oldLinks = ${differenceNewOld.length}`); + // console.dir(differenceNewOld, { maxArrayLength: null }); }); program.parse(process.argv); From c3ae8f6873a5af28ee2991b5626733b46f5124c6 Mon Sep 17 00:00:00 2001 From: Sergiu Coman Date: Thu, 8 Feb 2024 14:54:45 +0200 Subject: [PATCH 48/49] Added canonical url --- tools/importer/tags.js | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tools/importer/tags.js b/tools/importer/tags.js index 82123f2a..07994284 100644 --- a/tools/importer/tags.js +++ b/tools/importer/tags.js @@ -28,6 +28,14 @@ function getOriginalTags(jsonRendition) { return jsonRendition['jcr:content']['cq:tags']; } +function getCanonicalUrl(jsonRendition) { + const canonicalUrl = jsonRendition['jcr:content']['canonicalUrl']; + if (canonicalUrl === undefined) { + return ''; + } + return canonicalUrl; +} + // get sitemap content and parse it to Document Object Model function getXMLSitemapObject(sitemapFile, callback) { const xhttp = new XMLHttpRequest(); @@ -182,6 +190,8 @@ for (let j = 0; j < sitemaps.length; j++) { const correctedDate = getCorrectedDate(publishedDate); const previousDate = getPreviousDate(publishedDate); + const canonicalUrl = getCanonicalUrl(jsonRendition); + const wrongDate = previousDate !== correctedDate; all.push({ @@ -193,6 +203,7 @@ for (let j = 0; j < sitemaps.length; j++) { previousDate, correctedDate, wrongDate, + canonicalUrl, }); } else { console.log('originalTags is undefined'); @@ -209,7 +220,7 @@ for (let j = 0; j < sitemaps.length; j++) { // itearte over all array and display properties loc and topic as csv line for (let k = 0; k < all.length; k++) { const element = all[k]; - console.log(`${element.locale},${element.loc},${element.topic},${element.category},${element.newTrend},${element.previousDate},${element.correctedDate},${element.wrongDate}`); + console.log(`${element.locale},${element.loc},${element.topic},${element.category},${element.newTrend},${element.previousDate},${element.correctedDate},${element.wrongDate},${element.canonicalUrl}`); } }); } From 5ad6e0401f00f52472a9c247cbd133e58adb76bf Mon Sep 17 00:00:00 2001 From: Andrei Tuicu Date: Thu, 15 Feb 2024 11:10:58 +0100 Subject: [PATCH 49/49] import - Canonical URLs --- tools/importer/import-author.js | 7 +++++++ tools/importer/import.js | 33 +++++++++++++++++++++++++++++---- tools/sitemaps/app.mjs | 4 +++- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/tools/importer/import-author.js b/tools/importer/import-author.js index 2d62be98..d2025d6b 100644 --- a/tools/importer/import-author.js +++ b/tools/importer/import-author.js @@ -37,6 +37,12 @@ function getPageUrl(link) { return new URL(new URL(link).pathname.replace('.html', ''), pageUrl); } +function removeComments(document) { + document.body.innerHTML = document.body.innerHTML + // remove html comments + .replace(/>\s*/gm, '>'); +} + const createMetadataBlock = (main, document, url) => { const meta = {}; @@ -119,6 +125,7 @@ export default { // eslint-disable-next-line no-unused-vars document, url, html, params, }) => { + removeComments(document); const main = document.querySelector('body'); console.debug(url); diff --git a/tools/importer/import.js b/tools/importer/import.js index 58946df9..b2c48dbc 100644 --- a/tools/importer/import.js +++ b/tools/importer/import.js @@ -82,6 +82,10 @@ function getPageUrl(link) { return new URL(new URL(link.href).pathname.replace('.html', ''), pageUrl); } +function removeComments(document) { + document.body.innerHTML = document.body.innerHTML + .replace(/>\s*/gm, '>'); +} const timezoneMap = { '+0530': 'Asia/Calcutta', @@ -98,7 +102,7 @@ function makeProxySrc(imgSrc) { return `http://localhost:3001${u.pathname}${u.search}`; } -const createMetadataBlock = (main, document, url) => { +const createMetadataBlock = (main, document, url, params) => { const jsonRendition = JSON.parse(fetchSync('GET', jsonRenditionURL(url)).body); const allTags = getAllTags(); const originalTags = getOriginalTags(jsonRendition) || []; @@ -162,7 +166,9 @@ const createMetadataBlock = (main, document, url) => { // tzMatch[0] has the folllowing format "GMT" e.g. GMT-0700 or GMT+0200. Extract the offset from it const offset = tzMatch[0].substring(3); // if the offset ends in 00 remove it from offset - if (offset.endsWith('00')) { + if (offset.endsWith('0000')) { + timeZone = `Etc/GMT+0`; + } else if (offset.endsWith('00')) { const o = offset.substring(1, offset.length - 2); const sign = offset.startsWith('-') ? '+' : '-'; // remove any leading zeroes from o e.g. -0700 becomes -7 @@ -184,7 +190,7 @@ const createMetadataBlock = (main, document, url) => { 'en-US', { day: '2-digit', month: '2-digit', year: 'numeric' , timeZone}, ); - } + } // Keywords const keywords = document.querySelector('meta[name="keywords"]'); @@ -220,6 +226,24 @@ const createMetadataBlock = (main, document, url) => { } + if (document.querySelector('link[rel="canonical"]')) { + let canonical = new URL(document.querySelector('link[rel="canonical"]').href); + console.log('canonical: ', canonical); + if (canonical.hostname === 'localhost') { + canonical.hostname = 'www.servicenow.com'; + canonical.port = ''; + canonical.protocol = 'https'; + } + + if (canonical.toString() !== params.originalURL) { + if (canonical.hostname === 'www.servicenow.com') { + canonical = new URL(canonical.toString().replace('.html', '')); + } + + meta['Canonical'] = canonical.toString(); + } + } + const block = WebImporter.Blocks.getMetadataBlock(document, meta); main.prepend(block); @@ -275,10 +299,11 @@ export default { // eslint-disable-next-line no-unused-vars document, url, html, params, }) => { + removeComments(document); const main = document.querySelector('body'); - createMetadataBlock(main, document, url); + createMetadataBlock(main, document, url, params); // CLEANUP main.querySelectorAll('.legacyHTML, .social-sharing, .servicenow-blog-header, .blog-author-info, .component-tag-path, .aem-GridColumn--default--4').forEach(el => el.remove()); diff --git a/tools/sitemaps/app.mjs b/tools/sitemaps/app.mjs index 5eecef58..c0184999 100644 --- a/tools/sitemaps/app.mjs +++ b/tools/sitemaps/app.mjs @@ -124,6 +124,8 @@ https://www.servicenow.com/de/blogs/category/home.html https://www.servicenow.com/fr/blogs/category/home.html https://www.servicenow.com/nl/blogs/category/home.html https://www.servicenow.com/uk/blogs/category/home.html +https://www.servicenow.com/blogs/blogs/2024/governance-genai-low-code-development.html +https://www.servicenow.com/fr/blogs/author/olivier_pichon.html `; const LANGUAGES = ['us', 'uk', 'fr', 'de', 'nl']; @@ -135,7 +137,7 @@ function old2newLink(pathname) { if (!pathname.toLowerCase().endsWith(ext)) { throw new Error(`${pathname} does not contain .html`); } - return `${pathname.substring(0, pathname.lastIndexOf('.html')).toLowerCase()}`; + return `${pathname.substring(0, pathname.lastIndexOf('.html')).toLowerCase().replace('_', '-')}`; } async function downloadFile(url, fileName) {