Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic News and Healthy thinking importer added #45

Merged
merged 8 commits into from
Sep 21, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions helix-query.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ indices:
breadcrumbtitle:
select: head > meta[name="breadcrumbtitle"]
value: attribute(el, "content")
newsdate:
select: head > meta[name="newsdate"]
publisheddate:
select: head > meta[name="publisheddate"]
value: attribute(el, "content")
pagename:
select: head > meta[name="pagename"]
Expand Down
319 changes: 319 additions & 0 deletions tools/importer/import-news-press-healthythinking.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
/*
* Copyright 2023 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
/* global WebImporter */
/* eslint-disable no-console, class-methods-use-this */
const createMetadata = (main, document, params) => {
const meta = {};

const title = document.querySelector('title');
if (title) {
meta.Title = title.textContent.replace(/[\n\t]/gm, '');
}

const desc = document.querySelector('[property="og:description"]');
if (desc) {
meta.Description = desc.content;
}

const img = document.querySelector('[property="og:image"]');
if (img && img.content) {
const el = document.createElement('img');
el.src = img.content;
meta.Image = el;
}

const breadcrumb = document.querySelector('.section-breadcrumb');
if (breadcrumb) {
const breadcrumbItems = breadcrumb.querySelectorAll('.ss-breadcrumb .breadcrumb-item');
if (breadcrumbItems && breadcrumbItems.length) {
const breadcrumbText = breadcrumbItems[breadcrumbItems.length - 1].textContent.trim();
meta.BreadcrumbTitle = breadcrumbText;
}
}

if (params.preProcessMetadata && Object.keys(params.preProcessMetadata).length) {
Object.assign(meta, params.preProcessMetadata);
}

const block = WebImporter.Blocks.getMetadataBlock(document, meta);
main.append(block);

return meta;
};

const extractEmbed = (document) => {
const embedItems = document.querySelectorAll('.wp-block-embed');

if (embedItems && embedItems.length) {
embedItems.forEach((embedItem) => {
const iframes = embedItem.getElementsByTagName('iframe');

if (iframes && iframes.length) {
const cells = [['Embed']];
const anchor = document.createElement('a');
anchor.href = iframes[0].src;
anchor.textContent = iframes[0].src;
cells.push([anchor]);

const table = WebImporter.DOMUtils.createTable(cells, document);
embedItem.replaceWith(table);
}
});
}
};

const addBreadCrumb = (document) => {
const breadcrumb = document.querySelector('.section-breadcrumb');

if (breadcrumb) {
const cells = [['Breadcrumb']];
const table = WebImporter.DOMUtils.createTable(cells, document);
breadcrumb.after(document.createElement('hr'));
breadcrumb.replaceWith(table);
}
};

/**
* Creates a Feature block from a section
* @param {HTMLDocument} document The document
*/
const createFeatureBlockFromSection = (document) => {
document.querySelectorAll('div.section-container').forEach((section) => {
const block = [];
const healthifyThinkingCard = section.parentElement.className.includes('related-article');
const newsPressCard = section.parentElement.className.includes('news-featured');
let blockDetails = '';

if (healthifyThinkingCard) {
blockDetails = 'Feature (related-article)';
} else if (newsPressCard) {
blockDetails = 'Feature (featured-article)';
}

if (blockDetails) {
block.push([blockDetails]);
const table = WebImporter.DOMUtils.createTable(block, document);
section.before(document.createElement('hr'));
section.before(document.querySelector('.slider-title'));
section.after(document.createElement('hr'));
section.replaceWith(table);
}
});
};

const addSocialBlock = (document) => {
const socialShare = document.querySelector('.ss-share');
if (socialShare) {
const socialLinks = socialShare.querySelectorAll('a');
if (socialLinks && socialLinks.length) {
const cells = [['Social']];

socialLinks.forEach((x) => {
const img = x.querySelector('img');
if (img) {
const url = img.src;
const startIndex = url.lastIndexOf('/') + 1;
const extensionIndex = url.lastIndexOf('.svg');
const filename = url.substring(startIndex, extensionIndex);
cells.push([filename, x.href]);
}
});

const table = WebImporter.DOMUtils.createTable(cells, document);
socialShare.replaceWith(table);
}
}
};

/**
* Prefixes relative links with the target domain
* @param {HTMLDocument} document The document
*/
const fixRelativeLinks = (document) => {
document.querySelectorAll('a').forEach((a) => {
const targetDomain = 'https://main--sunstar--hlxsites.hlx.page';
// if the link is relative, make it absolute
if (a.href.startsWith('/')) {
let link = a.href;
const p1 = a.href.indexOf('#');
const p2 = a.href.indexOf('?');
let p = p1;
if (p1 < 0 || (p2 > 0 && p2 < p1)) {
p = p2;
}
if (p > 0) {
link = a.href.substring(0, p);
if (link.endsWith('/')) {
link = link.substring(0, link.length - 1);
}
link += a.href.substring(p);
} else if (link.endsWith('/')) {
link = link.substring(0, link.length - 1);
}
a.href = targetDomain + link;
}
});
};

const addTagsBlock = (document) => {
const section = document.querySelector('.ss-tag-container');

if (section) {
const tagLabel = section.querySelector('.tag-label');
const tagAnchors = section.querySelectorAll('.tag-link');

if (tagAnchors && tagAnchors.length) {
const cells = [['Tags']];

[...tagAnchors].forEach((x) => {
cells.push([x]);
});

const table = WebImporter.DOMUtils.createTable(cells, document);
section.before(tagLabel);
section.replaceWith(table);
}
}
};

const addQuoteBlock = (document) => {
const blockQuotes = document.querySelectorAll('blockquote.wp-block-quote');

if (blockQuotes && blockQuotes.length) {
[...blockQuotes].forEach((quote) => {
const cells = [['Quote']];
cells.push([quote.querySelector('p').textContent]);

const table = WebImporter.DOMUtils.createTable(cells, document);
quote.replaceWith(table);
});
}
};

const customImportLogic = (document) => {
addBreadCrumb(document);
addTagsBlock(document);
createFeatureBlockFromSection(document);
extractEmbed(document);
addSocialBlock(document);
addQuoteBlock(document);
fixRelativeLinks(document);
};

export default {
/**
* Apply DOM operations to the provided document and return
* the root element to be then transformed to Markdown.
* @param {HTMLDocument} document The document
* @param {string} url The url of the page imported
* @param {string} html The raw html (the document is cleaned up during preprocessing)
* @param {object} params Object containing some parameters given by the import process.
* @returns {HTMLElement} The root element to be transformed
*/
preprocess: ({
// eslint-disable-next-line no-unused-vars
document, url, html, params,
}) => {
const schemaDetails = document.querySelector('head script.aioseo-schema');
const metadataDetails = {};
const { pathname } = new URL(url);

if (schemaDetails) {
const jsonSchema = JSON.parse(schemaDetails.innerText);
const graphNode = jsonSchema['@graph'];

if (graphNode) {
graphNode.forEach((node) => {
const nodeType = node['@type'];

if (nodeType === 'BreadcrumbList' && node.itemListElement && node.itemListElement.length) {
const lastItem = node.itemListElement[node.itemListElement.length - 1];
const lastItemDetails = lastItem.item;

if (lastItemDetails) {
metadataDetails.PageName = lastItemDetails.name;
}
} else if (nodeType === 'WebPage' && (pathname.includes('/newsroom/') || pathname.includes('/healthy-thinking/')) && node.datePublished) {
metadataDetails.PublishedDate = new Date(node.datePublished).getTime();
}
});
}
}

if (pathname.includes('/newsroom/')) {
metadataDetails.Category = 'Newsroom';

const span = document.querySelector('span.tag');
if (span) {
metadataDetails.Topic = span.textContent;
}
}

if (pathname.includes('/healthy-thinking/')) {
metadataDetails.Category = 'Healthy Thinking';

const firstH6 = document.querySelector('h6.rabel');
if (firstH6) {
metadataDetails.Topic = firstH6.textContent;
}
}

const tags = document.querySelectorAll('.tag-pill');

if (tags && tags.length) {
metadataDetails.Tags = [...tags].map((x) => x.textContent).join(', ');
}

params.preProcessMetadata = metadataDetails;
},

transformDOM: ({
// eslint-disable-next-line no-unused-vars
document, url, html, params,
}) => {
// define the main element: the one that will be transformed to Markdown
const main = document.body;

// use helper method to remove header, footer, etc.
WebImporter.DOMUtils.remove(main, [
'header',
'footer',
'noscript',
]);

customImportLogic(document);
// create the metadata block and append it to the main element
createMetadata(main, document, params);

return main;
},

/**
* Return a path that describes the document being transformed (file name, nesting...).
* The path is then used to create the corresponding Word document.
* @param {HTMLDocument} document The document
* @param {string} url The url of the page imported
* @param {string} html The raw html (the document is cleaned up during preprocessing)
* @param {object} params Object containing some parameters given by the import process.
* @return {string} The path
*/
generateDocumentPath: ({
// eslint-disable-next-line no-unused-vars
document, url, html, params,
}) => {
const { pathname } = new URL(url);
const initialReplace = new URL(url).pathname.replace(/\.html$/, '').replace(/\/$/, '');

console.log(`pathname: ${pathname} -> initialReplace: ${initialReplace}`);
return WebImporter.FileUtils.sanitizePath(initialReplace);
},
};