Skip to content

Commit

Permalink
Merge pull request #92 from impresso/helpers
Browse files Browse the repository at this point in the history
Helpers
  • Loading branch information
piconti authored Aug 27, 2024
2 parents 192da84 + 13cb7d9 commit 00ed58b
Show file tree
Hide file tree
Showing 28 changed files with 389 additions and 214 deletions.
Binary file modified docs/_build/doctrees/environment.pickle
Binary file not shown.
Binary file modified docs/_build/doctrees/images.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/index.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/io.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/rebuild.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/utils.doctree
Binary file not shown.
Binary file modified docs/_build/doctrees/versioning.doctree
Binary file not shown.
2 changes: 1 addition & 1 deletion docs/_build/html/.buildinfo
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 22fc4b3edf789b51746d48473d45c93f
config: 8c31fa3be0c6ebef3824e4e08997d35b
tags: 645f666f9bcd5a90fca523b33c5a78b7
2 changes: 1 addition & 1 deletion docs/_build/html/_static/basic.css
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*
* Sphinx stylesheet -- basic theme.
*
* :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
Expand Down
2 changes: 1 addition & 1 deletion docs/_build/html/_static/doctools.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*
* Base JavaScript utilities for all Sphinx HTML documentation.
*
* :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
Expand Down
4 changes: 2 additions & 2 deletions docs/_build/html/_static/language_data.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
* This script contains the language-specific data used by searchtools.js,
* namely the list of stopwords, stemmer, scorer and splitter.
*
* :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/

var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];


/* Non-minified version is copied as a separate JS file, if available */
/* Non-minified version is copied as a separate JS file, is available */

/**
* Porter Stemmer
Expand Down
170 changes: 62 additions & 108 deletions docs/_build/html/_static/searchtools.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
*
* Sphinx JavaScript utilities for the full-text search.
*
* :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
Expand Down Expand Up @@ -99,7 +99,7 @@ const _displayItem = (item, searchTerms, highlightTerms) => {
.then((data) => {
if (data)
listItem.appendChild(
Search.makeSearchSummary(data, searchTerms, anchor)
Search.makeSearchSummary(data, searchTerms)
);
// highlight search terms in the summary
if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js
Expand All @@ -116,8 +116,8 @@ const _finishSearch = (resultCount) => {
);
else
Search.status.innerText = _(
"Search finished, found ${resultCount} page(s) matching the search query."
).replace('${resultCount}', resultCount);
`Search finished, found ${resultCount} page(s) matching the search query.`
);
};
const _displayNextItem = (
results,
Expand All @@ -137,22 +137,6 @@ const _displayNextItem = (
// search finished, update title and status message
else _finishSearch(resultCount);
};
// Helper function used by query() to order search results.
// Each input is an array of [docname, title, anchor, descr, score, filename].
// Order the results by score (in opposite order of appearance, since the
// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically.
const _orderResultsByScoreThenName = (a, b) => {
const leftScore = a[4];
const rightScore = b[4];
if (leftScore === rightScore) {
// same score: sort alphabetically
const leftTitle = a[1].toLowerCase();
const rightTitle = b[1].toLowerCase();
if (leftTitle === rightTitle) return 0;
return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
}
return leftScore > rightScore ? 1 : -1;
};

/**
* Default splitQuery function. Can be overridden in ``sphinx.search`` with a
Expand All @@ -176,26 +160,13 @@ const Search = {
_queued_query: null,
_pulse_status: -1,

htmlToText: (htmlString, anchor) => {
htmlToText: (htmlString) => {
const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html');
for (const removalQuery of [".headerlink", "script", "style"]) {
htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() });
}
if (anchor) {
const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`);
if (anchorContent) return anchorContent.textContent;

console.warn(
`Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.`
);
}

// if anchor not specified or not found, fall back to main content
htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() });
const docContent = htmlElement.querySelector('[role="main"]');
if (docContent) return docContent.textContent;

if (docContent !== undefined) return docContent.textContent;
console.warn(
"Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template."
"Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template."
);
return "";
},
Expand Down Expand Up @@ -268,7 +239,16 @@ const Search = {
else Search.deferQuery(query);
},

_parseQuery: (query) => {
/**
* execute search (requires search index to be loaded)
*/
query: (query) => {
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
const titles = Search._index.titles;
const allTitles = Search._index.alltitles;
const indexEntries = Search._index.indexentries;

// stem the search terms and add them to the correct list
const stemmer = new Stemmer();
const searchTerms = new Set();
Expand Down Expand Up @@ -304,38 +284,21 @@ const Search = {
// console.info("required: ", [...searchTerms]);
// console.info("excluded: ", [...excludedTerms]);

return [query, searchTerms, excludedTerms, highlightTerms, objectTerms];
},

/**
* execute search (requires search index to be loaded)
*/
_performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => {
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
const titles = Search._index.titles;
const allTitles = Search._index.alltitles;
const indexEntries = Search._index.indexentries;

// Collect multiple result groups to be sorted separately and then ordered.
// Each is an array of [docname, title, anchor, descr, score, filename].
const normalResults = [];
const nonMainIndexResults = [];

// array of [docname, title, anchor, descr, score, filename]
let results = [];
_removeChildren(document.getElementById("search-progress"));

const queryLower = query.toLowerCase().trim();
const queryLower = query.toLowerCase();
for (const [title, foundTitles] of Object.entries(allTitles)) {
if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) {
if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) {
for (const [file, id] of foundTitles) {
const score = Math.round(Scorer.title * queryLower.length / title.length);
const boost = titles[file] === title ? 1 : 0; // add a boost for document titles
normalResults.push([
let score = Math.round(100 * queryLower.length / title.length)
results.push([
docNames[file],
titles[file] !== title ? `${titles[file]} > ${title}` : title,
id !== null ? "#" + id : "",
null,
score + boost,
score,
filenames[file],
]);
}
Expand All @@ -345,47 +308,46 @@ const Search = {
// search for explicit entries in index directives
for (const [entry, foundEntries] of Object.entries(indexEntries)) {
if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) {
for (const [file, id, isMain] of foundEntries) {
const score = Math.round(100 * queryLower.length / entry.length);
const result = [
for (const [file, id] of foundEntries) {
let score = Math.round(100 * queryLower.length / entry.length)
results.push([
docNames[file],
titles[file],
id ? "#" + id : "",
null,
score,
filenames[file],
];
if (isMain) {
normalResults.push(result);
} else {
nonMainIndexResults.push(result);
}
]);
}
}
}

// lookup as object
objectTerms.forEach((term) =>
normalResults.push(...Search.performObjectSearch(term, objectTerms))
results.push(...Search.performObjectSearch(term, objectTerms))
);

// lookup as search terms in fulltext
normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms));
results.push(...Search.performTermsSearch(searchTerms, excludedTerms));

// let the scorer override scores with a custom scoring function
if (Scorer.score) {
normalResults.forEach((item) => (item[4] = Scorer.score(item)));
nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item)));
}

// Sort each group of results by score and then alphabetically by name.
normalResults.sort(_orderResultsByScoreThenName);
nonMainIndexResults.sort(_orderResultsByScoreThenName);

// Combine the result groups in (reverse) order.
// Non-main index entries are typically arbitrary cross-references,
// so display them after other results.
let results = [...nonMainIndexResults, ...normalResults];
if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item)));

// now sort the results by score (in opposite order of appearance, since the
// display function below uses pop() to retrieve items) and then
// alphabetically
results.sort((a, b) => {
const leftScore = a[4];
const rightScore = b[4];
if (leftScore === rightScore) {
// same score: sort alphabetically
const leftTitle = a[1].toLowerCase();
const rightTitle = b[1].toLowerCase();
if (leftTitle === rightTitle) return 0;
return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
}
return leftScore > rightScore ? 1 : -1;
});

// remove duplicate search results
// note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept
Expand All @@ -399,12 +361,7 @@ const Search = {
return acc;
}, []);

return results.reverse();
},

query: (query) => {
const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query);
const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms);
results = results.reverse();

// for debugging
//Search.lastresults = results.slice(); // a copy
Expand Down Expand Up @@ -509,18 +466,14 @@ const Search = {
// add support for partial matches
if (word.length > 2) {
const escapedWord = _escapeRegExp(word);
if (!terms.hasOwnProperty(word)) {
Object.keys(terms).forEach((term) => {
if (term.match(escapedWord))
arr.push({ files: terms[term], score: Scorer.partialTerm });
});
}
if (!titleTerms.hasOwnProperty(word)) {
Object.keys(titleTerms).forEach((term) => {
if (term.match(escapedWord))
arr.push({ files: titleTerms[term], score: Scorer.partialTitle });
});
}
Object.keys(terms).forEach((term) => {
if (term.match(escapedWord) && !terms[word])
arr.push({ files: terms[term], score: Scorer.partialTerm });
});
Object.keys(titleTerms).forEach((term) => {
if (term.match(escapedWord) && !titleTerms[word])
arr.push({ files: titleTerms[word], score: Scorer.partialTitle });
});
}

// no match but word was a required one
Expand All @@ -543,8 +496,9 @@ const Search = {

// create the mapping
files.forEach((file) => {
if (!fileMap.has(file)) fileMap.set(file, [word]);
else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word);
if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1)
fileMap.get(file).push(word);
else fileMap.set(file, [word]);
});
});

Expand Down Expand Up @@ -595,8 +549,8 @@ const Search = {
* search summary for a given text. keywords is a list
* of stemmed words.
*/
makeSearchSummary: (htmlText, keywords, anchor) => {
const text = Search.htmlToText(htmlText, anchor);
makeSearchSummary: (htmlText, keywords) => {
const text = Search.htmlToText(htmlText);
if (text === "") return null;

const textLower = text.toLowerCase();
Expand Down
22 changes: 15 additions & 7 deletions docs/_build/html/genindex.html
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<html class="writer-html5" lang="en" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Index &mdash; Impresso PyCommons documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />


<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<!--[if lt IE 9]>
<script src="_static/js/html5shiv.min.js"></script>
<![endif]-->

<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=9a2dae69"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="#" />
Expand Down Expand Up @@ -314,6 +312,8 @@ <h2 id="G">G</h2>
<li><a href="io.html#impresso_commons.path.path_fs.get_issueshortpath">get_issueshortpath() (in module impresso_commons.path.path_fs)</a>
</li>
<li><a href="images.html#impresso_commons.images.img_utils.get_jpg">get_jpg() (in module impresso_commons.images.img_utils)</a>
</li>
<li><a href="utils.html#impresso_commons.utils.utils.get_list_intersection">get_list_intersection() (in module impresso_commons.utils.utils)</a>
</li>
<li><a href="versioning.html#impresso_commons.versioning.helpers.get_media_item_years">get_media_item_years() (in module impresso_commons.versioning.helpers)</a>
</li>
Expand Down Expand Up @@ -348,6 +348,8 @@ <h2 id="G">G</h2>
<li><a href="images.html#impresso_commons.images.img_utils.get_tif">get_tif() (in module impresso_commons.images.img_utils)</a>
</li>
<li><a href="versioning.html#impresso_commons.versioning.helpers.git_commit_push">git_commit_push() (in module impresso_commons.versioning.helpers)</a>
</li>
<li><a href="utils.html#impresso_commons.utils.utils.glob_with_size">glob_with_size() (in module impresso_commons.utils.utils)</a>
</li>
<li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.granularity">granularity (impresso_commons.versioning.data_statistics.DataStatistics attribute)</a>

Expand Down Expand Up @@ -536,12 +538,16 @@ <h2 id="L">L</h2>
<li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.LANGIDENT">LANGIDENT (impresso_commons.versioning.helpers.DataStage attribute)</a>
</li>
<li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.LINGUISTIC_PROCESSING">LINGUISTIC_PROCESSING (impresso_commons.versioning.helpers.DataStage attribute)</a>
</li>
<li><a href="io.html#impresso_commons.path.path_s3.list_files">list_files() (in module impresso_commons.path.path_s3)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="io.html#impresso_commons.path.path_s3.list_files">list_files() (in module impresso_commons.path.path_s3)</a>
<li><a href="utils.html#impresso_commons.utils.utils.list_local_directories">list_local_directories() (in module impresso_commons.utils.utils)</a>
</li>
<li><a href="io.html#impresso_commons.path.path_s3.list_newspapers">list_newspapers() (in module impresso_commons.path.path_s3)</a>
</li>
<li><a href="utils.html#impresso_commons.utils.s3.list_s3_directories">list_s3_directories() (in module impresso_commons.utils.s3)</a>
</li>
</ul></td>
</tr></table>
Expand Down Expand Up @@ -727,6 +733,8 @@ <h2 id="S">S</h2>
<li><a href="utils.html#impresso_commons.utils.s3.s3_get_articles">s3_get_articles() (in module impresso_commons.utils.s3)</a>
</li>
<li><a href="utils.html#impresso_commons.utils.s3.s3_get_pages">s3_get_pages() (in module impresso_commons.utils.s3)</a>
</li>
<li><a href="utils.html#impresso_commons.utils.s3.s3_glob_with_size">s3_glob_with_size() (in module impresso_commons.utils.s3)</a>
</li>
<li><a href="io.html#impresso_commons.path.path_s3.s3_iter_bucket">s3_iter_bucket() (in module impresso_commons.path.path_s3)</a>
</li>
Expand Down
Loading

0 comments on commit 00ed58b

Please sign in to comment.