diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle index fbd2236..fc072a8 100644 Binary files a/docs/_build/doctrees/environment.pickle and b/docs/_build/doctrees/environment.pickle differ diff --git a/docs/_build/doctrees/images.doctree b/docs/_build/doctrees/images.doctree index ade1033..a2c17a4 100644 Binary files a/docs/_build/doctrees/images.doctree and b/docs/_build/doctrees/images.doctree differ diff --git a/docs/_build/doctrees/index.doctree b/docs/_build/doctrees/index.doctree index 3ac58e9..1590498 100644 Binary files a/docs/_build/doctrees/index.doctree and b/docs/_build/doctrees/index.doctree differ diff --git a/docs/_build/doctrees/io.doctree b/docs/_build/doctrees/io.doctree index 7873d90..d2c1131 100644 Binary files a/docs/_build/doctrees/io.doctree and b/docs/_build/doctrees/io.doctree differ diff --git a/docs/_build/doctrees/rebuild.doctree b/docs/_build/doctrees/rebuild.doctree index d9bb08f..dc4e806 100644 Binary files a/docs/_build/doctrees/rebuild.doctree and b/docs/_build/doctrees/rebuild.doctree differ diff --git a/docs/_build/doctrees/utils.doctree b/docs/_build/doctrees/utils.doctree index 4245d73..0cc5e7c 100644 Binary files a/docs/_build/doctrees/utils.doctree and b/docs/_build/doctrees/utils.doctree differ diff --git a/docs/_build/doctrees/versioning.doctree b/docs/_build/doctrees/versioning.doctree index 3f1affe..34cb70f 100644 Binary files a/docs/_build/doctrees/versioning.doctree and b/docs/_build/doctrees/versioning.doctree differ diff --git a/docs/_build/html/.buildinfo b/docs/_build/html/.buildinfo index 9501185..d9e8c4b 100644 --- a/docs/_build/html/.buildinfo +++ b/docs/_build/html/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 22fc4b3edf789b51746d48473d45c93f +config: 8c31fa3be0c6ebef3824e4e08997d35b tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_build/html/_static/basic.css b/docs/_build/html/_static/basic.css index f316efc..30fee9d 100644 --- a/docs/_build/html/_static/basic.css +++ b/docs/_build/html/_static/basic.css @@ -4,7 +4,7 @@ * * Sphinx stylesheet -- basic theme. * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ diff --git a/docs/_build/html/_static/doctools.js b/docs/_build/html/_static/doctools.js index 4d67807..d06a71d 100644 --- a/docs/_build/html/_static/doctools.js +++ b/docs/_build/html/_static/doctools.js @@ -4,7 +4,7 @@ * * Base JavaScript utilities for all Sphinx HTML documentation. * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ diff --git a/docs/_build/html/_static/language_data.js b/docs/_build/html/_static/language_data.js index 367b8ed..250f566 100644 --- a/docs/_build/html/_static/language_data.js +++ b/docs/_build/html/_static/language_data.js @@ -5,7 +5,7 @@ * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ @@ -13,7 +13,7 @@ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; -/* Non-minified version is copied as a separate JS file, if available */ +/* Non-minified version is copied as a separate JS file, is available */ /** * Porter Stemmer diff --git a/docs/_build/html/_static/searchtools.js b/docs/_build/html/_static/searchtools.js index b08d58c..7918c3f 100644 --- a/docs/_build/html/_static/searchtools.js +++ b/docs/_build/html/_static/searchtools.js @@ -4,7 +4,7 @@ * * Sphinx JavaScript utilities for the full-text search. * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ @@ -99,7 +99,7 @@ const _displayItem = (item, searchTerms, highlightTerms) => { .then((data) => { if (data) listItem.appendChild( - Search.makeSearchSummary(data, searchTerms, anchor) + Search.makeSearchSummary(data, searchTerms) ); // highlight search terms in the summary if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js @@ -116,8 +116,8 @@ const _finishSearch = (resultCount) => { ); else Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." - ).replace('${resultCount}', resultCount); + `Search finished, found ${resultCount} page(s) matching the search query.` + ); }; const _displayNextItem = ( results, @@ -137,22 +137,6 @@ const _displayNextItem = ( // search finished, update title and status message else _finishSearch(resultCount); }; -// Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. -// Order the results by score (in opposite order of appearance, since the -// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. -const _orderResultsByScoreThenName = (a, b) => { - const leftScore = a[4]; - const rightScore = b[4]; - if (leftScore === rightScore) { - // same score: sort alphabetically - const leftTitle = a[1].toLowerCase(); - const rightTitle = b[1].toLowerCase(); - if (leftTitle === rightTitle) return 0; - return leftTitle > rightTitle ? -1 : 1; // inverted is intentional - } - return leftScore > rightScore ? 1 : -1; -}; /** * Default splitQuery function. Can be overridden in ``sphinx.search`` with a @@ -176,26 +160,13 @@ const Search = { _queued_query: null, _pulse_status: -1, - htmlToText: (htmlString, anchor) => { + htmlToText: (htmlString) => { const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); - for (const removalQuery of [".headerlink", "script", "style"]) { - htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() }); - } - if (anchor) { - const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`); - if (anchorContent) return anchorContent.textContent; - - console.warn( - `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.` - ); - } - - // if anchor not specified or not found, fall back to main content + htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() }); const docContent = htmlElement.querySelector('[role="main"]'); - if (docContent) return docContent.textContent; - + if (docContent !== undefined) return docContent.textContent; console.warn( - "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template." + "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template." ); return ""; }, @@ -268,7 +239,16 @@ const Search = { else Search.deferQuery(query); }, - _parseQuery: (query) => { + /** + * execute search (requires search index to be loaded) + */ + query: (query) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + // stem the search terms and add them to the correct list const stemmer = new Stemmer(); const searchTerms = new Set(); @@ -304,38 +284,21 @@ const Search = { // console.info("required: ", [...searchTerms]); // console.info("excluded: ", [...excludedTerms]); - return [query, searchTerms, excludedTerms, highlightTerms, objectTerms]; - }, - - /** - * execute search (requires search index to be loaded) - */ - _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => { - const filenames = Search._index.filenames; - const docNames = Search._index.docnames; - const titles = Search._index.titles; - const allTitles = Search._index.alltitles; - const indexEntries = Search._index.indexentries; - - // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. - const normalResults = []; - const nonMainIndexResults = []; - + // array of [docname, title, anchor, descr, score, filename] + let results = []; _removeChildren(document.getElementById("search-progress")); - const queryLower = query.toLowerCase().trim(); + const queryLower = query.toLowerCase(); for (const [title, foundTitles] of Object.entries(allTitles)) { - if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) { + if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) { for (const [file, id] of foundTitles) { - const score = Math.round(Scorer.title * queryLower.length / title.length); - const boost = titles[file] === title ? 1 : 0; // add a boost for document titles - normalResults.push([ + let score = Math.round(100 * queryLower.length / title.length) + results.push([ docNames[file], titles[file] !== title ? `${titles[file]} > ${title}` : title, id !== null ? "#" + id : "", null, - score + boost, + score, filenames[file], ]); } @@ -345,47 +308,46 @@ const Search = { // search for explicit entries in index directives for (const [entry, foundEntries] of Object.entries(indexEntries)) { if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { - for (const [file, id, isMain] of foundEntries) { - const score = Math.round(100 * queryLower.length / entry.length); - const result = [ + for (const [file, id] of foundEntries) { + let score = Math.round(100 * queryLower.length / entry.length) + results.push([ docNames[file], titles[file], id ? "#" + id : "", null, score, filenames[file], - ]; - if (isMain) { - normalResults.push(result); - } else { - nonMainIndexResults.push(result); - } + ]); } } } // lookup as object objectTerms.forEach((term) => - normalResults.push(...Search.performObjectSearch(term, objectTerms)) + results.push(...Search.performObjectSearch(term, objectTerms)) ); // lookup as search terms in fulltext - normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + results.push(...Search.performTermsSearch(searchTerms, excludedTerms)); // let the scorer override scores with a custom scoring function - if (Scorer.score) { - normalResults.forEach((item) => (item[4] = Scorer.score(item))); - nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item))); - } - - // Sort each group of results by score and then alphabetically by name. - normalResults.sort(_orderResultsByScoreThenName); - nonMainIndexResults.sort(_orderResultsByScoreThenName); - - // Combine the result groups in (reverse) order. - // Non-main index entries are typically arbitrary cross-references, - // so display them after other results. - let results = [...nonMainIndexResults, ...normalResults]; + if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item))); + + // now sort the results by score (in opposite order of appearance, since the + // display function below uses pop() to retrieve items) and then + // alphabetically + results.sort((a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; + }); // remove duplicate search results // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept @@ -399,12 +361,7 @@ const Search = { return acc; }, []); - return results.reverse(); - }, - - query: (query) => { - const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query); - const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms); + results = results.reverse(); // for debugging //Search.lastresults = results.slice(); // a copy @@ -509,18 +466,14 @@ const Search = { // add support for partial matches if (word.length > 2) { const escapedWord = _escapeRegExp(word); - if (!terms.hasOwnProperty(word)) { - Object.keys(terms).forEach((term) => { - if (term.match(escapedWord)) - arr.push({ files: terms[term], score: Scorer.partialTerm }); - }); - } - if (!titleTerms.hasOwnProperty(word)) { - Object.keys(titleTerms).forEach((term) => { - if (term.match(escapedWord)) - arr.push({ files: titleTerms[term], score: Scorer.partialTitle }); - }); - } + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord) && !terms[word]) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord) && !titleTerms[word]) + arr.push({ files: titleTerms[word], score: Scorer.partialTitle }); + }); } // no match but word was a required one @@ -543,8 +496,9 @@ const Search = { // create the mapping files.forEach((file) => { - if (!fileMap.has(file)) fileMap.set(file, [word]); - else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word); + if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1) + fileMap.get(file).push(word); + else fileMap.set(file, [word]); }); }); @@ -595,8 +549,8 @@ const Search = { * search summary for a given text. keywords is a list * of stemmed words. */ - makeSearchSummary: (htmlText, keywords, anchor) => { - const text = Search.htmlToText(htmlText, anchor); + makeSearchSummary: (htmlText, keywords) => { + const text = Search.htmlToText(htmlText); if (text === "") return null; const textLower = text.toLowerCase(); diff --git a/docs/_build/html/genindex.html b/docs/_build/html/genindex.html index 58a0354..398032e 100644 --- a/docs/_build/html/genindex.html +++ b/docs/_build/html/genindex.html @@ -1,13 +1,11 @@ - +
level (int) – desired level of logging (default: logging.INFO)
file (str)
file (str) –
get_s3_versions()
get_s3_versions_client()
get_storage_options()
list_s3_directories()
read_jsonlines()
readtext_jsonlines()
s3_get_articles()
s3_get_pages()
s3_glob_with_size()
upload()
upload_to_s3()
List all files in a directory with a given suffix and their size in MB.
+directory (str) – The directory path to search for files.
file_suffix (str) – The file extension or suffix to match.
size in megabytes, rounded to six decimal places.
+list[str]
+Retrieve the ‘directory’ names (media titles) in an S3 bucket after a +given path prefix.
+bucket_name (str) – The name of the S3 bucket.
prefix (str) – The prefix path within the bucket to search. Default +is the root (‘’).
and prefix.
+list
+Custom glob function to list S3 objects matching a pattern. This function +works around the 1000-object listing limit in S3 by using boto3 directly.
+path (str) – The S3 path with a wildcard (*) to match files. +Example: s3://bucket_name/path/to/files/*.txt.
boto3_bucket (boto3.Bucket, optional) – An optional boto3 Bucket object. +If not provided, it will be +created from the path.
and their sizes in megabytes.
+list
+Partition a bag into n partitions and write each partition in a file
+impresso_commons.utils.daskutils.partitioner(bag: Bag, path: str, nb_partitions: int) None +Partition a Dask bag into n partitions and write each to a separate file.
+bag (dask.bag.Bag) – The Dask bag to be partitioned.
path (str) – Directory path where partitioned files will be saved.
nb_partitions (int) – Number of partitions to create.
The function writes partitioned files to the specified path.
+None
+mnf_json (dict) – A dictionary containing manifest data.
extended_summary (bool, optional) – Whether to include extended summary
False. (with year statistics. Defaults to)
False. (with year statistics. Defaults to) –