diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle
index fbd2236..fc072a8 100644
Binary files a/docs/_build/doctrees/environment.pickle and b/docs/_build/doctrees/environment.pickle differ
diff --git a/docs/_build/doctrees/images.doctree b/docs/_build/doctrees/images.doctree
index ade1033..a2c17a4 100644
Binary files a/docs/_build/doctrees/images.doctree and b/docs/_build/doctrees/images.doctree differ
diff --git a/docs/_build/doctrees/index.doctree b/docs/_build/doctrees/index.doctree
index 3ac58e9..1590498 100644
Binary files a/docs/_build/doctrees/index.doctree and b/docs/_build/doctrees/index.doctree differ
diff --git a/docs/_build/doctrees/io.doctree b/docs/_build/doctrees/io.doctree
index 7873d90..d2c1131 100644
Binary files a/docs/_build/doctrees/io.doctree and b/docs/_build/doctrees/io.doctree differ
diff --git a/docs/_build/doctrees/rebuild.doctree b/docs/_build/doctrees/rebuild.doctree
index d9bb08f..dc4e806 100644
Binary files a/docs/_build/doctrees/rebuild.doctree and b/docs/_build/doctrees/rebuild.doctree differ
diff --git a/docs/_build/doctrees/utils.doctree b/docs/_build/doctrees/utils.doctree
index 4245d73..0cc5e7c 100644
Binary files a/docs/_build/doctrees/utils.doctree and b/docs/_build/doctrees/utils.doctree differ
diff --git a/docs/_build/doctrees/versioning.doctree b/docs/_build/doctrees/versioning.doctree
index 3f1affe..34cb70f 100644
Binary files a/docs/_build/doctrees/versioning.doctree and b/docs/_build/doctrees/versioning.doctree differ
diff --git a/docs/_build/html/.buildinfo b/docs/_build/html/.buildinfo
index 9501185..d9e8c4b 100644
--- a/docs/_build/html/.buildinfo
+++ b/docs/_build/html/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 22fc4b3edf789b51746d48473d45c93f
+config: 8c31fa3be0c6ebef3824e4e08997d35b
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/docs/_build/html/_static/basic.css b/docs/_build/html/_static/basic.css
index f316efc..30fee9d 100644
--- a/docs/_build/html/_static/basic.css
+++ b/docs/_build/html/_static/basic.css
@@ -4,7 +4,7 @@
  *
  * Sphinx stylesheet -- basic theme.
  *
- * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
+ * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
  * :license: BSD, see LICENSE for details.
  *
  */
diff --git a/docs/_build/html/_static/doctools.js b/docs/_build/html/_static/doctools.js
index 4d67807..d06a71d 100644
--- a/docs/_build/html/_static/doctools.js
+++ b/docs/_build/html/_static/doctools.js
@@ -4,7 +4,7 @@
  *
  * Base JavaScript utilities for all Sphinx HTML documentation.
  *
- * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
+ * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
  * :license: BSD, see LICENSE for details.
  *
  */
diff --git a/docs/_build/html/_static/language_data.js b/docs/_build/html/_static/language_data.js
index 367b8ed..250f566 100644
--- a/docs/_build/html/_static/language_data.js
+++ b/docs/_build/html/_static/language_data.js
@@ -5,7 +5,7 @@
  * This script contains the language-specific data used by searchtools.js,
  * namely the list of stopwords, stemmer, scorer and splitter.
  *
- * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
+ * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
  * :license: BSD, see LICENSE for details.
  *
  */
@@ -13,7 +13,7 @@
 var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
 
 
-/* Non-minified version is copied as a separate JS file, if available */
+/* Non-minified version is copied as a separate JS file, is available */
 
 /**
  * Porter Stemmer
diff --git a/docs/_build/html/_static/searchtools.js b/docs/_build/html/_static/searchtools.js
index b08d58c..7918c3f 100644
--- a/docs/_build/html/_static/searchtools.js
+++ b/docs/_build/html/_static/searchtools.js
@@ -4,7 +4,7 @@
  *
  * Sphinx JavaScript utilities for the full-text search.
  *
- * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS.
+ * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
  * :license: BSD, see LICENSE for details.
  *
  */
@@ -99,7 +99,7 @@ const _displayItem = (item, searchTerms, highlightTerms) => {
       .then((data) => {
         if (data)
           listItem.appendChild(
-            Search.makeSearchSummary(data, searchTerms, anchor)
+            Search.makeSearchSummary(data, searchTerms)
           );
         // highlight search terms in the summary
         if (SPHINX_HIGHLIGHT_ENABLED)  // set in sphinx_highlight.js
@@ -116,8 +116,8 @@ const _finishSearch = (resultCount) => {
     );
   else
     Search.status.innerText = _(
-      "Search finished, found ${resultCount} page(s) matching the search query."
-    ).replace('${resultCount}', resultCount);
+      `Search finished, found ${resultCount} page(s) matching the search query.`
+    );
 };
 const _displayNextItem = (
   results,
@@ -137,22 +137,6 @@ const _displayNextItem = (
   // search finished, update title and status message
   else _finishSearch(resultCount);
 };
-// Helper function used by query() to order search results.
-// Each input is an array of [docname, title, anchor, descr, score, filename].
-// Order the results by score (in opposite order of appearance, since the
-// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically.
-const _orderResultsByScoreThenName = (a, b) => {
-  const leftScore = a[4];
-  const rightScore = b[4];
-  if (leftScore === rightScore) {
-    // same score: sort alphabetically
-    const leftTitle = a[1].toLowerCase();
-    const rightTitle = b[1].toLowerCase();
-    if (leftTitle === rightTitle) return 0;
-    return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
-  }
-  return leftScore > rightScore ? 1 : -1;
-};
 
 /**
  * Default splitQuery function. Can be overridden in ``sphinx.search`` with a
@@ -176,26 +160,13 @@ const Search = {
   _queued_query: null,
   _pulse_status: -1,
 
-  htmlToText: (htmlString, anchor) => {
+  htmlToText: (htmlString) => {
     const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html');
-    for (const removalQuery of [".headerlink", "script", "style"]) {
-      htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() });
-    }
-    if (anchor) {
-      const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`);
-      if (anchorContent) return anchorContent.textContent;
-
-      console.warn(
-        `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.`
-      );
-    }
-
-    // if anchor not specified or not found, fall back to main content
+    htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() });
     const docContent = htmlElement.querySelector('[role="main"]');
-    if (docContent) return docContent.textContent;
-
+    if (docContent !== undefined) return docContent.textContent;
     console.warn(
-      "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template."
+      "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template."
     );
     return "";
   },
@@ -268,7 +239,16 @@ const Search = {
     else Search.deferQuery(query);
   },
 
-  _parseQuery: (query) => {
+  /**
+   * execute search (requires search index to be loaded)
+   */
+  query: (query) => {
+    const filenames = Search._index.filenames;
+    const docNames = Search._index.docnames;
+    const titles = Search._index.titles;
+    const allTitles = Search._index.alltitles;
+    const indexEntries = Search._index.indexentries;
+
     // stem the search terms and add them to the correct list
     const stemmer = new Stemmer();
     const searchTerms = new Set();
@@ -304,38 +284,21 @@ const Search = {
     // console.info("required: ", [...searchTerms]);
     // console.info("excluded: ", [...excludedTerms]);
 
-    return [query, searchTerms, excludedTerms, highlightTerms, objectTerms];
-  },
-
-  /**
-   * execute search (requires search index to be loaded)
-   */
-  _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => {
-    const filenames = Search._index.filenames;
-    const docNames = Search._index.docnames;
-    const titles = Search._index.titles;
-    const allTitles = Search._index.alltitles;
-    const indexEntries = Search._index.indexentries;
-
-    // Collect multiple result groups to be sorted separately and then ordered.
-    // Each is an array of [docname, title, anchor, descr, score, filename].
-    const normalResults = [];
-    const nonMainIndexResults = [];
-
+    // array of [docname, title, anchor, descr, score, filename]
+    let results = [];
     _removeChildren(document.getElementById("search-progress"));
 
-    const queryLower = query.toLowerCase().trim();
+    const queryLower = query.toLowerCase();
     for (const [title, foundTitles] of Object.entries(allTitles)) {
-      if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) {
+      if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) {
         for (const [file, id] of foundTitles) {
-          const score = Math.round(Scorer.title * queryLower.length / title.length);
-          const boost = titles[file] === title ? 1 : 0;  // add a boost for document titles
-          normalResults.push([
+          let score = Math.round(100 * queryLower.length / title.length)
+          results.push([
             docNames[file],
             titles[file] !== title ? `${titles[file]} > ${title}` : title,
             id !== null ? "#" + id : "",
             null,
-            score + boost,
+            score,
             filenames[file],
           ]);
         }
@@ -345,47 +308,46 @@ const Search = {
     // search for explicit entries in index directives
     for (const [entry, foundEntries] of Object.entries(indexEntries)) {
       if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) {
-        for (const [file, id, isMain] of foundEntries) {
-          const score = Math.round(100 * queryLower.length / entry.length);
-          const result = [
+        for (const [file, id] of foundEntries) {
+          let score = Math.round(100 * queryLower.length / entry.length)
+          results.push([
             docNames[file],
             titles[file],
             id ? "#" + id : "",
             null,
             score,
             filenames[file],
-          ];
-          if (isMain) {
-            normalResults.push(result);
-          } else {
-            nonMainIndexResults.push(result);
-          }
+          ]);
         }
       }
     }
 
     // lookup as object
     objectTerms.forEach((term) =>
-      normalResults.push(...Search.performObjectSearch(term, objectTerms))
+      results.push(...Search.performObjectSearch(term, objectTerms))
     );
 
     // lookup as search terms in fulltext
-    normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms));
+    results.push(...Search.performTermsSearch(searchTerms, excludedTerms));
 
     // let the scorer override scores with a custom scoring function
-    if (Scorer.score) {
-      normalResults.forEach((item) => (item[4] = Scorer.score(item)));
-      nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item)));
-    }
-
-    // Sort each group of results by score and then alphabetically by name.
-    normalResults.sort(_orderResultsByScoreThenName);
-    nonMainIndexResults.sort(_orderResultsByScoreThenName);
-
-    // Combine the result groups in (reverse) order.
-    // Non-main index entries are typically arbitrary cross-references,
-    // so display them after other results.
-    let results = [...nonMainIndexResults, ...normalResults];
+    if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item)));
+
+    // now sort the results by score (in opposite order of appearance, since the
+    // display function below uses pop() to retrieve items) and then
+    // alphabetically
+    results.sort((a, b) => {
+      const leftScore = a[4];
+      const rightScore = b[4];
+      if (leftScore === rightScore) {
+        // same score: sort alphabetically
+        const leftTitle = a[1].toLowerCase();
+        const rightTitle = b[1].toLowerCase();
+        if (leftTitle === rightTitle) return 0;
+        return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
+      }
+      return leftScore > rightScore ? 1 : -1;
+    });
 
     // remove duplicate search results
     // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept
@@ -399,12 +361,7 @@ const Search = {
       return acc;
     }, []);
 
-    return results.reverse();
-  },
-
-  query: (query) => {
-    const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query);
-    const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms);
+    results = results.reverse();
 
     // for debugging
     //Search.lastresults = results.slice();  // a copy
@@ -509,18 +466,14 @@ const Search = {
       // add support for partial matches
       if (word.length > 2) {
         const escapedWord = _escapeRegExp(word);
-        if (!terms.hasOwnProperty(word)) {
-          Object.keys(terms).forEach((term) => {
-            if (term.match(escapedWord))
-              arr.push({ files: terms[term], score: Scorer.partialTerm });
-          });
-        }
-        if (!titleTerms.hasOwnProperty(word)) {
-          Object.keys(titleTerms).forEach((term) => {
-            if (term.match(escapedWord))
-              arr.push({ files: titleTerms[term], score: Scorer.partialTitle });
-          });
-        }
+        Object.keys(terms).forEach((term) => {
+          if (term.match(escapedWord) && !terms[word])
+            arr.push({ files: terms[term], score: Scorer.partialTerm });
+        });
+        Object.keys(titleTerms).forEach((term) => {
+          if (term.match(escapedWord) && !titleTerms[word])
+            arr.push({ files: titleTerms[word], score: Scorer.partialTitle });
+        });
       }
 
       // no match but word was a required one
@@ -543,8 +496,9 @@ const Search = {
 
       // create the mapping
       files.forEach((file) => {
-        if (!fileMap.has(file)) fileMap.set(file, [word]);
-        else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word);
+        if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1)
+          fileMap.get(file).push(word);
+        else fileMap.set(file, [word]);
       });
     });
 
@@ -595,8 +549,8 @@ const Search = {
    * search summary for a given text. keywords is a list
    * of stemmed words.
    */
-  makeSearchSummary: (htmlText, keywords, anchor) => {
-    const text = Search.htmlToText(htmlText, anchor);
+  makeSearchSummary: (htmlText, keywords) => {
+    const text = Search.htmlToText(htmlText);
     if (text === "") return null;
 
     const textLower = text.toLowerCase();
diff --git a/docs/_build/html/genindex.html b/docs/_build/html/genindex.html
index 58a0354..398032e 100644
--- a/docs/_build/html/genindex.html
+++ b/docs/_build/html/genindex.html
@@ -1,13 +1,11 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" data-content_root="./">
+<html class="writer-html5" lang="en" >
 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Index &mdash; Impresso PyCommons  documentation</title>
-      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
-      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
-
-  
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
@@ -15,7 +13,7 @@
         <script src="_static/jquery.js?v=5d32c60e"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
         <script src="_static/documentation_options.js?v=5929fcd5"></script>
-        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="#" />
@@ -314,6 +312,8 @@ <h2 id="G">G</h2>
       <li><a href="io.html#impresso_commons.path.path_fs.get_issueshortpath">get_issueshortpath() (in module impresso_commons.path.path_fs)</a>
 </li>
       <li><a href="images.html#impresso_commons.images.img_utils.get_jpg">get_jpg() (in module impresso_commons.images.img_utils)</a>
+</li>
+      <li><a href="utils.html#impresso_commons.utils.utils.get_list_intersection">get_list_intersection() (in module impresso_commons.utils.utils)</a>
 </li>
       <li><a href="versioning.html#impresso_commons.versioning.helpers.get_media_item_years">get_media_item_years() (in module impresso_commons.versioning.helpers)</a>
 </li>
@@ -348,6 +348,8 @@ <h2 id="G">G</h2>
       <li><a href="images.html#impresso_commons.images.img_utils.get_tif">get_tif() (in module impresso_commons.images.img_utils)</a>
 </li>
       <li><a href="versioning.html#impresso_commons.versioning.helpers.git_commit_push">git_commit_push() (in module impresso_commons.versioning.helpers)</a>
+</li>
+      <li><a href="utils.html#impresso_commons.utils.utils.glob_with_size">glob_with_size() (in module impresso_commons.utils.utils)</a>
 </li>
       <li><a href="versioning.html#impresso_commons.versioning.data_statistics.DataStatistics.granularity">granularity (impresso_commons.versioning.data_statistics.DataStatistics attribute)</a>
 
@@ -536,12 +538,16 @@ <h2 id="L">L</h2>
       <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.LANGIDENT">LANGIDENT (impresso_commons.versioning.helpers.DataStage attribute)</a>
 </li>
       <li><a href="versioning.html#impresso_commons.versioning.helpers.DataStage.LINGUISTIC_PROCESSING">LINGUISTIC_PROCESSING (impresso_commons.versioning.helpers.DataStage attribute)</a>
+</li>
+      <li><a href="io.html#impresso_commons.path.path_s3.list_files">list_files() (in module impresso_commons.path.path_s3)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
-      <li><a href="io.html#impresso_commons.path.path_s3.list_files">list_files() (in module impresso_commons.path.path_s3)</a>
+      <li><a href="utils.html#impresso_commons.utils.utils.list_local_directories">list_local_directories() (in module impresso_commons.utils.utils)</a>
 </li>
       <li><a href="io.html#impresso_commons.path.path_s3.list_newspapers">list_newspapers() (in module impresso_commons.path.path_s3)</a>
+</li>
+      <li><a href="utils.html#impresso_commons.utils.s3.list_s3_directories">list_s3_directories() (in module impresso_commons.utils.s3)</a>
 </li>
   </ul></td>
 </tr></table>
@@ -727,6 +733,8 @@ <h2 id="S">S</h2>
       <li><a href="utils.html#impresso_commons.utils.s3.s3_get_articles">s3_get_articles() (in module impresso_commons.utils.s3)</a>
 </li>
       <li><a href="utils.html#impresso_commons.utils.s3.s3_get_pages">s3_get_pages() (in module impresso_commons.utils.s3)</a>
+</li>
+      <li><a href="utils.html#impresso_commons.utils.s3.s3_glob_with_size">s3_glob_with_size() (in module impresso_commons.utils.s3)</a>
 </li>
       <li><a href="io.html#impresso_commons.path.path_s3.s3_iter_bucket">s3_iter_bucket() (in module impresso_commons.path.path_s3)</a>
 </li>
diff --git a/docs/_build/html/images.html b/docs/_build/html/images.html
index f252ddb..0196e80 100644
--- a/docs/_build/html/images.html
+++ b/docs/_build/html/images.html
@@ -1,14 +1,12 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" data-content_root="./">
+<html class="writer-html5" lang="en" >
 <head>
-  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Image handling &mdash; Impresso PyCommons  documentation</title>
-      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
-      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
-
-  
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
@@ -16,7 +14,7 @@
         <script src="_static/jquery.js?v=5d32c60e"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
         <script src="_static/documentation_options.js?v=5929fcd5"></script>
-        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
diff --git a/docs/_build/html/index.html b/docs/_build/html/index.html
index 1592d5a..4e07105 100644
--- a/docs/_build/html/index.html
+++ b/docs/_build/html/index.html
@@ -1,14 +1,12 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" data-content_root="./">
+<html class="writer-html5" lang="en" >
 <head>
-  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Welcome to Impresso PyCommons’s documentation! &mdash; Impresso PyCommons  documentation</title>
-      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
-      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
-
-  
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
@@ -16,7 +14,7 @@
         <script src="_static/jquery.js?v=5d32c60e"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
         <script src="_static/documentation_options.js?v=5929fcd5"></script>
-        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
diff --git a/docs/_build/html/io.html b/docs/_build/html/io.html
index 89339a8..259d623 100644
--- a/docs/_build/html/io.html
+++ b/docs/_build/html/io.html
@@ -1,14 +1,12 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" data-content_root="./">
+<html class="writer-html5" lang="en" >
 <head>
-  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Input/Output &mdash; Impresso PyCommons  documentation</title>
-      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
-      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
-
-  
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
@@ -16,7 +14,7 @@
         <script src="_static/jquery.js?v=5d32c60e"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
         <script src="_static/documentation_options.js?v=5929fcd5"></script>
-        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
diff --git a/docs/_build/html/objects.inv b/docs/_build/html/objects.inv
index 166a721..e514470 100644
Binary files a/docs/_build/html/objects.inv and b/docs/_build/html/objects.inv differ
diff --git a/docs/_build/html/py-modindex.html b/docs/_build/html/py-modindex.html
index 5474c92..be06ca2 100644
--- a/docs/_build/html/py-modindex.html
+++ b/docs/_build/html/py-modindex.html
@@ -1,13 +1,11 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" data-content_root="./">
+<html class="writer-html5" lang="en" >
 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Python Module Index &mdash; Impresso PyCommons  documentation</title>
-      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
-      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
-
-  
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
@@ -15,7 +13,7 @@
         <script src="_static/jquery.js?v=5d32c60e"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
         <script src="_static/documentation_options.js?v=5929fcd5"></script>
-        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
diff --git a/docs/_build/html/rebuild.html b/docs/_build/html/rebuild.html
index f01c479..7491e7e 100644
--- a/docs/_build/html/rebuild.html
+++ b/docs/_build/html/rebuild.html
@@ -1,14 +1,12 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" data-content_root="./">
+<html class="writer-html5" lang="en" >
 <head>
-  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Text Rebuild &mdash; Impresso PyCommons  documentation</title>
-      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
-      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
-
-  
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
@@ -16,7 +14,7 @@
         <script src="_static/jquery.js?v=5d32c60e"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
         <script src="_static/documentation_options.js?v=5929fcd5"></script>
-        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -204,7 +202,7 @@ <h2>Rebuild functions<a class="headerlink" href="#rebuild-functions" title="Link
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>level</strong> (<em>int</em>) – desired level of logging (default: logging.INFO)</p></li>
-<li><p><strong>file</strong> (<em>str</em>)</p></li>
+<li><p><strong>file</strong> (<em>str</em>) – </p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns<span class="colon">:</span></dt>
diff --git a/docs/_build/html/search.html b/docs/_build/html/search.html
index d737643..7258c61 100644
--- a/docs/_build/html/search.html
+++ b/docs/_build/html/search.html
@@ -1,13 +1,11 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" data-content_root="./">
+<html class="writer-html5" lang="en" >
 <head>
   <meta charset="utf-8" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Search &mdash; Impresso PyCommons  documentation</title>
-      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
-      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
-
-  
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
     
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
@@ -16,7 +14,7 @@
         <script src="_static/jquery.js?v=5d32c60e"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
         <script src="_static/documentation_options.js?v=5929fcd5"></script>
-        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/js/theme.js"></script>
     <script src="_static/searchtools.js"></script>
diff --git a/docs/_build/html/searchindex.js b/docs/_build/html/searchindex.js
index eafda0a..4cef3b7 100644
--- a/docs/_build/html/searchindex.js
+++ b/docs/_build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"Apache UIMA XMI Utils Functions": [[4, "module-impresso_commons.utils.uima"]], "Background information": [[0, "background-information"]], "Basic Utils Functions": [[4, "module-impresso_commons.utils.utils"]], "Case 1: tif": [[0, "case-1-tif"]], "Case 2: several png": [[0, "case-2-several-png"]], "Case 3: one png only": [[0, "case-3-one-png-only"]], "Case 4: one jpg only": [[0, "case-4-one-jpg-only"]], "Config File Loader": [[4, "module-impresso_commons.utils.config_loader"]], "Config file example": [[3, "config-file-example"]], "Contents:": [[1, null]], "Dask Utils Functions": [[4, "module-impresso_commons.utils.daskutils"]], "Data Manifest": [[5, "module-impresso_commons.versioning.data_manifest"]], "Data Statistics and Newspaper Statistics": [[5, "module-impresso_commons.versioning.data_statistics"]], "Data Versioning": [[5, null]], "General": [[2, "module-impresso_commons.path"]], "Helpers": [[3, "module-impresso_commons.text.helpers"]], "I/O from S3": [[2, "module-impresso_commons.path.path_s3"]], "I/O from file system": [[2, "module-impresso_commons.path.path_fs"]], "Image Utils": [[0, "module-impresso_commons.images.img_utils"]], "Image handling": [[0, null]], "Input/Output": [[2, null]], "Manifest Computing Script": [[5, "module-impresso_commons.versioning.compute_manifest"]], "Olive Boxes": [[0, "module-impresso_commons.images.olive_boxes"]], "Rebuild functions": [[3, "rebuild-functions"]], "S3 Utils Functions": [[4, "module-impresso_commons.utils.s3"]], "Text Rebuild": [[3, null]], "Utilities": [[4, null]], "Versioning Helpers": [[5, "module-impresso_commons.versioning.helpers"]], "Welcome to Impresso PyCommons\u2019s documentation!": [[1, null]]}, "docnames": ["images", "index", "io", "rebuild", "utils", "versioning"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2}, "filenames": ["images.rst", "index.rst", "io.rst", "rebuild.rst", "utils.rst", "versioning.rst"], "indexentries": {"add_by_ci_id() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.add_by_ci_id", false]], "add_by_title_year() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.add_by_title_year", false]], "add_count_list_by_title_year() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.add_count_list_by_title_year", false]], "add_counts() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.add_counts", false]], "agg() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.agg", false]], "aggregate_stats_for_title() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.aggregate_stats_for_title", false]], "alternative_read_text() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.alternative_read_text", false]], "append_to_notes() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.append_to_notes", false]], "base (class in impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.Base", false]], "boxstrategy (class in impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.BoxStrategy", false]], "bytes_to() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.bytes_to", false]], "canonical (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.CANONICAL", false]], "canonical_path() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.canonical_path", false]], "check_bucket() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.check_bucket", false]], "check_filenaming() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.check_filenaming", false]], "check_params() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.check_params", false]], "chunk() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.chunk", false]], "chunk() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.chunk", false]], "cleanup() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.cleanup", false]], "clone_git_repo() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.clone_git_repo", false]], "compose() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.compose", false]], "compress() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.compress", false]], "compute() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.compute", false]], "compute_box() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.compute_box", false]], "compute_image_links() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.compute_image_links", false]], "compute_scale_factor() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.compute_scale_factor", false]], "compute_stats_for_stage() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.compute_stats_for_stage", false]], "compute_stats_in_canonical_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_canonical_bag", false]], "compute_stats_in_entities_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_entities_bag", false]], "compute_stats_in_langident_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_langident_bag", false]], "compute_stats_in_rebuilt_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_rebuilt_bag", false]], "compute_stats_in_solr_text_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_solr_text_bag", false]], "contentitem (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.ContentItem", false]], "convert_box() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.convert_box", false]], "count_keys (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.count_keys", false]], "count_keys (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.count_keys", false]], "counts (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.counts", false]], "counts (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.counts", false]], "counts_for_canonical_issue() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.counts_for_canonical_issue", false]], "counts_for_rebuilt() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.counts_for_rebuilt", false]], "create_even_partitions() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.create_even_partitions", false]], "create_manifest() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.create_manifest", false]], "datamanifest (class in impresso_commons.versioning.data_manifest)": [[5, "impresso_commons.versioning.data_manifest.DataManifest", false]], "datastage (class in impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.DataStage", false]], "datastatistics (class in impresso_commons.versioning.data_statistics)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics", false]], "date (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.date", false]], "define_update_info_for_title() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.define_update_info_for_title", false]], "detect_canonical_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_canonical_issues", false]], "detect_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_issues", false]], "detect_journal_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_journal_issues", false]], "edition (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.edition", false]], "element (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.element", false]], "element (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.element", false]], "embeddings (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.EMBEDDINGS", false]], "entities (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.ENTITIES", false]], "evenized (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.EVENIZED", false]], "extract_np_key() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.extract_np_key", false]], "extract_version() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.extract_version", false]], "fetch_files() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.fetch_files", false]], "filter_new_or_modified_media() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.filter_new_or_modified_media", false]], "finalize() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.finalize", false]], "find_s3_data_manifest_path() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.find_s3_data_manifest_path", false]], "fixed_s3fs_glob() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.fixed_s3fs_glob", false]], "from_json() (impresso_commons.utils.config_loader.base class method)": [[4, "impresso_commons.utils.config_loader.Base.from_json", false]], "generate_media_dict() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.generate_media_dict", false]], "get_boto3_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_boto3_bucket", false]], "get_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_bucket", false]], "get_bucket_boto3() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_bucket_boto3", false]], "get_count_keys() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.get_count_keys", false]], "get_files_to_consider() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.get_files_to_consider", false]], "get_head_commit_url() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.get_head_commit_url", false]], "get_iiif_and_coords() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.get_iiif_and_coords", false]], "get_iiif_links() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.get_iiif_links", false]], "get_iiif_url() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.get_iiif_url", false]], "get_img_from_archive() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_img_from_archive", false]], "get_imgdimensions() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_imgdimensions", false]], "get_issueshortpath() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.get_issueshortpath", false]], "get_jpg() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_jpg", false]], "get_media_item_years() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.get_media_item_years", false]], "get_media_titles() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.get_media_titles", false]], "get_or_create_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_or_create_bucket", false]], "get_page_folders() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_page_folders", false]], "get_pkg_resource() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.get_pkg_resource", false]], "get_png() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_png", false]], "get_s3_client() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_client", false]], "get_s3_connection() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_connection", false]], "get_s3_object_size() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_object_size", false]], "get_s3_resource() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_resource", false]], "get_s3_versions() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_versions", false]], "get_s3_versions_client() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_versions_client", false]], "get_scale_factor() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.get_scale_factor", false]], "get_storage_options() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_storage_options", false]], "get_tif() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_tif", false]], "git_commit_push() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.git_commit_push", false]], "granularity (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.granularity", false]], "granularity (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.granularity", false]], "has_title_year_key() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.has_title_year_key", false]], "has_value() (impresso_commons.versioning.helpers.datastage class method)": [[5, "impresso_commons.versioning.helpers.DataStage.has_value", false]], "id2issuedir() (in module impresso_commons.path)": [[2, "impresso_commons.path.id2IssueDir", false]], "impresso_commons.images.img_utils": [[0, "module-impresso_commons.images.img_utils", false]], "impresso_commons.images.olive_boxes": [[0, "module-impresso_commons.images.olive_boxes", false]], "impresso_commons.path": [[2, "module-impresso_commons.path", false]], "impresso_commons.path.path_fs": [[2, "module-impresso_commons.path.path_fs", false]], "impresso_commons.path.path_s3": [[2, "module-impresso_commons.path.path_s3", false]], "impresso_commons.text.helpers": [[3, "module-impresso_commons.text.helpers", false]], "impresso_commons.text.rebuilder": [[3, "module-impresso_commons.text.rebuilder", false]], "impresso_commons.utils.config_loader": [[4, "module-impresso_commons.utils.config_loader", false]], "impresso_commons.utils.daskutils": [[4, "module-impresso_commons.utils.daskutils", false]], "impresso_commons.utils.s3": [[4, "module-impresso_commons.utils.s3", false]], "impresso_commons.utils.uima": [[4, "module-impresso_commons.utils.uima", false]], "impresso_commons.utils.utils": [[4, "module-impresso_commons.utils.utils", false]], "impresso_commons.versioning.compute_manifest": [[5, "module-impresso_commons.versioning.compute_manifest", false]], "impresso_commons.versioning.data_manifest": [[5, "module-impresso_commons.versioning.data_manifest", false]], "impresso_commons.versioning.data_statistics": [[5, "module-impresso_commons.versioning.data_statistics", false]], "impresso_commons.versioning.helpers": [[5, "module-impresso_commons.versioning.helpers", false]], "impresso_iter_bucket() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.impresso_iter_bucket", false]], "increment_version() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.increment_version", false]], "init_counts() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.init_counts", false]], "init_logger() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.init_logger", false]], "init_logging() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.init_logging", false]], "init_media_info() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.init_media_info", false]], "init_yearly_count_dict() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.init_yearly_count_dict", false]], "insert_whitespace() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.insert_whitespace", false]], "is_git_repo() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.is_git_repo", false]], "issuedir (class in impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.IssueDir", false]], "issuedir (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.IssueDir", false]], "journal (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.journal", false]], "jpg_highest (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.jpg_highest", false]], "jpg_uniq (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.jpg_uniq", false]], "langident (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.LANGIDENT", false]], "linguistic_processing (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.LINGUISTIC_PROCESSING", false]], "list_files() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.list_files", false]], "list_newspapers() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.list_newspapers", false]], "main() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.main", false]], "main() (in module impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.main", false]], "main() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.main", false]], "main() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.main", false]], "manifest_summary() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.manifest_summary", false]], "media_list_from_mft_json() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.media_list_from_mft_json", false]], "module": [[0, "module-impresso_commons.images.img_utils", false], [0, "module-impresso_commons.images.olive_boxes", false], [2, "module-impresso_commons.path", false], [2, "module-impresso_commons.path.path_fs", false], [2, "module-impresso_commons.path.path_s3", false], [3, "module-impresso_commons.text.helpers", false], [3, "module-impresso_commons.text.rebuilder", false], [4, "module-impresso_commons.utils.config_loader", false], [4, "module-impresso_commons.utils.daskutils", false], [4, "module-impresso_commons.utils.s3", false], [4, "module-impresso_commons.utils.uima", false], [4, "module-impresso_commons.utils.utils", false], [5, "module-impresso_commons.versioning.compute_manifest", false], [5, "module-impresso_commons.versioning.data_manifest", false], [5, "module-impresso_commons.versioning.data_statistics", false], [5, "module-impresso_commons.versioning.helpers", false]], "mysql_cis (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.MYSQL_CIS", false]], "new_media() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.new_media", false]], "newspaperstatistics (class in impresso_commons.versioning.data_statistics)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics", false]], "ocrqa (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.OCRQA", false]], "output_mft_s3_path (impresso_commons.versioning.data_manifest.datamanifest property)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.output_mft_s3_path", false]], "overall_stats() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.overall_stats", false]], "pages_to_article() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.pages_to_article", false]], "pair_issue() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.pair_issue", false]], "parse_canonical_filename() (in module impresso_commons.path)": [[2, "impresso_commons.path.parse_canonical_filename", false]], "parse_json() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.parse_json", false]], "partitioner() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.partitioner", false]], "partitionerconfig (class in impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.PartitionerConfig", false]], "passim (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.PASSIM", false]], "path (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.path", false]], "png_highest (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.png_highest", false]], "png_uniq (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.png_uniq", false]], "possible_count_keys (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.possible_count_keys", false]], "pretty_print() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.pretty_print", false]], "pretty_print() (impresso_commons.versioning.data_statistics.newspaperstatistics method)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.pretty_print", false]], "read_issue() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_issue", false]], "read_issue_pages() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_issue_pages", false]], "read_jsonlines() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.read_jsonlines", false]], "read_manifest_from_s3() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.read_manifest_from_s3", false]], "read_manifest_from_s3_path() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.read_manifest_from_s3_path", false]], "read_page() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_page", false]], "read_s3_issues() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.read_s3_issues", false]], "readtext_jsonlines() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.readtext_jsonlines", false]], "rebuild_for_passim() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_for_passim", false]], "rebuild_for_solr() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_for_solr", false]], "rebuild_issues() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_issues", false]], "rebuild_text() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_text", false]], "rebuild_text_passim() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_text_passim", false]], "rebuilt (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.REBUILT", false]], "rebuilt2xmi() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.rebuilt2xmi", false]], "reconstruct_iiif_link() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.reconstruct_iiif_link", false]], "rejoin_articles() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.rejoin_articles", false]], "remove_media_in_manifest() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.remove_media_in_manifest", false]], "replace_by_ci_id() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.replace_by_ci_id", false]], "replace_by_title_year() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.replace_by_title_year", false]], "run_cmd() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.run_cmd", false]], "s3_filter_archives() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3_filter_archives", false]], "s3_get_articles() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.s3_get_articles", false]], "s3_get_pages() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.s3_get_pages", false]], "s3_iter_bucket() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3_iter_bucket", false]], "s3contentitem (class in impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3ContentItem", false]], "same_counts() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.same_counts", false]], "same_counts() (impresso_commons.versioning.data_statistics.newspaperstatistics method)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.same_counts", false]], "select_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.select_issues", false]], "solr_embs (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.SOLR_EMBS", false]], "solr_entities (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.SOLR_ENTITIES", false]], "solr_text (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.SOLR_TEXT", false]], "stage (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.stage", false]], "stage (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.stage", false]], "test() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.test", false]], "text_apply_breaks() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.text_apply_breaks", false]], "text_reuse (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.TEXT_REUSE", false]], "tif (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.tif", false]], "title_level_stats() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.title_level_stats", false]], "to_dict() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.to_dict", false]], "topics (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.TOPICS", false]], "update_media_stats() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.update_media_stats", false]], "upload() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.upload", false]], "upload() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.upload", false]], "upload_to_s3() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.upload_to_s3", false]], "validate_against_schema() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.validate_against_schema", false]], "validate_and_export_manifest() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.validate_and_export_manifest", false]], "validate_config() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.validate_config", false]], "validate_granularity() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.validate_granularity", false]], "validate_stage() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.validate_stage", false]], "validate_version() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.validate_version", false]], "version_as_list() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.version_as_list", false]], "write_and_push_to_git() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.write_and_push_to_git", false]], "write_dump_to_fs() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.write_dump_to_fs", false]]}, "objects": {"impresso_commons": [[2, 0, 0, "-", "path"]], "impresso_commons.images": [[0, 0, 0, "-", "img_utils"], [0, 0, 0, "-", "olive_boxes"]], "impresso_commons.images.img_utils": [[0, 1, 1, "", "BoxStrategy"], [0, 3, 1, "", "compose"], [0, 3, 1, "", "get_img_from_archive"], [0, 3, 1, "", "get_imgdimensions"], [0, 3, 1, "", "get_jpg"], [0, 3, 1, "", "get_page_folders"], [0, 3, 1, "", "get_png"], [0, 3, 1, "", "get_tif"], [0, 3, 1, "", "run_cmd"]], "impresso_commons.images.img_utils.BoxStrategy": [[0, 2, 1, "", "jpg_highest"], [0, 2, 1, "", "jpg_uniq"], [0, 2, 1, "", "png_highest"], [0, 2, 1, "", "png_uniq"], [0, 2, 1, "", "tif"]], "impresso_commons.images.olive_boxes": [[0, 3, 1, "", "compute_box"], [0, 3, 1, "", "compute_scale_factor"], [0, 3, 1, "", "convert_box"], [0, 3, 1, "", "get_iiif_url"], [0, 3, 1, "", "get_scale_factor"], [0, 3, 1, "", "test"]], "impresso_commons.path": [[2, 3, 1, "", "id2IssueDir"], [2, 3, 1, "", "parse_canonical_filename"], [2, 0, 0, "-", "path_fs"], [2, 0, 0, "-", "path_s3"]], "impresso_commons.path.path_fs": [[2, 2, 1, "", "ContentItem"], [2, 1, 1, "", "IssueDir"], [2, 3, 1, "", "canonical_path"], [2, 3, 1, "", "check_filenaming"], [2, 3, 1, "", "detect_canonical_issues"], [2, 3, 1, "", "detect_issues"], [2, 3, 1, "", "detect_journal_issues"], [2, 3, 1, "", "get_issueshortpath"], [2, 3, 1, "", "pair_issue"], [2, 3, 1, "", "select_issues"]], "impresso_commons.path.path_fs.IssueDir": [[2, 2, 1, "", "date"], [2, 2, 1, "", "edition"], [2, 2, 1, "", "journal"], [2, 2, 1, "", "path"]], "impresso_commons.path.path_s3": [[2, 2, 1, "", "IssueDir"], [2, 3, 1, "", "fetch_files"], [2, 3, 1, "", "impresso_iter_bucket"], [2, 3, 1, "", "list_files"], [2, 3, 1, "", "list_newspapers"], [2, 3, 1, "", "read_s3_issues"], [2, 1, 1, "", "s3ContentItem"], [2, 3, 1, "", "s3_filter_archives"], [2, 3, 1, "", "s3_iter_bucket"]], "impresso_commons.text": [[3, 0, 0, "-", "helpers"], [3, 0, 0, "-", "rebuilder"]], "impresso_commons.text.helpers": [[3, 3, 1, "", "get_iiif_and_coords"], [3, 3, 1, "", "insert_whitespace"], [3, 3, 1, "", "pages_to_article"], [3, 3, 1, "", "read_issue"], [3, 3, 1, "", "read_issue_pages"], [3, 3, 1, "", "read_page"], [3, 3, 1, "", "reconstruct_iiif_link"], [3, 3, 1, "", "rejoin_articles"], [3, 3, 1, "", "text_apply_breaks"]], "impresso_commons.text.rebuilder": [[3, 3, 1, "", "cleanup"], [3, 3, 1, "", "compress"], [3, 3, 1, "", "init_logging"], [3, 3, 1, "", "main"], [3, 3, 1, "", "rebuild_for_passim"], [3, 3, 1, "", "rebuild_for_solr"], [3, 3, 1, "", "rebuild_issues"], [3, 3, 1, "", "rebuild_text"], [3, 3, 1, "", "rebuild_text_passim"], [3, 3, 1, "", "upload"]], "impresso_commons.utils": [[4, 0, 0, "-", "config_loader"], [4, 0, 0, "-", "daskutils"], [4, 0, 0, "-", "s3"], [4, 0, 0, "-", "uima"], [4, 0, 0, "-", "utils"]], "impresso_commons.utils.config_loader": [[4, 1, 1, "", "Base"], [4, 1, 1, "", "PartitionerConfig"], [4, 3, 1, "", "main"]], "impresso_commons.utils.config_loader.Base": [[4, 4, 1, "", "check_bucket"], [4, 4, 1, "", "check_params"], [4, 4, 1, "", "from_json"], [4, 4, 1, "", "to_dict"]], "impresso_commons.utils.daskutils": [[4, 3, 1, "", "create_even_partitions"], [4, 3, 1, "", "main"], [4, 3, 1, "", "partitioner"]], "impresso_commons.utils.s3": [[4, 3, 1, "", "alternative_read_text"], [4, 3, 1, "", "fixed_s3fs_glob"], [4, 3, 1, "", "get_boto3_bucket"], [4, 3, 1, "", "get_bucket"], [4, 3, 1, "", "get_bucket_boto3"], [4, 3, 1, "", "get_or_create_bucket"], [4, 3, 1, "", "get_s3_client"], [4, 3, 1, "", "get_s3_connection"], [4, 3, 1, "", "get_s3_object_size"], [4, 3, 1, "", "get_s3_resource"], [4, 3, 1, "", "get_s3_versions"], [4, 3, 1, "", "get_s3_versions_client"], [4, 3, 1, "", "get_storage_options"], [4, 3, 1, "", "read_jsonlines"], [4, 3, 1, "", "readtext_jsonlines"], [4, 3, 1, "", "s3_get_articles"], [4, 3, 1, "", "s3_get_pages"], [4, 3, 1, "", "upload"], [4, 3, 1, "", "upload_to_s3"]], "impresso_commons.utils.uima": [[4, 3, 1, "", "compute_image_links"], [4, 3, 1, "", "get_iiif_links"], [4, 3, 1, "", "rebuilt2xmi"]], "impresso_commons.utils.utils": [[4, 3, 1, "", "bytes_to"], [4, 3, 1, "", "chunk"], [4, 3, 1, "", "get_pkg_resource"], [4, 3, 1, "", "init_logger"], [4, 3, 1, "", "parse_json"], [4, 3, 1, "", "validate_against_schema"]], "impresso_commons.versioning": [[5, 0, 0, "-", "compute_manifest"], [5, 0, 0, "-", "data_manifest"], [5, 0, 0, "-", "data_statistics"], [5, 0, 0, "-", "helpers"]], "impresso_commons.versioning.compute_manifest": [[5, 3, 1, "", "compute_stats_for_stage"], [5, 3, 1, "", "create_manifest"], [5, 3, 1, "", "extract_np_key"], [5, 3, 1, "", "get_files_to_consider"], [5, 3, 1, "", "main"], [5, 3, 1, "", "validate_config"]], "impresso_commons.versioning.data_manifest": [[5, 1, 1, "", "DataManifest"]], "impresso_commons.versioning.data_manifest.DataManifest": [[5, 4, 1, "", "add_by_ci_id"], [5, 4, 1, "", "add_by_title_year"], [5, 4, 1, "", "add_count_list_by_title_year"], [5, 4, 1, "", "aggregate_stats_for_title"], [5, 4, 1, "", "append_to_notes"], [5, 4, 1, "", "compute"], [5, 4, 1, "", "define_update_info_for_title"], [5, 4, 1, "", "generate_media_dict"], [5, 4, 1, "", "get_count_keys"], [5, 4, 1, "", "has_title_year_key"], [5, 4, 1, "", "init_yearly_count_dict"], [5, 4, 1, "", "new_media"], [5, 5, 1, "", "output_mft_s3_path"], [5, 4, 1, "", "overall_stats"], [5, 4, 1, "", "replace_by_ci_id"], [5, 4, 1, "", "replace_by_title_year"], [5, 4, 1, "", "title_level_stats"], [5, 4, 1, "", "update_media_stats"], [5, 4, 1, "", "validate_and_export_manifest"]], "impresso_commons.versioning.data_statistics": [[5, 1, 1, "", "DataStatistics"], [5, 1, 1, "", "NewspaperStatistics"]], "impresso_commons.versioning.data_statistics.DataStatistics": [[5, 4, 1, "", "add_counts"], [5, 2, 1, "", "count_keys"], [5, 2, 1, "", "counts"], [5, 2, 1, "", "element"], [5, 2, 1, "", "granularity"], [5, 4, 1, "", "init_counts"], [5, 4, 1, "", "pretty_print"], [5, 4, 1, "", "same_counts"], [5, 2, 1, "", "stage"]], "impresso_commons.versioning.data_statistics.NewspaperStatistics": [[5, 2, 1, "", "count_keys"], [5, 2, 1, "", "counts"], [5, 2, 1, "", "element"], [5, 2, 1, "", "granularity"], [5, 2, 1, "", "possible_count_keys"], [5, 4, 1, "", "pretty_print"], [5, 4, 1, "", "same_counts"], [5, 2, 1, "", "stage"]], "impresso_commons.versioning.helpers": [[5, 1, 1, "", "DataStage"], [5, 3, 1, "", "agg"], [5, 3, 1, "", "chunk"], [5, 3, 1, "", "clone_git_repo"], [5, 3, 1, "", "compute_stats_in_canonical_bag"], [5, 3, 1, "", "compute_stats_in_entities_bag"], [5, 3, 1, "", "compute_stats_in_langident_bag"], [5, 3, 1, "", "compute_stats_in_rebuilt_bag"], [5, 3, 1, "", "compute_stats_in_solr_text_bag"], [5, 3, 1, "", "counts_for_canonical_issue"], [5, 3, 1, "", "counts_for_rebuilt"], [5, 3, 1, "", "extract_version"], [5, 3, 1, "", "filter_new_or_modified_media"], [5, 3, 1, "", "finalize"], [5, 3, 1, "", "find_s3_data_manifest_path"], [5, 3, 1, "", "get_head_commit_url"], [5, 3, 1, "", "get_media_item_years"], [5, 3, 1, "", "get_media_titles"], [5, 3, 1, "", "git_commit_push"], [5, 3, 1, "", "increment_version"], [5, 3, 1, "", "init_media_info"], [5, 3, 1, "", "is_git_repo"], [5, 3, 1, "", "manifest_summary"], [5, 3, 1, "", "media_list_from_mft_json"], [5, 3, 1, "", "read_manifest_from_s3"], [5, 3, 1, "", "read_manifest_from_s3_path"], [5, 3, 1, "", "remove_media_in_manifest"], [5, 3, 1, "", "validate_granularity"], [5, 3, 1, "", "validate_stage"], [5, 3, 1, "", "validate_version"], [5, 3, 1, "", "version_as_list"], [5, 3, 1, "", "write_and_push_to_git"], [5, 3, 1, "", "write_dump_to_fs"]], "impresso_commons.versioning.helpers.DataStage": [[5, 2, 1, "", "CANONICAL"], [5, 2, 1, "", "EMBEDDINGS"], [5, 2, 1, "", "ENTITIES"], [5, 2, 1, "", "EVENIZED"], [5, 2, 1, "", "LANGIDENT"], [5, 2, 1, "", "LINGUISTIC_PROCESSING"], [5, 2, 1, "", "MYSQL_CIS"], [5, 2, 1, "", "OCRQA"], [5, 2, 1, "", "PASSIM"], [5, 2, 1, "", "REBUILT"], [5, 2, 1, "", "SOLR_EMBS"], [5, 2, 1, "", "SOLR_ENTITIES"], [5, 2, 1, "", "SOLR_TEXT"], [5, 2, 1, "", "TEXT_REUSE"], [5, 2, 1, "", "TOPICS"], [5, 4, 1, "", "has_value"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "function", "Python function"], "4": ["py", "method", "Python method"], "5": ["py", "property", "Python property"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute", "3": "py:function", "4": "py:method", "5": "py:property"}, "terms": {"": [2, 3, 4, 5], "0": [2, 3, 4, 5], "00": 5, "000": 2, "00z": 5, "01": 2, "02": 2, "03": 4, "03t12": 5, "04": 5, "04t12": 5, "06": 0, "1": [2, 3, 4, 5], "10": [0, 2, 4, 5], "1000": 4, "10000": 2, "1024": 4, "10th": 2, "117": 3, "15": [2, 4], "1798": 2, "1889": 5, "1890": 2, "1900": [2, 3], "1930": 0, "1950": 2, "1960": 2, "1970": 2, "1990": 4, "1991": 4, "1998": 4, "1999": [2, 3], "2": [2, 3, 4, 5], "20": 4, "2024": 5, "25": 2, "3": [2, 5], "31": 5, "5": 5, "500": 4, "8": 5, "9": 5, "A": [3, 4, 5], "As": [4, 5], "By": 5, "For": [3, 5], "If": [2, 3, 4, 5], "In": [0, 2, 3, 5], "It": [0, 3, 4, 5], "On": 5, "The": [0, 2, 3, 4, 5], "These": 5, "To": 0, "_description_": [4, 5], "_format": 3, "_generation_d": 5, "_processing_stat": 5, "_vm": 5, "abc": 5, "abl": 5, "about": 5, "abs_path": 5, "absolut": 5, "abstract": 5, "access": 5, "accod": 5, "accord": [0, 2], "accordingli": 5, "acquir": 0, "acquisit": [0, 3], "acronym": 2, "across": 5, "activ": 5, "actual": 5, "ad": [4, 5], "adapt": [2, 3], "add": 5, "add_by_ci_id": 5, "add_by_title_year": 5, "add_count": 5, "add_count_list_by_title_year": 5, "addit": [0, 3, 5], "advertis": 4, "after": [3, 4, 5], "against": [4, 5], "agg": 5, "agg_tu": 5, "aggreg": 5, "aggregate_stats_for_titl": 5, "agnost": 5, "alia": 2, "alias": 2, "all": [2, 3, 4, 5], "all_count": 5, "allow": 5, "along": 5, "alreadai": 5, "alreadi": 5, "also": [0, 5], "alter": 5, "alternative_read_text": 4, "alwai": [2, 5], "among": 0, "an": [0, 2, 3, 4, 5], "ani": [3, 4, 5], "annot": 4, "anoth": [0, 5], "apach": 1, "api": 3, "appear": 5, "append": [3, 5], "append_to_not": 5, "appli": [3, 5], "approach": 5, "ar": [0, 2, 3, 4, 5], "arbitrari": 5, "archiv": [0, 3, 4], "area": 3, "arg": 4, "argument": [4, 5], "arrai": 2, "arriv": 5, "articl": [2, 3, 4], "as_int": 5, "ask": 4, "associ": [2, 5], "assum": 4, "atom": 5, "attribut": [4, 5], "augment": 5, "avail": 0, "b": [0, 2, 3, 4], "bag": [2, 4, 5], "base": [0, 2, 3, 4, 5], "base_dir": 2, "basestr": 2, "basic": [1, 3, 5], "batch": 4, "been": [3, 5], "befor": 3, "begin": 2, "behavior": 5, "being": 3, "belong": 3, "benoit": 4, "best": 0, "better": 5, "between": [0, 5], "bewteen": 0, "bit": 1, "blank": [0, 2], "blob": 3, "bool": [0, 2, 3, 4, 5], "boolean": [2, 3, 4], "both": [2, 5], "boto": 4, "boto3": [3, 4], "boto3_bucket": 4, "botocor": 2, "boundari": [0, 5], "box": [1, 4], "box_strategi": 0, "boxstrategi": 0, "branch": 5, "break": [3, 4], "bsize": 4, "bucket": [2, 3, 4, 5], "bucket_nam": [2, 3, 4, 5], "bug": 4, "build": 2, "built": 3, "bypass": 4, "byte": [0, 4], "bytes_nb": 4, "bytes_to": 4, "bz2": [2, 4, 5], "call": [4, 5], "can": [0, 3, 4, 5], "cannot": 5, "canon": [2, 3, 4, 5], "canonical_bucket": 4, "canonical_path": 2, "canonical_vers": 2, "case": [3, 4, 5], "ceph": 2, "cf": [4, 5], "ch": [0, 4], "chang": 5, "charact": 3, "check": [0, 2, 4, 5], "check_bucket": 4, "check_filenam": 2, "check_param": 4, "child": 5, "choic": 0, "choos": 0, "chunk": [3, 4, 5], "chunksiz": 4, "ci": [3, 4, 5], "ci_id": 5, "citat": 5, "cl": 5, "class": [0, 2, 4, 5], "classic": 4, "classmethod": [4, 5], "cleanup": 3, "clear": 3, "cli": 3, "client": [2, 3, 4, 5], "clone": [3, 5], "clone_git_repo": 5, "close": 4, "cloud": 4, "cluster": 3, "cmd": 0, "code": [1, 2], "collect": 5, "com": [2, 3], "command": 5, "commit": 5, "commit_msg": 5, "compar": 5, "comparison": 5, "compli": 2, "compon": 2, "compos": 0, "compress": 3, "comput": [0, 1, 2, 4], "compute_box": 0, "compute_image_link": 4, "compute_manifest": 5, "compute_scale_factor": 0, "compute_stats_for_stag": 5, "compute_stats_in_canonical_bag": 5, "compute_stats_in_entities_bag": 5, "compute_stats_in_langident_bag": 5, "compute_stats_in_rebuilt_bag": 5, "compute_stats_in_solr_text_bag": 5, "concaten": 3, "conclud": 5, "conduct": 5, "config": [1, 2, 5], "config_dict": [2, 4, 5], "config_fil": 2, "config_load": 4, "config_newspap": 4, "configur": [2, 3, 4, 5], "conform": 3, "connect": [3, 4], "consid": [2, 4, 5], "consist": 5, "consisteni": 5, "construct": [3, 5], "consult": 5, "contain": [2, 3, 4, 5], "content": [2, 3, 4, 5], "content_item": 3, "content_items_in": 5, "content_items_out": 5, "contentitem": [2, 4], "context": [0, 4], "contextlib": 4, "convent": 2, "convers": 4, "convert": [0, 4, 5], "convert_box": 0, "coordin": [0, 3, 4], "copi": 2, "core": [2, 4, 5], "corpu": 5, "correct": 0, "correspond": [0, 2, 4, 5], "could": [3, 4, 5], "count": [4, 5], "count_kei": 5, "counts_for_canonical_issu": 5, "counts_for_rebuilt": 5, "coverag": 0, "cpu": 4, "creat": [2, 3, 4, 5], "create_even_partit": 4, "create_manifest": 5, "critic": 5, "crystal": 5, "current": [0, 3, 5], "custom": 4, "danger": 0, "dask": [1, 2, 3, 5], "dask_client": 3, "daskutil": 4, "data": [1, 2, 3, 4], "data_manifest": 5, "data_stag": 5, "data_statist": 5, "datamanifest": 5, "dataset": 5, "datastag": 5, "datastatist": 5, "datastatst": 5, "date": [2, 4, 5], "datetim": 4, "db": [2, 4, 5], "debug": [3, 5], "decid": 4, "default": [2, 3, 4, 5], "defin": 5, "define_update_info_for_titl": 5, "definit": 5, "defit": 4, "depend": 4, "depreci": 4, "descript": 4, "desir": [3, 4, 5], "dest": 0, "destin": 0, "detail": [3, 4], "detect": [2, 4], "detect_canonical_issu": 2, "detect_issu": 2, "detect_journal_issu": 2, "determin": [3, 5], "devis": 0, "dhlabsrv17": [0, 4], "dict": [2, 3, 4, 5], "dictionari": [4, 5], "did": 0, "differ": [0, 3, 4, 5], "dimens": 0, "dir": [2, 3, 5], "directori": [2, 3, 4, 5], "discrep": [0, 4], "displai": 5, "distribut": [2, 4], "dit": 2, "do": [0, 5], "doc": 4, "doc_typ": 2, "document": [2, 3, 5], "doe": 4, "doesn": 4, "don": 5, "drive": 4, "dump": 5, "duplic": 3, "dure": 5, "e": [0, 2, 3, 4, 5], "each": [2, 4, 5], "eas": 5, "edit": 2, "effici": 4, "eg": 5, "either": [2, 5], "element": [2, 4, 5], "els": [2, 3], "emb": 5, "embed": 5, "embeddings_el": 5, "empti": [2, 5], "enabl": 4, "end": [2, 5], "endpoint": [3, 4], "enrich": 5, "ensur": 5, "entiti": 5, "entri": 2, "enum": [0, 5], "environ": 4, "epfl": [0, 3, 4], "equal": 0, "especi": 3, "etc": [3, 5], "even": [4, 5], "everi": 5, "ex": 5, "exabyt": 4, "exact": 5, "exactli": 5, "exampl": [1, 2, 4, 5], "exclud": [2, 5], "exclus": 2, "execut": 0, "exist": [3, 4, 5], "exitstack": 4, "exp": 0, "expect": [3, 5], "explan": 2, "export": [4, 5], "export_to_git_and_s3": 5, "ext_check": 0, "extend": 5, "extended_summari": 5, "extens": [2, 4, 5], "extract": [4, 5], "extract_np_kei": 5, "extract_vers": 5, "f": [2, 3, 4], "factor": 0, "factori": [3, 4], "fail": 5, "fals": [0, 2, 4, 5], "fc": 3, "fetch": [2, 3, 5], "fetch_fil": 2, "field": [2, 5], "file": [0, 1, 5], "file_basenam": 2, "file_cont": 5, "file_extens": 5, "file_manag": 4, "file_typ": 2, "filenam": [2, 4, 5], "filenotfounderror": 4, "filepath": [3, 5], "files_bag": 5, "filesystem": 5, "filter": [2, 3, 4], "filter_config": 2, "filter_languag": 3, "filter_new_or_modified_media": 5, "final": 5, "find": 5, "find_s3_data_manifest_path": 5, "first": [2, 3], "fit": 0, "fixed_s3fs_glob": [4, 5], "flag": 5, "float": [0, 4, 5], "fly": 5, "fo": 3, "folder": 0, "follow": [3, 4, 5], "form": [2, 5], "format": [0, 3, 4, 5], "found": [0, 4, 5], "four": 5, "frequenc": 5, "from": [0, 1, 3, 4, 5], "from_json": 4, "from_sequ": 4, "ft": 4, "ft_token": 5, "full": [0, 5], "full_git_filepath": 5, "full_titl": 5, "fulltext": 3, "function": [0, 1, 5], "further": 5, "g": [0, 2, 3, 4], "gdl": [2, 3, 4], "gener": [1, 3, 4, 5], "generate_media_dict": 5, "get": [2, 3, 4, 5], "get_boto3_bucket": 4, "get_bucket": [2, 4], "get_bucket_boto3": 4, "get_count_kei": 5, "get_files_to_consid": 5, "get_head_commit_url": 5, "get_iiif_and_coord": 3, "get_iiif_link": 4, "get_iiif_url": 0, "get_img_from_arch": 0, "get_imgdimens": 0, "get_issueshortpath": 2, "get_jpg": 0, "get_media_item_year": 5, "get_media_titl": 5, "get_new_or_modified_media": 5, "get_or_create_bucket": 4, "get_page_fold": 0, "get_pkg_resourc": 4, "get_png": 0, "get_s3_client": [2, 4], "get_s3_connect": 4, "get_s3_object_s": 4, "get_s3_resourc": 4, "get_s3_vers": 4, "get_s3_versions_cli": 4, "get_scale_factor": 0, "get_storage_opt": 4, "get_tif": 0, "ggregat": 5, "gigabyt": 4, "git": [3, 5], "git_commit_push": 5, "git_repo": 5, "github": [2, 3, 5], "given": [0, 2, 3, 4, 5], "glob": 4, "goal": 5, "gr": 3, "granular": 5, "group": 3, "groupbi": 5, "guidelin": 5, "h": 0, "ha": [3, 5], "hand": 0, "handl": [1, 4, 5], "handler": 0, "happen": 0, "has_title_year_kei": 5, "has_valu": 5, "have": [0, 3, 4, 5], "height": 0, "heigth": 0, "help": 4, "helper": 1, "henc": 2, "here": [0, 4, 5], "highest": 0, "highli": 1, "hopefulli": 0, "host": 4, "host_url": 4, "how": [2, 5], "howto": 3, "http": [0, 2, 3, 4, 5], "i": [0, 1, 3, 4, 5], "i0002": 2, "id": [0, 2, 3, 4, 5], "id2issuedir": 2, "ideal": 5, "ident": 5, "identif": 5, "identifi": 0, "ignor": 0, "iiif": [0, 3, 4], "iiif_endpoint": 4, "iiif_impresso": [0, 4], "iiif_link": 4, "iiif_manifest_uri": 0, "iiif_map": 4, "imag": [1, 2, 3, 5], "image_data": 0, "images_resolut": 0, "img_dest_path": 0, "img_source_nam": 0, "img_source_path": 0, "img_util": 0, "imp": 4, "implement": [2, 5], "import": [2, 3], "impresso": [0, 2, 3, 4, 5], "impresso_common": [0, 2, 3, 4, 5], "impresso_iter_bucket": 2, "includ": [3, 5], "include_count": 5, "include_np": 5, "include_np_yr": 5, "inconsist": [3, 5], "increas": 5, "increment": 5, "increment_vers": 5, "indeplux": 5, "index": [3, 5], "indic": [0, 5], "individu": [2, 5], "info": [0, 3, 4, 5], "inform": [3, 4, 5], "infrastructur": 3, "ingest": [3, 5], "init": 5, "init_count": 5, "init_log": 3, "init_logg": [3, 4], "init_media_info": 5, "init_yearly_count_dict": 5, "initi": [4, 5], "initialis": [3, 4], "inject": 3, "inp_dir": 2, "input": [1, 3, 4, 5], "input_box": 0, "input_bucket": [2, 3], "input_data": 5, "insert": 3, "insert_whitespac": 3, "insid": 5, "instanc": [2, 4], "instanti": [4, 5], "instead": [4, 5], "int": [2, 3, 4, 5], "integ": 5, "integr": 5, "interfac": 5, "interv": 2, "invalid": 2, "ioerror": 5, "iption": 5, "is_git_repo": 5, "is_patch": 5, "isfil": 2, "isn": 5, "issu": [0, 2, 3, 4, 5], "issue1": 2, "issue2": 2, "issue_dir_path": 0, "issue_id": 4, "issue_json": 3, "issue_list1": 2, "issue_list2": 2, "issuedir": [2, 3, 4], "issuedirectori": 2, "item": [2, 3, 4, 5], "item_typ": 2, "iter": 2, "iter_bucket": 4, "its": [2, 3, 5], "jdg": 2, "journal": 2, "journal_filt": 2, "jp2": 0, "jpg_highest": 0, "jpg_uniq": 0, "json": [2, 3, 4, 5], "json_fil": [3, 4], "json_mft": 5, "json_to_valid": 4, "jsonl": [2, 4, 5], "justifi": 5, "k": [2, 4], "keep": [3, 4], "keep_ful": 4, "kei": [2, 3, 4, 5], "kept": 4, "key_batch": 4, "key_nam": [2, 4], "key_suffix": 2, "keyerror": [4, 5], "kilobyt": 4, "know": 5, "lang": [3, 5], "lang_fd": 5, "langid": 5, "languag": 3, "last": [2, 4, 5], "last_modif_d": 5, "later": 5, "latest": 5, "layer": 4, "lazi": 5, "leakag": 5, "leav": 4, "left": 0, "len": 5, "level": [3, 4, 5], "lf": 5, "lg": 3, "lib": 4, "librari": [0, 4], "like": 0, "limit": 4, "line": [3, 4, 5], "line_by_lin": 4, "lingproc": 5, "linguistic_process": 5, "link": [3, 4], "list": [2, 3, 4, 5], "list_fil": 2, "list_newspap": 2, "ll": [3, 5], "load": [2, 4, 5], "loader": 1, "local": [3, 4, 5], "local_f": 4, "local_path": 4, "locat": 2, "log": [2, 3, 4, 5], "logger": [3, 4], "logic": 5, "longer": 4, "look": [0, 5], "lower": [0, 5], "lsit": 5, "m": [4, 5], "main": [3, 4, 5], "major": 5, "make": 5, "manag": 4, "manifest": [1, 3, 4], "manifest_config": 5, "manifest_data": 5, "manifest_json": 5, "manifest_s3_path": 5, "manifest_summari": 5, "map": [4, 5], "markdown": 5, "master": [2, 5], "match": 5, "maximum": 2, "mb": 5, "md": [2, 3, 5], "mean": [2, 5], "media": 5, "media_dict": 5, "media_items_year": 5, "media_list": 5, "media_list_from_mft_json": 5, "media_titl": 5, "megabyt": 4, "member": [3, 5], "memori": 4, "merg": 3, "messag": [2, 5], "metadata": [3, 4], "method": [4, 5], "minor": 5, "miss": [3, 5], "mnf_json": 5, "modif": 5, "modif_d": 5, "modifi": [4, 5], "modified_media_item_2": 5, "modul": [0, 1, 5], "more": [3, 4, 5], "move": [0, 5], "mssing": 5, "mybucket": 2, "mysql": 5, "mysql_ci": 5, "n": [0, 4], "name": [0, 2, 3, 4, 5], "name_check": 0, "name_or_path": 5, "nb": [2, 3, 4], "nb_partit": 4, "nbpart": 4, "ne": [4, 5], "ne_ent": 5, "ne_ment": 5, "necessari": 5, "need": [0, 3, 4, 5], "neg": 2, "neither": 5, "new": [0, 4, 5], "new_count": 5, "new_manifest": 5, "new_media": 5, "new_media_item_1": 5, "new_or_modifi": 5, "new_vers": 5, "newli": [4, 5], "newspap": [1, 2, 3, 4], "newspaper_prefix": 4, "newspapers_filt": 2, "newspaperstatist": 5, "next_t": 3, "non": 5, "none": [0, 2, 3, 4, 5], "nor": 5, "normal": 0, "note": 5, "notimplementederror": 2, "now": 4, "np": 5, "nps_stat": 5, "number": [2, 3, 4, 5], "number_partit": 4, "nuniqu": 5, "nw": [3, 5], "nworker": [3, 5], "o": [1, 4], "ob": 3, "object": [1, 2, 3, 4, 5], "occur": 5, "ocr": [0, 4], "ocrqa": 5, "od": 3, "offset": [3, 4], "old_media_list": 5, "oliv": 1, "olive_box": 0, "onc": [4, 5], "one": [2, 3, 4, 5], "ones": [0, 5], "onli": [2, 4, 5], "only_count": 5, "open": [2, 3], "oper": 5, "option": [2, 3, 4, 5], "orient": 4, "origin": [2, 5], "other": [0, 5], "other_stat": 5, "otherwis": [3, 4, 5], "our": [0, 2, 3, 4], "out": 4, "outp_dir": 3, "output": [1, 3, 4, 5], "output_bucket_nam": 5, "output_dir": [3, 4], "output_mft_s3_path": 5, "outsid": 5, "over": 2, "overal": 5, "overall_stat": 5, "overrid": 5, "overriden": 5, "overwrit": 5, "own": 5, "p": [4, 5], "p0001": [0, 2, 4], "paassim": 5, "packag": [4, 5], "pad": 4, "page": [0, 2, 3, 4, 5], "page_digit": 0, "page_id": 0, "page_kei": 3, "page_nam": 4, "page_s": 2, "page_width": 0, "page_xml": 0, "pages": 2, "pages_to_articl": 3, "pagin": 2, "pair": [2, 5], "pair_issu": 2, "paragraph": 3, "parallel": [3, 4], "param": [2, 3, 4], "paramet": [0, 2, 3, 4, 5], "pars": 2, "parse_canonical_filenam": 2, "parse_json": 4, "part": [3, 5], "parti": 4, "partial": 5, "particular": 5, "partit": [2, 4, 5], "partition": 4, "partition_nam": 4, "partition_s": 2, "partitionerconfig": 4, "passim": [3, 5], "patch": 5, "patched_field": 5, "path": [0, 2, 3, 4, 5], "path_check": 0, "path_f": 2, "path_img_on": 0, "path_img_thre": 0, "path_img_two": 0, "path_in_repo": 5, "path_s3": 2, "path_to_schema": 4, "path_typ": 2, "path_within_bucket": 4, "pathlib": 4, "pct": [0, 4], "pct_coordin": 4, "per": [4, 5], "perform": 5, "petabyt": 4, "pipelin": 5, "place": 5, "pleas": 4, "pluck": 4, "pm": 3, "png_highest": 0, "png_uniq": 0, "point": [4, 5], "portion": 5, "posit": 2, "posixpath": 4, "possibl": [0, 2, 4, 5], "possible_count_kei": 5, "possibli": 2, "potenti": 5, "precis": 5, "prefix": [2, 4], "prepar": 4, "preprocess": 5, "present": [0, 2, 3, 5], "pretti": 5, "pretty_print": 5, "prev": 3, "prev_t": 3, "prev_vers": 5, "prev_version_year": 5, "previou": [3, 5], "previous": 5, "previous_manifest": 5, "previous_mft_json": 5, "previous_mft_path": 5, "print": [4, 5], "prior": 4, "priorit": 4, "problem": 5, "process": [3, 4, 5], "processed_year": 5, "produc": 4, "progress": 5, "project": 5, "proper": [0, 4], "properli": 3, "properti": 5, "provid": [0, 4, 5], "pure": 4, "purpos": 3, "push": 5, "push_to_git": 5, "py": [2, 3, 4, 5], "pyimag": 4, "python": 1, "qualnam": [0, 5], "r": 2, "radio": 5, "radiostatist": 5, "rais": [2, 4, 5], "ratio": 0, "re": [0, 5], "read": [0, 2, 3, 4, 5], "read_issu": 3, "read_issue_pag": 3, "read_jsonlin": 4, "read_manifest_from_s3": 5, "read_manifest_from_s3_path": 5, "read_pag": 3, "read_s3_issu": 2, "read_text": 4, "readi": 5, "readtext_jsonlin": 4, "reason": [4, 5], "rebuild": 1, "rebuild_articl": 3, "rebuild_for_passim": 3, "rebuild_for_solr": 3, "rebuild_issu": 3, "rebuild_text": 3, "rebuild_text_passim": 3, "rebuilt": [3, 4, 5], "rebuilt2xmi": 4, "rebuilt_articl": 5, "rebuilt_ci": 5, "rebuilt_mft_json": 5, "rebuilt_mft_path": 5, "rebuilt_vers": 2, "reconstruct_iiif_link": 3, "reduc": 5, "regex": 5, "region": 3, "rejoin_articl": 3, "rel": [0, 5], "relat": [4, 5], "releas": [3, 5], "relev": 5, "reli": 0, "remain": 2, "remov": [3, 5], "remove_media_in_manifest": 5, "replac": 5, "replace_by_ci_id": 5, "replace_by_title_year": 5, "repo": [3, 5], "repo_nam": 5, "repositori": [2, 3, 4, 5], "repres": [2, 5], "represent": [3, 5], "request": 4, "requir": 5, "reset": 5, "resolut": 0, "resourc": [3, 4], "respec": 5, "respect": 5, "result": [0, 2, 4, 5], "retain": 5, "retriev": [3, 4, 5], "return": [0, 2, 3, 4, 5], "return_value_str": 5, "reus": [4, 5], "reusabl": [1, 4], "right": 0, "root": [2, 3, 4], "rootlogg": [3, 4], "rubric": 2, "run": [3, 5], "run_cmd": 0, "runai": 3, "rytp": 3, "s3": [1, 3, 5], "s3_bucket_partit": 4, "s3_bucket_process": 4, "s3_bucket_rebuilt": 4, "s3_canonical_issu": 5, "s3_client": [2, 3], "s3_credenti": 4, "s3_data": 2, "s3_entiti": 5, "s3_filter_arch": 2, "s3_get_articl": 4, "s3_get_pag": 4, "s3_host": 4, "s3_input_bucket": 5, "s3_iter_bucket": 2, "s3_kei": [4, 5], "s3_langid": 5, "s3_output_bucket": 5, "s3_solr_text": 5, "s3_version": 3, "s3contentitem": 2, "s3f": 4, "s3r": 4, "same": [0, 5], "same_count": 5, "sanity_check": 2, "sanitycheck": 2, "scale": [0, 3], "scale_factor": 0, "sch": [3, 5], "schedul": [3, 5], "schema": [3, 4, 5], "script": [1, 3], "se_access_kei": 4, "se_secret_kei": 4, "second": 2, "see": [0, 2, 3], "seem": [0, 2, 4], "select": 2, "select_issu": 2, "self": 5, "separ": [0, 2, 4, 5], "sequenc": 2, "serial": [3, 5], "serv": 4, "server": 4, "serviceresourc": [3, 4], "set": [2, 3, 4, 5], "sever": 5, "shell": 0, "short": [2, 4], "should": [0, 3, 4, 5], "show": 5, "shuffl": 4, "signatur": 3, "singl": 3, "size": [2, 4, 5], "skip": 2, "so": [3, 5], "solr": [3, 5], "solr_cor": 4, "solr_emb": 5, "solr_ent": 5, "solr_serv": 4, "solr_text": 5, "solv": 4, "some": [3, 5], "sort": 3, "sort_kei": 3, "sourc": [0, 4], "space": 0, "span": 4, "specif": 5, "specifi": [2, 3, 4], "ssh": 5, "stage": 5, "start": [0, 4, 5], "stat": 5, "statisit": 5, "statist": 1, "stats_as_dict": 5, "stderr": 0, "stdout": 0, "step": 5, "still": 5, "storag": 3, "store": [3, 4, 5], "str": [0, 2, 3, 4, 5], "straightforward": 5, "strategi": 0, "strenum": 5, "string": [0, 2, 3, 4, 5], "structur": [2, 5], "subset": 5, "success": [3, 4, 5], "successfulli": 3, "suffix": 2, "suit": 5, "summari": [4, 5], "support": [0, 2, 4], "switch": 4, "switchengin": 2, "system": [1, 5], "t": [4, 5], "tag": 0, "take": [0, 2, 4], "target": [4, 5], "task": 4, "tell": [3, 5], "temp": 3, "temp_dir": 5, "temporari": 3, "terabyt": 4, "test": [0, 5], "testb": 4, "text": [1, 4, 5], "text_apply_break": 3, "text_reus": 5, "text_reuse_clust": 5, "text_reuse_passag": 5, "textual": 4, "than": 4, "thei": 5, "them": 5, "therefor": 0, "thi": [0, 3, 4, 5], "thing": [0, 5], "third": 4, "thought": 3, "three": 5, "through": 5, "time": [2, 5], "titl": 5, "title_level_stat": 5, "title_stat": 5, "to_dict": 4, "to_start": 5, "to_unit": 4, "todo": [2, 5], "togeth": 5, "token": 3, "took": [0, 5], "topic": [4, 5], "tp": 3, "track": 5, "transform": 3, "transpar": 5, "tree": 2, "true": [2, 4, 5], "tupl": [2, 3, 4, 5], "turn": 4, "two": [4, 5], "txt": [0, 2], "type": [0, 2, 3, 4, 5], "typeerror": 5, "typesystem": 4, "typesystem_path": 4, "typic": 5, "uima": 1, "unabl": 4, "undesir": 2, "union": 5, "unit": [0, 4], "until": 4, "updat": 5, "update_level": 5, "update_media_stat": 5, "update_typ": 5, "updated_field": 5, "updated_year": 5, "upload": [3, 4, 5], "upload_success": 3, "upload_to_s3": 4, "upper": 0, "url": [0, 4, 5], "us": [0, 2, 3, 4, 5], "usag": [2, 3, 4, 5], "user": 5, "usual": 0, "util": [1, 3, 5], "v": [2, 5], "valid": [2, 4, 5], "validate_against_schema": 4, "validate_and_export_manifest": 5, "validate_config": 5, "validate_granular": 5, "validate_stag": 5, "validate_vers": 5, "valu": [0, 2, 4, 5], "valueerror": 5, "variabl": 4, "variat": 3, "variou": [3, 4, 5], "verbos": [3, 4, 5], "verifi": 5, "verison": 5, "version": [1, 2, 4], "version_as_list": 5, "versison": 5, "vesion": 5, "view": 4, "visual": 3, "vm": 5, "w": 0, "wa": [0, 3, 5], "wai": 5, "want": 4, "warn": [2, 4], "we": 0, "well": 4, "were": [0, 5], "what": [4, 5], "whatev": 5, "when": 5, "where": [2, 3, 4, 5], "whether": [2, 3, 4, 5], "which": [0, 3, 4, 5], "white_list": 5, "whitelist": 5, "whitespac": 3, "whose": 4, "width": 0, "within": [1, 2, 4, 5], "without": 4, "won": 5, "work": 3, "worker": [3, 4, 5], "write": [3, 4, 5], "write_and_push_to_git": 5, "write_dump_to_f": 5, "writen": 5, "written": 5, "x": 0, "xmi": 1, "xml": 0, "y": 0, "year": [2, 3, 4, 5], "yearli": [4, 5], "yearly_stat": 5, "yet": 4, "yield": 4, "you": 4, "zhdk": 4, "zip": 0, "zipfil": 0}, "titles": ["Image handling", "Welcome to Impresso PyCommons\u2019s documentation!", "Input/Output", "Text Rebuild", "Utilities", "Data Versioning"], "titleterms": {"": 1, "1": 0, "2": 0, "3": 0, "4": 0, "apach": 4, "background": 0, "basic": 4, "box": 0, "case": 0, "comput": 5, "config": [3, 4], "content": 1, "dask": 4, "data": 5, "document": 1, "exampl": 3, "file": [2, 3, 4], "from": 2, "function": [3, 4], "gener": 2, "handl": 0, "helper": [3, 5], "i": 2, "imag": 0, "impresso": 1, "inform": 0, "input": 2, "jpg": 0, "loader": 4, "manifest": 5, "newspap": 5, "o": 2, "oliv": 0, "one": 0, "onli": 0, "output": 2, "png": 0, "pycommon": 1, "rebuild": 3, "s3": [2, 4], "script": 5, "sever": 0, "statist": 5, "system": 2, "text": 3, "tif": 0, "uima": 4, "util": [0, 4], "version": 5, "welcom": 1, "xmi": 4}})
\ No newline at end of file
+Search.setIndex({"docnames": ["images", "index", "io", "rebuild", "utils", "versioning"], "filenames": ["images.rst", "index.rst", "io.rst", "rebuild.rst", "utils.rst", "versioning.rst"], "titles": ["Image handling", "Welcome to Impresso PyCommons\u2019s documentation!", "Input/Output", "Text Rebuild", "Utilities", "Data Versioning"], "terms": {"class": [0, 2, 4, 5], "impresso_common": [0, 2, 3, 4, 5], "img_util": 0, "boxstrategi": 0, "valu": [0, 2, 4, 5], "name": [0, 2, 3, 4, 5], "none": [0, 2, 3, 4, 5], "modul": [0, 1, 5], "qualnam": [0, 5], "type": [0, 2, 3, 4, 5], "start": [0, 4, 5], "boundari": [0, 5], "base": [0, 2, 3, 4, 5], "enum": [0, 5], "jpg_highest": 0, "jpg_uniq": 0, "png_highest": 0, "png_uniq": 0, "compos": 0, "path_img_on": 0, "path_img_two": 0, "path_img_thre": 0, "get_img_from_arch": 0, "archiv": [0, 3, 4], "path_check": 0, "ext_check": 0, "name_check": 0, "get_imgdimens": 0, "image_data": 0, "return": [0, 2, 3, 4, 5], "height": 0, "width": 0, "get_jpg": 0, "page_digit": 0, "get_page_fold": 0, "get_png": 0, "get_tif": 0, "run_cmd": 0, "cmd": 0, "execut": 0, "shell": 0, "result": [0, 2, 4, 5], "stdout": 0, "stderr": 0, "function": [0, 1, 5], "support": [0, 2, 4], "re": [0, 5], "comput": [0, 1, 2, 4], "coordin": [0, 3, 4], "olive_box": 0, "compute_box": 0, "scale_factor": 0, "input_box": 0, "iiif": [0, 3, 4], "rel": [0, 5], "paramet": [0, 2, 3, 4, 5], "float": [0, 4, 5], "ratio": 0, "between": [0, 5], "differ": [0, 3, 4, 5], "dimens": 0, "str": [0, 2, 3, 4, 5], "string": [0, 2, 3, 4, 5], "separ": [0, 2, 4, 5], "space": 0, "new": [0, 4, 5], "compute_scale_factor": 0, "img_source_path": 0, "img_dest_path": 0, "x": 0, "scale": [0, 3], "factor": 0, "bewteen": 0, "full": [0, 4, 5], "path": [0, 2, 3, 4, 5], "sourc": [0, 4], "destin": 0, "convert_box": 0, "convert": [0, 4, 5], "y": 0, "w": 0, "h": 0, "upper": 0, "left": 0, "lower": [0, 5], "right": 0, "get_iiif_url": 0, "page_id": 0, "http": [0, 2, 3, 4, 5], "dhlabsrv17": [0, 4], "epfl": [0, 3, 4], "ch": [0, 4], "iiif_impresso": [0, 4], "iiif_manifest_uri": 0, "pct": [0, 4], "bool": [0, 2, 3, 4, 5], "fals": [0, 2, 4, 5], "impresso": [0, 2, 3, 4, 5], "url": [0, 4, 5], "given": [0, 2, 3, 4, 5], "page": [0, 2, 3, 4, 5], "id": [0, 2, 3, 4, 5], "e": [0, 2, 3, 4, 5], "g": [0, 2, 3, 4], "exp": 0, "1930": 0, "06": 0, "10": [0, 2, 4, 5], "p0001": [0, 2, 4], "blank": [0, 2], "get_scale_factor": 0, "issue_dir_path": 0, "page_xml": 0, "box_strategi": 0, "img_source_nam": 0, "context": [0, 4], "strategi": 0, "choos": 0, "issu": [0, 2, 3, 4, 5], "zipfil": 0, "zip": 0, "byte": [0, 4], "xml": 0, "handler": 0, "found": [0, 4, 5], "info": [0, 3, 4, 5], "txt": [0, 2, 4], "from": [0, 1, 3, 4, 5], "jp2": 0, "folder": 0, "hopefulli": 0, "correct": 0, "librari": [0, 4], "take": [0, 2, 4], "best": 0, "avail": 0, "highest": 0, "were": [0, 5], "accord": [0, 2], "an": [0, 2, 3, 4, 5], "which": [0, 3, 4, 5], "we": 0, "have": [0, 3, 4, 5], "identifi": 0, "among": 0, "format": [0, 3, 4, 5], "coverag": 0, "i": [0, 1, 3, 4, 5], "devis": 0, "The": [0, 2, 3, 4, 5], "present": [0, 2, 3, 5], "file": [0, 1, 5], "wa": [0, 3, 5], "dest": 0, "can": [0, 3, 4, 5], "therefor": 0, "us": [0, 2, 3, 4, 5], "need": [0, 3, 4, 5], "read": [0, 2, 3, 4, 5], "normal": 0, "In": [0, 2, 3, 5], "thi": [0, 3, 4, 5], "acquir": 0, "It": [0, 3, 4, 5], "look": [0, 5], "also": [0, 5], "took": [0, 5], "ocr": [0, 4], "possibl": [0, 2, 4, 5], "reli": 0, "resolut": 0, "indic": [0, 5], "should": [0, 3, 4, 5], "same": [0, 5], "our": [0, 2, 3, 4], "n": [0, 4], "b": [0, 2, 3, 4], "heigth": 0, "do": [0, 5], "correspond": [0, 2, 4, 5], "usual": 0, "discrep": [0, 4], "tag": 0, "images_resolut": 0, "hand": 0, "page_width": 0, "other": [0, 5], "seem": [0, 2, 4], "ignor": 0, "current": [0, 3, 5], "here": [0, 4, 5], "ar": [0, 2, 3, 4, 5], "equal": 0, "To": 0, "check": [0, 2, 4, 5], "happen": 0, "choic": 0, "acquisit": [0, 3], "addit": [0, 3, 5], "see": [0, 2, 3], "ones": [0, 5], "danger": 0, "anoth": [0, 5], "did": 0, "provid": [0, 4, 5], "thing": [0, 5], "fit": 0, "like": 0, "test": [0, 5], "move": [0, 5], "proper": [0, 4], "unit": [0, 4], "python": 1, "bit": 1, "code": [1, 2], "object": [1, 2, 3, 4, 5], "highli": 1, "reusabl": [1, 4], "within": [1, 2, 4, 5], "input": [1, 3, 4, 5], "output": [1, 3, 4, 5], "gener": [1, 3, 4, 5], "o": [1, 4], "system": [1, 5], "s3": [1, 3, 5], "text": [1, 4, 5], "rebuild": 1, "helper": 1, "config": [1, 2, 5], "exampl": [1, 2, 4, 5], "util": [1, 3, 5], "basic": [1, 3, 5], "dask": [1, 2, 3, 5], "apach": 1, "uima": 1, "xmi": 1, "loader": 1, "imag": [1, 2, 3, 5], "handl": [1, 4, 5], "oliv": 1, "box": [1, 4], "data": [1, 2, 3, 4], "version": [1, 2, 4], "statist": 1, "newspap": [1, 2, 3, 4], "manifest": [1, 3, 4], "script": [1, 3], "id2issuedir": 2, "todo": [2, 5], "document": [2, 3, 5], "parse_canonical_filenam": 2, "filenam": [2, 4, 5], "pars": 2, "canon": [2, 3, 4, 5], "its": [2, 3, 4, 5], "compon": 2, "tupl": [2, 3, 4, 5], "gdl": [2, 3, 4], "1950": 2, "01": 2, "02": 2, "i0002": 2, "2": [2, 3, 4, 5], "": [2, 3, 4, 5], "directori": [2, 3, 4, 5], "structur": [2, 5], "path_f": 2, "contentitem": [2, 4], "alia": 2, "item": [2, 3, 4, 5], "issuedir": [2, 3, 4], "journal": 2, "date": [2, 4, 5], "edit": 2, "field": [2, 5], "number": [2, 3, 4, 5], "1": [2, 3, 4, 5], "0": [2, 3, 4, 5], "3": [2, 5], "canonical_path": 2, "dir": [2, 3, 5], "extens": [2, 4, 5], "path_typ": 2, "creat": [2, 3, 4, 5], "repres": [2, 5], "onli": [2, 4, 5], "build": 2, "check_filenam": 2, "file_basenam": 2, "whether": [2, 3, 4, 5], "compli": 2, "convent": 2, "1900": [2, 3], "detect_canonical_issu": 2, "base_dir": 2, "detect": [2, 4], "import": [2, 3], "nb": [2, 3, 4], "invalid": 2, "skip": 2, "warn": [2, 4], "messag": [2, 5], "log": [2, 3, 4, 5], "root": [2, 3, 4], "list": [2, 3, 4, 5], "consid": [2, 4, 5], "acronym": 2, "instanc": [2, 4], "detect_issu": 2, "journal_filt": 2, "exclud": [2, 5], "basestr": 2, "set": [2, 3, 4, 5], "filter": [2, 3, 4], "posit": 2, "neg": 2, "boolean": [2, 3, 4], "detect_journal_issu": 2, "get_issueshortpath": 2, "short": [2, 4], "pair_issu": 2, "issue_list1": 2, "issue_list2": 2, "associ": [2, 5], "pair": [2, 5], "origin": [2, 5], "repositori": [2, 3, 4, 5], "arrai": 2, "contain": [2, 3, 4, 5], "issue1": 2, "issue2": 2, "select_issu": 2, "config_dict": [2, 4, 5], "inp_dir": 2, "configur": [2, 3, 4, 5], "select": 2, "md": [2, 3, 5], "explan": 2, "usag": [2, 3, 4, 5], "config_fil": 2, "isfil": 2, "open": [2, 3], "r": 2, "f": [2, 3, 4], "json": [2, 3, 4, 5], "load": [2, 4, 5], "els": [2, 3], "dict": [2, 3, 4, 5], "dit": 2, "where": [2, 3, 4, 5], "get": [2, 3, 4, 5], "path_s3": 2, "issuedirectori": 2, "fetch_fil": 2, "bucket_nam": [2, 3, 4, 5], "true": [2, 4, 5], "file_typ": 2, "newspapers_filt": 2, "bag": [2, 4, 5], "fetch": [2, 3, 5], "bucket": [2, 3, 4, 5], "If": [2, 3, 4, 5], "content": [2, 3, 4, 5], "all": [2, 3, 4, 5], "specifi": [2, 3, 4], "remain": 2, "distribut": [2, 4], "both": [2, 5], "alwai": [2, 5], "first": [2, 3], "element": [2, 4, 5], "second": 2, "henc": 2, "entri": 2, "undesir": 2, "adapt": [2, 3], "github": [2, 3, 5], "com": [2, 3], "sanitycheck": 2, "tree": 2, "master": [2, 5], "sanity_check": 2, "s3_data": 2, "py": [2, 3, 4, 5], "form": [2, 5], "option": [2, 3, 4, 5], "default": [2, 3, 4, 5], "rais": [2, 4, 5], "notimplementederror": 2, "one": [2, 3, 4, 5], "db": [2, 4, 5], "core": [2, 4, 5], "impresso_iter_bucket": 2, "item_typ": 2, "prefix": [2, 4], "filter_config": 2, "partition_s": 2, "15": [2, 4], "iter": 2, "over": 2, "possibli": 2, "either": [2, 5], "valid": [2, 4, 5], "individu": [2, 5], "articl": [2, 3, 4], "param": [2, 3, 4], "kei": [2, 3, 4, 5], "exclus": 2, "year": [2, 3, 4, 5], "interv": 2, "1960": 2, "jdg": 2, "1890": 2, "last": [2, 4, 5], "partit": [2, 4, 5], "size": [2, 4, 5], "list_fil": 2, "locat": 2, "list_newspap": 2, "s3_client": [2, 3], "botocor": 2, "client": [2, 3, 4, 5], "page_s": 2, "int": [2, 3, 4, 5], "10000": 2, "25": 2, "000": 2, "maximum": 2, "pages": 2, "switchengin": 2, "implement": [2, 5], "ceph": 2, "copi": 2, "get_s3_client": [2, 4], "pagin": 2, "alias": 2, "read_s3_issu": 2, "input_bucket": [2, 3], "s3contentitem": 2, "key_nam": [2, 4], "doc_typ": 2, "rebuilt_vers": 2, "canonical_vers": 2, "s3_filter_arch": 2, "suffix": [2, 4], "jsonl": [2, 4, 5], "bz2": [2, 4, 5], "k": [2, 4], "v": [2, 5], "time": [2, 5], "rubric": 2, "1970": 2, "empti": [2, 5], "mean": [2, 5], "1798": 2, "1999": [2, 3], "each": [2, 4, 5], "10th": 2, "sequenc": 2, "key_suffix": 2, "end": [2, 5], "s3_iter_bucket": 2, "get_bucket": [2, 4], "mybucket": 2, "begin": 2, "how": [2, 5], "A": [3, 4, 5], "transform": 3, "purpos": 3, "cli": 3, "For": [3, 5], "member": [3, 5], "run": [3, 5], "runai": 3, "infrastructur": 3, "blob": 3, "main": [3, 4, 5], "howto": 3, "rebuild_articl": 3, "od": 3, "fc": 3, "fo": 3, "schedul": [3, 5], "sch": [3, 5], "ob": 3, "verbos": [3, 4, 5], "clear": 3, "languag": 3, "lg": 3, "nworker": [3, 5], "nw": [3, 5], "git": [3, 5], "repo": [3, 5], "gr": 3, "temp": 3, "tp": 3, "prev": 3, "pm": 3, "rebuilt": [3, 4, 5], "upload": [3, 4, 5], "otherwis": [3, 4, 5], "tell": [3, 5], "exist": [3, 4, 5], "ll": [3, 5], "level": [3, 4, 5], "debug": [3, 5], "remov": [3, 5], "befor": 3, "after": [3, 4, 5], "solr": [3, 5], "passim": [3, 5], "worker": [3, 4, 5], "local": [3, 4, 5], "includ": [3, 5], "temporari": 3, "clone": [3, 5], "releas": [3, 5], "previou": [3, 5], "cleanup": 3, "upload_success": 3, "filepath": [3, 5], "ha": [3, 5], "been": [3, 5], "successfulli": 3, "success": [3, 4, 5], "compress": 3, "json_fil": [3, 4], "output_dir": [3, 4], "merg": 3, "line": [3, 4, 5], "singl": 3, "signatur": 3, "write": [3, 4, 5], "sort": 3, "serial": [3, 5], "rytp": 3, "sort_kei": 3, "expect": [3, 5], "concaten": 3, "init_log": 3, "initialis": [3, 4], "logger": [3, 4], "desir": [3, 4, 5], "rootlogg": [3, 4], "duplic": 3, "init_logg": [3, 4], "could": [3, 4, 5], "work": [3, 4], "properli": 3, "so": [3, 5], "keep": [3, 4], "rebuild_for_passim": 3, "content_item": 3, "ani": [3, 4, 5], "metadata": [3, 4], "built": 3, "rebuild_for_solr": 3, "thought": 3, "especi": 3, "ingest": [3, 5], "index": [3, 5], "follow": [3, 4, 5], "schema": [3, 4, 5], "rebuild_issu": 3, "dask_client": 3, "_format": 3, "filter_languag": 3, "outp_dir": 3, "store": [3, 4, 5], "rebuild_text": 3, "append": [3, 5], "conform": 3, "being": 3, "fulltext": 3, "offset": [3, 4], "token": 3, "region": 3, "rebuild_text_passim": 3, "group": 3, "get_iiif_and_coord": 3, "ci": [3, 4, 5], "link": [3, 4], "variou": [3, 4, 5], "case": [3, 4, 5], "117": 3, "retriev": [3, 4, 5], "inform": [3, 4, 5], "part": [3, 5], "miss": [3, 5], "insert_whitespac": 3, "next_t": 3, "prev_t": 3, "lang": [3, 5], "determin": [3, 5], "whitespac": 3, "insert": 3, "pages_to_articl": 3, "belong": 3, "read_issu": 3, "inject": 3, "s3_version": 3, "boto3": [3, 4], "resourc": [3, 4], "factori": [3, 4], "serviceresourc": [3, 4], "connect": [3, 4], "storag": 3, "represent": [3, 5], "read_issue_pag": 3, "issue_json": 3, "parallel": [3, 4], "read_pag": 3, "page_kei": 3, "reconstruct_iiif_link": 3, "construct": [3, 5], "api": 3, "endpoint": [3, 4], "process": [3, 4, 5], "some": [3, 5], "inconsist": [3, 5], "variat": 3, "more": [3, 4, 5], "detail": [3, 4], "area": 3, "rejoin_articl": 3, "text_apply_break": 3, "break": [3, 4], "appli": [3, 5], "visual": 3, "charact": 3, "paragraph": 3, "etc": [3, 5], "chunk": [3, 4, 5], "cluster": 3, "bytes_to": 4, "bytes_nb": 4, "to_unit": 4, "bsize": 4, "1024": 4, "target": [4, 5], "kilobyt": 4, "m": [4, 5], "megabyt": 4, "gigabyt": 4, "t": [4, 5], "terabyt": 4, "p": [4, 5], "petabyt": 4, "exabyt": 4, "convers": 4, "keyerror": [4, 5], "chunksiz": 4, "yield": 4, "get_list_intersect": 4, "list1": 4, "list2": 4, "get_pkg_resourc": 4, "file_manag": 4, "exitstack": 4, "packag": [4, 5], "posixpath": 4, "manag": 4, "instanti": [4, 5], "prior": 4, "call": [4, 5], "close": 4, "onc": [4, 5], "longer": 4, "contextlib": 4, "pathlib": 4, "glob_with_s": 4, "file_suffix": 4, "mb": [4, 5], "search": 4, "match": [4, 5], "round": 4, "six": 4, "decim": 4, "place": [4, 5], "20": 4, "_description_": [4, 5], "list_local_directori": 4, "parse_json": 4, "validate_against_schema": 4, "json_to_valid": 4, "path_to_schema": 4, "against": [4, 5], "drive": 4, "boto": 4, "kept": 4, "until": 4, "third": 4, "parti": 4, "lib": 4, "depend": 4, "solv": 4, "alternative_read_text": 4, "s3_kei": [4, 5], "s3_credenti": 4, "line_by_lin": 4, "reason": [4, 5], "bug": 4, "read_text": 4, "1000": 4, "filenotfounderror": 4, "fixed_s3fs_glob": [4, 5], "boto3_bucket": 4, "benoit": 4, "pyimag": 4, "custom": 4, "glob": 4, "s3f": 4, "unabl": 4, "than": 4, "switch": 4, "get_boto3_bucket": 4, "request": 4, "ask": 4, "doe": 4, "turn": 4, "newli": [4, 5], "testb": 4, "depreci": 4, "pleas": 4, "priorit": 4, "get_or_create_bucket": 4, "instead": [4, 5], "yet": 4, "get_bucket_boto3": 4, "host_url": 4, "zhdk": 4, "cloud": 4, "get_s3_connect": 4, "host": 4, "assum": 4, "two": [4, 5], "environ": 4, "variabl": 4, "se_access_kei": 4, "se_secret_kei": 4, "get_s3_resourc": 4, "get_s3_object_s": 4, "whose": 4, "you": 4, "want": 4, "doesn": 4, "relat": [4, 5], "get_s3_vers": 4, "modifi": [4, 5], "datetim": 4, "get_s3_versions_cli": 4, "get_storage_opt": 4, "list_s3_directori": 4, "media": [4, 5], "titl": [4, 5], "read_jsonlin": 4, "point": [4, 5], "extract": [4, 5], "doc": 4, "per": [4, 5], "from_sequ": 4, "s3r": 4, "print": [4, 5], "count": [4, 5], "map": [4, 5], "pluck": 4, "ft": 4, "without": 4, "readtext_jsonlin": 4, "limit": 4, "textual": 4, "leav": 4, "out": 4, "serv": 4, "pure": 4, "ne": [4, 5], "reus": [4, 5], "topic": [4, 5], "s3_get_articl": 4, "ad": [4, 5], "advertis": 4, "iter_bucket": 4, "cpu": 4, "dictionari": [4, 5], "s3_get_pag": 4, "issue_id": 4, "page_nam": 4, "imp": 4, "1990": 4, "03": 4, "s3_glob_with_s": 4, "pattern": 4, "around": 4, "directli": 4, "wildcard": 4, "partition_nam": 4, "newspaper_prefix": 4, "upload_to_s3": 4, "local_path": 4, "path_within_bucket": 4, "help": 4, "prepar": 4, "view": 4, "orient": 4, "daskutil": 4, "cf": [4, 5], "argument": [4, 5], "create_even_partit": 4, "config_newspap": 4, "local_f": 4, "keep_ful": 4, "nb_partit": 4, "500": 4, "yearli": [4, 5], "even": [4, 5], "enabl": 4, "effici": 4, "bypass": 4, "shuffl": 4, "well": 4, "decid": 4, "what": [4, 5], "memori": 4, "span": 4, "classic": 4, "produc": 4, "arg": 4, "partition": 4, "save": 4, "export": [4, 5], "compute_image_link": 4, "pad": 4, "iiif_endpoint": 4, "iiif_link": 4, "summari": [4, 5], "descript": 4, "get_iiif_link": 4, "canonical_bucket": 4, "rebuilt2xmi": 4, "typesystem_path": 4, "iiif_map": 4, "pct_coordin": 4, "typesystem": 4, "defit": 4, "annot": 4, "layer": 4, "task": 4, "config_load": 4, "initi": [4, 5], "method": [4, 5], "check_bucket": 4, "attribut": [4, 5], "check_param": 4, "classmethod": [4, 5], "from_json": 4, "to_dict": 4, "partitionerconfig": 4, "As": [4, 5], "now": 4, "solr_serv": 4, "server": 4, "solr_cor": 4, "s3_host": 4, "s3_bucket_rebuilt": 4, "s3_bucket_partit": 4, "s3_bucket_process": 4, "key_batch": 4, "batch": 4, "number_partit": 4, "1991": 4, "1998": 4, "sever": 5, "allow": 5, "stage": 5, "pipelin": 5, "goal": 5, "approach": 5, "track": 5, "everi": 5, "ensur": 5, "consisteni": 5, "eas": 5, "consist": 5, "across": 5, "justifi": 5, "through": 5, "identif": 5, "leakag": 5, "partial": 5, "updat": 5, "subset": 5, "know": 5, "step": 5, "necessari": 5, "when": 5, "collect": 5, "arriv": 5, "patch": 5, "transpar": 5, "citat": 5, "dataset": 5, "straightforward": 5, "user": 5, "interfac": 5, "exactli": 5, "thei": 5, "abl": 5, "consult": 5, "precis": 5, "them": 5, "definit": 5, "datastatst": 5, "dure": 5, "preprocess": 5, "augment": 5, "project": 5, "progress": 5, "data_statist": 5, "datastatist": 5, "data_stag": 5, "datastag": 5, "granular": 5, "abc": 5, "specif": 5, "portion": 5, "stat": 5, "respect": 5, "count_kei": 5, "frequenc": 5, "add_count": 5, "new_count": 5, "replac": 5, "add": 5, "init_count": 5, "defin": 5, "pretty_print": 5, "modif_d": 5, "include_count": 5, "These": 5, "agnost": 5, "self": 5, "child": 5, "modif": 5, "about": 5, "abstract": 5, "same_count": 5, "other_stat": 5, "newspaperstatist": 5, "possible_count_kei": 5, "content_items_out": 5, "ft_token": 5, "content_items_in": 5, "ne_ment": 5, "ne_ent": 5, "embeddings_el": 5, "lang_fd": 5, "text_reuse_clust": 5, "text_reuse_passag": 5, "nps_stat": 5, "union": 5, "pretti": 5, "data_manifest": 5, "datamanifest": 5, "s3_output_bucket": 5, "git_repo": 5, "temp_dir": 5, "s3_input_bucket": 5, "new_vers": 5, "is_patch": 5, "patched_field": 5, "previous_mft_path": 5, "only_count": 5, "note": 5, "push_to_git": 5, "add_by_ci_id": 5, "ci_id": 5, "add_by_title_year": 5, "add_count_list_by_title_year": 5, "all_count": 5, "lsit": 5, "aggregate_stats_for_titl": 5, "media_dict": 5, "aggreg": 5, "radio": 5, "radiostatist": 5, "don": 5, "displai": 5, "show": 5, "final": 5, "append_to_not": 5, "to_start": 5, "export_to_git_and_s3": 5, "commit_msg": 5, "perform": 5, "logic": 5, "lazi": 5, "behavior": 5, "readi": 5, "access": 5, "particular": 5, "_processing_stat": 5, "crystal": 5, "_generation_d": 5, "corpu": 5, "manifest_data": 5, "dump": 5, "validate_and_export_manifest": 5, "commit": 5, "define_update_info_for_titl": 5, "processed_year": 5, "prev_version_year": 5, "four": 5, "eg": 5, "alreadi": 5, "generate_media_dict": 5, "old_media_list": 5, "conclud": 5, "increas": 5, "flag": 5, "conduct": 5, "major": 5, "verison": 5, "get_count_kei": 5, "integr": 5, "init": 5, "has_title_year_kei": 5, "verifi": 5, "init_yearly_count_dict": 5, "new_media": 5, "By": 5, "update_typ": 5, "update_level": 5, "updated_year": 5, "updated_field": 5, "properti": 5, "output_mft_s3_path": 5, "versison": 5, "cannot": 5, "overall_stat": 5, "title_stat": 5, "overal": 5, "replace_by_ci_id": 5, "oper": 5, "overwrit": 5, "isn": 5, "better": 5, "suit": 5, "replace_by_title_year": 5, "title_level_stat": 5, "media_list": 5, "stats_as_dict": 5, "update_media_stat": 5, "yearly_stat": 5, "actual": 5, "chang": 5, "statisit": 5, "potenti": 5, "output_bucket_nam": 5, "overriden": 5, "problem": 5, "occur": 5, "push": 5, "critic": 5, "won": 5, "alter": 5, "overrid": 5, "strenum": 5, "requir": 5, "accordingli": 5, "exact": 5, "embed": 5, "entiti": 5, "langid": 5, "linguistic_process": 5, "lingproc": 5, "mysql_ci": 5, "mysql": 5, "ocrqa": 5, "solr_emb": 5, "emb": 5, "solr_ent": 5, "solr_text": 5, "text_reus": 5, "has_valu": 5, "cl": 5, "agg": 5, "reduc": 5, "ggregat": 5, "np": 5, "nuniqu": 5, "clone_git_repo": 5, "repo_nam": 5, "branch": 5, "ideal": 5, "absolut": 5, "appear": 5, "fail": 5, "ssh": 5, "compute_stats_in_canonical_bag": 5, "s3_canonical_issu": 5, "compute_stats_in_entities_bag": 5, "s3_entiti": 5, "compute_stats_in_langident_bag": 5, "s3_langid": 5, "compute_stats_in_rebuilt_bag": 5, "rebuilt_articl": 5, "include_np": 5, "fly": 5, "groupbi": 5, "paassim": 5, "compute_stats_in_solr_text_bag": 5, "s3_solr_text": 5, "counts_for_canonical_issu": 5, "include_np_yr": 5, "later": 5, "counts_for_rebuilt": 5, "rebuilt_ci": 5, "extract_vers": 5, "name_or_path": 5, "as_int": 5, "_vm": 5, "filter_new_or_modified_media": 5, "rebuilt_mft_json": 5, "previous_mft_json": 5, "compar": 5, "typic": 5, "atom": 5, "ident": 5, "rebuilt_mft_path": 5, "new_or_modifi": 5, "get_new_or_modified_media": 5, "new_manifest": 5, "previous_manifest": 5, "media_titl": 5, "new_media_item_1": 5, "last_modif_d": 5, "2024": 5, "04": 5, "04t12": 5, "00": 5, "00z": 5, "modified_media_item_2": 5, "03t12": 5, "agg_tu": 5, "find_s3_data_manifest_path": 5, "find": 5, "latest": 5, "On": 5, "wai": 5, "enrich": 5, "own": 5, "insid": 5, "get_head_commit_url": 5, "three": 5, "alreadai": 5, "outsid": 5, "previous": 5, "activ": 5, "get_media_item_year": 5, "mnf_json": 5, "media_items_year": 5, "get_media_titl": 5, "input_data": 5, "ex": 5, "typeerror": 5, "git_commit_push": 5, "full_git_filepath": 5, "make": 5, "non": 5, "increment_vers": 5, "prev_vers": 5, "increment": 5, "accod": 5, "minor": 5, "reset": 5, "vesion": 5, "init_media_info": 5, "full_titl": 5, "relev": 5, "comparison": 5, "is_git_repo": 5, "manifest_summari": 5, "extended_summari": 5, "extend": 5, "manifest_json": 5, "8": 5, "5": 5, "media_list_from_mft_json": 5, "json_mft": 5, "still": 5, "along": 5, "read_manifest_from_s3": 5, "read_manifest_from_s3_path": 5, "manifest_s3_path": 5, "arbitrari": 5, "remove_media_in_manifest": 5, "white_list": 5, "whitelist": 5, "whatev": 5, "retain": 5, "validate_granular": 5, "valueerror": 5, "validate_stag": 5, "return_value_str": 5, "neither": 5, "nor": 5, "validate_vers": 5, "regex": 5, "9": 5, "vm": 5, "integ": 5, "version_as_list": 5, "len": 5, "respec": 5, "write_and_push_to_git": 5, "file_cont": 5, "path_in_repo": 5, "write_dump_to_f": 5, "abs_path": 5, "filesystem": 5, "written": 5, "writen": 5, "ioerror": 5, "command": 5, "compute_manifest": 5, "lf": 5, "compute_stats_for_stag": 5, "files_bag": 5, "create_manifest": 5, "togeth": 5, "iption": 5, "further": 5, "markdown": 5, "manifest_config": 5, "guidelin": 5, "extract_np_kei": 5, "31": 5, "indeplux": 5, "1889": 5, "get_files_to_consid": 5, "file_extens": 5, "validate_config": 5, "mssing": 5}, "objects": {"impresso_commons.images": [[0, 0, 0, "-", "img_utils"], [0, 0, 0, "-", "olive_boxes"]], "impresso_commons.images.img_utils": [[0, 1, 1, "", "BoxStrategy"], [0, 3, 1, "", "compose"], [0, 3, 1, "", "get_img_from_archive"], [0, 3, 1, "", "get_imgdimensions"], [0, 3, 1, "", "get_jpg"], [0, 3, 1, "", "get_page_folders"], [0, 3, 1, "", "get_png"], [0, 3, 1, "", "get_tif"], [0, 3, 1, "", "run_cmd"]], "impresso_commons.images.img_utils.BoxStrategy": [[0, 2, 1, "", "jpg_highest"], [0, 2, 1, "", "jpg_uniq"], [0, 2, 1, "", "png_highest"], [0, 2, 1, "", "png_uniq"], [0, 2, 1, "", "tif"]], "impresso_commons.images.olive_boxes": [[0, 3, 1, "", "compute_box"], [0, 3, 1, "", "compute_scale_factor"], [0, 3, 1, "", "convert_box"], [0, 3, 1, "", "get_iiif_url"], [0, 3, 1, "", "get_scale_factor"], [0, 3, 1, "", "test"]], "impresso_commons": [[2, 0, 0, "-", "path"]], "impresso_commons.path": [[2, 3, 1, "", "id2IssueDir"], [2, 3, 1, "", "parse_canonical_filename"], [2, 0, 0, "-", "path_fs"], [2, 0, 0, "-", "path_s3"]], "impresso_commons.path.path_fs": [[2, 2, 1, "", "ContentItem"], [2, 1, 1, "", "IssueDir"], [2, 3, 1, "", "canonical_path"], [2, 3, 1, "", "check_filenaming"], [2, 3, 1, "", "detect_canonical_issues"], [2, 3, 1, "", "detect_issues"], [2, 3, 1, "", "detect_journal_issues"], [2, 3, 1, "", "get_issueshortpath"], [2, 3, 1, "", "pair_issue"], [2, 3, 1, "", "select_issues"]], "impresso_commons.path.path_fs.IssueDir": [[2, 2, 1, "", "date"], [2, 2, 1, "", "edition"], [2, 2, 1, "", "journal"], [2, 2, 1, "", "path"]], "impresso_commons.path.path_s3": [[2, 2, 1, "", "IssueDir"], [2, 3, 1, "", "fetch_files"], [2, 3, 1, "", "impresso_iter_bucket"], [2, 3, 1, "", "list_files"], [2, 3, 1, "", "list_newspapers"], [2, 3, 1, "", "read_s3_issues"], [2, 1, 1, "", "s3ContentItem"], [2, 3, 1, "", "s3_filter_archives"], [2, 3, 1, "", "s3_iter_bucket"]], "impresso_commons.text": [[3, 0, 0, "-", "helpers"], [3, 0, 0, "-", "rebuilder"]], "impresso_commons.text.helpers": [[3, 3, 1, "", "get_iiif_and_coords"], [3, 3, 1, "", "insert_whitespace"], [3, 3, 1, "", "pages_to_article"], [3, 3, 1, "", "read_issue"], [3, 3, 1, "", "read_issue_pages"], [3, 3, 1, "", "read_page"], [3, 3, 1, "", "reconstruct_iiif_link"], [3, 3, 1, "", "rejoin_articles"], [3, 3, 1, "", "text_apply_breaks"]], "impresso_commons.text.rebuilder": [[3, 3, 1, "", "cleanup"], [3, 3, 1, "", "compress"], [3, 3, 1, "", "init_logging"], [3, 3, 1, "", "main"], [3, 3, 1, "", "rebuild_for_passim"], [3, 3, 1, "", "rebuild_for_solr"], [3, 3, 1, "", "rebuild_issues"], [3, 3, 1, "", "rebuild_text"], [3, 3, 1, "", "rebuild_text_passim"], [3, 3, 1, "", "upload"]], "impresso_commons.utils": [[4, 0, 0, "-", "config_loader"], [4, 0, 0, "-", "daskutils"], [4, 0, 0, "-", "s3"], [4, 0, 0, "-", "uima"], [4, 0, 0, "-", "utils"]], "impresso_commons.utils.config_loader": [[4, 1, 1, "", "Base"], [4, 1, 1, "", "PartitionerConfig"], [4, 3, 1, "", "main"]], "impresso_commons.utils.config_loader.Base": [[4, 4, 1, "", "check_bucket"], [4, 4, 1, "", "check_params"], [4, 4, 1, "", "from_json"], [4, 4, 1, "", "to_dict"]], "impresso_commons.utils.daskutils": [[4, 3, 1, "", "create_even_partitions"], [4, 3, 1, "", "main"], [4, 3, 1, "", "partitioner"]], "impresso_commons.utils.s3": [[4, 3, 1, "", "alternative_read_text"], [4, 3, 1, "", "fixed_s3fs_glob"], [4, 3, 1, "", "get_boto3_bucket"], [4, 3, 1, "", "get_bucket"], [4, 3, 1, "", "get_bucket_boto3"], [4, 3, 1, "", "get_or_create_bucket"], [4, 3, 1, "", "get_s3_client"], [4, 3, 1, "", "get_s3_connection"], [4, 3, 1, "", "get_s3_object_size"], [4, 3, 1, "", "get_s3_resource"], [4, 3, 1, "", "get_s3_versions"], [4, 3, 1, "", "get_s3_versions_client"], [4, 3, 1, "", "get_storage_options"], [4, 3, 1, "", "list_s3_directories"], [4, 3, 1, "", "read_jsonlines"], [4, 3, 1, "", "readtext_jsonlines"], [4, 3, 1, "", "s3_get_articles"], [4, 3, 1, "", "s3_get_pages"], [4, 3, 1, "", "s3_glob_with_size"], [4, 3, 1, "", "upload"], [4, 3, 1, "", "upload_to_s3"]], "impresso_commons.utils.uima": [[4, 3, 1, "", "compute_image_links"], [4, 3, 1, "", "get_iiif_links"], [4, 3, 1, "", "rebuilt2xmi"]], "impresso_commons.utils.utils": [[4, 3, 1, "", "bytes_to"], [4, 3, 1, "", "chunk"], [4, 3, 1, "", "get_list_intersection"], [4, 3, 1, "", "get_pkg_resource"], [4, 3, 1, "", "glob_with_size"], [4, 3, 1, "", "init_logger"], [4, 3, 1, "", "list_local_directories"], [4, 3, 1, "", "parse_json"], [4, 3, 1, "", "validate_against_schema"]], "impresso_commons.versioning": [[5, 0, 0, "-", "compute_manifest"], [5, 0, 0, "-", "data_manifest"], [5, 0, 0, "-", "data_statistics"], [5, 0, 0, "-", "helpers"]], "impresso_commons.versioning.compute_manifest": [[5, 3, 1, "", "compute_stats_for_stage"], [5, 3, 1, "", "create_manifest"], [5, 3, 1, "", "extract_np_key"], [5, 3, 1, "", "get_files_to_consider"], [5, 3, 1, "", "main"], [5, 3, 1, "", "validate_config"]], "impresso_commons.versioning.data_manifest": [[5, 1, 1, "", "DataManifest"]], "impresso_commons.versioning.data_manifest.DataManifest": [[5, 4, 1, "", "add_by_ci_id"], [5, 4, 1, "", "add_by_title_year"], [5, 4, 1, "", "add_count_list_by_title_year"], [5, 4, 1, "", "aggregate_stats_for_title"], [5, 4, 1, "", "append_to_notes"], [5, 4, 1, "", "compute"], [5, 4, 1, "", "define_update_info_for_title"], [5, 4, 1, "", "generate_media_dict"], [5, 4, 1, "", "get_count_keys"], [5, 4, 1, "", "has_title_year_key"], [5, 4, 1, "", "init_yearly_count_dict"], [5, 4, 1, "", "new_media"], [5, 5, 1, "", "output_mft_s3_path"], [5, 4, 1, "", "overall_stats"], [5, 4, 1, "", "replace_by_ci_id"], [5, 4, 1, "", "replace_by_title_year"], [5, 4, 1, "", "title_level_stats"], [5, 4, 1, "", "update_media_stats"], [5, 4, 1, "", "validate_and_export_manifest"]], "impresso_commons.versioning.data_statistics": [[5, 1, 1, "", "DataStatistics"], [5, 1, 1, "", "NewspaperStatistics"]], "impresso_commons.versioning.data_statistics.DataStatistics": [[5, 4, 1, "", "add_counts"], [5, 2, 1, "", "count_keys"], [5, 2, 1, "", "counts"], [5, 2, 1, "", "element"], [5, 2, 1, "", "granularity"], [5, 4, 1, "", "init_counts"], [5, 4, 1, "", "pretty_print"], [5, 4, 1, "", "same_counts"], [5, 2, 1, "", "stage"]], "impresso_commons.versioning.data_statistics.NewspaperStatistics": [[5, 2, 1, "", "count_keys"], [5, 2, 1, "", "counts"], [5, 2, 1, "", "element"], [5, 2, 1, "", "granularity"], [5, 2, 1, "", "possible_count_keys"], [5, 4, 1, "", "pretty_print"], [5, 4, 1, "", "same_counts"], [5, 2, 1, "", "stage"]], "impresso_commons.versioning.helpers": [[5, 1, 1, "", "DataStage"], [5, 3, 1, "", "agg"], [5, 3, 1, "", "chunk"], [5, 3, 1, "", "clone_git_repo"], [5, 3, 1, "", "compute_stats_in_canonical_bag"], [5, 3, 1, "", "compute_stats_in_entities_bag"], [5, 3, 1, "", "compute_stats_in_langident_bag"], [5, 3, 1, "", "compute_stats_in_rebuilt_bag"], [5, 3, 1, "", "compute_stats_in_solr_text_bag"], [5, 3, 1, "", "counts_for_canonical_issue"], [5, 3, 1, "", "counts_for_rebuilt"], [5, 3, 1, "", "extract_version"], [5, 3, 1, "", "filter_new_or_modified_media"], [5, 3, 1, "", "finalize"], [5, 3, 1, "", "find_s3_data_manifest_path"], [5, 3, 1, "", "get_head_commit_url"], [5, 3, 1, "", "get_media_item_years"], [5, 3, 1, "", "get_media_titles"], [5, 3, 1, "", "git_commit_push"], [5, 3, 1, "", "increment_version"], [5, 3, 1, "", "init_media_info"], [5, 3, 1, "", "is_git_repo"], [5, 3, 1, "", "manifest_summary"], [5, 3, 1, "", "media_list_from_mft_json"], [5, 3, 1, "", "read_manifest_from_s3"], [5, 3, 1, "", "read_manifest_from_s3_path"], [5, 3, 1, "", "remove_media_in_manifest"], [5, 3, 1, "", "validate_granularity"], [5, 3, 1, "", "validate_stage"], [5, 3, 1, "", "validate_version"], [5, 3, 1, "", "version_as_list"], [5, 3, 1, "", "write_and_push_to_git"], [5, 3, 1, "", "write_dump_to_fs"]], "impresso_commons.versioning.helpers.DataStage": [[5, 2, 1, "", "CANONICAL"], [5, 2, 1, "", "EMBEDDINGS"], [5, 2, 1, "", "ENTITIES"], [5, 2, 1, "", "EVENIZED"], [5, 2, 1, "", "LANGIDENT"], [5, 2, 1, "", "LINGUISTIC_PROCESSING"], [5, 2, 1, "", "MYSQL_CIS"], [5, 2, 1, "", "OCRQA"], [5, 2, 1, "", "PASSIM"], [5, 2, 1, "", "REBUILT"], [5, 2, 1, "", "SOLR_EMBS"], [5, 2, 1, "", "SOLR_ENTITIES"], [5, 2, 1, "", "SOLR_TEXT"], [5, 2, 1, "", "TEXT_REUSE"], [5, 2, 1, "", "TOPICS"], [5, 4, 1, "", "has_value"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:attribute", "3": "py:function", "4": "py:method", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "attribute", "Python attribute"], "3": ["py", "function", "Python function"], "4": ["py", "method", "Python method"], "5": ["py", "property", "Python property"]}, "titleterms": {"imag": 0, "handl": 0, "util": [0, 4], "oliv": 0, "box": 0, "background": 0, "inform": 0, "case": 0, "1": 0, "tif": 0, "2": 0, "sever": 0, "png": 0, "3": 0, "one": 0, "onli": 0, "4": 0, "jpg": 0, "welcom": 1, "impresso": 1, "pycommon": 1, "": 1, "document": 1, "content": 1, "input": 2, "output": 2, "gener": 2, "i": 2, "o": 2, "from": 2, "file": [2, 3, 4], "system": 2, "s3": [2, 4], "text": 3, "rebuild": 3, "function": [3, 4], "helper": [3, 5], "config": [3, 4], "exampl": 3, "basic": 4, "dask": 4, "apach": 4, "uima": 4, "xmi": 4, "loader": 4, "data": 5, "version": 5, "statist": 5, "newspap": 5, "manifest": 5, "comput": 5, "script": 5}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2, "sphinx": 60}, "alltitles": {"Image handling": [[0, "image-handling"]], "Image Utils": [[0, "module-impresso_commons.images.img_utils"]], "Olive Boxes": [[0, "module-impresso_commons.images.olive_boxes"]], "Background information": [[0, "background-information"]], "Case 1: tif": [[0, "case-1-tif"]], "Case 2: several png": [[0, "case-2-several-png"]], "Case 3: one png only": [[0, "case-3-one-png-only"]], "Case 4: one jpg only": [[0, "case-4-one-jpg-only"]], "Welcome to Impresso PyCommons\u2019s documentation!": [[1, "welcome-to-impresso-pycommons-s-documentation"]], "Contents:": [[1, null]], "Input/Output": [[2, "input-output"]], "General": [[2, "module-impresso_commons.path"]], "I/O from file system": [[2, "module-impresso_commons.path.path_fs"]], "I/O from S3": [[2, "module-impresso_commons.path.path_s3"]], "Text Rebuild": [[3, "text-rebuild"]], "Rebuild functions": [[3, "rebuild-functions"]], "Helpers": [[3, "module-impresso_commons.text.helpers"]], "Config file example": [[3, "config-file-example"]], "Utilities": [[4, "utilities"]], "Basic Utils Functions": [[4, "module-impresso_commons.utils.utils"]], "S3 Utils Functions": [[4, "module-impresso_commons.utils.s3"]], "Dask Utils Functions": [[4, "module-impresso_commons.utils.daskutils"]], "Apache UIMA XMI Utils Functions": [[4, "module-impresso_commons.utils.uima"]], "Config File Loader": [[4, "module-impresso_commons.utils.config_loader"]], "Data Versioning": [[5, "data-versioning"]], "Data Statistics and Newspaper Statistics": [[5, "module-impresso_commons.versioning.data_statistics"]], "Data Manifest": [[5, "module-impresso_commons.versioning.data_manifest"]], "Versioning Helpers": [[5, "module-impresso_commons.versioning.helpers"]], "Manifest Computing Script": [[5, "module-impresso_commons.versioning.compute_manifest"]]}, "indexentries": {"boxstrategy (class in impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.BoxStrategy"]], "compose() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.compose"]], "compute_box() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.compute_box"]], "compute_scale_factor() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.compute_scale_factor"]], "convert_box() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.convert_box"]], "get_iiif_url() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.get_iiif_url"]], "get_img_from_archive() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_img_from_archive"]], "get_imgdimensions() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_imgdimensions"]], "get_jpg() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_jpg"]], "get_page_folders() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_page_folders"]], "get_png() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_png"]], "get_scale_factor() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.get_scale_factor"]], "get_tif() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.get_tif"]], "impresso_commons.images.img_utils": [[0, "module-impresso_commons.images.img_utils"]], "impresso_commons.images.olive_boxes": [[0, "module-impresso_commons.images.olive_boxes"]], "jpg_highest (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.jpg_highest"]], "jpg_uniq (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.jpg_uniq"]], "module": [[0, "module-impresso_commons.images.img_utils"], [0, "module-impresso_commons.images.olive_boxes"], [2, "module-impresso_commons.path"], [2, "module-impresso_commons.path.path_fs"], [2, "module-impresso_commons.path.path_s3"], [3, "module-impresso_commons.text.helpers"], [3, "module-impresso_commons.text.rebuilder"], [4, "module-impresso_commons.utils.config_loader"], [4, "module-impresso_commons.utils.daskutils"], [4, "module-impresso_commons.utils.s3"], [4, "module-impresso_commons.utils.uima"], [4, "module-impresso_commons.utils.utils"], [5, "module-impresso_commons.versioning.compute_manifest"], [5, "module-impresso_commons.versioning.data_manifest"], [5, "module-impresso_commons.versioning.data_statistics"], [5, "module-impresso_commons.versioning.helpers"]], "png_highest (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.png_highest"]], "png_uniq (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.png_uniq"]], "run_cmd() (in module impresso_commons.images.img_utils)": [[0, "impresso_commons.images.img_utils.run_cmd"]], "test() (in module impresso_commons.images.olive_boxes)": [[0, "impresso_commons.images.olive_boxes.test"]], "tif (impresso_commons.images.img_utils.boxstrategy attribute)": [[0, "impresso_commons.images.img_utils.BoxStrategy.tif"]], "contentitem (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.ContentItem"]], "issuedir (class in impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.IssueDir"]], "issuedir (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.IssueDir"]], "canonical_path() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.canonical_path"]], "check_filenaming() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.check_filenaming"]], "date (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.date"]], "detect_canonical_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_canonical_issues"]], "detect_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_issues"]], "detect_journal_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.detect_journal_issues"]], "edition (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.edition"]], "fetch_files() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.fetch_files"]], "get_issueshortpath() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.get_issueshortpath"]], "id2issuedir() (in module impresso_commons.path)": [[2, "impresso_commons.path.id2IssueDir"]], "impresso_commons.path": [[2, "module-impresso_commons.path"]], "impresso_commons.path.path_fs": [[2, "module-impresso_commons.path.path_fs"]], "impresso_commons.path.path_s3": [[2, "module-impresso_commons.path.path_s3"]], "impresso_iter_bucket() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.impresso_iter_bucket"]], "journal (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.journal"]], "list_files() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.list_files"]], "list_newspapers() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.list_newspapers"]], "pair_issue() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.pair_issue"]], "parse_canonical_filename() (in module impresso_commons.path)": [[2, "impresso_commons.path.parse_canonical_filename"]], "path (impresso_commons.path.path_fs.issuedir attribute)": [[2, "impresso_commons.path.path_fs.IssueDir.path"]], "read_s3_issues() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.read_s3_issues"]], "s3contentitem (class in impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3ContentItem"]], "s3_filter_archives() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3_filter_archives"]], "s3_iter_bucket() (in module impresso_commons.path.path_s3)": [[2, "impresso_commons.path.path_s3.s3_iter_bucket"]], "select_issues() (in module impresso_commons.path.path_fs)": [[2, "impresso_commons.path.path_fs.select_issues"]], "cleanup() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.cleanup"]], "compress() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.compress"]], "get_iiif_and_coords() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.get_iiif_and_coords"]], "impresso_commons.text.helpers": [[3, "module-impresso_commons.text.helpers"]], "impresso_commons.text.rebuilder": [[3, "module-impresso_commons.text.rebuilder"]], "init_logging() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.init_logging"]], "insert_whitespace() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.insert_whitespace"]], "main() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.main"]], "pages_to_article() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.pages_to_article"]], "read_issue() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_issue"]], "read_issue_pages() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_issue_pages"]], "read_page() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.read_page"]], "rebuild_for_passim() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_for_passim"]], "rebuild_for_solr() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_for_solr"]], "rebuild_issues() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_issues"]], "rebuild_text() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_text"]], "rebuild_text_passim() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.rebuild_text_passim"]], "reconstruct_iiif_link() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.reconstruct_iiif_link"]], "rejoin_articles() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.rejoin_articles"]], "text_apply_breaks() (in module impresso_commons.text.helpers)": [[3, "impresso_commons.text.helpers.text_apply_breaks"]], "upload() (in module impresso_commons.text.rebuilder)": [[3, "impresso_commons.text.rebuilder.upload"]], "base (class in impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.Base"]], "partitionerconfig (class in impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.PartitionerConfig"]], "alternative_read_text() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.alternative_read_text"]], "bytes_to() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.bytes_to"]], "check_bucket() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.check_bucket"]], "check_params() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.check_params"]], "chunk() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.chunk"]], "compute_image_links() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.compute_image_links"]], "create_even_partitions() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.create_even_partitions"]], "fixed_s3fs_glob() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.fixed_s3fs_glob"]], "from_json() (impresso_commons.utils.config_loader.base class method)": [[4, "impresso_commons.utils.config_loader.Base.from_json"]], "get_boto3_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_boto3_bucket"]], "get_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_bucket"]], "get_bucket_boto3() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_bucket_boto3"]], "get_iiif_links() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.get_iiif_links"]], "get_list_intersection() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.get_list_intersection"]], "get_or_create_bucket() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_or_create_bucket"]], "get_pkg_resource() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.get_pkg_resource"]], "get_s3_client() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_client"]], "get_s3_connection() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_connection"]], "get_s3_object_size() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_object_size"]], "get_s3_resource() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_resource"]], "get_s3_versions() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_versions"]], "get_s3_versions_client() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_s3_versions_client"]], "get_storage_options() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.get_storage_options"]], "glob_with_size() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.glob_with_size"]], "impresso_commons.utils.config_loader": [[4, "module-impresso_commons.utils.config_loader"]], "impresso_commons.utils.daskutils": [[4, "module-impresso_commons.utils.daskutils"]], "impresso_commons.utils.s3": [[4, "module-impresso_commons.utils.s3"]], "impresso_commons.utils.uima": [[4, "module-impresso_commons.utils.uima"]], "impresso_commons.utils.utils": [[4, "module-impresso_commons.utils.utils"]], "init_logger() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.init_logger"]], "list_local_directories() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.list_local_directories"]], "list_s3_directories() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.list_s3_directories"]], "main() (in module impresso_commons.utils.config_loader)": [[4, "impresso_commons.utils.config_loader.main"]], "main() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.main"]], "parse_json() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.parse_json"]], "partitioner() (in module impresso_commons.utils.daskutils)": [[4, "impresso_commons.utils.daskutils.partitioner"]], "read_jsonlines() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.read_jsonlines"]], "readtext_jsonlines() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.readtext_jsonlines"]], "rebuilt2xmi() (in module impresso_commons.utils.uima)": [[4, "impresso_commons.utils.uima.rebuilt2xmi"]], "s3_get_articles() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.s3_get_articles"]], "s3_get_pages() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.s3_get_pages"]], "s3_glob_with_size() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.s3_glob_with_size"]], "to_dict() (impresso_commons.utils.config_loader.base method)": [[4, "impresso_commons.utils.config_loader.Base.to_dict"]], "upload() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.upload"]], "upload_to_s3() (in module impresso_commons.utils.s3)": [[4, "impresso_commons.utils.s3.upload_to_s3"]], "validate_against_schema() (in module impresso_commons.utils.utils)": [[4, "impresso_commons.utils.utils.validate_against_schema"]], "canonical (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.CANONICAL"]], "datamanifest (class in impresso_commons.versioning.data_manifest)": [[5, "impresso_commons.versioning.data_manifest.DataManifest"]], "datastage (class in impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.DataStage"]], "datastatistics (class in impresso_commons.versioning.data_statistics)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics"]], "embeddings (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.EMBEDDINGS"]], "entities (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.ENTITIES"]], "evenized (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.EVENIZED"]], "langident (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.LANGIDENT"]], "linguistic_processing (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.LINGUISTIC_PROCESSING"]], "mysql_cis (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.MYSQL_CIS"]], "newspaperstatistics (class in impresso_commons.versioning.data_statistics)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics"]], "ocrqa (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.OCRQA"]], "passim (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.PASSIM"]], "rebuilt (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.REBUILT"]], "solr_embs (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.SOLR_EMBS"]], "solr_entities (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.SOLR_ENTITIES"]], "solr_text (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.SOLR_TEXT"]], "text_reuse (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.TEXT_REUSE"]], "topics (impresso_commons.versioning.helpers.datastage attribute)": [[5, "impresso_commons.versioning.helpers.DataStage.TOPICS"]], "add_by_ci_id() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.add_by_ci_id"]], "add_by_title_year() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.add_by_title_year"]], "add_count_list_by_title_year() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.add_count_list_by_title_year"]], "add_counts() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.add_counts"]], "agg() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.agg"]], "aggregate_stats_for_title() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.aggregate_stats_for_title"]], "append_to_notes() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.append_to_notes"]], "chunk() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.chunk"]], "clone_git_repo() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.clone_git_repo"]], "compute() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.compute"]], "compute_stats_for_stage() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.compute_stats_for_stage"]], "compute_stats_in_canonical_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_canonical_bag"]], "compute_stats_in_entities_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_entities_bag"]], "compute_stats_in_langident_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_langident_bag"]], "compute_stats_in_rebuilt_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_rebuilt_bag"]], "compute_stats_in_solr_text_bag() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.compute_stats_in_solr_text_bag"]], "count_keys (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.count_keys"]], "count_keys (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.count_keys"]], "counts (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.counts"]], "counts (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.counts"]], "counts_for_canonical_issue() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.counts_for_canonical_issue"]], "counts_for_rebuilt() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.counts_for_rebuilt"]], "create_manifest() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.create_manifest"]], "define_update_info_for_title() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.define_update_info_for_title"]], "element (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.element"]], "element (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.element"]], "extract_np_key() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.extract_np_key"]], "extract_version() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.extract_version"]], "filter_new_or_modified_media() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.filter_new_or_modified_media"]], "finalize() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.finalize"]], "find_s3_data_manifest_path() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.find_s3_data_manifest_path"]], "generate_media_dict() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.generate_media_dict"]], "get_count_keys() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.get_count_keys"]], "get_files_to_consider() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.get_files_to_consider"]], "get_head_commit_url() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.get_head_commit_url"]], "get_media_item_years() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.get_media_item_years"]], "get_media_titles() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.get_media_titles"]], "git_commit_push() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.git_commit_push"]], "granularity (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.granularity"]], "granularity (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.granularity"]], "has_title_year_key() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.has_title_year_key"]], "has_value() (impresso_commons.versioning.helpers.datastage class method)": [[5, "impresso_commons.versioning.helpers.DataStage.has_value"]], "impresso_commons.versioning.compute_manifest": [[5, "module-impresso_commons.versioning.compute_manifest"]], "impresso_commons.versioning.data_manifest": [[5, "module-impresso_commons.versioning.data_manifest"]], "impresso_commons.versioning.data_statistics": [[5, "module-impresso_commons.versioning.data_statistics"]], "impresso_commons.versioning.helpers": [[5, "module-impresso_commons.versioning.helpers"]], "increment_version() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.increment_version"]], "init_counts() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.init_counts"]], "init_media_info() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.init_media_info"]], "init_yearly_count_dict() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.init_yearly_count_dict"]], "is_git_repo() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.is_git_repo"]], "main() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.main"]], "manifest_summary() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.manifest_summary"]], "media_list_from_mft_json() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.media_list_from_mft_json"]], "new_media() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.new_media"]], "output_mft_s3_path (impresso_commons.versioning.data_manifest.datamanifest property)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.output_mft_s3_path"]], "overall_stats() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.overall_stats"]], "possible_count_keys (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.possible_count_keys"]], "pretty_print() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.pretty_print"]], "pretty_print() (impresso_commons.versioning.data_statistics.newspaperstatistics method)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.pretty_print"]], "read_manifest_from_s3() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.read_manifest_from_s3"]], "read_manifest_from_s3_path() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.read_manifest_from_s3_path"]], "remove_media_in_manifest() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.remove_media_in_manifest"]], "replace_by_ci_id() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.replace_by_ci_id"]], "replace_by_title_year() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.replace_by_title_year"]], "same_counts() (impresso_commons.versioning.data_statistics.datastatistics method)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.same_counts"]], "same_counts() (impresso_commons.versioning.data_statistics.newspaperstatistics method)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.same_counts"]], "stage (impresso_commons.versioning.data_statistics.datastatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.DataStatistics.stage"]], "stage (impresso_commons.versioning.data_statistics.newspaperstatistics attribute)": [[5, "impresso_commons.versioning.data_statistics.NewspaperStatistics.stage"]], "title_level_stats() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.title_level_stats"]], "update_media_stats() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.update_media_stats"]], "validate_and_export_manifest() (impresso_commons.versioning.data_manifest.datamanifest method)": [[5, "impresso_commons.versioning.data_manifest.DataManifest.validate_and_export_manifest"]], "validate_config() (in module impresso_commons.versioning.compute_manifest)": [[5, "impresso_commons.versioning.compute_manifest.validate_config"]], "validate_granularity() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.validate_granularity"]], "validate_stage() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.validate_stage"]], "validate_version() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.validate_version"]], "version_as_list() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.version_as_list"]], "write_and_push_to_git() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.write_and_push_to_git"]], "write_dump_to_fs() (in module impresso_commons.versioning.helpers)": [[5, "impresso_commons.versioning.helpers.write_dump_to_fs"]]}})
\ No newline at end of file
diff --git a/docs/_build/html/utils.html b/docs/_build/html/utils.html
index 281aa27..d8397c5 100644
--- a/docs/_build/html/utils.html
+++ b/docs/_build/html/utils.html
@@ -1,14 +1,12 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" data-content_root="./">
+<html class="writer-html5" lang="en" >
 <head>
-  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Utilities &mdash; Impresso PyCommons  documentation</title>
-      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
-      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
-
-  
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
@@ -16,7 +14,7 @@
         <script src="_static/jquery.js?v=5d32c60e"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
         <script src="_static/documentation_options.js?v=5929fcd5"></script>
-        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -52,8 +50,11 @@
 <li class="toctree-l2"><a class="reference internal" href="#module-impresso_commons.utils.utils">Basic Utils Functions</a><ul>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.bytes_to"><code class="docutils literal notranslate"><span class="pre">bytes_to()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.chunk"><code class="docutils literal notranslate"><span class="pre">chunk()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.get_list_intersection"><code class="docutils literal notranslate"><span class="pre">get_list_intersection()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.get_pkg_resource"><code class="docutils literal notranslate"><span class="pre">get_pkg_resource()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.glob_with_size"><code class="docutils literal notranslate"><span class="pre">glob_with_size()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.init_logger"><code class="docutils literal notranslate"><span class="pre">init_logger()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.list_local_directories"><code class="docutils literal notranslate"><span class="pre">list_local_directories()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.parse_json"><code class="docutils literal notranslate"><span class="pre">parse_json()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.utils.validate_against_schema"><code class="docutils literal notranslate"><span class="pre">validate_against_schema()</span></code></a></li>
 </ul>
@@ -72,10 +73,12 @@
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_s3_versions"><code class="docutils literal notranslate"><span class="pre">get_s3_versions()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_s3_versions_client"><code class="docutils literal notranslate"><span class="pre">get_s3_versions_client()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.get_storage_options"><code class="docutils literal notranslate"><span class="pre">get_storage_options()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.list_s3_directories"><code class="docutils literal notranslate"><span class="pre">list_s3_directories()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.read_jsonlines"><code class="docutils literal notranslate"><span class="pre">read_jsonlines()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.readtext_jsonlines"><code class="docutils literal notranslate"><span class="pre">readtext_jsonlines()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.s3_get_articles"><code class="docutils literal notranslate"><span class="pre">s3_get_articles()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.s3_get_pages"><code class="docutils literal notranslate"><span class="pre">s3_get_pages()</span></code></a></li>
+<li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.s3_glob_with_size"><code class="docutils literal notranslate"><span class="pre">s3_glob_with_size()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.upload"><code class="docutils literal notranslate"><span class="pre">upload()</span></code></a></li>
 <li class="toctree-l3"><a class="reference internal" href="#impresso_commons.utils.s3.upload_to_s3"><code class="docutils literal notranslate"><span class="pre">upload_to_s3()</span></code></a></li>
 </ul>
@@ -172,6 +175,11 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 <dd><p>Yield successive n-sized chunks from list.</p>
 </dd></dl>
 
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.utils.utils.get_list_intersection">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">get_list_intersection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">list1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">list2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.utils.get_list_intersection" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.utils.get_pkg_resource">
 <span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">get_pkg_resource</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">file_manager</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">ExitStack</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">package</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'impresso_commons'</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">PosixPath</span></span></span><a class="headerlink" href="#impresso_commons.utils.utils.get_pkg_resource" title="Link to this definition"></a></dt>
@@ -199,6 +207,30 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 </dl>
 </dd></dl>
 
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.utils.utils.glob_with_size">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">glob_with_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">directory</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">file_suffix</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></span></span><a class="headerlink" href="#impresso_commons.utils.utils.glob_with_size" title="Link to this definition"></a></dt>
+<dd><p>List all files in a directory with a given suffix and their size in MB.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>directory</strong> (<em>str</em>) – The directory path to search for files.</p></li>
+<li><p><strong>file_suffix</strong> (<em>str</em>) – The file extension or suffix to match.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>A list of tuples, each containing the file path and its</dt><dd><p>size in megabytes, rounded to six decimal places.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list[str]</p>
+</dd>
+</dl>
+</dd></dl>
+
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.utils.init_logger">
 <span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">init_logger</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">level</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">20</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">file</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">RootLogger</span></span></span><a class="headerlink" href="#impresso_commons.utils.utils.init_logger" title="Link to this definition"></a></dt>
@@ -219,6 +251,11 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 </dl>
 </dd></dl>
 
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.utils.utils.list_local_directories">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">list_local_directories</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.utils.list_local_directories" title="Link to this definition"></a></dt>
+<dd></dd></dl>
+
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.utils.parse_json">
 <span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.utils.</span></span><span class="sig-name descname"><span class="pre">parse_json</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.utils.parse_json" title="Link to this definition"></a></dt>
@@ -460,6 +497,32 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 <span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">get_storage_options</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.get_storage_options" title="Link to this definition"></a></dt>
 <dd></dd></dl>
 
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.utils.s3.list_s3_directories">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">list_s3_directories</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prefix</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.list_s3_directories" title="Link to this definition"></a></dt>
+<dd><p>Retrieve the ‘directory’ names (media titles) in an S3 bucket after a
+given path prefix.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>bucket_name</strong> (<em>str</em>) – The name of the S3 bucket.</p></li>
+<li><p><strong>prefix</strong> (<em>str</em>) – The prefix path within the bucket to search. Default
+is the root (‘’).</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>A list of ‘directory’ names found in the specified bucket</dt><dd><p>and prefix.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list</p>
+</dd>
+</dl>
+</dd></dl>
+
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.s3.read_jsonlines">
 <span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">read_jsonlines</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">key_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.read_jsonlines" title="Link to this definition"></a></dt>
@@ -531,6 +594,34 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 </dl>
 </dd></dl>
 
+<dl class="py function">
+<dt class="sig sig-object py" id="impresso_commons.utils.s3.s3_glob_with_size">
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">s3_glob_with_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">boto3_bucket</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.s3_glob_with_size" title="Link to this definition"></a></dt>
+<dd><p>Custom glob function to list S3 objects matching a pattern. This function
+works around the 1000-object listing limit in S3 by using boto3 directly.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>path</strong> (<em>str</em>) – The S3 path with a wildcard (*) to match files.
+Example: <cite>s3://bucket_name/path/to/files/*.txt</cite>.</p></li>
+<li><p><strong>boto3_bucket</strong> (<em>boto3.Bucket</em><em>, </em><em>optional</em>) – An optional boto3 Bucket object.
+If not provided, it will be
+created from the path.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p><dl class="simple">
+<dt>A list of tuples containing the full S3 paths of matching files</dt><dd><p>and their sizes in megabytes.</p>
+</dd>
+</dl>
+</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>list</p>
+</dd>
+</dl>
+</dd></dl>
+
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.s3.upload">
 <span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.s3.</span></span><span class="sig-name descname"><span class="pre">upload</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">partition_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">newspaper_prefix</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">bucket_name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.s3.upload" title="Link to this definition"></a></dt>
@@ -584,7 +675,7 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 <li><p><strong>bucket</strong> – name of the bucket where the files to partition are</p></li>
 <li><p><strong>config_newspapers</strong> – json dict specifying the sources to consider (name(s) of newspaper(s) and year span(s))</p></li>
 <li><p><strong>output_dir</strong> – classic FS repository where to write the produced partitions</p></li>
-<li><p><strong>local_fs</strong></p></li>
+<li><p><strong>local_fs</strong> – </p></li>
 <li><p><strong>keep_full</strong> – whether to filter out metadata or not (i.e. keeping only text and leaving out coordinates)</p></li>
 <li><p><strong>nb_partition</strong> – number of partitions</p></li>
 </ul>
@@ -602,8 +693,23 @@ <h1>Utilities<a class="headerlink" href="#utilities" title="Link to this heading
 
 <dl class="py function">
 <dt class="sig sig-object py" id="impresso_commons.utils.daskutils.partitioner">
-<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.daskutils.</span></span><span class="sig-name descname"><span class="pre">partitioner</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bag</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nbpart</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#impresso_commons.utils.daskutils.partitioner" title="Link to this definition"></a></dt>
-<dd><p>Partition a bag into n partitions and write each partition in a file</p>
+<span class="sig-prename descclassname"><span class="pre">impresso_commons.utils.daskutils.</span></span><span class="sig-name descname"><span class="pre">partitioner</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">bag</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Bag</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nb_partitions</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#impresso_commons.utils.daskutils.partitioner" title="Link to this definition"></a></dt>
+<dd><p>Partition a Dask bag into n partitions and write each to a separate file.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>bag</strong> (<em>dask.bag.Bag</em>) – The Dask bag to be partitioned.</p></li>
+<li><p><strong>path</strong> (<em>str</em>) – Directory path where partitioned files will be saved.</p></li>
+<li><p><strong>nb_partitions</strong> (<em>int</em>) – Number of partitions to create.</p></li>
+</ul>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>The function writes partitioned files to the specified path.</p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p>None</p>
+</dd>
+</dl>
 </dd></dl>
 
 </section>
diff --git a/docs/_build/html/versioning.html b/docs/_build/html/versioning.html
index b1c087d..06c3d43 100644
--- a/docs/_build/html/versioning.html
+++ b/docs/_build/html/versioning.html
@@ -1,14 +1,12 @@
 <!DOCTYPE html>
-<html class="writer-html5" lang="en" data-content_root="./">
+<html class="writer-html5" lang="en" >
 <head>
-  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1.0" />
   <title>Data Versioning &mdash; Impresso PyCommons  documentation</title>
-      <link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
-      <link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
-
-  
+      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
+      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
   <!--[if lt IE 9]>
     <script src="_static/js/html5shiv.min.js"></script>
   <![endif]-->
@@ -16,7 +14,7 @@
         <script src="_static/jquery.js?v=5d32c60e"></script>
         <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
         <script src="_static/documentation_options.js?v=5929fcd5"></script>
-        <script src="_static/doctools.js?v=9a2dae69"></script>
+        <script src="_static/doctools.js?v=888ff710"></script>
         <script src="_static/sphinx_highlight.js?v=dc90522c"></script>
     <script src="_static/js/theme.js"></script>
     <link rel="index" title="Index" href="genindex.html" />
@@ -1529,7 +1527,7 @@ <h1>Data Versioning<a class="headerlink" href="#data-versioning" title="Link to
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>mnf_json</strong> (<em>dict</em>) – A dictionary containing manifest data.</p></li>
 <li><p><strong>extended_summary</strong> (<em>bool</em><em>, </em><em>optional</em>) – Whether to include extended summary</p></li>
-<li><p><strong>False.</strong> (<em>with year statistics. Defaults to</em>)</p></li>
+<li><p><strong>False.</strong> (<em>with year statistics. Defaults to</em>) – </p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns<span class="colon">:</span></dt>
diff --git a/impresso_commons/__init__.py b/impresso_commons/__init__.py
index a82b376..72f26f5 100644
--- a/impresso_commons/__init__.py
+++ b/impresso_commons/__init__.py
@@ -1 +1 @@
-__version__ = "1.1.1"
+__version__ = "1.1.2"
diff --git a/impresso_commons/utils/__init__.py b/impresso_commons/utils/__init__.py
index 6c03d10..7b85070 100644
--- a/impresso_commons/utils/__init__.py
+++ b/impresso_commons/utils/__init__.py
@@ -3,17 +3,17 @@
 # created on 2018.03.27 using PyCharm
 # project impresso-image-acquisition
 
+import datetime
 import logging
+import multiprocessing
 import sys
 import time
-import datetime
 from datetime import timedelta
 
 import dask
-from dask import compute, delayed
+from dask import compute
 from dask.diagnostics import ProgressBar
 from dask.multiprocessing import get as mp_get
-import multiprocessing
 
 logger = logging.getLogger(__name__)
 
@@ -51,7 +51,7 @@ def init_logger(logger, log_level, log_file):
     ch.setFormatter(formatter)
 
     logger.addHandler(ch)
-    logger.info("Logger successfully initialised")
+    logger.info("LOGGER - Logger successfully initialised")
 
     return logger
 
@@ -87,14 +87,18 @@ def user_confirmation(question, default=None):
             sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
 
 
-
 def user_question(variable_to_confirm):
-    answer = user_confirmation(f"Is [{variable_to_confirm}] the correct one to work with?", None)
+    answer = user_confirmation(
+        f"\tIs the following the correct item to work with?\n"
+        f"{variable_to_confirm}",
+        None
+    )
+
     if not answer:
-        logger.info(f"Variable {variable_to_confirm} not confirmed, exiting.")
+        logger.info("Variable not confirmed, exiting.")
         sys.exit()
     else:
-        logger.info(f"Variable {variable_to_confirm} confirmed.")
+        logger.info("Variable confirmed.")
 
 
 def timestamp():
@@ -111,6 +115,7 @@ def timestamp():
 
 class Timer:
     """ Basic timer"""
+
     def __init__(self):
         self.start = time.time()
         self.intermediate = time.time()
diff --git a/impresso_commons/utils/daskutils.py b/impresso_commons/utils/daskutils.py
index a5697d5..d04a006 100644
--- a/impresso_commons/utils/daskutils.py
+++ b/impresso_commons/utils/daskutils.py
@@ -11,29 +11,43 @@
     --config-file=<cf>  json configuration dict specifying various arguments
 """
 
-import os
 import logging
-import docopt
+import os
 
-from dask.diagnostics import ProgressBar
 import dask.bag as db
+import docopt
 import numpy as np
+from dask.bag import Bag
+from dask.diagnostics import ProgressBar
 
-from impresso_commons.utils import init_logger
-from impresso_commons.utils import Timer
 from impresso_commons.path.path_s3 import s3_filter_archives
-from impresso_commons.utils.s3 import get_bucket, read_jsonlines, readtext_jsonlines
-from impresso_commons.utils.s3 import IMPRESSO_STORAGEOPT
+from impresso_commons.utils import Timer
+from impresso_commons.utils import init_logger
 from impresso_commons.utils.config_loader import PartitionerConfig
+from impresso_commons.utils.s3 import IMPRESSO_STORAGEOPT
+from impresso_commons.utils.s3 import get_bucket, read_jsonlines, readtext_jsonlines
 
 __author__ = "maudehrmann"
 
 logger = logging.getLogger(__name__)
 
 
-def partitioner(bag, path, nbpart):
-    """Partition a bag into n partitions and write each partition in a file"""
-    grouped_items = bag.groupby(lambda x: np.random.randint(500), npartitions=nbpart)
+def partitioner(bag: Bag,
+                path: str,
+                nb_partitions: int) -> None:
+    """
+    Partition a Dask bag into n partitions and write each to a separate file.
+
+    Args:
+        bag (dask.bag.Bag): The Dask bag to be partitioned.
+        path (str): Directory path where partitioned files will be saved.
+        nb_partitions (int): Number of partitions to create.
+
+    Returns:
+        None: The function writes partitioned files to the specified path.
+    """
+    grouped_items = bag.groupby(lambda x: np.random.randint(500),
+                                npartitions=nb_partitions)
     items = grouped_items.map(lambda x: x[1]).flatten()
     path = os.path.join(path, "*.jsonl.bz2")
     with ProgressBar():
@@ -41,12 +55,12 @@ def partitioner(bag, path, nbpart):
 
 
 def create_even_partitions(
-    bucket,
-    config_newspapers,
-    output_dir,
-    local_fs=False,
-    keep_full=False,
-    nb_partition=500,
+        bucket,
+        config_newspapers,
+        output_dir,
+        local_fs=False,
+        keep_full=False,
+        nb_partition=500,
 ):
     """Convert yearly bz2 archives to even bz2 archives, i.e. partitions.
 
diff --git a/impresso_commons/utils/s3.py b/impresso_commons/utils/s3.py
index 5eb9200..2b225ef 100644
--- a/impresso_commons/utils/s3.py
+++ b/impresso_commons/utils/s3.py
@@ -3,19 +3,21 @@
 
 """
 
-import os
-import logging
+import bz2
 import json
+import logging
+import os
 import warnings
-import bz2
 from typing import Union
+
 import boto3
-from smart_open.s3 import iter_bucket
-from smart_open import open as s_open
-from dotenv import load_dotenv
 import botocore
+from dotenv import load_dotenv
+from smart_open import open as s_open
+from smart_open.s3 import iter_bucket
 
 from impresso_commons.utils import _get_cores
+from impresso_commons.utils.utils import bytes_to
 
 logger = logging.getLogger(__name__)
 
@@ -45,7 +47,6 @@ def get_storage_options():
 
 
 def get_s3_client(host_url="https://os.zhdk.cloud.switch.ch/"):
-
     # load environment variables from local .env files
     load_dotenv()
     if host_url is None:
@@ -324,7 +325,6 @@ def get_s3_versions(bucket_name, key_name):
 
 
 def get_s3_versions_client(client, bucket_name, key_name):
-
     versions = client.Bucket(bucket_name).object_versions.filter(Prefix=key_name)
 
     version_ids = [
@@ -397,7 +397,6 @@ def readtext_jsonlines(key_name, bucket_name):
 
 
 def upload(partition_name, newspaper_prefix=None, bucket_name=None):
-
     if newspaper_prefix is not None:
         key_name = os.path.join("/", newspaper_prefix, partition_name.split("/")[-1])
     else:
@@ -461,6 +460,7 @@ def fixed_s3fs_glob(path: str, boto3_bucket=None):
         bucket_name = boto3_bucket.name
         base_path = path
     base_path, suffix_path = base_path.split("*")
+
     filenames = [
         "s3://"
         + os.path.join(
@@ -469,6 +469,44 @@ def fixed_s3fs_glob(path: str, boto3_bucket=None):
         for o in boto3_bucket.objects.filter(Prefix=base_path)
         if o.key.endswith(suffix_path)
     ]
+
+    return filenames
+
+
+def s3_glob_with_size(path: str, boto3_bucket=None):
+    """
+    Custom glob function to list S3 objects matching a pattern. This function
+    works around the 1000-object listing limit in S3 by using boto3 directly.
+
+    Args:
+        path (str): The S3 path with a wildcard (*) to match files.
+                    Example: `s3://bucket_name/path/to/files/*.txt`.
+        boto3_bucket (boto3.Bucket, optional): An optional boto3 Bucket object.
+                                               If not provided, it will be
+                                               created from the path.
+
+    Returns:
+        list: A list of tuples containing the full S3 paths of matching files
+              and their sizes in megabytes.
+    """
+    if boto3_bucket is None:
+        if path.startswith("s3://"):
+            path = path[len("s3://") :]
+        bucket_name = path.split("/")[0]
+        base_path = "/".join(path.split("/")[1:])  # Remove bucket name
+        boto3_bucket = get_boto3_bucket(bucket_name)
+    else:
+        bucket_name = boto3_bucket.name
+        base_path = path
+
+    base_path, suffix_path = base_path.split("*")
+
+    filenames = [
+        ("s3://" + os.path.join(bucket_name, o.key), round(bytes_to(o.size, "m"), 6))
+        for o in boto3_bucket.objects.filter(Prefix=base_path)
+        if o.key.endswith(suffix_path)
+    ]
+
     return filenames
 
 
@@ -502,6 +540,33 @@ def alternative_read_text(
     return text
 
 
+def list_s3_directories(bucket_name, prefix=""):
+    """
+    Retrieve the 'directory' names (media titles) in an S3 bucket after a
+    given path prefix.
+
+    Args:
+        bucket_name (str): The name of the S3 bucket.
+        prefix (str): The prefix path within the bucket to search. Default
+                      is the root ('').
+
+    Returns:
+        list: A list of 'directory' names found in the specified bucket
+              and prefix.
+    """
+    logger.info(f"Listing 'folders'' of '{bucket_name}' under prefix '{prefix}'")
+    s3 = get_s3_client()
+    result = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter="/")
+
+    directories = []
+    if "CommonPrefixes" in result:
+        directories = [
+            prefix["Prefix"][:-1].split("/")[-1] for prefix in result["CommonPrefixes"]
+        ]
+    logger.info(f"Returning {len(directories)} directories.")
+    return directories
+
+
 def get_s3_object_size(bucket_name, key):
     """
     Get the size of an object (key) in an S3 bucket.
diff --git a/impresso_commons/utils/utils.py b/impresso_commons/utils/utils.py
index 137950e..94a4129 100644
--- a/impresso_commons/utils/utils.py
+++ b/impresso_commons/utils/utils.py
@@ -9,6 +9,7 @@
 from typing import Any, Optional
 import jsonschema
 import importlib_resources
+import glob
 
 logger = logging.getLogger(__name__)
 
@@ -128,3 +129,39 @@ def bytes_to(bytes_nb: int, to_unit: str, bsize: int = 1024) -> float:
     """
     units = {"k": 1, "m": 2, "g": 3, "t": 4, "p": 5, "e": 6}
     return float(bytes_nb) / (bsize ** units[to_unit])
+
+
+def glob_with_size(directory: str, file_suffix: str) -> list[str]:
+    """
+    List all files in a directory with a given suffix and their size in MB.
+
+    Args:
+        directory (str): The directory path to search for files.
+        file_suffix (str): The file extension or suffix to match.
+
+    Returns:
+        list[str]: A list of tuples, each containing the file path and its
+                   size in megabytes, rounded to six decimal places.
+    """
+    file_paths = glob.glob(
+        os.path.join(directory, "*"),
+        include_hidden=False
+    )
+    files = [
+        (
+            path,
+            round(bytes_to(os.path.getsize(path), "m"), 6)
+        )
+        for path in file_paths
+        if path.endswith(file_suffix)
+    ]
+
+    return files
+
+
+def list_local_directories(path):
+    return [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
+
+
+def get_list_intersection(list1, list2):
+    return list(set(list1).intersection(list2))