diff --git a/docs/_build/doctrees/environment.pickle b/docs/_build/doctrees/environment.pickle index fbd2236..fc072a8 100644 Binary files a/docs/_build/doctrees/environment.pickle and b/docs/_build/doctrees/environment.pickle differ diff --git a/docs/_build/doctrees/images.doctree b/docs/_build/doctrees/images.doctree index ade1033..a2c17a4 100644 Binary files a/docs/_build/doctrees/images.doctree and b/docs/_build/doctrees/images.doctree differ diff --git a/docs/_build/doctrees/index.doctree b/docs/_build/doctrees/index.doctree index 3ac58e9..1590498 100644 Binary files a/docs/_build/doctrees/index.doctree and b/docs/_build/doctrees/index.doctree differ diff --git a/docs/_build/doctrees/io.doctree b/docs/_build/doctrees/io.doctree index 7873d90..d2c1131 100644 Binary files a/docs/_build/doctrees/io.doctree and b/docs/_build/doctrees/io.doctree differ diff --git a/docs/_build/doctrees/rebuild.doctree b/docs/_build/doctrees/rebuild.doctree index d9bb08f..dc4e806 100644 Binary files a/docs/_build/doctrees/rebuild.doctree and b/docs/_build/doctrees/rebuild.doctree differ diff --git a/docs/_build/doctrees/utils.doctree b/docs/_build/doctrees/utils.doctree index 4245d73..0cc5e7c 100644 Binary files a/docs/_build/doctrees/utils.doctree and b/docs/_build/doctrees/utils.doctree differ diff --git a/docs/_build/doctrees/versioning.doctree b/docs/_build/doctrees/versioning.doctree index 3f1affe..34cb70f 100644 Binary files a/docs/_build/doctrees/versioning.doctree and b/docs/_build/doctrees/versioning.doctree differ diff --git a/docs/_build/html/.buildinfo b/docs/_build/html/.buildinfo index 9501185..d9e8c4b 100644 --- a/docs/_build/html/.buildinfo +++ b/docs/_build/html/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 22fc4b3edf789b51746d48473d45c93f +config: 8c31fa3be0c6ebef3824e4e08997d35b tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/_build/html/_static/basic.css b/docs/_build/html/_static/basic.css index f316efc..30fee9d 100644 --- a/docs/_build/html/_static/basic.css +++ b/docs/_build/html/_static/basic.css @@ -4,7 +4,7 @@ * * Sphinx stylesheet -- basic theme. * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ diff --git a/docs/_build/html/_static/doctools.js b/docs/_build/html/_static/doctools.js index 4d67807..d06a71d 100644 --- a/docs/_build/html/_static/doctools.js +++ b/docs/_build/html/_static/doctools.js @@ -4,7 +4,7 @@ * * Base JavaScript utilities for all Sphinx HTML documentation. * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ diff --git a/docs/_build/html/_static/language_data.js b/docs/_build/html/_static/language_data.js index 367b8ed..250f566 100644 --- a/docs/_build/html/_static/language_data.js +++ b/docs/_build/html/_static/language_data.js @@ -5,7 +5,7 @@ * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ @@ -13,7 +13,7 @@ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; -/* Non-minified version is copied as a separate JS file, if available */ +/* Non-minified version is copied as a separate JS file, is available */ /** * Porter Stemmer diff --git a/docs/_build/html/_static/searchtools.js b/docs/_build/html/_static/searchtools.js index b08d58c..7918c3f 100644 --- a/docs/_build/html/_static/searchtools.js +++ b/docs/_build/html/_static/searchtools.js @@ -4,7 +4,7 @@ * * Sphinx JavaScript utilities for the full-text search. * - * :copyright: Copyright 2007-2024 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ @@ -99,7 +99,7 @@ const _displayItem = (item, searchTerms, highlightTerms) => { .then((data) => { if (data) listItem.appendChild( - Search.makeSearchSummary(data, searchTerms, anchor) + Search.makeSearchSummary(data, searchTerms) ); // highlight search terms in the summary if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js @@ -116,8 +116,8 @@ const _finishSearch = (resultCount) => { ); else Search.status.innerText = _( - "Search finished, found ${resultCount} page(s) matching the search query." - ).replace('${resultCount}', resultCount); + `Search finished, found ${resultCount} page(s) matching the search query.` + ); }; const _displayNextItem = ( results, @@ -137,22 +137,6 @@ const _displayNextItem = ( // search finished, update title and status message else _finishSearch(resultCount); }; -// Helper function used by query() to order search results. -// Each input is an array of [docname, title, anchor, descr, score, filename]. -// Order the results by score (in opposite order of appearance, since the -// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. -const _orderResultsByScoreThenName = (a, b) => { - const leftScore = a[4]; - const rightScore = b[4]; - if (leftScore === rightScore) { - // same score: sort alphabetically - const leftTitle = a[1].toLowerCase(); - const rightTitle = b[1].toLowerCase(); - if (leftTitle === rightTitle) return 0; - return leftTitle > rightTitle ? -1 : 1; // inverted is intentional - } - return leftScore > rightScore ? 1 : -1; -}; /** * Default splitQuery function. Can be overridden in ``sphinx.search`` with a @@ -176,26 +160,13 @@ const Search = { _queued_query: null, _pulse_status: -1, - htmlToText: (htmlString, anchor) => { + htmlToText: (htmlString) => { const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); - for (const removalQuery of [".headerlink", "script", "style"]) { - htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() }); - } - if (anchor) { - const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`); - if (anchorContent) return anchorContent.textContent; - - console.warn( - `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.` - ); - } - - // if anchor not specified or not found, fall back to main content + htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() }); const docContent = htmlElement.querySelector('[role="main"]'); - if (docContent) return docContent.textContent; - + if (docContent !== undefined) return docContent.textContent; console.warn( - "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template." + "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template." ); return ""; }, @@ -268,7 +239,16 @@ const Search = { else Search.deferQuery(query); }, - _parseQuery: (query) => { + /** + * execute search (requires search index to be loaded) + */ + query: (query) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + // stem the search terms and add them to the correct list const stemmer = new Stemmer(); const searchTerms = new Set(); @@ -304,38 +284,21 @@ const Search = { // console.info("required: ", [...searchTerms]); // console.info("excluded: ", [...excludedTerms]); - return [query, searchTerms, excludedTerms, highlightTerms, objectTerms]; - }, - - /** - * execute search (requires search index to be loaded) - */ - _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => { - const filenames = Search._index.filenames; - const docNames = Search._index.docnames; - const titles = Search._index.titles; - const allTitles = Search._index.alltitles; - const indexEntries = Search._index.indexentries; - - // Collect multiple result groups to be sorted separately and then ordered. - // Each is an array of [docname, title, anchor, descr, score, filename]. - const normalResults = []; - const nonMainIndexResults = []; - + // array of [docname, title, anchor, descr, score, filename] + let results = []; _removeChildren(document.getElementById("search-progress")); - const queryLower = query.toLowerCase().trim(); + const queryLower = query.toLowerCase(); for (const [title, foundTitles] of Object.entries(allTitles)) { - if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) { + if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) { for (const [file, id] of foundTitles) { - const score = Math.round(Scorer.title * queryLower.length / title.length); - const boost = titles[file] === title ? 1 : 0; // add a boost for document titles - normalResults.push([ + let score = Math.round(100 * queryLower.length / title.length) + results.push([ docNames[file], titles[file] !== title ? `${titles[file]} > ${title}` : title, id !== null ? "#" + id : "", null, - score + boost, + score, filenames[file], ]); } @@ -345,47 +308,46 @@ const Search = { // search for explicit entries in index directives for (const [entry, foundEntries] of Object.entries(indexEntries)) { if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { - for (const [file, id, isMain] of foundEntries) { - const score = Math.round(100 * queryLower.length / entry.length); - const result = [ + for (const [file, id] of foundEntries) { + let score = Math.round(100 * queryLower.length / entry.length) + results.push([ docNames[file], titles[file], id ? "#" + id : "", null, score, filenames[file], - ]; - if (isMain) { - normalResults.push(result); - } else { - nonMainIndexResults.push(result); - } + ]); } } } // lookup as object objectTerms.forEach((term) => - normalResults.push(...Search.performObjectSearch(term, objectTerms)) + results.push(...Search.performObjectSearch(term, objectTerms)) ); // lookup as search terms in fulltext - normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + results.push(...Search.performTermsSearch(searchTerms, excludedTerms)); // let the scorer override scores with a custom scoring function - if (Scorer.score) { - normalResults.forEach((item) => (item[4] = Scorer.score(item))); - nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item))); - } - - // Sort each group of results by score and then alphabetically by name. - normalResults.sort(_orderResultsByScoreThenName); - nonMainIndexResults.sort(_orderResultsByScoreThenName); - - // Combine the result groups in (reverse) order. - // Non-main index entries are typically arbitrary cross-references, - // so display them after other results. - let results = [...nonMainIndexResults, ...normalResults]; + if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item))); + + // now sort the results by score (in opposite order of appearance, since the + // display function below uses pop() to retrieve items) and then + // alphabetically + results.sort((a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; + }); // remove duplicate search results // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept @@ -399,12 +361,7 @@ const Search = { return acc; }, []); - return results.reverse(); - }, - - query: (query) => { - const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query); - const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms); + results = results.reverse(); // for debugging //Search.lastresults = results.slice(); // a copy @@ -509,18 +466,14 @@ const Search = { // add support for partial matches if (word.length > 2) { const escapedWord = _escapeRegExp(word); - if (!terms.hasOwnProperty(word)) { - Object.keys(terms).forEach((term) => { - if (term.match(escapedWord)) - arr.push({ files: terms[term], score: Scorer.partialTerm }); - }); - } - if (!titleTerms.hasOwnProperty(word)) { - Object.keys(titleTerms).forEach((term) => { - if (term.match(escapedWord)) - arr.push({ files: titleTerms[term], score: Scorer.partialTitle }); - }); - } + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord) && !terms[word]) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord) && !titleTerms[word]) + arr.push({ files: titleTerms[word], score: Scorer.partialTitle }); + }); } // no match but word was a required one @@ -543,8 +496,9 @@ const Search = { // create the mapping files.forEach((file) => { - if (!fileMap.has(file)) fileMap.set(file, [word]); - else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word); + if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1) + fileMap.get(file).push(word); + else fileMap.set(file, [word]); }); }); @@ -595,8 +549,8 @@ const Search = { * search summary for a given text. keywords is a list * of stemmed words. */ - makeSearchSummary: (htmlText, keywords, anchor) => { - const text = Search.htmlToText(htmlText, anchor); + makeSearchSummary: (htmlText, keywords) => { + const text = Search.htmlToText(htmlText); if (text === "") return null; const textLower = text.toLowerCase(); diff --git a/docs/_build/html/genindex.html b/docs/_build/html/genindex.html index 58a0354..398032e 100644 --- a/docs/_build/html/genindex.html +++ b/docs/_build/html/genindex.html @@ -1,13 +1,11 @@ - + Index — Impresso PyCommons documentation - - - - + + @@ -15,7 +13,7 @@ - + @@ -314,6 +312,8 @@

G

  • get_issueshortpath() (in module impresso_commons.path.path_fs)
  • get_jpg() (in module impresso_commons.images.img_utils) +
  • +
  • get_list_intersection() (in module impresso_commons.utils.utils)
  • get_media_item_years() (in module impresso_commons.versioning.helpers)
  • @@ -348,6 +348,8 @@

    G

  • get_tif() (in module impresso_commons.images.img_utils)
  • git_commit_push() (in module impresso_commons.versioning.helpers) +
  • +
  • glob_with_size() (in module impresso_commons.utils.utils)
  • granularity (impresso_commons.versioning.data_statistics.DataStatistics attribute) @@ -536,12 +538,16 @@

    L

  • LANGIDENT (impresso_commons.versioning.helpers.DataStage attribute)
  • LINGUISTIC_PROCESSING (impresso_commons.versioning.helpers.DataStage attribute) +
  • +
  • list_files() (in module impresso_commons.path.path_s3)
  • @@ -727,6 +733,8 @@

    S

  • s3_get_articles() (in module impresso_commons.utils.s3)
  • s3_get_pages() (in module impresso_commons.utils.s3) +
  • +
  • s3_glob_with_size() (in module impresso_commons.utils.s3)
  • s3_iter_bucket() (in module impresso_commons.path.path_s3)
  • diff --git a/docs/_build/html/images.html b/docs/_build/html/images.html index f252ddb..0196e80 100644 --- a/docs/_build/html/images.html +++ b/docs/_build/html/images.html @@ -1,14 +1,12 @@ - + - + Image handling — Impresso PyCommons documentation - - - - + + @@ -16,7 +14,7 @@ - + diff --git a/docs/_build/html/index.html b/docs/_build/html/index.html index 1592d5a..4e07105 100644 --- a/docs/_build/html/index.html +++ b/docs/_build/html/index.html @@ -1,14 +1,12 @@ - + - + Welcome to Impresso PyCommons’s documentation! — Impresso PyCommons documentation - - - - + + @@ -16,7 +14,7 @@ - + diff --git a/docs/_build/html/io.html b/docs/_build/html/io.html index 89339a8..259d623 100644 --- a/docs/_build/html/io.html +++ b/docs/_build/html/io.html @@ -1,14 +1,12 @@ - + - + Input/Output — Impresso PyCommons documentation - - - - + + @@ -16,7 +14,7 @@ - + diff --git a/docs/_build/html/objects.inv b/docs/_build/html/objects.inv index 166a721..e514470 100644 Binary files a/docs/_build/html/objects.inv and b/docs/_build/html/objects.inv differ diff --git a/docs/_build/html/py-modindex.html b/docs/_build/html/py-modindex.html index 5474c92..be06ca2 100644 --- a/docs/_build/html/py-modindex.html +++ b/docs/_build/html/py-modindex.html @@ -1,13 +1,11 @@ - + Python Module Index — Impresso PyCommons documentation - - - - + + @@ -15,7 +13,7 @@ - + diff --git a/docs/_build/html/rebuild.html b/docs/_build/html/rebuild.html index f01c479..7491e7e 100644 --- a/docs/_build/html/rebuild.html +++ b/docs/_build/html/rebuild.html @@ -1,14 +1,12 @@ - + - + Text Rebuild — Impresso PyCommons documentation - - - - + + @@ -16,7 +14,7 @@ - + @@ -204,7 +202,7 @@

    Rebuild functionsParameters:
    • level (int) – desired level of logging (default: logging.INFO)

    • -
    • file (str)

    • +
    • file (str) –

    Returns:
    diff --git a/docs/_build/html/search.html b/docs/_build/html/search.html index d737643..7258c61 100644 --- a/docs/_build/html/search.html +++ b/docs/_build/html/search.html @@ -1,13 +1,11 @@ - + Search — Impresso PyCommons documentation - - - - + + @@ -16,7 +14,7 @@ - + @@ -52,8 +50,11 @@
  • Basic Utils Functions @@ -72,10 +73,12 @@
  • get_s3_versions()
  • get_s3_versions_client()
  • get_storage_options()
  • +
  • list_s3_directories()
  • read_jsonlines()
  • readtext_jsonlines()
  • s3_get_articles()
  • s3_get_pages()
  • +
  • s3_glob_with_size()
  • upload()
  • upload_to_s3()
  • @@ -172,6 +175,11 @@

    Utilities +
    +impresso_commons.utils.utils.get_list_intersection(list1, list2)
    +
    +
    impresso_commons.utils.utils.get_pkg_resource(file_manager: ExitStack, path: str, package: str = 'impresso_commons') PosixPath
    @@ -199,6 +207,30 @@

    Utilities +
    +impresso_commons.utils.utils.glob_with_size(directory: str, file_suffix: str) list[str]
    +

    List all files in a directory with a given suffix and their size in MB.

    +
    +
    Parameters:
    +
      +
    • directory (str) – The directory path to search for files.

    • +
    • file_suffix (str) – The file extension or suffix to match.

    • +
    +
    +
    Returns:
    +

    +
    A list of tuples, each containing the file path and its

    size in megabytes, rounded to six decimal places.

    +
    +
    +

    +
    +
    Return type:
    +

    list[str]

    +
    +
    +

    +
    impresso_commons.utils.utils.init_logger(level: int = 20, file: str | None = None) RootLogger
    @@ -219,6 +251,11 @@

    Utilities +
    +impresso_commons.utils.utils.list_local_directories(path)
    +

    +
    impresso_commons.utils.utils.parse_json(filename)
    @@ -460,6 +497,32 @@

    Utilitiesimpresso_commons.utils.s3.get_storage_options()

    +
    +
    +impresso_commons.utils.s3.list_s3_directories(bucket_name, prefix='')
    +

    Retrieve the ‘directory’ names (media titles) in an S3 bucket after a +given path prefix.

    +
    +
    Parameters:
    +
      +
    • bucket_name (str) – The name of the S3 bucket.

    • +
    • prefix (str) – The prefix path within the bucket to search. Default +is the root (‘’).

    • +
    +
    +
    Returns:
    +

    +
    A list of ‘directory’ names found in the specified bucket

    and prefix.

    +
    +
    +

    +
    +
    Return type:
    +

    list

    +
    +
    +
    +
    impresso_commons.utils.s3.read_jsonlines(key_name, bucket_name)
    @@ -531,6 +594,34 @@

    Utilities +
    +impresso_commons.utils.s3.s3_glob_with_size(path: str, boto3_bucket=None)
    +

    Custom glob function to list S3 objects matching a pattern. This function +works around the 1000-object listing limit in S3 by using boto3 directly.

    +
    +
    Parameters:
    +
      +
    • path (str) – The S3 path with a wildcard (*) to match files. +Example: s3://bucket_name/path/to/files/*.txt.

    • +
    • boto3_bucket (boto3.Bucket, optional) – An optional boto3 Bucket object. +If not provided, it will be +created from the path.

    • +
    +
    +
    Returns:
    +

    +
    A list of tuples containing the full S3 paths of matching files

    and their sizes in megabytes.

    +
    +
    +

    +
    +
    Return type:
    +

    list

    +
    +
    +

    +
    impresso_commons.utils.s3.upload(partition_name, newspaper_prefix=None, bucket_name=None)
    @@ -584,7 +675,7 @@

    Utilities
    -impresso_commons.utils.daskutils.partitioner(bag, path, nbpart)
    -

    Partition a bag into n partitions and write each partition in a file

    +impresso_commons.utils.daskutils.partitioner(bag: Bag, path: str, nb_partitions: int) None +

    Partition a Dask bag into n partitions and write each to a separate file.

    +
    +
    Parameters:
    +
      +
    • bag (dask.bag.Bag) – The Dask bag to be partitioned.

    • +
    • path (str) – Directory path where partitioned files will be saved.

    • +
    • nb_partitions (int) – Number of partitions to create.

    • +
    +
    +
    Returns:
    +

    The function writes partitioned files to the specified path.

    +
    +
    Return type:
    +

    None

    +
    +

    diff --git a/docs/_build/html/versioning.html b/docs/_build/html/versioning.html index b1c087d..06c3d43 100644 --- a/docs/_build/html/versioning.html +++ b/docs/_build/html/versioning.html @@ -1,14 +1,12 @@ - + - + Data Versioning — Impresso PyCommons documentation - - - - + + @@ -16,7 +14,7 @@ - + @@ -1529,7 +1527,7 @@

    Data Versioning
    Returns:
    diff --git a/impresso_commons/__init__.py b/impresso_commons/__init__.py index a82b376..72f26f5 100644 --- a/impresso_commons/__init__.py +++ b/impresso_commons/__init__.py @@ -1 +1 @@ -__version__ = "1.1.1" +__version__ = "1.1.2" diff --git a/impresso_commons/utils/__init__.py b/impresso_commons/utils/__init__.py index 6c03d10..7b85070 100644 --- a/impresso_commons/utils/__init__.py +++ b/impresso_commons/utils/__init__.py @@ -3,17 +3,17 @@ # created on 2018.03.27 using PyCharm # project impresso-image-acquisition +import datetime import logging +import multiprocessing import sys import time -import datetime from datetime import timedelta import dask -from dask import compute, delayed +from dask import compute from dask.diagnostics import ProgressBar from dask.multiprocessing import get as mp_get -import multiprocessing logger = logging.getLogger(__name__) @@ -51,7 +51,7 @@ def init_logger(logger, log_level, log_file): ch.setFormatter(formatter) logger.addHandler(ch) - logger.info("Logger successfully initialised") + logger.info("LOGGER - Logger successfully initialised") return logger @@ -87,14 +87,18 @@ def user_confirmation(question, default=None): sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n") - def user_question(variable_to_confirm): - answer = user_confirmation(f"Is [{variable_to_confirm}] the correct one to work with?", None) + answer = user_confirmation( + f"\tIs the following the correct item to work with?\n" + f"{variable_to_confirm}", + None + ) + if not answer: - logger.info(f"Variable {variable_to_confirm} not confirmed, exiting.") + logger.info("Variable not confirmed, exiting.") sys.exit() else: - logger.info(f"Variable {variable_to_confirm} confirmed.") + logger.info("Variable confirmed.") def timestamp(): @@ -111,6 +115,7 @@ def timestamp(): class Timer: """ Basic timer""" + def __init__(self): self.start = time.time() self.intermediate = time.time() diff --git a/impresso_commons/utils/daskutils.py b/impresso_commons/utils/daskutils.py index a5697d5..d04a006 100644 --- a/impresso_commons/utils/daskutils.py +++ b/impresso_commons/utils/daskutils.py @@ -11,29 +11,43 @@ --config-file= json configuration dict specifying various arguments """ -import os import logging -import docopt +import os -from dask.diagnostics import ProgressBar import dask.bag as db +import docopt import numpy as np +from dask.bag import Bag +from dask.diagnostics import ProgressBar -from impresso_commons.utils import init_logger -from impresso_commons.utils import Timer from impresso_commons.path.path_s3 import s3_filter_archives -from impresso_commons.utils.s3 import get_bucket, read_jsonlines, readtext_jsonlines -from impresso_commons.utils.s3 import IMPRESSO_STORAGEOPT +from impresso_commons.utils import Timer +from impresso_commons.utils import init_logger from impresso_commons.utils.config_loader import PartitionerConfig +from impresso_commons.utils.s3 import IMPRESSO_STORAGEOPT +from impresso_commons.utils.s3 import get_bucket, read_jsonlines, readtext_jsonlines __author__ = "maudehrmann" logger = logging.getLogger(__name__) -def partitioner(bag, path, nbpart): - """Partition a bag into n partitions and write each partition in a file""" - grouped_items = bag.groupby(lambda x: np.random.randint(500), npartitions=nbpart) +def partitioner(bag: Bag, + path: str, + nb_partitions: int) -> None: + """ + Partition a Dask bag into n partitions and write each to a separate file. + + Args: + bag (dask.bag.Bag): The Dask bag to be partitioned. + path (str): Directory path where partitioned files will be saved. + nb_partitions (int): Number of partitions to create. + + Returns: + None: The function writes partitioned files to the specified path. + """ + grouped_items = bag.groupby(lambda x: np.random.randint(500), + npartitions=nb_partitions) items = grouped_items.map(lambda x: x[1]).flatten() path = os.path.join(path, "*.jsonl.bz2") with ProgressBar(): @@ -41,12 +55,12 @@ def partitioner(bag, path, nbpart): def create_even_partitions( - bucket, - config_newspapers, - output_dir, - local_fs=False, - keep_full=False, - nb_partition=500, + bucket, + config_newspapers, + output_dir, + local_fs=False, + keep_full=False, + nb_partition=500, ): """Convert yearly bz2 archives to even bz2 archives, i.e. partitions. diff --git a/impresso_commons/utils/s3.py b/impresso_commons/utils/s3.py index 5eb9200..2b225ef 100644 --- a/impresso_commons/utils/s3.py +++ b/impresso_commons/utils/s3.py @@ -3,19 +3,21 @@ """ -import os -import logging +import bz2 import json +import logging +import os import warnings -import bz2 from typing import Union + import boto3 -from smart_open.s3 import iter_bucket -from smart_open import open as s_open -from dotenv import load_dotenv import botocore +from dotenv import load_dotenv +from smart_open import open as s_open +from smart_open.s3 import iter_bucket from impresso_commons.utils import _get_cores +from impresso_commons.utils.utils import bytes_to logger = logging.getLogger(__name__) @@ -45,7 +47,6 @@ def get_storage_options(): def get_s3_client(host_url="https://os.zhdk.cloud.switch.ch/"): - # load environment variables from local .env files load_dotenv() if host_url is None: @@ -324,7 +325,6 @@ def get_s3_versions(bucket_name, key_name): def get_s3_versions_client(client, bucket_name, key_name): - versions = client.Bucket(bucket_name).object_versions.filter(Prefix=key_name) version_ids = [ @@ -397,7 +397,6 @@ def readtext_jsonlines(key_name, bucket_name): def upload(partition_name, newspaper_prefix=None, bucket_name=None): - if newspaper_prefix is not None: key_name = os.path.join("/", newspaper_prefix, partition_name.split("/")[-1]) else: @@ -461,6 +460,7 @@ def fixed_s3fs_glob(path: str, boto3_bucket=None): bucket_name = boto3_bucket.name base_path = path base_path, suffix_path = base_path.split("*") + filenames = [ "s3://" + os.path.join( @@ -469,6 +469,44 @@ def fixed_s3fs_glob(path: str, boto3_bucket=None): for o in boto3_bucket.objects.filter(Prefix=base_path) if o.key.endswith(suffix_path) ] + + return filenames + + +def s3_glob_with_size(path: str, boto3_bucket=None): + """ + Custom glob function to list S3 objects matching a pattern. This function + works around the 1000-object listing limit in S3 by using boto3 directly. + + Args: + path (str): The S3 path with a wildcard (*) to match files. + Example: `s3://bucket_name/path/to/files/*.txt`. + boto3_bucket (boto3.Bucket, optional): An optional boto3 Bucket object. + If not provided, it will be + created from the path. + + Returns: + list: A list of tuples containing the full S3 paths of matching files + and their sizes in megabytes. + """ + if boto3_bucket is None: + if path.startswith("s3://"): + path = path[len("s3://") :] + bucket_name = path.split("/")[0] + base_path = "/".join(path.split("/")[1:]) # Remove bucket name + boto3_bucket = get_boto3_bucket(bucket_name) + else: + bucket_name = boto3_bucket.name + base_path = path + + base_path, suffix_path = base_path.split("*") + + filenames = [ + ("s3://" + os.path.join(bucket_name, o.key), round(bytes_to(o.size, "m"), 6)) + for o in boto3_bucket.objects.filter(Prefix=base_path) + if o.key.endswith(suffix_path) + ] + return filenames @@ -502,6 +540,33 @@ def alternative_read_text( return text +def list_s3_directories(bucket_name, prefix=""): + """ + Retrieve the 'directory' names (media titles) in an S3 bucket after a + given path prefix. + + Args: + bucket_name (str): The name of the S3 bucket. + prefix (str): The prefix path within the bucket to search. Default + is the root (''). + + Returns: + list: A list of 'directory' names found in the specified bucket + and prefix. + """ + logger.info(f"Listing 'folders'' of '{bucket_name}' under prefix '{prefix}'") + s3 = get_s3_client() + result = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter="/") + + directories = [] + if "CommonPrefixes" in result: + directories = [ + prefix["Prefix"][:-1].split("/")[-1] for prefix in result["CommonPrefixes"] + ] + logger.info(f"Returning {len(directories)} directories.") + return directories + + def get_s3_object_size(bucket_name, key): """ Get the size of an object (key) in an S3 bucket. diff --git a/impresso_commons/utils/utils.py b/impresso_commons/utils/utils.py index 137950e..94a4129 100644 --- a/impresso_commons/utils/utils.py +++ b/impresso_commons/utils/utils.py @@ -9,6 +9,7 @@ from typing import Any, Optional import jsonschema import importlib_resources +import glob logger = logging.getLogger(__name__) @@ -128,3 +129,39 @@ def bytes_to(bytes_nb: int, to_unit: str, bsize: int = 1024) -> float: """ units = {"k": 1, "m": 2, "g": 3, "t": 4, "p": 5, "e": 6} return float(bytes_nb) / (bsize ** units[to_unit]) + + +def glob_with_size(directory: str, file_suffix: str) -> list[str]: + """ + List all files in a directory with a given suffix and their size in MB. + + Args: + directory (str): The directory path to search for files. + file_suffix (str): The file extension or suffix to match. + + Returns: + list[str]: A list of tuples, each containing the file path and its + size in megabytes, rounded to six decimal places. + """ + file_paths = glob.glob( + os.path.join(directory, "*"), + include_hidden=False + ) + files = [ + ( + path, + round(bytes_to(os.path.getsize(path), "m"), 6) + ) + for path in file_paths + if path.endswith(file_suffix) + ] + + return files + + +def list_local_directories(path): + return [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))] + + +def get_list_intersection(list1, list2): + return list(set(list1).intersection(list2))