Simplify index loading (#443)

* simplifies index loading; moves remote index support to downloaders; adds subdirectory option to downloader * fix builtins * bump coverage * patch mypy errors; opened #449 Co-authored-by: Rachel Bittner <[email protected]>
mir-dataset-loaders · Feb 1, 2021 · 54cbacb · 54cbacb
1 parent b0a6fd9
commit 54cbacb
Show file tree

Hide file tree

Showing 43 changed files with 189 additions and 4,573 deletions.
diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst
@@ -310,27 +310,20 @@ Working with remote indexes
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 For the end-user there is no difference between the remote and local indexes. However, indexes can get large when there are a lot of tracks
-in the dataset. In these cases, storing and accessing an index remotely can be convenient.
-
-However, to contribute to the library using remote indexes you have to add in ``utils.LargeData(...)`` the remote_index argument with a
-``download_utils.RemoteFileMetadata`` dictionary with the remote index information.
+in the dataset. In these cases, storing and accessing an index remotely can be convenient. Large indexes can be added to REMOTES, 
+and will be downloaded with the rest of the dataset. For example:
 
 .. code-block:: python
 
-    DATA = utils.LargeData("acousticbrainz_genre_index.json", remote_index=REMOTE_INDEX)
-
+    "index": download_utils.RemoteFileMetadata(
+        filename="acousticbrainz_genre_index.json.zip",
+        url="https://zenodo.org/record/4298580/files/acousticbrainz_genre_index.json.zip?download=1",
+        checksum="810f1c003f53cbe58002ba96e6d4d138",
+    )
 
-.. code-block:: python
 
-    REMOTE_INDEX = {
-        "REMOTE_INDEX": download_utils.RemoteFileMetadata(
-            filename="acousticbrainz_genre_index.json.zip",
-            url="https://zenodo.org/record/4298580/files/acousticbrainz_genre_index.json.zip?download=1",
-            checksum="810f1c003f53cbe58002ba96e6d4d138",
-            destination_dir="",
-        )
-    }
-    DATA = utils.LargeData("acousticbrainz_genre_index.json", remote_index=REMOTE_INDEX)
+Unlike local indexes, the remote indexes will live in the ``data_home`` directory. When creating the ``Dataset``
+object, specify the ``custom_index_path`` to where the index will be downloaded (as a relative path to ``data_home``).
 
 
 .. _reducing_test_space:

diff --git a/docs/source/contributing_examples/example.py b/docs/source/contributing_examples/example.py
@@ -63,8 +63,6 @@
 The dataset's license information goes here.
 """
 
-DATA = core.LargeData('example_index.json')
-
 
 class Track(core.Track):
     """Example track class
@@ -234,7 +232,6 @@ class Dataset(core.Dataset):
     def __init__(self, data_home=None):
         super().__init__(
             data_home,
-            index=DATA.index,
             name=NAME,
             track_class=Track,
             bibtex=BIBTEX,

diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
@@ -106,13 +106,11 @@ list is passed to the ``download()`` function through the ``partial_download`` v
                 filename="cante100Meta.xml",
                 url="https://zenodo.org/record/1322542/files/cante100Meta.xml?download=1",
                 checksum="6cce186ce77a06541cdb9f0a671afb46",  # the md5 checksum
-                destination_dir=None,  # relative path for where to unzip the data, or None
             ),
             "README": download_utils.RemoteFileMetadata(
                 filename="cante100_README.txt",
                 url="https://zenodo.org/record/1322542/files/cante100_README.txt?download=1",
                 checksum="184209b7e7d816fa603f0c7f481c0aae",  # the md5 checksum
-                destination_dir=None,  # relative path for where to unzip the data, or None
             ),
         }
 

diff --git a/mirdata/core.py b/mirdata/core.py
@@ -97,30 +97,39 @@ class Dataset(object):
     def __init__(
         self,
         data_home=None,
-        index=None,
         name=None,
         track_class=None,
         bibtex=None,
         remotes=None,
         download_info=None,
         license_info=None,
+        custom_index_path=None,
     ):
         """Dataset init method
 
         Args:
             data_home (str or None): path where mirdata will look for the dataset
-            index (dict or None): the dataset's file index
             name (str or None): the identifier of the dataset
             track_class (mirdata.core.Track or None): a Track class
             bibtex (str or None): dataset citation/s in bibtex format
             remotes (dict or None): data to be downloaded
             download_info (str or None): download instructions or caveats
             license_info (str or None): license of the dataset
+            custom_index_path (str or None): overwrites the default index path for remote indexes
 
         """
         self.name = name
         self.data_home = self.default_path if data_home is None else data_home
-        self._index = index
+        if custom_index_path:
+            self.index_path = os.path.join(self.data_home, custom_index_path)
+            self.remote_index = True
+        else:
+            self.index_path = os.path.join(
+                os.path.dirname(os.path.realpath(__file__)),
+                "datasets/indexes",
+                "{}_index.json".format(self.name),
+            )
+            self.remote_index = False
         self._track_class = track_class
         self.bibtex = bibtex
         self.remotes = remotes
@@ -146,6 +155,16 @@ def __repr__(self):
 
         return repr_string
 
+    @cached_property
+    def _index(self):
+        if self.remote_index and not os.path.exists(self.index_path):
+            raise FileNotFoundError(
+                "This dataset's index is not available locally. You may need to first run .download()"
+            )
+        with open(self.index_path) as fhandle:
+            index = json.load(fhandle)
+        return index
+
     @cached_property
     def _metadata(self):
         return None
@@ -298,7 +317,6 @@ def __init__(
             data_home (str): path where mirdata will look for the dataset
             dataset_name (str): the identifier of the dataset
             index (dict): the dataset's file index
-                Typically accessed via the .index attribute of a LargeData object
             metadata (dict or None): a dictionary of metadata or None
 
         """
@@ -474,12 +492,6 @@ def get_mix(self):
         return self.get_target(list(self.tracks.keys()))
 
 
-def load_json_index(filename):
-    working_dir = os.path.dirname(os.path.realpath(__file__))
-    with open(os.path.join(working_dir, "datasets/indexes", filename)) as f:
-        return json.load(f)
-
-
 def none_path_join(partial_path_list):
     """Join a list of partial paths. If any part of the path is None,
     returns None.
@@ -495,32 +507,3 @@ def none_path_join(partial_path_list):
         return None
     else:
         return os.path.join(*partial_path_list)
-
-
-class LargeData(object):
-    def __init__(self, index_file, remote_index=None):
-        """Object which loads and caches large data the first time it's accessed.
-
-        Args:
-            index_file: str
-                File name of checksum index file to be passed to `load_json_index`
-
-        Cached Properties:
-            index (dict): dataset index
-
-        """
-        self._metadata = None
-        self.index_file = index_file
-        self.remote_index = remote_index
-
-    @cached_property
-    def index(self):
-        if self.remote_index is not None:
-            working_dir = os.path.dirname(os.path.realpath(__file__))
-            path_index_file = os.path.join(
-                working_dir, "datasets/indexes", self.index_file
-            )
-            if not os.path.isfile(path_index_file):
-                path_indexes = os.path.join(working_dir, "datasets/indexes")
-                download_utils.downloader(path_indexes, remotes=self.remote_index)
-        return load_json_index(self.index_file)
diff --git a/mirdata/datasets/acousticbrainz_genre.py b/mirdata/datasets/acousticbrainz_genre.py
@@ -40,8 +40,6 @@
 """
 
 import json
-import os
-import shutil
 
 from mirdata import download_utils, core, io
 from mirdata import jams_utils
@@ -59,77 +57,82 @@
 }
 """
 REMOTES = {
+    "index": download_utils.RemoteFileMetadata(
+        filename="acousticbrainz_genre_index.json.zip",
+        url="https://zenodo.org/record/4298580/files/acousticbrainz_genre_index.json.zip?download=1",
+        checksum="810f1c003f53cbe58002ba96e6d4d138",
+    ),
     "validation-01": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features-validation-01234567.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-validation-01234567.tar.bz2?download=1",
         checksum="f21f9c5e398713139cca9790b656faf9",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-validation",
+        unpack_directories=["acousticbrainz-mediaeval-validation"],
     ),
     "validation-89": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features-validation-89abcdef.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-validation-89abcdef.tar.bz2?download=1",
         checksum="34f47394ac6d8face4399f48e2b98ebe",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-validation",
+        unpack_directories=["acousticbrainz-mediaeval-validation"],
     ),
     "train-01": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features--train-01.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features--train-01.tar.bz2?download=1",
         checksum="db7157b5112022d609652dd21c632090",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-train",
+        unpack_directories=["acousticbrainz-mediaeval-train"],
     ),
     "train-23": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features-train-23.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-23.tar.bz2?download=1",
         checksum="79581967a1be5c52e83be21261d1ef6c",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-train",
+        unpack_directories=["acousticbrainz-mediaeval-train"],
     ),
     "train-45": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features-train-45.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-45.tar.bz2?download=1",
         checksum="0e48fa319fa48e5cf95eea8118d2e882",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-train",
+        unpack_directories=["acousticbrainz-mediaeval-train"],
     ),
     "train-67": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features-train-67.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-67.tar.bz2?download=1",
         checksum="22ca7f1fea8a86459b7fda4530f00070",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-train",
+        unpack_directories=["acousticbrainz-mediaeval-train"],
     ),
     "train-89": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features-train-89.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-89.tar.bz2?download=1",
         checksum="c6e4a2ef1b0e8ed535197b868f8c7302",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-train",
+        unpack_directories=["acousticbrainz-mediaeval-train"],
     ),
     "train-ab": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features-train-ab.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-ab.tar.bz2?download=1",
         checksum="513d5f306dd4f3799c137423ee444051",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-train",
+        unpack_directories=["acousticbrainz-mediaeval-train"],
     ),
     "train-cd": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features-train-cd.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-cd.tar.bz2?download=1",
         checksum="422d75d70d583decec0b2761865092a7",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-train",
+        unpack_directories=["acousticbrainz-mediaeval-train"],
     ),
     "train-ef": download_utils.RemoteFileMetadata(
         filename="acousticbrainz-mediaeval-features-train-ef.tar.bz2",
         url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-ef.tar.bz2?download=1",
         checksum="021ab25a5fd1b020521824e7fce9c775",
-        destination_dir="temp",
+        destination_dir="acousticbrainz-mediaeval-train",
+        unpack_directories=["acousticbrainz-mediaeval-train"],
     ),
 }
-REMOTE_INDEX = {
-    "REMOTE_INDEX": download_utils.RemoteFileMetadata(
-        filename="acousticbrainz_genre_index.json.zip",
-        url="https://zenodo.org/record/4298580/files/acousticbrainz_genre_index.json.zip?download=1",
-        checksum="810f1c003f53cbe58002ba96e6d4d138",
-        destination_dir="",
-    )
-}
-
-DATA = core.LargeData("acousticbrainz_genre_index.json", remote_index=REMOTE_INDEX)
 
 LICENSE_INFO = """
 This dataset is composed of 4 subdatasets. Three of them are Creative Commons Attribution 
@@ -370,100 +373,21 @@ class Dataset(core.Dataset):
     The acousticbrainz genre dataset
     """
 
-    def __init__(
-        self,
-        data_home=None,
-        remote_index=None,
-        remote_index_name=None,
-    ):
-        if remote_index and remote_index_name:
-            data = core.LargeData(remote_index_name, remote_index=remote_index)
-            index = data.index
-        else:
-            index = None
-
+    def __init__(self, data_home=None):
         super().__init__(
             data_home,
-            index=DATA.index if index is None else index,
             name=NAME,
             track_class=Track,
             bibtex=BIBTEX,
             remotes=REMOTES,
             license_info=LICENSE_INFO,
+            custom_index_path="acousticbrainz_genre_index.json",
         )
 
     @core.copy_docs(load_extractor)
     def load_extractor(self, *args, **kwargs):
         return load_extractor(*args, **kwargs)
 
-    def download(self, partial_download=None, force_overwrite=False, cleanup=False):
-        """Download the dataset
-
-        Args:
-            partial_download (list or None):
-                A list of keys of remotes to partially download.
-                If None, all data is downloaded
-            force_overwrite (bool):
-                If True, existing files are overwritten by the downloaded files.
-                By default False.
-            cleanup (bool):
-                Whether to delete any zip/tar files after extracting.
-
-        Raises:
-            ValueError: if invalid keys are passed to partial_download
-            IOError: if a downloaded file's checksum is different from expected
-
-        """
-        if not os.path.exists(self.data_home):
-            os.makedirs(self.data_home)
-        # Create these directories if doesn't exist
-        train = "acousticbrainz-mediaeval-train"
-        train_dir = os.path.join(self.data_home, train)
-        if not os.path.isdir(train_dir):
-            os.mkdir(train_dir)
-        validate = "acousticbrainz-mediaeval-validation"
-        validate_dir = os.path.join(self.data_home, validate)
-        if not os.path.isdir(validate_dir):
-            os.mkdir(validate_dir)
-
-        # start to download
-        for key, remote in self.remotes.items():
-            # check overwrite
-            file_downloaded = False
-            if not force_overwrite:
-                fold, first_dir = key.split("-")
-                first_dir_path = os.path.join(
-                    train_dir if fold == "train" else validate_dir, first_dir
-                )
-                if os.path.isdir(first_dir_path):
-                    file_downloaded = True
-                    logging.info(
-                        "File "
-                        + remote.filename
-                        + " downloaded. Skip download (force_overwrite=False)."
-                    )
-            if not file_downloaded:
-                #  if this typical error happend it repeat download
-                download_utils.downloader(
-                    self.data_home,
-                    remotes={key: remote},
-                    partial_download=None,
-                    info_message=None,
-                    force_overwrite=True,
-                    cleanup=cleanup,
-                )
-            # move from a temporary directory to final one
-            source_dir = os.path.join(
-                self.data_home, "temp", train if "train" in key else validate
-            )
-            target_dir = train_dir if "train" in key else validate_dir
-            dir_names = os.listdir(source_dir)
-            for dir_name in dir_names:
-                shutil.move(
-                    os.path.join(source_dir, dir_name),
-                    os.path.join(target_dir, dir_name),
-                )
-
     def filter_index(self, search_key):
         """Load from AcousticBrainz genre dataset the indexes that match with search_key.