Skip to content

Commit

Permalink
Simplify index loading (#443)
Browse files Browse the repository at this point in the history
* simplifies index loading; moves remote index support to downloaders; adds subdirectory option to downloader

* fix builtins

* bump coverage

* patch mypy errors; opened #449

Co-authored-by: Rachel Bittner <[email protected]>
  • Loading branch information
rabitt and Rachel Bittner authored Feb 1, 2021
1 parent b0a6fd9 commit 54cbacb
Show file tree
Hide file tree
Showing 43 changed files with 189 additions and 4,573 deletions.
25 changes: 9 additions & 16 deletions docs/source/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -310,27 +310,20 @@ Working with remote indexes
^^^^^^^^^^^^^^^^^^^^^^^^^^^

For the end-user there is no difference between the remote and local indexes. However, indexes can get large when there are a lot of tracks
in the dataset. In these cases, storing and accessing an index remotely can be convenient.

However, to contribute to the library using remote indexes you have to add in ``utils.LargeData(...)`` the remote_index argument with a
``download_utils.RemoteFileMetadata`` dictionary with the remote index information.
in the dataset. In these cases, storing and accessing an index remotely can be convenient. Large indexes can be added to REMOTES,
and will be downloaded with the rest of the dataset. For example:

.. code-block:: python
DATA = utils.LargeData("acousticbrainz_genre_index.json", remote_index=REMOTE_INDEX)
"index": download_utils.RemoteFileMetadata(
filename="acousticbrainz_genre_index.json.zip",
url="https://zenodo.org/record/4298580/files/acousticbrainz_genre_index.json.zip?download=1",
checksum="810f1c003f53cbe58002ba96e6d4d138",
)
.. code-block:: python
REMOTE_INDEX = {
"REMOTE_INDEX": download_utils.RemoteFileMetadata(
filename="acousticbrainz_genre_index.json.zip",
url="https://zenodo.org/record/4298580/files/acousticbrainz_genre_index.json.zip?download=1",
checksum="810f1c003f53cbe58002ba96e6d4d138",
destination_dir="",
)
}
DATA = utils.LargeData("acousticbrainz_genre_index.json", remote_index=REMOTE_INDEX)
Unlike local indexes, the remote indexes will live in the ``data_home`` directory. When creating the ``Dataset``
object, specify the ``custom_index_path`` to where the index will be downloaded (as a relative path to ``data_home``).


.. _reducing_test_space:
Expand Down
3 changes: 0 additions & 3 deletions docs/source/contributing_examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,6 @@
The dataset's license information goes here.
"""

DATA = core.LargeData('example_index.json')


class Track(core.Track):
"""Example track class
Expand Down Expand Up @@ -234,7 +232,6 @@ class Dataset(core.Dataset):
def __init__(self, data_home=None):
super().__init__(
data_home,
index=DATA.index,
name=NAME,
track_class=Track,
bibtex=BIBTEX,
Expand Down
2 changes: 0 additions & 2 deletions docs/source/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -106,13 +106,11 @@ list is passed to the ``download()`` function through the ``partial_download`` v
filename="cante100Meta.xml",
url="https://zenodo.org/record/1322542/files/cante100Meta.xml?download=1",
checksum="6cce186ce77a06541cdb9f0a671afb46", # the md5 checksum
destination_dir=None, # relative path for where to unzip the data, or None
),
"README": download_utils.RemoteFileMetadata(
filename="cante100_README.txt",
url="https://zenodo.org/record/1322542/files/cante100_README.txt?download=1",
checksum="184209b7e7d816fa603f0c7f481c0aae", # the md5 checksum
destination_dir=None, # relative path for where to unzip the data, or None
),
}
Expand Down
61 changes: 22 additions & 39 deletions mirdata/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,30 +97,39 @@ class Dataset(object):
def __init__(
self,
data_home=None,
index=None,
name=None,
track_class=None,
bibtex=None,
remotes=None,
download_info=None,
license_info=None,
custom_index_path=None,
):
"""Dataset init method
Args:
data_home (str or None): path where mirdata will look for the dataset
index (dict or None): the dataset's file index
name (str or None): the identifier of the dataset
track_class (mirdata.core.Track or None): a Track class
bibtex (str or None): dataset citation/s in bibtex format
remotes (dict or None): data to be downloaded
download_info (str or None): download instructions or caveats
license_info (str or None): license of the dataset
custom_index_path (str or None): overwrites the default index path for remote indexes
"""
self.name = name
self.data_home = self.default_path if data_home is None else data_home
self._index = index
if custom_index_path:
self.index_path = os.path.join(self.data_home, custom_index_path)
self.remote_index = True
else:
self.index_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
"datasets/indexes",
"{}_index.json".format(self.name),
)
self.remote_index = False
self._track_class = track_class
self.bibtex = bibtex
self.remotes = remotes
Expand All @@ -146,6 +155,16 @@ def __repr__(self):

return repr_string

@cached_property
def _index(self):
if self.remote_index and not os.path.exists(self.index_path):
raise FileNotFoundError(
"This dataset's index is not available locally. You may need to first run .download()"
)
with open(self.index_path) as fhandle:
index = json.load(fhandle)
return index

@cached_property
def _metadata(self):
return None
Expand Down Expand Up @@ -298,7 +317,6 @@ def __init__(
data_home (str): path where mirdata will look for the dataset
dataset_name (str): the identifier of the dataset
index (dict): the dataset's file index
Typically accessed via the .index attribute of a LargeData object
metadata (dict or None): a dictionary of metadata or None
"""
Expand Down Expand Up @@ -474,12 +492,6 @@ def get_mix(self):
return self.get_target(list(self.tracks.keys()))


def load_json_index(filename):
working_dir = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(working_dir, "datasets/indexes", filename)) as f:
return json.load(f)


def none_path_join(partial_path_list):
"""Join a list of partial paths. If any part of the path is None,
returns None.
Expand All @@ -495,32 +507,3 @@ def none_path_join(partial_path_list):
return None
else:
return os.path.join(*partial_path_list)


class LargeData(object):
def __init__(self, index_file, remote_index=None):
"""Object which loads and caches large data the first time it's accessed.
Args:
index_file: str
File name of checksum index file to be passed to `load_json_index`
Cached Properties:
index (dict): dataset index
"""
self._metadata = None
self.index_file = index_file
self.remote_index = remote_index

@cached_property
def index(self):
if self.remote_index is not None:
working_dir = os.path.dirname(os.path.realpath(__file__))
path_index_file = os.path.join(
working_dir, "datasets/indexes", self.index_file
)
if not os.path.isfile(path_index_file):
path_indexes = os.path.join(working_dir, "datasets/indexes")
download_utils.downloader(path_indexes, remotes=self.remote_index)
return load_json_index(self.index_file)
130 changes: 27 additions & 103 deletions mirdata/datasets/acousticbrainz_genre.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@
"""

import json
import os
import shutil

from mirdata import download_utils, core, io
from mirdata import jams_utils
Expand All @@ -59,77 +57,82 @@
}
"""
REMOTES = {
"index": download_utils.RemoteFileMetadata(
filename="acousticbrainz_genre_index.json.zip",
url="https://zenodo.org/record/4298580/files/acousticbrainz_genre_index.json.zip?download=1",
checksum="810f1c003f53cbe58002ba96e6d4d138",
),
"validation-01": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features-validation-01234567.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-validation-01234567.tar.bz2?download=1",
checksum="f21f9c5e398713139cca9790b656faf9",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-validation",
unpack_directories=["acousticbrainz-mediaeval-validation"],
),
"validation-89": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features-validation-89abcdef.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-validation-89abcdef.tar.bz2?download=1",
checksum="34f47394ac6d8face4399f48e2b98ebe",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-validation",
unpack_directories=["acousticbrainz-mediaeval-validation"],
),
"train-01": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features--train-01.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features--train-01.tar.bz2?download=1",
checksum="db7157b5112022d609652dd21c632090",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-train",
unpack_directories=["acousticbrainz-mediaeval-train"],
),
"train-23": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features-train-23.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-23.tar.bz2?download=1",
checksum="79581967a1be5c52e83be21261d1ef6c",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-train",
unpack_directories=["acousticbrainz-mediaeval-train"],
),
"train-45": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features-train-45.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-45.tar.bz2?download=1",
checksum="0e48fa319fa48e5cf95eea8118d2e882",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-train",
unpack_directories=["acousticbrainz-mediaeval-train"],
),
"train-67": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features-train-67.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-67.tar.bz2?download=1",
checksum="22ca7f1fea8a86459b7fda4530f00070",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-train",
unpack_directories=["acousticbrainz-mediaeval-train"],
),
"train-89": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features-train-89.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-89.tar.bz2?download=1",
checksum="c6e4a2ef1b0e8ed535197b868f8c7302",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-train",
unpack_directories=["acousticbrainz-mediaeval-train"],
),
"train-ab": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features-train-ab.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-ab.tar.bz2?download=1",
checksum="513d5f306dd4f3799c137423ee444051",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-train",
unpack_directories=["acousticbrainz-mediaeval-train"],
),
"train-cd": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features-train-cd.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-cd.tar.bz2?download=1",
checksum="422d75d70d583decec0b2761865092a7",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-train",
unpack_directories=["acousticbrainz-mediaeval-train"],
),
"train-ef": download_utils.RemoteFileMetadata(
filename="acousticbrainz-mediaeval-features-train-ef.tar.bz2",
url="https://zenodo.org/record/2553414/files/acousticbrainz-mediaeval-features-train-ef.tar.bz2?download=1",
checksum="021ab25a5fd1b020521824e7fce9c775",
destination_dir="temp",
destination_dir="acousticbrainz-mediaeval-train",
unpack_directories=["acousticbrainz-mediaeval-train"],
),
}
REMOTE_INDEX = {
"REMOTE_INDEX": download_utils.RemoteFileMetadata(
filename="acousticbrainz_genre_index.json.zip",
url="https://zenodo.org/record/4298580/files/acousticbrainz_genre_index.json.zip?download=1",
checksum="810f1c003f53cbe58002ba96e6d4d138",
destination_dir="",
)
}

DATA = core.LargeData("acousticbrainz_genre_index.json", remote_index=REMOTE_INDEX)

LICENSE_INFO = """
This dataset is composed of 4 subdatasets. Three of them are Creative Commons Attribution
Expand Down Expand Up @@ -370,100 +373,21 @@ class Dataset(core.Dataset):
The acousticbrainz genre dataset
"""

def __init__(
self,
data_home=None,
remote_index=None,
remote_index_name=None,
):
if remote_index and remote_index_name:
data = core.LargeData(remote_index_name, remote_index=remote_index)
index = data.index
else:
index = None

def __init__(self, data_home=None):
super().__init__(
data_home,
index=DATA.index if index is None else index,
name=NAME,
track_class=Track,
bibtex=BIBTEX,
remotes=REMOTES,
license_info=LICENSE_INFO,
custom_index_path="acousticbrainz_genre_index.json",
)

@core.copy_docs(load_extractor)
def load_extractor(self, *args, **kwargs):
return load_extractor(*args, **kwargs)

def download(self, partial_download=None, force_overwrite=False, cleanup=False):
"""Download the dataset
Args:
partial_download (list or None):
A list of keys of remotes to partially download.
If None, all data is downloaded
force_overwrite (bool):
If True, existing files are overwritten by the downloaded files.
By default False.
cleanup (bool):
Whether to delete any zip/tar files after extracting.
Raises:
ValueError: if invalid keys are passed to partial_download
IOError: if a downloaded file's checksum is different from expected
"""
if not os.path.exists(self.data_home):
os.makedirs(self.data_home)
# Create these directories if doesn't exist
train = "acousticbrainz-mediaeval-train"
train_dir = os.path.join(self.data_home, train)
if not os.path.isdir(train_dir):
os.mkdir(train_dir)
validate = "acousticbrainz-mediaeval-validation"
validate_dir = os.path.join(self.data_home, validate)
if not os.path.isdir(validate_dir):
os.mkdir(validate_dir)

# start to download
for key, remote in self.remotes.items():
# check overwrite
file_downloaded = False
if not force_overwrite:
fold, first_dir = key.split("-")
first_dir_path = os.path.join(
train_dir if fold == "train" else validate_dir, first_dir
)
if os.path.isdir(first_dir_path):
file_downloaded = True
logging.info(
"File "
+ remote.filename
+ " downloaded. Skip download (force_overwrite=False)."
)
if not file_downloaded:
# if this typical error happend it repeat download
download_utils.downloader(
self.data_home,
remotes={key: remote},
partial_download=None,
info_message=None,
force_overwrite=True,
cleanup=cleanup,
)
# move from a temporary directory to final one
source_dir = os.path.join(
self.data_home, "temp", train if "train" in key else validate
)
target_dir = train_dir if "train" in key else validate_dir
dir_names = os.listdir(source_dir)
for dir_name in dir_names:
shutil.move(
os.path.join(source_dir, dir_name),
os.path.join(target_dir, dir_name),
)

def filter_index(self, search_key):
"""Load from AcousticBrainz genre dataset the indexes that match with search_key.
Expand Down
Loading

0 comments on commit 54cbacb

Please sign in to comment.