Skip to content

Commit

Permalink
Better docs for datasets (#377)
Browse files Browse the repository at this point in the history
* test for docs

* working example for beatles

* update loaders part 1

* convert more loaders

* add inheritence to loader docs

* updated loaders, docs and tests

* remove utils

* update to new top-level api

* load --> initialize

Co-authored-by: Rachel Bittner <[email protected]>
  • Loading branch information
rabitt and Rachel Bittner authored Dec 21, 2020
1 parent cd699ff commit dff8ab9
Show file tree
Hide file tree
Showing 51 changed files with 6,244 additions and 2,283 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pip install mirdata
import mirdata
import random

orchset = mirdata.Dataset('orchset')
orchset = mirdata.initialize('orchset')
orchset.download() # download the dataset
orchset.validate() # validate that all the expected files are there

Expand Down
100 changes: 62 additions & 38 deletions docs/source/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ Here there is an example of an index to use as guideline:
import glob
import json
import os
from mirdata.utils import md5
from mirdata.validate import md5
DATASET_INDEX_PATH = "../mirdata/datasets/indexes/dataset_index.json"
Expand Down Expand Up @@ -169,7 +169,6 @@ Module example
from mirdata import download_utils
from mirdata import jams_utils
from mirdata import core, annotations
from mirdata import utils
# -- Add any relevant citations here
Expand Down Expand Up @@ -219,8 +218,8 @@ Module example
return metadata
DATA = utils.LargeData('example_index.json', _load_metadata)
# DATA = utils.LargeData('example_index.json') ## use this if your dataset has no metadata
DATA = core.LargeData('example_index.json', _load_metadata)
# DATA = core.LargeData('example_index.json') ## use this if your dataset has no metadata
class Track(core.Track):
Expand Down Expand Up @@ -264,7 +263,7 @@ Module example
# -- and saved when someone accesses it. Useful when loading slightly
# -- bigger files or for bigger datasets. By default, we make any time
# -- series data loaded from a file a cached property
@utils.cached_property
@core.cached_property
def annotation(self):
"""output type: description of output"""
return load_annotation(self.annotation_path)
Expand Down Expand Up @@ -322,7 +321,7 @@ Module example
self.annotation_path = ...
# -- multitracks can optionally have mix-level cached properties and properties
@utils.cached_property
@core.cached_property
def annotation(self):
"""output type: description of output"""
return load_annotation(self.annotation_path)
Expand Down Expand Up @@ -362,35 +361,6 @@ Module example
raise IOError("audio_path {} does not exist".format(audio_path))
return librosa.load(audio_path, sr=None, mono=True)
# -- this function is not necessary unless you need very custom download logic
# -- If you need it, it must have this signature.
def _download(
save_dir, remotes, partial_download, info_message, force_overwrite, cleanup
):
"""Download the dataset.
Args:
save_dir (str):
The directory to download the data
remotes (dict or None):
A dictionary of RemoteFileMetadata tuples of data in zip format.
If None, there is no data to download
partial_download (list or None):
A list of keys to partially download the remote objects of the download dict.
If None, all data is downloaded
info_message (str or None):
A string of info to print when this function is called.
If None, no string is printed.
force_overwrite (bool):
If True, existing files are overwritten by the downloaded files.
cleanup (bool):
Whether to delete the zip/tar file after extracting.
"""
# see download_utils.downloader for basic usage - if you only need to call downloader
# once, you do not need this function at all.
# only write a custom function if you need it!
# -- Write any necessary loader functions for loading the dataset's data
def load_annotation(annotation_path):
Expand All @@ -416,6 +386,59 @@ Module example
)
return annotation_data
# -- use this decorator so the docs are complete
@core.docstring_inherit(core.Dataset)
class Dataset(core.Dataset):
"""The Example dataset
"""
def __init__(self, data_home=None):
super().__init__(
data_home,
index=DATA.index,
name="Example",
track_object=Track,
bibtex=BIBTEX,
remotes=REMOTES,
download_info=DOWNLOAD_INFO,
)
# -- Copy any loaders you wrote that should be part of the Dataset object
# -- use this core.copy_docs decorator to copy the docs from the original
# -- load_ function
@core.copy_docs(load_audio)
def load_audio(self, *args, **kwargs):
return load_audio(*args, **kwargs)
@core.copy_docs(load_annotation)
def load_annotation(self, *args, **kwargs):
return load_annotation(*args, **kwargs)
# -- if your dataset needs to overwrite the default download logic, do it here.
# -- this function is usually not necessary unless you need very custom download logic
def download(
self, partial_download=None, force_overwrite=False, cleanup=True
):
"""Download the dataset
Args:
partial_download (list or None):
A list of keys of remotes to partially download.
If None, all data is downloaded
force_overwrite (bool):
If True, existing files are overwritten by the downloaded files.
By default False.
cleanup (bool):
Whether to delete any zip/tar files after extracting.
Raises:
ValueError: if invalid keys are passed to partial_download
IOError: if a downloaded file's checksum is different from expected
"""
# see download_utils.downloader for basic usage - if you only need to call downloader
# once, you do not need this function at all.
# only write a custom function if you need it!
.. _add_tests:
Expand Down Expand Up @@ -449,7 +472,6 @@ Test file example
import numpy as np
from mirdata.datasets import dataset
from mirdata import utils
from tests.test_utils import run_track_tests
Expand Down Expand Up @@ -563,8 +585,10 @@ Submit your loader

Before you submit your loader make sure to:

1. Add your module to ``docs/source/mirdata.rst`` (you can check that this was done correctly by clicking on the readthedocs check when you open a PR)
2. Add the module name to ``DATASETS`` in ``mirdata/__init__.py``
1. Add your module to ``docs/source/mirdata.rst``
2. Add your module to ``docs/source/quick_reference.rst``

(you can check that this was done correctly by clicking on the readthedocs check when you open a PR)

Pull Request template
^^^^^^^^^^^^^^^^^^^^^
Expand Down
12 changes: 6 additions & 6 deletions docs/source/example.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Fortunately, we can download Orchset dataset directly.
:linenos:
import mirdata
orchset = mirdata.Dataset("orchset")
orchset = mirdata.initialize("orchset")
# Download the Orchset Dataset
orchset.download()
Expand Down Expand Up @@ -71,7 +71,7 @@ metadata, we could do the following:
return time_stamps, melody_f0
# Evaluate on the full dataset
orchset = mirdata.Dataset("orchset")
orchset = mirdata.initialize("orchset")
orchset_scores = {}
orchset_data = orchset.load_tracks()
for track_id, track_data in orchset_data.items():
Expand Down Expand Up @@ -138,7 +138,7 @@ For example, to load the melody annotations from Orchset into memory, we can sim
import mirdata
# get the orchset dataset
orchset = mirdata.Dataset("orchset")
orchset = mirdata.initialize("orchset")
# Load a specific track
track = orchset.track('Beethoven-S3-I-ex1')
Expand All @@ -156,10 +156,10 @@ However, if your data lives somewhere else, accessing the annotation will return
import mirdata
# get the orchset dataset
orchset = mirdata.Dataset("orchset")
orchset = mirdata.initialize("orchset", data_home='gs://my_custom/remote_path')
# Load a single track, specifying the remote location
track = orchset.track('Beethoven-S3-I-ex1', data_home='gs://my_custom/remote_path')
track = orchset.track('Beethoven-S3-I-ex1')
melody_path = track.melody_path
print(melody_path)
Expand Down Expand Up @@ -199,7 +199,7 @@ The following is a simple example of a generator that can be used to create a te
def orchset_generator():
# using the default data_home
orchset = mirdata.Dataset("orchset")
orchset = mirdata.initialize("orchset")
track_ids = orchset.track_ids()
for track_id in track_ids:
track = orchset.track(track_id)
Expand Down
Loading

0 comments on commit dff8ab9

Please sign in to comment.