Adding loader for CIPI dataset (#599)

* scripts/make, index and track and dataset class. TODO tests * fix docstring * modify the docs * download disclaimer * black * first test * fix metadata * remove embeddings * add more tests * black * modify tests * modify fix.py for adding music21 (optional) * fix bugt with load_scores * fix bugs * from smart_open import open * from smart_open import open * same error than francesco * test fulldataset * test fulldataset * test fulldataset * genis suggestion * replace os.path.exists by try catch * fix plobles with try catch * add cipi to CUSTOM_TEST_TRACKS * modify all the tests * black * smart open test * black * check embeddings * check embeddings * check embeddings * imrpoving codecov * rollback haydn_op20.py * rollback haydn_op20.py * comentario de los embeddings * cante100 -> cipi * baclk * expressiveness * fix make * Done! * Update cipi.py * difficulty annotation * fix docs table * add dataset details and fix error message * now doing the fixes right :) * address problem in table.rst --------- Co-authored-by: PRamoneda <[email protected]> Co-authored-by: Genís Plaja-Roglans <[email protected]> Co-authored-by: Guillem Cortès <[email protected]> Co-authored-by: genisplaja <[email protected]>
mir-dataset-loaders · Nov 2, 2023 · c1dccb0 · c1dccb0
1 parent afbc0c3
commit c1dccb0
Show file tree

Hide file tree

Showing 17 changed files with 126,342 additions and 2 deletions.
diff --git a/docs/source/mirdata.rst b/docs/source/mirdata.rst
@@ -60,6 +60,14 @@ cante100
    :inherited-members:
 
 
+cipi
+^^^^
+
+.. automodule:: mirdata.datasets.cipi
+   :members:
+   :inherited-members:
+
+
 compmusic_carnatic_rhythm
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/docs/source/table.rst b/docs/source/table.rst
@@ -61,6 +61,15 @@
      - 100
      - :cante:`\ `
 
+   * - CIPI
+     - - musicXML: 🔑
+       - embeddings: 🔑
+       - annotations: 🔑
+     - - difficulty levels
+     - 652
+     - .. image:: https://licensebuttons.net/l/by-nc-sa/4.0/80x15.png
+          :target: https://creativecommons.org/licenses/by-nc-sa/4.0
+
    * - .. line-block::
 
         (CompMusic)

diff --git a/mirdata/core.py b/mirdata/core.py
@@ -529,13 +529,12 @@ def __init__(self, track_id, data_home, dataset_name, index, metadata):
             raise ValueError(
                 "{} is not a valid track_id in {}".format(track_id, dataset_name)
             )
-
+        self._metadata = metadata
         self.track_id = track_id
         self._dataset_name = dataset_name
 
         self._data_home = data_home
         self._track_paths = index["tracks"][track_id]
-        self._metadata = metadata
 
     @cached_property
     def _track_metadata(self):

diff --git a/mirdata/datasets/cipi.py b/mirdata/datasets/cipi.py
@@ -0,0 +1,231 @@
+"""Can I play it? (CIPI) Dataset Loader
+
+.. admonition:: Dataset Info
+    :class: dropdown
+
+    The "Can I Play It?" (CIPI) dataset is a specialized collection of 652 classical piano scores, provided in a
+    machine-readable MusicXML format and accompanied by integer-based difficulty levels ranging from 1 to 9, as
+    verified by expert pianists. Then, it provides embeddings for fingering and expresiveness of the piece. Each 
+    recording has multiple scores corresponding to it. This dataset focuses exclusively on classical piano music,
+    offering a rich resource for music researchers, educators, and students. Developed by the Music Technology Group
+    in Barcelona, by P. Ramoneda et al. 
+
+    The CIPI dataset facilitates various applications such as the study of musical complexity, the selection of
+    appropriately leveled pieces for students, and general research in music education. The dataset, alongside
+    embeddings of multiple dimensions of difficulty, has been made publicly available to encourage ongoing innovation
+    and collaboration within the music education and research communities.
+
+    The dataset has been published alongside a paper in Expert Systems with Applications Journal. 
+
+    The dataset is shared under a Creative Commons Attribution Non Commercial Share Alike 4.0 International License, but
+    need to be requested. Please do request the dataset here: https://zenodo.org/records/8037327. The dataset can only
+    be used for open research purposes.
+"""
+import json
+import logging
+import os
+from typing import Optional, List
+
+from smart_open import open
+
+
+from mirdata import core, jams_utils
+
+try:
+    import music21
+except ImportError:
+    logging.error(
+        "In order to use cipi you must have music21 installed. "
+        "Please reinstall mirdata using `pip install 'mirdata[cipi]'"
+    )
+    raise ImportError
+
+BIBTEX = """
+@article{Ramoneda2024,
+  author    = {Pedro Ramoneda and Dasaem Jeong and Vsevolod Eremenko and Nazif Can Tamer and Marius Miron and Xavier Serra},
+  title     = {Combining Piano Performance Dimensions for Score Difficulty Classification},
+  journal   = {Expert Systems with Applications},
+  volume    = {238},
+  pages     = {121776},
+  year      = {2024},
+  doi       = {10.1016/j.eswa.2023.121776},
+  url       = {https://doi.org/10.1016/j.eswa.2023.121776}
+}"""
+
+INDEXES = {
+    "default": "1.0",
+    "test": "1.0",
+    "1.0": core.Index(filename="cipi_index_1.0.json"),
+}
+
+LICENSE_INFO = (
+    "Creative Commons Attribution Non Commercial Share Alike 4.0 International."
+)
+
+DOWNLOAD_INFO = """
+    Unfortunately the files of the CIPI dataset are available
+    for download upon request here: https://zenodo.org/records/8037327.
+    After requesting the dataset, you will receive a link to download the 
+    dataset. You must download scores.zip, embeddings.zip and index.json
+    copy the files into the folder:
+        > cipi/
+            > index.json
+            > embeddings.zip
+            > scores.zip
+    unzip embedding.zip and scores.zip and copy the CIPI folder to {}
+"""
+
+
+class Track(core.Track):
+    """Can I play it? (CIPI) track class
+
+    Args:
+        track_id (str): track id of the track
+
+    Attributes:
+        title (str): title of the track
+        book (str): book of the track
+        URI (str): URI of the track
+        composer (str): name of the author of the track
+        track_id (str): track id
+        musicxml_paths (list): path to musicxml score. If the music piece contains multiple movents the list will contain multiple paths.
+        difficulty_annotation (int): annotated difficulty
+        fingering_path (tuple): Path of fingering features from technique dimension computed with ArGNN fingering model. Return of two paths, the right hand and the ones of the left hand. Use torch.load(...) for loading the embeddings.
+        expressiveness_path (str): Path of expressiveness features from sound dimension computed with virtuosoNet model.Use torch.load(...) for loading the embeddings.
+        notes_path (str): Path of note features from notation dimension. Use torch.load(...) for loading the embeddings.
+
+    Cached Properties:
+        scores (list[music21.stream.Score]): music21 scores. If the work is splited in several movements the list will contain multiple scores.
+    """
+
+    def __init__(self, track_id, data_home, dataset_name, index, metadata):
+        super().__init__(track_id, data_home, dataset_name, index, metadata)
+        self._data_home = data_home
+        self.fingering_path = (
+            self.get_path("rh_fingering"),
+            self.get_path("lh_fingering"),
+        )
+        self.expressiveness_path = self.get_path("expressiveness")
+        self.notes_path = self.get_path("notes")
+
+    @property
+    def title(self) -> Optional[str]:
+        return (
+            self._track_metadata["work_name"]
+            if "work_name" in self._track_metadata
+            else None
+        )
+
+    @property
+    def book(self) -> Optional[str]:
+        return self._track_metadata["book"] if "book" in self._track_metadata else None
+
+    @property
+    def URI(self) -> Optional[str]:
+        return self._track_metadata["URI"] if "URI" in self._track_metadata else None
+
+    @property
+    def composer(self) -> Optional[str]:
+        return (
+            self._track_metadata["composer"]
+            if "composer" in self._track_metadata
+            else None
+        )
+
+    @property
+    def musicxml_paths(self) -> List[str]:
+        return (
+            list(self._track_metadata["path"].values())
+            if "path" in self._track_metadata
+            else []
+        )
+
+    @property
+    def difficulty_annotation(self) -> int:
+        return (
+            self._track_metadata["henle"] if "henle" in self._track_metadata else None
+        )
+
+    @core.cached_property
+    def scores(self) -> list:
+        try:
+            scores = [load_score(path, self._data_home) for path in self.musicxml_paths]
+        except FileNotFoundError:
+            raise FileNotFoundError(
+                "Some MusicXML files for track id {} not found. "
+                "Did you request, download, and store the files as indicated?".format(
+                    self.track_id
+                )
+            )
+        return scores
+
+    def to_jams(self):
+        """Get the track's data in jams format
+
+        Returns:
+            jams.JAMS: the track's data in jams format
+
+        """
+        return jams_utils.jams_converter(
+            metadata={
+                "title": self.title,
+                "artist": self.composer,
+                "duration": 0.0,
+                "book": self.book,
+                "URI": self.URI,
+                "composer": self.composer,
+                "track_id": self.track_id,
+                "musicxml_paths": self.musicxml_paths,
+                "difficulty_annotation": self.difficulty_annotation,
+            }
+        )
+
+
+def load_score(
+    fhandle: str, data_home: str = "tests/resources/mir_datasets/cipi"
+) -> music21.stream.Score:
+    """Load cipi score in music21 stream
+
+    Args:
+        fhandle (str): path to MusicXML score
+        data_home (str): path to cipi dataset
+
+    Returns:
+        music21.stream.Score: score in music21 format
+    """
+    try:
+        score = music21.converter.parse(os.path.join(data_home, fhandle))
+    except:
+        raise FileNotFoundError("File {} not found.".format(fhandle))
+    return score
+
+
+@core.docstring_inherit(core.Dataset)
+class Dataset(core.Dataset):
+    """
+    The Can I play it? (CIPI) dataset
+    """
+
+    def __init__(self, data_home=None, version="default"):
+        super().__init__(
+            data_home,
+            version,
+            name="cipi",
+            track_class=Track,
+            bibtex=BIBTEX,
+            indexes=INDEXES,
+            license_info=LICENSE_INFO,
+            download_info=DOWNLOAD_INFO,
+        )
+
+    @core.cached_property
+    def _metadata(self):
+        metadata_path = os.path.join(self.data_home, "index.json")
+        try:
+            with open(metadata_path, "r") as fhandle:
+                metadata_index = json.load(fhandle)
+        except FileNotFoundError:
+            raise FileNotFoundError(
+                f"Metadata {metadata_path} not found. Did you download the files?"
+            )
+        return dict(metadata_index)