cherry pick main

Signed-off-by: Nikolay Karpov <[email protected]>
NVIDIA · Dec 18, 2023 · b98f858 · b98f858
1 parent 7017b69
commit b98f858
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 11 deletions.
diff --git a/scripts/speech_recognition/convert_hf_dataset_to_nemo.py b/scripts/speech_recognition/convert_hf_dataset_to_nemo.py
@@ -361,7 +361,7 @@ def main(cfg: HFDatasetConversionConfig):
             split=cfg.split,
             cache_dir=None,
             streaming=cfg.streaming,
-            use_auth_token=cfg.use_auth_token,
+            token=cfg.use_auth_token,
         )
 
     except Exception as e:

diff --git a/tutorials/asr/Multilang_ASR.ipynb b/tutorials/asr/Multilang_ASR.ipynb
@@ -240,7 +240,21 @@
     "id": "ATf7lapcbAxB"
    },
    "source": [
-    "Now, let's download the Mozilla CommonVoice Spanish dataset. We will ignore the larger train file and get just the test and dev parts for the purposes of this tutorial. For good results, you will need to get the train files and likely other datasets too, bringing the total to over 1k hours. "
+    "Now, let's download the Mozilla CommonVoice Spanish dataset. We will ignore the larger train file and get just the test part for the purposes of this tutorial. For good results, you will need to get the train files and likely other datasets too, bringing the total to over 1k hours. \n",
+    "\n",
+    "Website steps:\n",
+    "- Visit https://huggingface.co/settings/profile\n",
+    "- Visit \"Access Tokens\" on list of items.\n",
+    "- Create new token - provide a name for the token and \"read\" access is sufficient.\n",
+    "  - PRESERVE THAT TOKEN API KEY. You can copy that key for next step.\n",
+    "- Visit the [HuggingFace Dataset page for Mozilla Common Voice 3.0](https://huggingface.co/datasets/mozilla-foundation/common_voice_3_0)\n",
+    "  - There should be a section that asks you for your approval.\n",
+    "  - Make sure you are logged in and then read that agreement.\n",
+    "  - If and only if you agree to the text, then accept the terms.\n",
+    "\n",
+    "Code steps:\n",
+    "- Now below, run `login()` \n",
+    "- Paste your preserved HF TOKEN API KEY to the text box."
    ]
   },
   {
@@ -261,7 +275,8 @@
    },
    "outputs": [],
    "source": [
-    "!mkdir -p datasets/mcv3"
+    "from huggingface_hub import login\n",
+    "login()"
    ]
   },
   {
@@ -270,7 +285,7 @@
     "id": "YpZNMYfKde9n"
    },
    "source": [
-    "We will use the `get_commonvoice_data.py` script located in the nemo/scripts/dataset_processing dir if you cloned NeMo repo"
+    "We will use the `convert_hf_dataset_to_nemo.py` script located in the nemo/scripts/speech_recognition dir if you cloned NeMo repo"
    ]
   },
   {
@@ -295,8 +310,8 @@
    },
    "outputs": [],
    "source": [
-    "if not os.path.exists(\"get_commonvoice_data.py\"):\n",
-    "    !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/dataset_processing/get_commonvoice_data.py"
+    "if not os.path.exists(\"convert_hf_dataset_to_nemo.py\"):\n",
+    "    !wget https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/speech_recognition/convert_hf_dataset_to_nemo.py"
    ]
   },
   {
@@ -321,7 +336,10 @@
    },
    "outputs": [],
    "source": [
-    "!python get_commonvoice_data.py --data_root \"datasets/mcv3\" --language es --cleanup --version cv-corpus-3"
+    "! python convert_hf_dataset_to_nemo.py \\\n",
+    "    path=\"mozilla-foundation/common_voice_3_0\" \\\n",
+    "    output_dir=\"datasets\" name=\"es\" split='test' \\\n",
+    "    use_auth_token=True"
    ]
   },
   {
@@ -351,9 +369,8 @@
    },
    "outputs": [],
    "source": [
-    "!head -1000 commonvoice_dev_manifest.json > commonvoice_dev_manifest_1000.json\n",
-    "!cat commonvoice_test_manifest.json >> commonvoice_train_manifest.json\n",
-    "!tail -1617 commonvoice_dev_manifest.json >> commonvoice_train_manifest.json"
+    "!head -1000 datasets/mozilla-foundation/common_voice_3_0/es/test/test_mozilla-foundation_common_voice_3_0_manifest.json > commonvoice_dev_manifest_1000.json\n",
+    "!tail -1729 datasets/mozilla-foundation/common_voice_3_0/es/test/test_mozilla-foundation_common_voice_3_0_manifest.json > commonvoice_train_manifest.json"
    ]
   },
   {
@@ -616,7 +633,7 @@
    },
    "outputs": [],
    "source": [
-    "es_files = ['datasets/mcv3/dev/wav/common_voice_es_18309780.wav', 'datasets/mcv3/dev/wav/common_voice_es_18311421.wav']"
+    "es_files = ['datasets/mozilla-foundation/common_voice_3_0/es/test/clips/common_voice_es_18481930.wav', 'datasets/mozilla-foundation/common_voice_3_0/es/test/clips/common_voice_es_18481932.wav']"
    ]
   },
   {