modify audio_special_token

modelscope · Nov 23, 2023 · 539f099 · 539f099
1 parent fc98733
commit 539f099
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 20 deletions.
diff --git a/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py b/tools/multimodal/data_juicer_format_to_target_format/dj_to_wavcaps.py
@@ -3,8 +3,8 @@
 #
 # Data-Juicer format:
 # {'audios': ['./path/to/audio/2219.flac'],
-#  'text': '<audio>\n'
-#          '[[caption]]: An airplane is landing. <|__dj__eoc|>',
+#  'text': '<__dj__audio>\n'
+#          'An airplane is landing. <|__dj__eoc|>',
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Airplane Landing Airport',
@@ -19,8 +19,8 @@
 #       'category': '',
 #       'tags': '' }}
 # {'audios': ['./path/to/audio/2218.flac'],
-#  'text': '<audio>\n'
-#          '[[caption]]: Someone is ringing a bell. <|__dj__eoc|>',
+#  'text': '<__dj__audio>\n'
+#          'Someone is ringing a bell. <|__dj__eoc|>',
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Service Bell Help',

diff --git a/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py b/tools/multimodal/source_format_to_data_juicer_format/wavcaps_to_dj.py
@@ -29,8 +29,8 @@
 #
 # Corresponding Data-Juicer format:
 # {'audios': ['./path/to/audio/2219.flac'],
-#  'text': '<audio>\n'
-#          '[[caption]]: An airplane is landing. <|__dj__eoc|>',
+#  'text': '<__dj__audio>\n'
+#          'An airplane is landing. <|__dj__eoc|>',
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Airplane Landing Airport',
@@ -45,8 +45,8 @@
 #       'category': '',
 #       'tags': '' }}
 # {'audios': ['./path/to/audio/2218.flac'],
-#  'text': '<audio>\n'
-#          '[[caption]]: Someone is ringing a bell. <|__dj__eoc|>',
+#  'text': '<__dj__audio>\n'
+#          'Someone is ringing a bell. <|__dj__eoc|>',
 #  '__dj__meta__': {
 #       'num_captions_per_audio': 1,
 #       'title': 'Service Bell Help',
@@ -110,9 +110,9 @@ def main(
     target_ds_path: str,
     target_field: Union[str, List[str]] = 'caption',
     eoc_special_token: str = SpecialTokens.eoc,
-    audio_special_token: str = '<audio>',
+    audio_special_token: str = SpecialTokens.audio,
     add_eoc_at_last: bool = True,
-    add_target_field_token: bool = True,
+    add_target_field_token: bool = False,
     sent_seperator: str = '\n',
 ):
     """
@@ -128,10 +128,7 @@ def main(
         to split conversation chunks explicitly. Default: <|__dj__eoc|> (from
         Data-Juicer).
     :param audio_special_token: the special token for audios. It's used to
-        locate the audios in the text. In typical WavCaps-like datasets,
-        this token always be "<audio>". You can change it to align with your
-        own WavCaps-like datasets but should be careful of possible
-        compatibility problems that come from this change. Default: <audio>.
+        locate the audios in the text.
     :param add_eoc_at_last: whether to add an extra eoc_special_token at the
         end of text. Default: True.
     :param add_target_field_token: whether to add an extra target_field_token
@@ -171,12 +168,6 @@ def main(
                     f'for the target dataset.')
         os.makedirs(os.path.dirname(target_ds_path))
 
-    # check if the default audio special token is changed
-    if audio_special_token != '<audio>':
-        logger.warning('The audio_special_token used in the original WavCaps '
-                       'dataset is "<audio>". It\'s better to align the this '
-                       'token. There might be some compatibility problem if '
-                       'you change it.')
     # check whether to add the eoc special token at last
     if not add_eoc_at_last:
         logger.warning('You choose not to add special eoc token at the last, '