-
Notifications
You must be signed in to change notification settings - Fork 3
/
main.py
274 lines (264 loc) · 8.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A main file executing an end-to-end dubbing prcoesses of Ariel package from the Google EMEA gTech Ads Data Science."""
import ast
from typing import Sequence
from absl import app
from absl import flags
from ariel.dubbing import Dubber
from ariel.dubbing import get_safety_settings
FLAGS = flags.FLAGS
_INPUT_FILE = flags.DEFINE_string(
"input_file",
None,
"Path to the input video or audio file.",
required=True,
)
_OUTPUT_DIRECTORY = flags.DEFINE_string(
"output_directory",
None,
"Directory to save output files.",
required=True,
)
_ADVERTISER_NAME = flags.DEFINE_string(
"advertiser_name",
None,
"Name of the advertiser.",
required=True,
)
_ORIGINAL_LANGUAGE = flags.DEFINE_string(
"original_language",
None,
"Original language of the ad (ISO 3166-1 alpha-2 country code).",
required=True,
)
_TARGET_LANGUAGE = flags.DEFINE_string(
"target_language",
None,
"Target language for dubbing (ISO 3166-1 alpha-2 country code).",
required=True,
)
_GCP_PROJECT_ID = flags.DEFINE_string(
"gcp_project_id",
None,
"Google Cloud Platform (GCP) project ID for Gemini model"
" access and Google Text-To-Speech API (if this method is picked)",
required=True,
)
_GCP_REGION = flags.DEFINE_string(
"gcp_region",
"europe_west_2",
"GCP region to use when making API calls and where a temporary"
" bucket will be created for Gemini to analyze the video / audio ad."
" The bucket with all its contents will be removed immediately afterwards.",
required=True,
)
_NUMBER_OF_SPEAKERS = flags.DEFINE_integer(
"number_of_speakers",
1,
"Number of speakers in the ad.",
)
_GEMINI_TOKEN = flags.DEFINE_string(
"gemini_token",
None,
"Gemini API token.",
)
_HUGGING_FACE_TOKEN = flags.DEFINE_string(
"hugging_face_token",
None,
"Hugging Face API token.",
)
_NO_DUBBING_PHRASES = flags.DEFINE_list(
"no_dubbing_phrases",
[],
"Phrases to exclude in the dubbing process, they orignal utterance will be"
" used instead.",
)
_DIARIZATION_INSTRUCTIONS = flags.DEFINE_string(
"diarization_instructions",
None,
"Specific instructions for speaker diarization.",
)
_TRANSLATION_INSTRUCTIONS = flags.DEFINE_string(
"translation_instructions",
None,
"Specific instructions for translation.",
)
_MERGE_UTTERANCES = flags.DEFINE_bool(
"merge_utterances",
True,
"Merge utterances with timestamps closer than the threshold.",
)
_MINIMUM_MERGE_THRESHOLD = flags.DEFINE_float(
"minimum_merge_threshold",
0.001,
"Threshold for merging utterances in seconds.",
)
_PREFERRED_VOICES = flags.DEFINE_list(
"preferred_voices",
[],
"Preferred voice names for text-to-speech (e.g., ['Wavenet'] for Google's"
" TTS or ['Calllum'] for ElevenLabs).",
)
_ASSIGNED_VOICES_OVERRIDE = flags.DEFINE_string(
"assigned_voices_override",
None,
"A mapping between unique speaker IDs and the"
" full name of their assigned voices. E.g. {'speaker_01':"
" 'en-US-Casual-K'} or {'speaker_01': 'Charlie'}.",
)
_KEEP_VOICE_ASSIGNMENTS = flags.DEFINE_bool(
"keep_voice_assignments",
False,
"Whether the voices assigned on the first run"
" should be used again when utilizing the same class instance. It helps"
" prevents repetitive voice assignment and cloning.",
)
_ADJUST_SPEED = flags.DEFINE_bool(
"adjust_speed",
False,
"Whether to adjust the duration of the dubbed audio files to match the"
" duration of the source audio files.",
)
_VOCALS_VOLUME_ADJUSTMENT = flags.DEFINE_float(
"vocals_volume_adjustment",
5.0,
"By how much the vocals audio volume should be adjusted",
)
_BACKGROUND_VOLUME_ADJUSTMENT = flags.DEFINE_float(
"background_volume_adjustment",
0.0,
"By how much the background audio volume should be adjusted.",
)
_VOICE_SEPARATION_ROUNDS = flags.DEFINE_float(
"voice_separation_rounds",
2,
"The number of times the background audio file"
" should be processed for voice detection and removal. It helps with"
" the old voice artifacts being present in the dubbed ad.",
)
_CLEAN_UP = flags.DEFINE_bool(
"clean_up",
False,
"Delete intermediate files after dubbing.",
)
_GEMINI_MODEL_NAME = flags.DEFINE_string(
"gemini_model_name",
"gemini-1.5-flash",
"Name of the Gemini model to use.",
)
_TEMPERATURE = flags.DEFINE_float(
"temperature",
1.0,
"Controls randomness in generation.",
)
_TOP_P = flags.DEFINE_float(
"top_p",
0.95,
"Nucleus sampling threshold.",
)
_TOP_K = flags.DEFINE_integer(
"top_k",
40,
"Top-k sampling parameter.",
)
_SAFETY_SETTINGS = flags.DEFINE_string(
"_safety_settings",
"Medium",
"The indicator of what kind of Gemini safety settings should"
" be used in the dubbing process. Can be"
" 'Low', 'Medium', 'High', or 'None'",
)
_MAX_OUTPUT_TOKENS = flags.DEFINE_integer(
"max_output_tokens",
8192,
"Maximum number of tokens in the generated response.",
)
_ELEVENLABS_TOKEN = flags.DEFINE_string(
"elevenlabs_token",
None,
"ElevenLabs API token.",
)
_ELEVENLABS_CLONE_VOICES = flags.DEFINE_bool(
"elevenlabs_clone_voices",
False,
"Whether to clone source voices. It requires using ElevenLabs API.",
)
_ELEVENLABS_MODEL = flags.DEFINE_string(
"elevenlabs_model",
"eleven_multilingual_v2",
"The ElevenLabs model to use in the Text-To-Speech process.",
)
_ELEVENLABS_REMOVE_CLONED_VOICES = flags.DEFINE_bool(
"elevenlabs_remove_cloned_voices",
False,
"Whether to remove all the voices that"
" were cloned with ELevenLabs during the dubbing process.",
)
_WITH_VERIFICATION = flags.DEFINE_bool(
"with_verification",
True,
"Verify, and optionally edit, the utterance metadata in the dubbing"
" process.",
)
def main(argv: Sequence[str]) -> None:
"""Parses command-line arguments and runs the dubbing process."""
if len(argv) > 1:
raise app.UsageError("Too many command-line arguments.")
dubber = Dubber(
input_file=_INPUT_FILE.value,
output_directory=_OUTPUT_DIRECTORY.value,
advertiser_name=_ADVERTISER_NAME.value,
original_language=_ORIGINAL_LANGUAGE.value,
target_language=_TARGET_LANGUAGE.value,
gcp_project_id=_GCP_PROJECT_ID.value,
gcp_region=_GCP_REGION.value,
number_of_speakers=_NUMBER_OF_SPEAKERS.value,
gemini_token=_GEMINI_TOKEN.value,
hugging_face_token=_HUGGING_FACE_TOKEN.value,
no_dubbing_phrases=_NO_DUBBING_PHRASES.value,
diarization_instructions=_DIARIZATION_INSTRUCTIONS.value,
translation_instructions=_TRANSLATION_INSTRUCTIONS.value,
merge_utterances=_MERGE_UTTERANCES.value,
minimum_merge_threshold=_MINIMUM_MERGE_THRESHOLD.value,
preferred_voices=_PREFERRED_VOICES.value,
assigned_voices_override=ast.literal_eval(_ASSIGNED_VOICES_OVERRIDE),
keep_voice_assignments=_KEEP_VOICE_ASSIGNMENTS.value,
adjust_speed=_ADJUST_SPEED.value,
vocals_volume_adjustment=_VOCALS_VOLUME_ADJUSTMENT.value,
background_volume_adjustment=_BACKGROUND_VOLUME_ADJUSTMENT.value,
voice_separation_rounds=_VOICE_SEPARATION_ROUNDS.value,
clean_up=_CLEAN_UP.value,
gemini_model_name=_GEMINI_MODEL_NAME.value,
temperature=_TEMPERATURE.value,
top_p=_TOP_P.value,
top_k=_TOP_K.value,
max_output_tokens=_MAX_OUTPUT_TOKENS.value,
safety_settings=get_safety_settings(_SAFETY_SETTINGS.value),
use_elevenlabs=False if _ELEVENLABS_TOKEN.value == "".strip() else True,
elevenlabs_token=_ELEVENLABS_TOKEN.value,
elevenlabs_clone_voices=_ELEVENLABS_CLONE_VOICES.value,
elevenlabs_model=_ELEVENLABS_MODEL.value,
elevenlabs_remove_cloned_voices=_ELEVENLABS_REMOVE_CLONED_VOICES.value,
with_verification=_WITH_VERIFICATION.value,
)
dubber.dub_ad()
if __name__ == "__main__":
flags.mark_flag_as_required("input_file")
flags.mark_flag_as_required("output_directory")
flags.mark_flag_as_required("advertiser_name")
flags.mark_flag_as_required("original_language")
flags.mark_flag_as_required("target_language")
flags.mark_flag_as_required("gcp_project_id")
app.run(main)