-
Notifications
You must be signed in to change notification settings - Fork 0
/
randchoicedataformodel_V9.py
114 lines (97 loc) · 3.8 KB
/
randchoicedataformodel_V9.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
import sys
import random
def write_new_base_file(currentPath, processFileName, textContext):
num = 0
tmpSave = os.path.join(currentPath, processFileName + "_new")
tmpRead = os.path.join(currentPath, processFileName)
with open(tmpSave, "wt") as f:
for line in open(tmpRead, "rt").readlines():
line = line.strip()
uttid, context = line.split(" ")
if uttid in textContext.keys():
f.write("%s %s\n" % (uttid, context))
num += 1
print("%s write done." % (tmpSave))
if num == len(textContext.keys()):
os.remove(tmpRead)
os.rename(tmpSave, tmpRead)
else:
print("%s dont have same lines with text." % (tmpRead))
sys.exit(1)
def write_output_file(savePath, saveName, indexList, fileKeys, contextDict):
with open(os.path.join(savePath, saveName), "wt", encoding="utf-8") as f:
for i in indexList:
key = fileKeys[i - 1]
value = contextDict.get(key)
f.write("%s %s\n" % (key, value))
if __name__ == "__main__":
currentPath = sys.argv[1]
# currentPath = r"C:\Users\PX\Desktop\train_mixed"
# savePath = r"C:\Users\PX\Desktop"
percent = 0.1 # new_file_num=old_file_num * percent
seed = 1235
backffix = "_with_noise" # save file name backffix
if currentPath.endswith("/") or currentPath.endswith("\\"):
currentPath = currentPath[:-1]
savePath = os.path.dirname(currentPath)
textContext = {}
empty = False
for line in open(
os.path.join(currentPath, "text"), "rt",
encoding="utf-8").readlines():
line = line.strip()
# print(line)
try:
utt, text = line.split(" ", 1)
utt = str(utt)
text = str(text)
textContext.update([(utt, text)])
except Exception:
empty = True
print(line)
# Done text dict make
# Maybe this one is optional
if empty:
tmpSave = os.path.join(currentPath, "text_new")
with open(tmpSave, "wt", encoding="utf-8") as f:
for key in textContext.keys():
f.write("%s %s\n" % (key, textContext.get(key)))
print("Blank line occurs!!!")
tmpRead = os.path.join(currentPath, "text")
os.remove(tmpRead)
os.rename(tmpSave, tmpRead)
write_new_base_file(currentPath, "utt2spk", textContext)
write_new_base_file(currentPath, "wav.scp", textContext)
# Back to regular route
readFilename = os.path.join(currentPath, "wav.scp")
fileContext = {}
for line in open(readFilename, "rt").readlines():
line = line.strip()
utt, path = line.split(" ")
utt = str(utt)
fileContext.update({utt: path})
# Done wav dict make
utt2spkContext = {}
for line in open(
os.path.join(currentPath, "utt2spk"), "rt",
encoding="utf-8").readlines():
line = line.strip()
utt, spk = line.split(" ")
utt = str(utt)
spk = str(spk)
utt2spkContext.update({utt: spk})
# Done utt2spk dict make
fileKeys = list(fileContext.keys())
originCount = len(fileContext)
outCount = int(originCount * percent)
random.seed(seed)
indexList = random.sample(range(1, originCount + 1), outCount)
indexList.sort() # 需要的索引列表
saveName = "wav" + backffix + ".scp"
write_output_file(savePath, saveName, indexList, fileKeys, fileContext)
saveName = "utt2spk" + backffix
write_output_file(savePath, saveName, indexList, fileKeys, utt2spkContext)
saveName = "text" + backffix
write_output_file(savePath, saveName, indexList, fileKeys, textContext)
print("done!")