-
Notifications
You must be signed in to change notification settings - Fork 52
/
clean_lexica.py
64 lines (49 loc) · 1.91 KB
/
clean_lexica.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import glob
import argparse
def clean_prompts(csv_dir):
cleaned_prompts = []
for filepath in glob.glob(f'{csv_dir}/*.csv'):
df = pd.read_csv(filepath)
for _, row in enumerate(df.iterrows()):
prompt = str(row[1]['Content'])
if '[-h]' in prompt or (prompt[0] == '-' and prompt[2] == ' ') or (prompt[:2] == '--' and prompt[3] == ' '):
continue
# Split on !dream command
elif '!dream' in prompt:
cleaned_prompt = prompt.split('!dream')[-1].strip()
elif '! dream' in prompt:
cleaned_prompt = prompt.split('! dream')[-1].strip()
# Remove accidental pasting of just params
if not cleaned_prompt or cleaned_prompt == '-h' or cleaned_prompt == '--help':
continue
# Strip quotes
if cleaned_prompt[0] == '"':
cleaned_prompt = cleaned_prompt.split('"')[1].strip()
elif cleaned_prompt[0] == '“':
cleaned_prompt = cleaned_prompt.split('”')[0].replace('“', '').strip()
# Remove more params
cleaned_prompt = cleaned_prompt.split(' -')[0]
print(cleaned_prompt)
cleaned_prompts.append(cleaned_prompt)
# Remove duplicates
cleaned_prompts = list(set(cleaned_prompts))
return cleaned_prompts
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--lexica_dir_path",
"-p",
type=str,
required=True,
help="Path to Lexica folder with csvs of dump"
)
args = parser.parse_args()
csv_dir = args.lexica_dir_path
cleaned_prompts = clean_prompts(csv_dir)
# Write to file
with open('lexica_prompts.txt', 'w') as f:
for p in cleaned_prompts:
f.write(f'{p}\n') # will mess up multiline prompts
if __name__ == "__main__":
main()