-
Notifications
You must be signed in to change notification settings - Fork 3
/
extract_first_and_second_client_utt.py
130 lines (104 loc) · 4.5 KB
/
extract_first_and_second_client_utt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
extract_first_and_second_client_utt.py
Any long set of conversations (for instance ABCD) that you want to take just the first or second client utterances.
Notes:
Input files should be in HF json format with first_client_utt,second_client_utt,idx metadata fields
These metadata are automatically available if the input data files are produced using csv_to_json_unlabelled.py script
"""
# *********************************************************************************************************************
# standard imports
import os
import json
# third party imports
import click
import pandas
# custom imports
import back_to_hf_unlabelled
@click.command()
@click.option('-i','--input_location',type=str,required=True,
help='a file or a directory containing conversation in HF JSON format')
@click.option('-h','--sample',type=int,required=False,default=0,help='Sample head')
@click.option('-s','--sep',type=str,required=False,default="-",help='Seperator')
@click.option('-f','--file_num',type=int,required=False,default=0,help='Files to stop after')
@click.option('-r','--remove_list',type=str,required=False,default="WELCOME,NO_RESPONSE,NO_RESPONSE_FINAL",
help='List of utterances to be excluded from the final dataset')
def main(input_location: str,
sample: int,
sep: str,
file_num: int,
remove_list: str):
"""Main Function"""
# deal with file or directory
work_list = []
if os.path.isfile(input_location):
work_list.append(input_location)
elif os.path.isdir(input_location):
candidate_list = os.listdir(input_location)
for c in candidate_list:
if c.endswith(".json"):
work_list.append(os.path.join(input_location,c))
work_list.sort()
print("Worklist of files is:")
print(work_list)
# Read all the files
for i,file_name in enumerate(work_list):
print(f'{i} Processing: {file_name}')
df = pandas.DataFrame()
df = pandas.concat([df,read_json_to_df(file_name, sep)])
if file_num > 0 and (i + 1) == file_num:
break
print(f'All files read shape: {df.shape}')
# sample the dataframe
df = sample_dataframe(df,sample)
# do the stuff
df = process(df,remove_list)
# output location
if os.path.isfile(input_location):
output_location = input_location.replace(".json","_output.json")
elif os.path.isdir(input_location):
output_location = os.path.join(input_location,"output.json")
assert input_location != output_location
# turn it back into
back_to_hf_unlabelled.back_to_hf(df,output_location)
def sample_dataframe(df: pandas.DataFrame, sample: int) -> pandas.DataFrame:
"""Sample by conversation ids"""
if sample > 0:
context_ids = df["context-context_id"].unique()
context_ids = context_ids[0:sample]
df = df[df["context-context_id"].isin(context_ids)].copy(deep=True)
df["intidx"] = df["metadata-idx"].astype(int)
df.sort_values(["context-context_id","intidx"],inplace=True,ignore_index=True)
df.drop(columns=["intidx"],inplace=True)
return df
def process(df: pandas.DataFrame, remove_list: str):
"""Extracts first or second client utterances"""
# make it just the first or second customer utterances
df = df[(df["metadata-first_client_utt"]=="True") | (df["metadata-second_client_utt"]=="True")]
print(df[["text","context-context_id","metadata-idx","metadata-first_client_utt","metadata-second_client_utt"]])
# groupby text
print(df[["text","metadata-idx"]].groupby("text").count().sort_values("metadata-idx",ascending=False))
# eliminate the utterances from the remove list
if remove_list != "":
remove_list = remove_list.split(",")
for t in remove_list:
df = strip_string(df,t)
return df
def strip_string(df: pandas.DataFrame, strip_this: str) -> pandas.DataFrame:
"""Strips any utterances matching exactly string"""
print(f'Before stripping {strip_this}')
print(df.shape)
print(df.columns)
df = df[~(df["text"]==strip_this)]
print(f'After stripping {strip_this}')
print(df.shape)
return df
def read_json_to_df(file_name: str, sep: str) -> pandas.DataFrame:
"""Just read data frame"""
file_handle = open(file_name, mode="r", encoding="utf8")
data = json.load(file_handle)
data = data["examples"]
file_handle.close()
df = pandas.json_normalize(data=data, sep=sep)
return df
if __name__ == '__main__':
main() # pylint: disable=no-value-for-parameter