-
Notifications
You must be signed in to change notification settings - Fork 0
/
format.py
68 lines (52 loc) · 1.75 KB
/
format.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import csv
import pandas
MIN_COUNT = 2
def convert_to_tscan_format(input_file, output_file):
'''
Convert a CSV file with frequencies to t-scan's format.
This file does not include headers and is tab-separated.
Each row respectively lists:
- the term
- the absolute frequency
- the cumulative absolute frequency
- the cumulative percentile frequency
Rows are sorted in descending order of frequency
'''
df = pandas.read_csv(input_file)
processed = prepare_data(df)
save_data(processed, output_file)
def get_filtered_frequencies(df: pandas.DataFrame):
terms = df['Term']
freq = df['Count']
result = pandas.DataFrame({
'term': terms,
'freq': freq,
})
filtered = result[result['freq'] >= MIN_COUNT]
return filtered.sort_values(by='freq', ascending=False)
def add_relative_and_cumulative(df: pandas.DataFrame):
total = df['freq'].sum()
freq_rel = 100 * df['freq'] / total
freq_abs_cum = df['freq'].cumsum()
freq_rel_cum = freq_rel.cumsum()
return pandas.DataFrame({
'term': df['term'],
'freq_abs': df['freq'],
'freq_rel': freq_rel,
'freq_abs_cum': freq_abs_cum,
'freq_rel_cum': freq_rel_cum,
})
def save_data(df: pandas.DataFrame, output_path: str):
with open(output_path, 'w') as output_io:
for _, row in df.iterrows():
content = '{}\t{}\t{}\t{}\n'.format(
row['term'],
row['freq_abs'],
row['freq_abs_cum'],
row['freq_rel_cum'],
)
output_io.write(content)
def prepare_data(df: pandas.DataFrame):
freqs = get_filtered_frequencies(df)
complete = add_relative_and_cumulative(freqs)
return complete