-
Notifications
You must be signed in to change notification settings - Fork 0
/
stylometry_validation_performance.py
168 lines (156 loc) · 9.75 KB
/
stylometry_validation_performance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from definitions import STYLOMETRY_DIR, CHAR_MACHINE_CONFIDENCE, CHAR_HUMAN_CONFIDENCE, SEM_MACHINE_CONFIDENCE, SEM_HUMAN_CONFIDENCE
from database.mock_database import DatabaseAuthorship, DatabaseGenArticles
from stylometry.char_trigrams import char_trigrams
from stylometry.semantic_trigrams import sem_trigrams
from stylometry.logistic_regression import fixed_trigram_distribution
from nltk.parse.corenlp import CoreNLPDependencyParser
import pandas as pd
import pickle, os, argparse
used_authors = {
"gpt2":"ai",
"gpt3":"ai",
"gpt3-phrase":"ai",
"grover":"ai",
"https:__www.theguardian.com_profile_hannah-ellis-petersen":"human",
"https:__www.theguardian.com_profile_leyland-cecco":"human",
"https:__www.theguardian.com_profile_martin-chulov":"human",
"https:__www.theguardian.com_profile_julianborger":"human",
"https:__www.theguardian.com_profile_helen-sullivan":"human"
}
parser = argparse.ArgumentParser()
parser.add_argument("--nfeatures", action="store", required=False, type=int, default=100, help="number of char trigram & semantic trigram features used in the distribution")
args = parser.parse_args()
nfeatures = str(args.nfeatures) + "_2"
def write_test_distributions():
parser = CoreNLPDependencyParser(url='http://localhost:9000')
char_features = list(pd.read_csv(os.path.join(STYLOMETRY_DIR, f"char_distribution{nfeatures}.csv")).columns)[1:]
sem_features = [eval(feature) for feature in list(pd.read_csv(os.path.join(STYLOMETRY_DIR, f"sem_distribution{nfeatures}.csv")).columns)[1:]]
author_frame = pd.DataFrame({"author":[]})
char_frames = []
sem_frames = []
authors = used_authors.keys()
for author in authors:
print(f"working on author {author}")
if used_authors[author] == "human":
full_article_list = [(article["text"], author) for article in DatabaseAuthorship.get_articles_by_author(author.replace("_", "/"))]
elif used_authors[author] == "ai":
full_article_list = [(article["text"], author) for article in DatabaseGenArticles.get_articles_by_author(author.replace("_", "/"))]
test_data = full_article_list[int(len(full_article_list)*0.8):]
print("creating char trigrams")
char_grams = [char_trigrams(article_tuple[0]) for article_tuple in test_data]
print("creating sem trigrams")
sem_grams = [sem_trigrams(article_tuple[0], parser) for article_tuple in test_data]
char_distribution = fixed_trigram_distribution(char_grams, char_features)
sem_distribution = fixed_trigram_distribution(sem_grams, sem_features)
author_frame = pd.concat([author_frame, pd.DataFrame({"author":[author]*len(test_data)})])
char_frames.append(char_distribution)
sem_frames.append(sem_distribution)
full_char_distribution = char_frames[0]
full_sem_distribution = sem_frames[0]
for i in range(len(char_frames)-1):
full_char_distribution = pd.concat([full_char_distribution, char_frames[i+1]])
full_sem_distribution = pd.concat([full_sem_distribution, sem_frames[i+1]])
full_char_distribution.insert(0, "author", author_frame["author"].to_list())
full_sem_distribution.insert(0, "author", author_frame["author"].to_list())
full_char_distribution.to_csv(os.path.join(STYLOMETRY_DIR, f"test_char_distribution{nfeatures}.csv"))
full_sem_distribution.to_csv(os.path.join(STYLOMETRY_DIR, f"test_sem_distribution{nfeatures}.csv"))
def char_model_prediction(inp):
authors = used_authors.keys()
models = {}
for author in authors:
with open(os.path.join(STYLOMETRY_DIR, f"{author}_char{nfeatures}.pickle"), "rb") as fp:
models[author] = pickle.load(fp)
confidence_values = {}
for author in authors:
confidence_values[author] = models[author].predict_proba(inp)
print(confidence_values)
final_predictions = []
raw_predictions = []
for i in range(inp.shape[0]):
machine = any(confidence_values[author][i][1] > CHAR_MACHINE_CONFIDENCE for author in authors if used_authors[author] == "ai")
human = any(confidence_values[author][i][1] > CHAR_HUMAN_CONFIDENCE for author in authors if used_authors[author] == "human")
if (machine and human) or (not human and not machine):
final_predictions.append(0)
elif machine:
final_predictions.append(1)
elif human:
final_predictions.append(-1)
machine = max(confidence_values[author][i][1] for author in authors if used_authors[author] == "ai")
human = max(confidence_values[author][i][1] for author in authors if used_authors[author] == "human")
raw_predictions.append((machine, human))
return final_predictions
def sem_model_prediction(inp):
authors = used_authors.keys()
models = {}
for author in authors:
with open(os.path.join(STYLOMETRY_DIR, f"{author}_sem{nfeatures}.pickle"), "rb") as fp:
models[author] = pickle.load(fp)
confidence_values = {}
for author in authors:
confidence_values[author] = models[author].predict_proba(inp)
final_predictions = []
raw_predictions = []
for i in range(inp.shape[0]):
machine = any(confidence_values[author][i][1] > SEM_MACHINE_CONFIDENCE for author in authors if used_authors[author] == "ai")
human = any(confidence_values[author][i][1] > SEM_HUMAN_CONFIDENCE for author in authors if used_authors[author] == "human")
if (machine and human) or (not human and not machine):
final_predictions.append(0)
elif machine:
final_predictions.append(1)
elif human:
final_predictions.append(-1)
machine = max(confidence_values[author][i][1] for author in authors if used_authors[author] == "ai")
human = max(confidence_values[author][i][1] for author in authors if used_authors[author] == "human")
raw_predictions.append((machine, human))
return final_predictions
def char_performance():
test_dataframe = pd.read_csv(os.path.join(STYLOMETRY_DIR, f"test_char_distribution{nfeatures}.csv"))
correct_class = []
for i in range(test_dataframe.shape[0]):
if used_authors[test_dataframe.iloc[i]["author"]] == "ai":
correct_class.append(1)
elif used_authors[test_dataframe.iloc[i]["author"]] == "human":
correct_class.append(-1)
else:
correct_class.append(0)
predictions = char_model_prediction(test_dataframe.drop(["author", "Unnamed: 0"], axis=1))
print(predictions)
accuracy = sum([1 if prediction == correct_class[i] else 0 for i, prediction in enumerate(predictions)]) / len(correct_class)
count_ai = max(correct_class.count(1), 1)
count_human = max(correct_class.count(-1), 1)
true_ai = sum([1 if prediction == correct_class[i] and prediction == 1 else 0 for i, prediction in enumerate(predictions)]) / count_ai
false_ai = sum([1 if prediction != correct_class[i] and prediction == 1 else 0 for i, prediction in enumerate(predictions)]) / count_human
true_human = sum([1 if prediction == correct_class[i] and prediction == -1 else 0 for i, prediction in enumerate(predictions)]) / count_human
false_human = sum([1 if prediction != correct_class[i] and prediction == -1 else 0 for i, prediction in enumerate(predictions)]) / count_ai
unsure_ai = sum([1 if prediction == 0 and correct_class[i] == 1 else 0 for i, prediction in enumerate(predictions)]) / count_ai
unsure_human = sum([1 if prediction == 0 and correct_class[i] == -1 else 0 for i, prediction in enumerate(predictions)]) / count_human
unsure_total = sum([1 if prediction == 0 else 0 for prediction in predictions]) / len(predictions)
print([true_ai, false_ai, true_human, false_human, unsure_ai, unsure_human])
return {"accuracy":accuracy, "ai_true_positives":true_ai, "ai_false_positives":false_ai, "unsure":unsure_total}
def sem_performance():
test_dataframe = pd.read_csv(os.path.join(STYLOMETRY_DIR, f"test_sem_distribution{nfeatures}.csv"))
correct_class = []
for i in range(test_dataframe.shape[0]):
if used_authors[test_dataframe.iloc[i]["author"]] == "ai":
correct_class.append(1)
elif used_authors[test_dataframe.iloc[i]["author"]] == "human":
correct_class.append(-1)
else:
correct_class.append(0)
predictions = sem_model_prediction(test_dataframe.drop(["author", "Unnamed: 0"], axis=1))
print(predictions)
accuracy = sum([1 if prediction == correct_class[i] else 0 for i, prediction in enumerate(predictions)]) / len(correct_class)
count_ai = max(correct_class.count(1), 1)
count_human = max(correct_class.count(-1), 1)
true_ai = sum([1 if prediction == correct_class[i] and prediction == 1 else 0 for i, prediction in enumerate(predictions)]) / count_ai
false_ai = sum([1 if prediction != correct_class[i] and prediction == 1 else 0 for i, prediction in enumerate(predictions)]) / count_human
true_human = sum([1 if prediction == correct_class[i] and prediction == -1 else 0 for i, prediction in enumerate(predictions)]) / count_human
false_human = sum([1 if prediction != correct_class[i] and prediction == -1 else 0 for i, prediction in enumerate(predictions)]) / count_ai
unsure_ai = sum([1 if prediction == 0 and correct_class[i] == 1 else 0 for i, prediction in enumerate(predictions)]) / count_ai
unsure_human = sum([1 if prediction == 0 and correct_class[i] == -1 else 0 for i, prediction in enumerate(predictions)]) / count_human
unsure_total = sum([1 if prediction == 0 else 0 for prediction in predictions]) / len(predictions)
print([true_ai, false_ai, true_human, false_human, unsure_ai, unsure_human])
return {"accuracy":accuracy, "ai_true_positives":true_ai, "ai_false_positives":false_ai, "unsure":unsure_total}
write_test_distributions()
print(char_performance())
print(sem_performance())