-
Notifications
You must be signed in to change notification settings - Fork 1
/
unifying_raw_dataset.py
415 lines (336 loc) · 17.4 KB
/
unifying_raw_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
"""
This file unifies the fixes and the analysis made by Roslynator into single datapoints.
"""
import xml.etree.ElementTree as ET
import json
import os
import urllib.request
from unidiff import PatchSet, PatchedFile
import pandas as pd
import hashlib
import copy
from parsing_diffs import parse_hunk
diff_dir = "raw_dataset/diffs"
analysis_dir = "raw_dataset/analysis_files"
unified_dataset_dir = "unified_dataset"
repositories_dir = "submodule_repos_to_analyze"
df_repos = pd.read_csv("github_repos.csv")
analysis_files = [f.name for f in os.scandir(
analysis_dir) if f.is_file() and not f.name == ".DS_Store"]
unified_data_files = [f.name.split("-")[0] for f in os.scandir(
unified_dataset_dir) if f.is_file()]
# Instantiate this multiple times later:
unified_data_sample = {
"Repo": "",
"RepoURL": "",
"SolutionFile": "",
"FilePath": "",
"NumberFileLines": "",
"Commit": "",
"FileURL": "",
"DiagnosticID": "",
"AnalyzerNuGet": "",
"Severity": "",
"RequiredLinesStart": None,
"RequiredLinesEnd": None,
"DiagnosticOccurances": [
# {
# "Message": "",
# "Line": 0,
# "Character": 0
# },
],
"ParsedDiff": {
"ReplacedLines": [
# {
# "SourceLocations": [4, 5],
# "TargetLines": ["static int i = 0;"]
# }
],
"RemovedLines": [],
"AddedLines": [
# {
# "TargetStartLocation": 34,
# "TargetLines": ["static int i = 0;"]
# }
]
},
"FileContextStart": None,
"FileContext": [],
}
def hash_filename(filename):
"""Using hash function to avoid OS Errors, too long filename"""
return f"{str(int(hashlib.sha256(filename.encode('utf-8')).hexdigest(), 16) % 10**8)}"
def filter_analysis_files(repo_name, solution_filename, nuget_full_name):
def filter_func(analysis_filename):
"""
$ANALYSIS_FILEPATH = "${Using:REPO_NAME}__${SOLUTION_FILENAME}__${Using:LAST_COMMIT}__${NUGET_FULL_NAME}.xml"
Sample ANALYSIS_FILEPATH:
raw_dataset\analysis_files\runtime__Common.Tests.sln__dcf816579c616e1d172d587301a0a769aa1c0771__AkzenteIT.Analyzers.1.0.6304.37642.xml
Disregarding LAST_COMMIT for now.
"""
REPO_NAME, SOLUTION_FILENAME, LAST_COMMIT, NUGET_FULL_NAME = analysis_filename[:-len(
'.xml')].split("__")
if repo_name == REPO_NAME and solution_filename == SOLUTION_FILENAME and nuget_full_name == NUGET_FULL_NAME:
return True
else:
return False
return filter_func
def filter_diagnostic_occurance(new_occurance_dict):
def filter_func(existing_occurance_dict):
if (
existing_occurance_dict["Message"] == new_occurance_dict["Message"] and
existing_occurance_dict["Line"] == new_occurance_dict["Line"] and
existing_occurance_dict["Character"] == new_occurance_dict["Character"]
):
return True
return False
return filter_func
def match_diff_batches_to_diagnostics(diagnostics, all_added_lines, all_removed_lines, all_replaced_lines):
"""
To which diff-batch does each diagnostic correspond to?
In some cases, multiple diagnostic occurances may have generated
one "diff batch". An example would be two occurances in the same line,
but at different characters. If the line was deleted in the diff, we would
not know which diagnostic occurance caused this to happen. Therefore,
these occurances are bundled.
One diagnostic occurance can however only have generated a single
diff batch. The assumption is that this diff batch will be "at the same
line" as the diagnostic occurance. This however also has shortcomings
for multi-line diffs (they will be split up).
WARNING: Very hacky code. Trying to align diff batches with diagnostics, which is very speculative.
Diagnostics that result in multiple diff batches are not handled well; the diff batches will be
split up, since one datapoint can only have one diff batch.
IRL, it would actually make sense that one diagnostic has multiple
diff batches and not the other way around...
"""
add_batch_is_above_diagnostic_position = {}
diff_batch_to_diagnostics = {}
for diagnostic_occurance in unique_diagnostic_occurances:
diff_key = None
for count, value in enumerate(all_replaced_lines):
if diagnostic_occurance["Line"] in value["SourceLocations"]:
diff_key = f"REPLACE-{count}"
break
if not diff_key:
for count, value in enumerate(all_removed_lines):
if (diagnostic_occurance["Line"] >= value["SourceLocationStart"] and
diagnostic_occurance["Line"] <= value["SourceLocationEnd"]):
diff_key = f"REMOVE-{count}"
break
# Do added_lines last, since intuitively, finding deleted lines is easier; diagnostic
# will probably be inside one of the deleted/replaced lines?
# Adding lines on the other hand is evenly likely to happen before
# or after the diagnostic. Here we are prioritising ADD batches that happen above
# the diagnostics, unless these already have diagnostics above them.
if not diff_key:
for count, value in enumerate(all_added_lines):
# If added lines are above diagnostic
if (diagnostic_occurance["Line"] - 1) == value["PreviousSourceLocation"]:
diff_key = f"ADD-{count}"
# If a ADD diff batch already has diagnostics above it, then it more
# likely to find the correct diff batch underneath the current diagnostic.
if diff_key in add_batch_is_above_diagnostic_position:
if not add_batch_is_above_diagnostic_position[diff_key]:
diff_key = None
continue
else:
add_batch_is_above_diagnostic_position[diff_key] = True
break
if not diff_key:
for count, value in enumerate(all_added_lines):
# If added lines are beneath diagnostic
if diagnostic_occurance["Line"] == value["PreviousSourceLocation"]:
diff_key = f"ADD-{count}"
add_batch_is_above_diagnostic_position[diff_key] = False
break
# Diagnostic occurance leads to no obvious diff batch
if not diff_key:
continue
if diff_key not in diff_batch_to_diagnostics:
diff_batch_to_diagnostics[diff_key] = []
diff_batch_to_diagnostics[diff_key].append(
diagnostic_occurance)
return diff_batch_to_diagnostics
diff_files = [f.name for f in os.scandir(diff_dir) if f.is_file()]
for diff_file in diff_files:
print("diff_file: ", diff_file)
# $ANALYSIS_FILEPATH = "${Using:REPO_NAME}__${SOLUTION_FILENAME}__${Using:LAST_COMMIT}__${NUGET_FULL_NAME}__${DIAGNOSTIC_ID}.diff"
REPO_NAME, SOLUTION_FILENAME, LAST_COMMIT, NUGET_FULL_NAME, DIAGNOSTIC_ID = diff_file[:-len(
'.diff')].split("__")
# Prepare XML analysis file
analysis_file_for_diff = list(filter(filter_analysis_files(
REPO_NAME, SOLUTION_FILENAME, NUGET_FULL_NAME), analysis_files))
print("analysis_file_for_diff: ", analysis_file_for_diff)
if len(analysis_file_for_diff) != 1:
print("More than one analysis_file_for_diff!")
# input("Press Enter to continue...")
analysis_file_for_diff = analysis_file_for_diff[0]
# Generated by Roslynator as 'Filepath'
analyzed_file_prefix = f"C:\\Users\\vlohse\Desktop\\neural-repair-static-analysis\\submodule_repos_to_analyze\\{REPO_NAME}\\"
tree = ET.parse(f"{analysis_dir}/{analysis_file_for_diff}")
root = tree.getroot() # <Roslynator></Roslynator> is root
projects_analysed = root.find('CodeAnalysis').find('Projects')
patch_set = PatchSet.from_filename(f"{diff_dir}/{diff_file}")
repo_dir = f"{repositories_dir}/{REPO_NAME}"
# One patch per file
for patched_file in patch_set:
if patched_file.is_added_file or patched_file.is_removed_file:
continue
print("patched_file.path: ", patched_file.path)
parsed_file_path = patched_file.path.replace(
"/", "--").replace("\\", "--")
unified_data_filename = f"{diff_file[:-len('.diff')]}__{parsed_file_path}"
unified_data_filename_hash = f"{hash_filename(unified_data_filename)}"
if unified_data_filename_hash in unified_data_files:
print(
f"unified_data_filename_hash already exists! file: {unified_data_filename}")
# Appending unified_data_filename_hash to unified_data_files later on
continue
unified_data_file = copy.deepcopy(unified_data_sample)
repo_row = df_repos.loc[df_repos['RepoName'] == REPO_NAME].iloc[0]
unified_data_file["Repo"] = REPO_NAME
unified_data_file["RepoURL"] = repo_row["RepoURL"]
unified_data_file["SolutionFile"] = SOLUTION_FILENAME
unified_data_file["FilePath"] = patched_file.path
unified_data_file["Commit"] = LAST_COMMIT
unified_data_file["DiagnosticID"] = DIAGNOSTIC_ID
unified_data_file["AnalyzerNuGet"] = NUGET_FULL_NAME
repo_url = repo_row["RepoURL"]
if "https://github.com" in repo_url:
repo_url = repo_url[:-len('.git')
] if repo_url.endswith('.git') else repo_url
unified_data_file["FileURL"] = f"{repo_url}/blob/{LAST_COMMIT}/{patched_file.path}"
try:
with open(f"{repo_dir}/{patched_file.path}") as f:
unified_data_file["NumberFileLines"] = len(list(f))
except UnicodeDecodeError as e:
print(f"Error reading file: {repo_dir}/{patched_file.path}; Error: {e}")
continue
all_replaced_lines = []
all_added_lines = []
all_removed_lines = []
for hunk in patched_file:
replaced_lines, added_lines, removed_lines = parse_hunk(hunk)
all_replaced_lines += replaced_lines
all_added_lines += added_lines
all_removed_lines += removed_lines
count = 0
project_filepaths = []
unique_diagnostic_occurances = []
for xml_project in projects_analysed:
# TODO: Check whether some analysis is based on target framework
# Occasionally, Roslynator may perform the same analysis multiple times (due to multiple target frameworks in sln file, etc.)
cs_proj_path = xml_project.get('FilePath')
if cs_proj_path in project_filepaths:
print(
f"This .csproj has been analysed multiple times! cs_proj_path: {cs_proj_path}")
continue
else:
project_filepaths.append(cs_proj_path)
for xml_diagnostic in xml_project.find('Diagnostics'):
if not hasattr(xml_diagnostic.find('FilePath'), 'text'):
# Happens rarely. Example:
# CA9998; FxCopAnalyzers package has been deprecated in favor of 'Microsoft.CodeAnalysis.NetAnalyzers'
# Cannot be sure that this diagnostic led to fix in our file
continue
analyzed_file_filepath = xml_diagnostic.find(
'FilePath').text[len(analyzed_file_prefix):].replace("\\", "/")
if xml_diagnostic.get('Id') != DIAGNOSTIC_ID or analyzed_file_filepath != patched_file.path:
continue
# Just do this once
if count == 0:
unified_data_file["Severity"] = xml_diagnostic.find(
'Severity').text
count += 1
new_occurance_dict = {
"Message": xml_diagnostic.find('Message').text,
"Line": int(xml_diagnostic.find('Location').get('Line')),
"Character": int(xml_diagnostic.find('Location').get('Character'))
}
# Even though already checking for .csproj duplicates earlier, one file may be referenced
# in multiple different projects as well.
# Example: SA1642 for <Location Line="55" Character="16" /> in analysis file
# Druntime__Microsoft.Bcl.AsyncInterfaces.sln__e98d043d7d293c88a346b632d8fc12564a8ef0ce__Documentation.Analyser.1.1.1.xml
occurance_duplicates = filter(filter_diagnostic_occurance(
new_occurance_dict), unique_diagnostic_occurances)
if len(list(occurance_duplicates)) != 0:
print(
f"Duplicate DiagnosticOccurance! new_occurance_dict: {new_occurance_dict}")
# input("Press Enter to continue...")
continue
unique_diagnostic_occurances.append(new_occurance_dict)
num_diff_batches_in_file = len(
all_replaced_lines) + len(all_removed_lines) + len(all_added_lines)
num_diagnostics = len(unique_diagnostic_occurances)
if num_diff_batches_in_file != num_diagnostics:
print(
f"Num diff batches ({num_diff_batches_in_file}) != diagnostics ({num_diagnostics})")
diff_batch_to_diagnostic_occurances_dict = match_diff_batches_to_diagnostics(
unique_diagnostic_occurances, all_added_lines, all_removed_lines, all_replaced_lines)
num_diff_datapoint = 0
# Creating one datapoint per diff action (add/delete/replace)
for key, value in diff_batch_to_diagnostic_occurances_dict.items():
# TODO: Comment this out for full dataset
# For a more heterogenuous dataset
if num_diff_datapoint > 3:
continue
unified_data = copy.deepcopy(unified_data_file)
unified_data["DiagnosticOccurances"] = value
diff_action, action_num = key.split("-")
unified_data["ParsedDiff"] = {}
unified_data["ParsedDiff"]["ActionType"] = diff_action
if diff_action == "REPLACE":
unified_data["ParsedDiff"]["Action"] = all_replaced_lines[int(
action_num)]
elif diff_action == "ADD":
unified_data["ParsedDiff"]["Action"] = all_added_lines[int(
action_num)]
elif diff_action == "REMOVE":
unified_data["ParsedDiff"]["Action"] = all_removed_lines[int(
action_num)]
# Find range of lines that are required to be inside FileContext
diag_occurance_lines = [diag_occurance["Line"]
for diag_occurance in value]
# Roslynator & diff hunks start at index 1
first_diag_line = min(diag_occurance_lines)
# Roslynator & diff hunks start at index 1
last_diag_line = max(diag_occurance_lines)
# Take into account that all deleted lines have to be in FileContext as well
first_diff_line = None
last_diff_line = None
actionType = unified_data["ParsedDiff"]["ActionType"]
if actionType == "REPLACE":
first_diff_line = unified_data["ParsedDiff"]["Action"]["SourceLocations"][0]
last_diff_line = unified_data["ParsedDiff"]["Action"]["SourceLocations"][-1]
elif actionType == "ADD":
first_diff_line = unified_data["ParsedDiff"]["Action"]["PreviousSourceLocation"]
else: # actionType == REMOVE
first_diff_line = unified_data["ParsedDiff"]["Action"]["SourceLocationStart"]
last_diff_line = unified_data["ParsedDiff"]["Action"]["SourceLocationEnd"]
first_required_line = min(first_diag_line, first_diff_line)
last_required_line = max(
last_diag_line, last_diff_line) if last_diff_line else last_diag_line
unified_data["RequiredLinesStart"] = first_required_line
unified_data["RequiredLinesEnd"] = last_required_line
# Add context around required lines. This is only for intermediate readability
# since context is added later as a fixed amount of tokens.
LINE_DELTA = 3
if first_diag_line > LINE_DELTA:
starting_line = first_required_line - LINE_DELTA
else:
starting_line = 1
if last_required_line < unified_data_file["NumberFileLines"] - LINE_DELTA:
ending_line = last_required_line + LINE_DELTA
else:
ending_line = unified_data_file["NumberFileLines"]
with open(f"{repo_dir}/{patched_file.path}") as f:
file_list = list(f)
# Also want to include ending_line
unified_data["FileContext"] = file_list[starting_line - 1:ending_line]
unified_data["FileContextStart"] = starting_line
with open(f"{unified_dataset_dir}/{unified_data_filename_hash}-{num_diff_datapoint}.json", 'w', encoding='utf-8') as f:
json.dump(unified_data, f, ensure_ascii=False, indent=2)
print("Created unified_data_filename: ", unified_data_filename)
unified_data_files.append(unified_data_filename_hash)
num_diff_datapoint += 1