Skip to content
This repository has been archived by the owner on Feb 26, 2019. It is now read-only.

Commit

Permalink
add aggregated data to make learning faster, update requirements, als…
Browse files Browse the repository at this point in the history
…o fix empty line fix
  • Loading branch information
Marvin Bornstein committed Jan 30, 2017
1 parent 56c31d9 commit 9e637af
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 7 deletions.
16 changes: 11 additions & 5 deletions importer/datasetImporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,18 @@

class DatasetImporter:

def __init__(self, filename):
content = open(filename, 'r').readlines()
self.repos, self.target = zip(*[line.strip().split(',') for line in content])
def __init__(self, filename, complete_set=False):
if complete_set:
df = pd.read_csv(filename)
self.repos = df['repo']
self.target = df['y']
self.data = df.iloc[:,3:]
else:
content = open(filename, 'r').readlines()
self.repos, self.target = zip(*[line.strip().split(',') for line in content])

self.target = np.array(self.target)
self.data = self.get_data(self.repos)
self.target = np.array(self.target)
self.data = self.get_data(self.repos)

@staticmethod
def get_data(repo_links):
Expand Down
4 changes: 3 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,12 @@ def trainAndPredict(repos):
])

#train the classifier
importer = DatasetImporter('data/testset.csv')
#importer = DatasetImporter('data/testset.csv')
importer = DatasetImporter('enriched_data.csv', complete_set=True)
classifier.fit(logarithmitize(importer.data), importer.target)

# predict gives repositories
repos = [repo.strip() for repo in repos if repo.strip() != '']
prediction = classifier.predict(logarithmitize(DatasetImporter.get_data(repos)))
for repo, category in zip(repos, prediction):
print(repo + ', ' + category)
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
pygithub
numpy
pandas
scipy
scikit-learn
entropy
scikit-learn

0 comments on commit 9e637af

Please sign in to comment.