Skip to content

Commit

Permalink
Adjust variable names, update project report
Browse files Browse the repository at this point in the history
Signed-off-by: Vartan Benohanian <[email protected]>
  • Loading branch information
vartanbeno committed Dec 4, 2018
1 parent e513319 commit 6d34ce5
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 73 deletions.
Binary file modified Project Report.pdf
Binary file not shown.
52 changes: 26 additions & 26 deletions src/classes/document_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from helpers import afinn, pages, url, content, totals, total_documents, total_tokens, total_afinn, avg_tokens, avg_afinn
from helpers import afinn, PAGES, URL, CONTENT, TOTALS, TOTAL_DOCUMENTS, TOTAL_TOKENS, TOTAL_AFINN, AVG_TOKENS, AVG_AFINN

import json

Expand All @@ -16,7 +16,7 @@ def __init__(self, file_to_parse):
:param file_to_parse: file with the crawler's output
"""
self.file_to_parse = file_to_parse
self.stats = {pages: {}, totals: {}}
self.stats = {PAGES: {}, TOTALS: {}}

def construct_stats(self):
"""
Expand All @@ -33,22 +33,22 @@ def construct_stats(self):

for result in results:

doc_terms = len(result[content])
doc_afinn = afinn.score(" ".join(result[content]))
doc_terms = len(result[CONTENT])
doc_afinn = afinn.score(" ".join(result[CONTENT]))

total_num_tokens += doc_terms
total_num_afinn += doc_afinn

self.stats[pages][result[url]] = {
total_tokens: len(result[content]),
total_afinn: afinn.score(" ".join(result[content]))
self.stats[PAGES][result[URL]] = {
TOTAL_TOKENS: len(result[CONTENT]),
TOTAL_AFINN: afinn.score(" ".join(result[CONTENT]))
}

self.stats[totals][total_documents] = len(self.stats[pages])
self.stats[totals][total_tokens] = total_num_tokens
self.stats[totals][avg_tokens] = total_num_tokens / len(self.stats[pages])
self.stats[totals][total_afinn] = total_num_afinn
self.stats[totals][avg_afinn] = total_num_afinn / len(self.stats[pages])
self.stats[TOTALS][TOTAL_DOCUMENTS] = len(self.stats[PAGES])
self.stats[TOTALS][TOTAL_TOKENS] = total_num_tokens
self.stats[TOTALS][AVG_TOKENS] = total_num_tokens / len(self.stats[PAGES])
self.stats[TOTALS][TOTAL_AFINN] = total_num_afinn
self.stats[TOTALS][AVG_AFINN] = total_num_afinn / len(self.stats[PAGES])

self.write_to_file(self.stats)

Expand All @@ -65,16 +65,16 @@ def write_to_file(self, stats):

with open(self.stats_file, "w", encoding="utf-8") as stats_file:

for page, page_info in stats[pages].items():
for page, page_info in stats[PAGES].items():

total_num_tokens += page_info[total_tokens]
total_num_afinn += page_info[total_afinn]
total_num_tokens += page_info[TOTAL_TOKENS]
total_num_afinn += page_info[TOTAL_AFINN]

stats_file.write("{} {} {}\n".format(page, page_info[total_tokens], page_info[total_afinn]))
stats_file.write("{} {} {}\n".format(page, page_info[TOTAL_TOKENS], page_info[TOTAL_AFINN]))

stats_file.write(
"\n{} {} document(s): {} total tokens, {} average tokens, {} total Afinn score, {} average Afinn score\n"
.format(self.summary, len(stats[pages]), total_num_tokens, round(total_num_tokens / len(stats[pages]), 3), total_num_afinn, round(total_num_afinn / len(stats[pages]), 3))
.format(self.summary, len(stats[PAGES]), total_num_tokens, round(total_num_tokens / len(stats[PAGES]), 3), total_num_afinn, round(total_num_afinn / len(stats[PAGES]), 3))
)

print("Document stats available at {}, showcasing:\n\t"
Expand All @@ -98,7 +98,7 @@ def build_stats_from_file():
:return: statistics dictionary of documents
"""

stats = {pages: {}, totals: {}}
stats = {PAGES: {}, TOTALS: {}}

with open(DocumentParser.stats_file) as stats_file:

Expand All @@ -110,14 +110,14 @@ def build_stats_from_file():
continue

if elements[0] == DocumentParser.summary:
stats[totals][total_documents] = int(elements[1])
stats[totals][total_tokens] = int(elements[3])
stats[totals][avg_tokens] = float(elements[6])
stats[totals][total_afinn] = float(elements[9])
stats[totals][avg_afinn] = float(elements[13])
stats[TOTALS][TOTAL_DOCUMENTS] = int(elements[1])
stats[TOTALS][TOTAL_TOKENS] = int(elements[3])
stats[TOTALS][AVG_TOKENS] = float(elements[6])
stats[TOTALS][TOTAL_AFINN] = float(elements[9])
stats[TOTALS][AVG_AFINN] = float(elements[13])
else:
stats[pages][elements[0]] = {}
stats[pages][elements[0]][total_tokens] = int(elements[1])
stats[pages][elements[0]][total_afinn] = float(elements[2])
stats[PAGES][elements[0]] = {}
stats[PAGES][elements[0]][TOTAL_TOKENS] = int(elements[1])
stats[PAGES][elements[0]][TOTAL_AFINN] = float(elements[2])

return stats
50 changes: 25 additions & 25 deletions src/classes/index_builder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from helpers import afinn, log10, sentiment, pages
from helpers import afinn, SENTIMENT, PAGES, TF, CFT, DFT, IDF, TF_IDF
from classes.tf_idf import TFIDF

import json
Expand Down Expand Up @@ -43,22 +43,22 @@ def construct_index(self):
for term in terms:
if term not in self.index:
self.index[term] = {}
self.index[term]["cft"] = 0
self.index[term][sentiment] = afinn.score(term)
self.index[term][pages] = {}
self.index[term]["cft"] += 1
if url not in self.index[term][pages]:
self.index[term][pages][url] = {}
self.index[term][pages][url]["tf"] = 1
self.index[term][CFT] = 0
self.index[term][SENTIMENT] = afinn.score(term)
self.index[term][PAGES] = {}
self.index[term][CFT] += 1
if url not in self.index[term][PAGES]:
self.index[term][PAGES][url] = {}
self.index[term][PAGES][url][TF] = 1
else:
self.index[term][pages][url]["tf"] += 1
self.index[term][PAGES][url][TF] += 1

for term in self.index:
self.index[term]["dft"] = self.tfidf.dft(term)
self.index[term]["idf"] = self.tfidf.idf(term)
term_urls = list(self.index[term][pages])
self.index[term][DFT] = self.tfidf.dft(term)
self.index[term][IDF] = self.tfidf.idf(term)
term_urls = list(self.index[term][PAGES])
for url in term_urls:
self.index[term][pages][url]["tf-idf"] = self.tfidf.tf_idf(term, url)
self.index[term][PAGES][url][TF_IDF] = self.tfidf.tf_idf(term, url)

print("Index created. There's a total of {} distinct terms.".format(len(self.index)))
self.write_to_file(self.index)
Expand All @@ -74,9 +74,9 @@ def write_to_file(self, index):
with open(self.index_file, "w", encoding="utf-8") as index_file:
for term in sorted(index):
try:
index_file.write("{} {} {} {} {}".format(term, index[term]["cft"], index[term]["dft"], index[term]["idf"], index[term][sentiment]))
for url, stats in index[term][pages].items():
index_file.write(" {} {} {}".format(url, stats["tf"], stats["tf-idf"]))
index_file.write("{} {} {} {} {}".format(term, index[term][CFT], index[term][DFT], index[term][IDF], index[term][SENTIMENT]))
for url, stats in index[term][PAGES].items():
index_file.write(" {} {} {}".format(url, stats[TF], stats[TF_IDF]))
index_file.write("\n")
except UnicodeEncodeError:
pass
Expand All @@ -92,7 +92,7 @@ def get_index(self):
def build_index_from_file():
"""
Build the inverted index from the file.
:return: inverted index
:return: the inverted index
"""

index = {}
Expand All @@ -104,16 +104,16 @@ def build_index_from_file():
elements = line.split()

index[elements[0]] = {}
index[elements[0]]["cft"] = int(elements[1])
index[elements[0]]["dft"] = int(elements[2])
index[elements[0]]["idf"] = float(elements[3])
index[elements[0]][sentiment] = float(elements[4])
index[elements[0]][CFT] = int(elements[1])
index[elements[0]][DFT] = int(elements[2])
index[elements[0]][IDF] = float(elements[3])
index[elements[0]][SENTIMENT] = float(elements[4])

index[elements[0]][pages] = {}
index[elements[0]][PAGES] = {}

for i in range (5, len(elements), 3):
index[elements[0]][pages][elements[i]] = {}
index[elements[0]][pages][elements[i]]["tf"] = int(elements[i+1])
index[elements[0]][pages][elements[i]]["tf-idf"] = float(elements[i+2])
index[elements[0]][PAGES][elements[i]] = {}
index[elements[0]][PAGES][elements[i]][TF] = int(elements[i + 1])
index[elements[0]][PAGES][elements[i]][TF_IDF] = float(elements[i + 2])

return index
16 changes: 8 additions & 8 deletions src/classes/query.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from helpers import clean_terms, afinn, pages, sqrt, total_afinn, sentiment
from helpers import clean_terms, afinn, sqrt, PAGES, TOTAL_AFINN, SENTIMENT, COSINE_SIMILARITY, AFINN_SCORE, URL
from classes.tf_idf import TFIDF

from beautifultable import BeautifulTable
Expand Down Expand Up @@ -26,7 +26,7 @@ def __init__(self, index, stats):
self.results_with_cosine_similarity = {}

self.table = BeautifulTable(max_width=140, default_alignment=BeautifulTable.ALIGN_LEFT)
self.table.column_headers = ["cosine similarity", "Afinn score", "URL"]
self.table.column_headers = [COSINE_SIMILARITY, AFINN_SCORE, URL]
self.table.numeric_precision = 10

@staticmethod
Expand All @@ -50,7 +50,7 @@ def get_pages(self):

for term in self.terms:
try:
results[term] = self.index[term][pages]
results[term] = self.index[term][PAGES]
except KeyError:
results[term] = []

Expand Down Expand Up @@ -111,8 +111,8 @@ def get_cosine_similarities(self):
cosine_similarity = 0.0

self.results_with_cosine_similarity[url] = {}
self.results_with_cosine_similarity[url]["cos"] = cosine_similarity
self.results_with_cosine_similarity[url][sentiment] = self.stats[pages][url][total_afinn]
self.results_with_cosine_similarity[url][COSINE_SIMILARITY] = cosine_similarity
self.results_with_cosine_similarity[url][SENTIMENT] = self.stats[PAGES][url][TOTAL_AFINN]

def generate_results_table(self, rows):
"""
Expand All @@ -134,8 +134,8 @@ def execute(self, terms):
:param terms: the user's query.
:return: None
"""
if self.__class__.__name__ == "Query":
print("You are conducting a query using the %s class." % self.__class__.__name__)
if self.__class__.__name__ == Query.__class__.__name__:
print("You are conducting a query using the %s class." % Query.__class__.__name__)
print("Make sure to use either AndQuery or OrQuery.\n")
return

Expand Down Expand Up @@ -163,7 +163,7 @@ def print_results(self):

rows = []
for url, cos_and_score in self.results_with_cosine_similarity.items():
row = [cos_and_score["cos"], cos_and_score[sentiment], url]
row = [cos_and_score[COSINE_SIMILARITY], cos_and_score[SENTIMENT], url]
rows.append(row)

# sort rows by cosine similarity, ascending
Expand Down
8 changes: 4 additions & 4 deletions src/classes/tf_idf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from helpers import pages, log10, totals, total_documents
from helpers import PAGES, log10, TOTALS, TOTAL_DOCUMENTS, TF


class TFIDF:
Expand All @@ -13,7 +13,7 @@ def __init__(self, index, stats):
self.index = index
self.stats = stats

self.N = self.stats[totals][total_documents]
self.N = self.stats[TOTALS][TOTAL_DOCUMENTS]

def get_documents_of_term(self, term):
"""
Expand All @@ -22,7 +22,7 @@ def get_documents_of_term(self, term):
:return: list of web pages
"""
try:
return list(self.index[term][pages].keys())
return list(self.index[term][PAGES].keys())
except KeyError:
return []

Expand All @@ -45,7 +45,7 @@ def get_term_frequency_in_document(self, term, url):
:return: number of times the term appears in the web page
"""
try:
return self.index[term][pages][url]["tf"]
return self.index[term][PAGES][url][TF]
except KeyError:
return 0

Expand Down
29 changes: 19 additions & 10 deletions src/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,22 @@ def clean_terms(text):
return [term for term in terms if not re.fullmatch("[" + string.punctuation + "–—‘’“”…•‹›«»]+", term)]


sentiment = "sentiment"
pages = "pages"
url = "url"
content = "content"
totals = "totals"
total_documents = "total_documents"
total_tokens = "total_tokens"
total_afinn = "total_afinn"
avg_tokens = "avg_tokens"
avg_afinn = "avg_afinn"
SENTIMENT = "sentiment"
PAGES = "pages"
URL = "url"
CONTENT = "content"
TOTALS = "totals"
TOTAL_DOCUMENTS = "total_documents"
TOTAL_TOKENS = "total_tokens"
TOTAL_AFINN = "total_afinn"
AVG_TOKENS = "avg_tokens"
AVG_AFINN = "avg_afinn"

TF = "tf"
CFT = "cft"
DFT = "dft"
IDF = "idf"
TF_IDF = "tf-idf"

COSINE_SIMILARITY = "cosine similarity"
AFINN_SCORE = "Afinn score"

0 comments on commit 6d34ce5

Please sign in to comment.