diff --git a/Project Report.pdf b/Project Report.pdf index 7113591..4e97899 100644 Binary files a/Project Report.pdf and b/Project Report.pdf differ diff --git a/src/classes/document_parser.py b/src/classes/document_parser.py index 915219a..c154f26 100644 --- a/src/classes/document_parser.py +++ b/src/classes/document_parser.py @@ -1,4 +1,4 @@ -from helpers import afinn, pages, url, content, totals, total_documents, total_tokens, total_afinn, avg_tokens, avg_afinn +from helpers import afinn, PAGES, URL, CONTENT, TOTALS, TOTAL_DOCUMENTS, TOTAL_TOKENS, TOTAL_AFINN, AVG_TOKENS, AVG_AFINN import json @@ -16,7 +16,7 @@ def __init__(self, file_to_parse): :param file_to_parse: file with the crawler's output """ self.file_to_parse = file_to_parse - self.stats = {pages: {}, totals: {}} + self.stats = {PAGES: {}, TOTALS: {}} def construct_stats(self): """ @@ -33,22 +33,22 @@ def construct_stats(self): for result in results: - doc_terms = len(result[content]) - doc_afinn = afinn.score(" ".join(result[content])) + doc_terms = len(result[CONTENT]) + doc_afinn = afinn.score(" ".join(result[CONTENT])) total_num_tokens += doc_terms total_num_afinn += doc_afinn - self.stats[pages][result[url]] = { - total_tokens: len(result[content]), - total_afinn: afinn.score(" ".join(result[content])) + self.stats[PAGES][result[URL]] = { + TOTAL_TOKENS: len(result[CONTENT]), + TOTAL_AFINN: afinn.score(" ".join(result[CONTENT])) } - self.stats[totals][total_documents] = len(self.stats[pages]) - self.stats[totals][total_tokens] = total_num_tokens - self.stats[totals][avg_tokens] = total_num_tokens / len(self.stats[pages]) - self.stats[totals][total_afinn] = total_num_afinn - self.stats[totals][avg_afinn] = total_num_afinn / len(self.stats[pages]) + self.stats[TOTALS][TOTAL_DOCUMENTS] = len(self.stats[PAGES]) + self.stats[TOTALS][TOTAL_TOKENS] = total_num_tokens + self.stats[TOTALS][AVG_TOKENS] = total_num_tokens / len(self.stats[PAGES]) + self.stats[TOTALS][TOTAL_AFINN] = total_num_afinn + self.stats[TOTALS][AVG_AFINN] = total_num_afinn / len(self.stats[PAGES]) self.write_to_file(self.stats) @@ -65,16 +65,16 @@ def write_to_file(self, stats): with open(self.stats_file, "w", encoding="utf-8") as stats_file: - for page, page_info in stats[pages].items(): + for page, page_info in stats[PAGES].items(): - total_num_tokens += page_info[total_tokens] - total_num_afinn += page_info[total_afinn] + total_num_tokens += page_info[TOTAL_TOKENS] + total_num_afinn += page_info[TOTAL_AFINN] - stats_file.write("{} {} {}\n".format(page, page_info[total_tokens], page_info[total_afinn])) + stats_file.write("{} {} {}\n".format(page, page_info[TOTAL_TOKENS], page_info[TOTAL_AFINN])) stats_file.write( "\n{} {} document(s): {} total tokens, {} average tokens, {} total Afinn score, {} average Afinn score\n" - .format(self.summary, len(stats[pages]), total_num_tokens, round(total_num_tokens / len(stats[pages]), 3), total_num_afinn, round(total_num_afinn / len(stats[pages]), 3)) + .format(self.summary, len(stats[PAGES]), total_num_tokens, round(total_num_tokens / len(stats[PAGES]), 3), total_num_afinn, round(total_num_afinn / len(stats[PAGES]), 3)) ) print("Document stats available at {}, showcasing:\n\t" @@ -98,7 +98,7 @@ def build_stats_from_file(): :return: statistics dictionary of documents """ - stats = {pages: {}, totals: {}} + stats = {PAGES: {}, TOTALS: {}} with open(DocumentParser.stats_file) as stats_file: @@ -110,14 +110,14 @@ def build_stats_from_file(): continue if elements[0] == DocumentParser.summary: - stats[totals][total_documents] = int(elements[1]) - stats[totals][total_tokens] = int(elements[3]) - stats[totals][avg_tokens] = float(elements[6]) - stats[totals][total_afinn] = float(elements[9]) - stats[totals][avg_afinn] = float(elements[13]) + stats[TOTALS][TOTAL_DOCUMENTS] = int(elements[1]) + stats[TOTALS][TOTAL_TOKENS] = int(elements[3]) + stats[TOTALS][AVG_TOKENS] = float(elements[6]) + stats[TOTALS][TOTAL_AFINN] = float(elements[9]) + stats[TOTALS][AVG_AFINN] = float(elements[13]) else: - stats[pages][elements[0]] = {} - stats[pages][elements[0]][total_tokens] = int(elements[1]) - stats[pages][elements[0]][total_afinn] = float(elements[2]) + stats[PAGES][elements[0]] = {} + stats[PAGES][elements[0]][TOTAL_TOKENS] = int(elements[1]) + stats[PAGES][elements[0]][TOTAL_AFINN] = float(elements[2]) return stats diff --git a/src/classes/index_builder.py b/src/classes/index_builder.py index 61ac5f5..71bd315 100644 --- a/src/classes/index_builder.py +++ b/src/classes/index_builder.py @@ -1,4 +1,4 @@ -from helpers import afinn, log10, sentiment, pages +from helpers import afinn, SENTIMENT, PAGES, TF, CFT, DFT, IDF, TF_IDF from classes.tf_idf import TFIDF import json @@ -43,22 +43,22 @@ def construct_index(self): for term in terms: if term not in self.index: self.index[term] = {} - self.index[term]["cft"] = 0 - self.index[term][sentiment] = afinn.score(term) - self.index[term][pages] = {} - self.index[term]["cft"] += 1 - if url not in self.index[term][pages]: - self.index[term][pages][url] = {} - self.index[term][pages][url]["tf"] = 1 + self.index[term][CFT] = 0 + self.index[term][SENTIMENT] = afinn.score(term) + self.index[term][PAGES] = {} + self.index[term][CFT] += 1 + if url not in self.index[term][PAGES]: + self.index[term][PAGES][url] = {} + self.index[term][PAGES][url][TF] = 1 else: - self.index[term][pages][url]["tf"] += 1 + self.index[term][PAGES][url][TF] += 1 for term in self.index: - self.index[term]["dft"] = self.tfidf.dft(term) - self.index[term]["idf"] = self.tfidf.idf(term) - term_urls = list(self.index[term][pages]) + self.index[term][DFT] = self.tfidf.dft(term) + self.index[term][IDF] = self.tfidf.idf(term) + term_urls = list(self.index[term][PAGES]) for url in term_urls: - self.index[term][pages][url]["tf-idf"] = self.tfidf.tf_idf(term, url) + self.index[term][PAGES][url][TF_IDF] = self.tfidf.tf_idf(term, url) print("Index created. There's a total of {} distinct terms.".format(len(self.index))) self.write_to_file(self.index) @@ -74,9 +74,9 @@ def write_to_file(self, index): with open(self.index_file, "w", encoding="utf-8") as index_file: for term in sorted(index): try: - index_file.write("{} {} {} {} {}".format(term, index[term]["cft"], index[term]["dft"], index[term]["idf"], index[term][sentiment])) - for url, stats in index[term][pages].items(): - index_file.write(" {} {} {}".format(url, stats["tf"], stats["tf-idf"])) + index_file.write("{} {} {} {} {}".format(term, index[term][CFT], index[term][DFT], index[term][IDF], index[term][SENTIMENT])) + for url, stats in index[term][PAGES].items(): + index_file.write(" {} {} {}".format(url, stats[TF], stats[TF_IDF])) index_file.write("\n") except UnicodeEncodeError: pass @@ -92,7 +92,7 @@ def get_index(self): def build_index_from_file(): """ Build the inverted index from the file. - :return: inverted index + :return: the inverted index """ index = {} @@ -104,16 +104,16 @@ def build_index_from_file(): elements = line.split() index[elements[0]] = {} - index[elements[0]]["cft"] = int(elements[1]) - index[elements[0]]["dft"] = int(elements[2]) - index[elements[0]]["idf"] = float(elements[3]) - index[elements[0]][sentiment] = float(elements[4]) + index[elements[0]][CFT] = int(elements[1]) + index[elements[0]][DFT] = int(elements[2]) + index[elements[0]][IDF] = float(elements[3]) + index[elements[0]][SENTIMENT] = float(elements[4]) - index[elements[0]][pages] = {} + index[elements[0]][PAGES] = {} for i in range (5, len(elements), 3): - index[elements[0]][pages][elements[i]] = {} - index[elements[0]][pages][elements[i]]["tf"] = int(elements[i+1]) - index[elements[0]][pages][elements[i]]["tf-idf"] = float(elements[i+2]) + index[elements[0]][PAGES][elements[i]] = {} + index[elements[0]][PAGES][elements[i]][TF] = int(elements[i + 1]) + index[elements[0]][PAGES][elements[i]][TF_IDF] = float(elements[i + 2]) return index diff --git a/src/classes/query.py b/src/classes/query.py index 483e712..fb4cc47 100644 --- a/src/classes/query.py +++ b/src/classes/query.py @@ -1,4 +1,4 @@ -from helpers import clean_terms, afinn, pages, sqrt, total_afinn, sentiment +from helpers import clean_terms, afinn, sqrt, PAGES, TOTAL_AFINN, SENTIMENT, COSINE_SIMILARITY, AFINN_SCORE, URL from classes.tf_idf import TFIDF from beautifultable import BeautifulTable @@ -26,7 +26,7 @@ def __init__(self, index, stats): self.results_with_cosine_similarity = {} self.table = BeautifulTable(max_width=140, default_alignment=BeautifulTable.ALIGN_LEFT) - self.table.column_headers = ["cosine similarity", "Afinn score", "URL"] + self.table.column_headers = [COSINE_SIMILARITY, AFINN_SCORE, URL] self.table.numeric_precision = 10 @staticmethod @@ -50,7 +50,7 @@ def get_pages(self): for term in self.terms: try: - results[term] = self.index[term][pages] + results[term] = self.index[term][PAGES] except KeyError: results[term] = [] @@ -111,8 +111,8 @@ def get_cosine_similarities(self): cosine_similarity = 0.0 self.results_with_cosine_similarity[url] = {} - self.results_with_cosine_similarity[url]["cos"] = cosine_similarity - self.results_with_cosine_similarity[url][sentiment] = self.stats[pages][url][total_afinn] + self.results_with_cosine_similarity[url][COSINE_SIMILARITY] = cosine_similarity + self.results_with_cosine_similarity[url][SENTIMENT] = self.stats[PAGES][url][TOTAL_AFINN] def generate_results_table(self, rows): """ @@ -134,8 +134,8 @@ def execute(self, terms): :param terms: the user's query. :return: None """ - if self.__class__.__name__ == "Query": - print("You are conducting a query using the %s class." % self.__class__.__name__) + if self.__class__.__name__ == Query.__class__.__name__: + print("You are conducting a query using the %s class." % Query.__class__.__name__) print("Make sure to use either AndQuery or OrQuery.\n") return @@ -163,7 +163,7 @@ def print_results(self): rows = [] for url, cos_and_score in self.results_with_cosine_similarity.items(): - row = [cos_and_score["cos"], cos_and_score[sentiment], url] + row = [cos_and_score[COSINE_SIMILARITY], cos_and_score[SENTIMENT], url] rows.append(row) # sort rows by cosine similarity, ascending diff --git a/src/classes/tf_idf.py b/src/classes/tf_idf.py index 918fcf1..230a88d 100644 --- a/src/classes/tf_idf.py +++ b/src/classes/tf_idf.py @@ -1,4 +1,4 @@ -from helpers import pages, log10, totals, total_documents +from helpers import PAGES, log10, TOTALS, TOTAL_DOCUMENTS, TF class TFIDF: @@ -13,7 +13,7 @@ def __init__(self, index, stats): self.index = index self.stats = stats - self.N = self.stats[totals][total_documents] + self.N = self.stats[TOTALS][TOTAL_DOCUMENTS] def get_documents_of_term(self, term): """ @@ -22,7 +22,7 @@ def get_documents_of_term(self, term): :return: list of web pages """ try: - return list(self.index[term][pages].keys()) + return list(self.index[term][PAGES].keys()) except KeyError: return [] @@ -45,7 +45,7 @@ def get_term_frequency_in_document(self, term, url): :return: number of times the term appears in the web page """ try: - return self.index[term][pages][url]["tf"] + return self.index[term][PAGES][url][TF] except KeyError: return 0 diff --git a/src/helpers.py b/src/helpers.py index 9342f2d..ab76df6 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -27,13 +27,22 @@ def clean_terms(text): return [term for term in terms if not re.fullmatch("[" + string.punctuation + "–—‘’“”…•‹›«»]+", term)] -sentiment = "sentiment" -pages = "pages" -url = "url" -content = "content" -totals = "totals" -total_documents = "total_documents" -total_tokens = "total_tokens" -total_afinn = "total_afinn" -avg_tokens = "avg_tokens" -avg_afinn = "avg_afinn" +SENTIMENT = "sentiment" +PAGES = "pages" +URL = "url" +CONTENT = "content" +TOTALS = "totals" +TOTAL_DOCUMENTS = "total_documents" +TOTAL_TOKENS = "total_tokens" +TOTAL_AFINN = "total_afinn" +AVG_TOKENS = "avg_tokens" +AVG_AFINN = "avg_afinn" + +TF = "tf" +CFT = "cft" +DFT = "dft" +IDF = "idf" +TF_IDF = "tf-idf" + +COSINE_SIMILARITY = "cosine similarity" +AFINN_SCORE = "Afinn score"