Add option to remove stopwords [-rs|--remove-stopwords]

Also updated project report to reflect these changes. Signed-off-by: Vartan Benohanian <[email protected]>
vartanbeno · Dec 4, 2018 · ae6d891 · ae6d891
1 parent 6d34ce5
commit ae6d891
Show file tree

Hide file tree

Showing 6 changed files with 52 additions and 28 deletions.
diff --git a/Project Report.pdf b/Project Report.pdf
diff --git a/README.md b/README.md
@@ -35,22 +35,24 @@ This will take you to an interactive Bash terminal, from which you can [run](#ru
 The file to run is in the root directory of the project.
 
 ```
-python main.py [-url "START_URL"]
-               [-ign]
-               [-m MAX]
+python main.py [-url|--start-url <"START_URL">]
+               [-ign|--ignore-robots]
+               [-m|--max <MAX>]
+               [-rs|--remove-stopwords]
 ```
 
 The arguments are the following:
 
 1. `-url` or `--start-url`: URL the crawler will begin scraping links from. Surround it with quotes in the command line for best results. Default is the [about page](https://www.concordia.ca/about.html) of the Concordia University website.
 2. `-ign` or `--ignore-robots`: websites' robots.txt will be ignored. Default is false.
 3. `-m` or `--max`: maximum number of links to scrape. Default is 10.
-4. `-skip` or `--skip-crawl`: spider won't be run, and index/stats will be built from current results file. Default is false.
+4. `-rs` or `--remove-stopwords`: stopwords will be removed from the index and ignored in queries. Default is false.
+5. `-skip` or `--skip-crawl`: spider won't crawl; index/stats will be built from current files. Default is false.
 
 If you intend to use that last one, no need to specify the others. You would obviously need to have run the crawler first, to generate a data set. Simply run:
 
 ```
-python main.py -skip
+python main.py [-skip|--skip-crawl]
 ```
 
 ## Authors

diff --git a/src/classes/query.py b/src/classes/query.py
@@ -8,14 +8,16 @@
 
 class Query:
 
-    def __init__(self, index, stats):
+    def __init__(self, index, stats, remove_stopwords=False):
         """
         Query constructor.
         :param index: dictionary generated by the crawler
         :param stats: dictionary of pages scraped, with total number of terms, and Afinn score, for each page
+        :param remove_stopwords: whether or not stopwords in queries will be ignored
         """
         self.index = index
         self.stats = stats
+        self.remove_stopwords = remove_stopwords
 
         self.tf_idf = TFIDF(self.index, self.stats)
 
@@ -184,8 +186,10 @@ def print_results(self):
 
 class AndQuery(Query):
 
-    def __init__(self, index, stats):
-        Query.__init__(self, index, stats)
+    def __init__(self, index, stats, remove_stopwords=False):
+        Query.__init__(self, index, stats, remove_stopwords)
+
+        self.remove_stopwords = remove_stopwords
 
     def execute(self, terms):
         """
@@ -194,7 +198,9 @@ def execute(self, terms):
         :return: list of pages containing all of the terms in the query (AND).
         """
         self.original_terms = terms
-        self.terms = clean_terms(terms)
+        print(self.original_terms)
+        self.terms = clean_terms(terms, self.remove_stopwords)
+        print(self.terms)
         self.results_with_cosine_similarity = {}
 
         lists_of_pages = self.get_pages()
@@ -210,8 +216,10 @@ def execute(self, terms):
 
 class OrQuery(Query):
 
-    def __init__(self, index, stats):
-        Query.__init__(self, index, stats)
+    def __init__(self, index, stats, remove_stopwords=False):
+        Query.__init__(self, index, stats, remove_stopwords)
+
+        self.remove_stopwords = remove_stopwords
 
     def execute(self, terms):
         """
@@ -220,7 +228,7 @@ def execute(self, terms):
         :return: list of pages containing at least one of the terms in the query (OR).
         """
         self.original_terms = terms
-        self.terms = clean_terms(terms)
+        self.terms = clean_terms(terms, self.remove_stopwords)
         self.results_with_cosine_similarity = {}
 
         lists_of_pages = self.get_pages()

diff --git a/src/classes/spider.py b/src/classes/spider.py
@@ -35,6 +35,8 @@ class ConcordiaSpider(CrawlSpider):
 
     scraped_links = []
 
+    remove_stopwords = False
+
     def parse_item(self, response):
         """
         This method parses the response object.
@@ -52,10 +54,10 @@ def parse_item(self, response):
         content = []
 
         title = response.xpath("//title//text()").extract_first()
-        content.extend(clean_terms(title))
+        content.extend(clean_terms(title, self.remove_stopwords))
 
         for text in response.xpath(self.tags).extract():
-            content.extend(clean_terms(text))
+            content.extend(clean_terms(text, self.remove_stopwords))
 
         yield {
             "url": url,
@@ -87,7 +89,7 @@ def get_process():
             }
         })
 
-    def crawl(self, start_url="https://www.concordia.ca/about.html", obey_robots=True, max=10):
+    def crawl(self, start_url="https://www.concordia.ca/about.html", obey_robots=True, max=10, remove_stopwords=False):
         """
         First, we define the start URL of the crawler by appending it to its start_urls attribute, which is currently
         just an empty list.
@@ -101,9 +103,11 @@ def crawl(self, start_url="https://www.concordia.ca/about.html", obey_robots=Tru
         :param start_url: URL the crawler will start scraping links from
         :param obey_robots: whether or not the crawler will obey websites' robots.txt
         :param max: maximum number of pages to be crawled
+        :param remove_stopwords: whether or not stopwords will be removed from scraped content
         :return: None
         """
         ConcordiaSpider.start_urls = [start_url]
+        ConcordiaSpider.remove_stopwords = remove_stopwords
 
         process = ConcordiaSpider.get_process()
         process.settings.set("ROBOTSTXT_OBEY", obey_robots)

diff --git a/src/helpers.py b/src/helpers.py
@@ -16,15 +16,19 @@
 sqrt = sqrt
 
 
-def clean_terms(text):
+def clean_terms(text, remove_stopwords=False):
     """
     :param text: string of text (could be one word, a sentence, a whole article, etc.) to be tokenized and casefolded
+    :param remove_stopwords: whether or not stopwords will be removed from the string of text
     :return: list of terms without strings that are just punctuation, and without stopwords
     """
     terms = word_tokenize(text)
     terms = [term.casefold() for term in terms]
     """ includes the Em dash (a long hyphen), another dash, a kind of single/double quotes, and other punctuation """
-    return [term for term in terms if not re.fullmatch("[" + string.punctuation + "–—‘’“”…•‹›«»]+", term)]
+    terms = [term for term in terms if not re.fullmatch("[" + string.punctuation + "–—‘’“”…•‹›«»]+", term)]
+    if remove_stopwords:
+        terms = [term for term in terms if term not in stopwords]
+    return terms
 
 
 SENTIMENT = "sentiment"

diff --git a/src/main.py b/src/main.py
@@ -14,7 +14,8 @@
 parser.add_argument("-url", "--start-url", type=str, help="page where we start crawling for links", default="https://www.concordia.ca/about.html")
 parser.add_argument("-ign", "--ignore-robots", action="store_true", help="ignore websites' robots.txt", default=False)
 parser.add_argument("-m", "--max", type=int, help="maximum number of pages to crawl", default=10)
-parser.add_argument("-skip", "--skip-crawl", action="store_true", help="skip crawler, build index and stats from current results.json", default=False)
+parser.add_argument("-rs", "--remove-stopwords", action="store_true", help="remove stopwords from scraped content and queries", default=False)
+parser.add_argument("-skip", "--skip-crawl", action="store_true", help="skip crawler, build index and stats from current files", default=False)
 
 args = parser.parse_args()
 
@@ -24,7 +25,7 @@ def delete_results():
         os.remove(output_file)
 
 
-def run_spider():
+def run_spider(remove_stopwords=False):
 
     """
     First, delete the results.json file if it exists. The crawler will recreate it and populate it with data.
@@ -36,7 +37,12 @@ def run_spider():
     delete_results()
 
     spider = ConcordiaSpider()
-    spider.crawl(start_url=args.start_url, obey_robots=not args.ignore_robots, max=args.max)
+    spider.crawl(
+        start_url=args.start_url,
+        obey_robots=not args.ignore_robots,
+        max=args.max,
+        remove_stopwords=remove_stopwords
+    )
 
     document_parser = DocumentParser(output_file)
     document_parser.construct_stats()
@@ -48,21 +54,21 @@ def run_spider():
 
     index = index_builder.get_index()
 
-    conduct_queries(index, stats)
+    conduct_queries(index, stats, remove_stopwords)
 
 
-def build_stats_and_index():
+def build_stats_and_index(remove_stopwords=False):
 
     stats = DocumentParser.build_stats_from_file()
     index = IndexBuilder.build_index_from_file()
 
-    conduct_queries(index, stats)
+    conduct_queries(index, stats, remove_stopwords)
 
 
-def conduct_queries(index, stats):
+def conduct_queries(index, stats, remove_stopwords=False):
 
-    and_query = AndQuery(index, stats)
-    or_query = OrQuery(index, stats)
+    and_query = AndQuery(index, stats, remove_stopwords)
+    or_query = OrQuery(index, stats, remove_stopwords)
 
     while True:
         user_input = input("Would you like to conduct an AND query or an OR query? Hit enter for no. [and/or] ")
@@ -80,7 +86,7 @@ def conduct_queries(index, stats):
 
     if not args.skip_crawl:
 
-        run_spider()
+        run_spider(args.remove_stopwords)
 
     else:
 
@@ -95,6 +101,6 @@ def conduct_queries(index, stats):
 
         else:
 
-            build_stats_and_index()
+            build_stats_and_index(args.remove_stopwords)
 
     print("Bye!")