Skip to content

Commit

Permalink
Add option to remove stopwords [-rs|--remove-stopwords]
Browse files Browse the repository at this point in the history
Also updated project report to reflect these changes.

Signed-off-by: Vartan Benohanian <[email protected]>
  • Loading branch information
vartanbeno committed Dec 4, 2018
1 parent 6d34ce5 commit ae6d891
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 28 deletions.
Binary file modified Project Report.pdf
Binary file not shown.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,24 @@ This will take you to an interactive Bash terminal, from which you can [run](#ru
The file to run is in the root directory of the project.

```
python main.py [-url "START_URL"]
[-ign]
[-m MAX]
python main.py [-url|--start-url <"START_URL">]
[-ign|--ignore-robots]
[-m|--max <MAX>]
[-rs|--remove-stopwords]
```

The arguments are the following:

1. `-url` or `--start-url`: URL the crawler will begin scraping links from. Surround it with quotes in the command line for best results. Default is the [about page](https://www.concordia.ca/about.html) of the Concordia University website.
2. `-ign` or `--ignore-robots`: websites' robots.txt will be ignored. Default is false.
3. `-m` or `--max`: maximum number of links to scrape. Default is 10.
4. `-skip` or `--skip-crawl`: spider won't be run, and index/stats will be built from current results file. Default is false.
4. `-rs` or `--remove-stopwords`: stopwords will be removed from the index and ignored in queries. Default is false.
5. `-skip` or `--skip-crawl`: spider won't crawl; index/stats will be built from current files. Default is false.

If you intend to use that last one, no need to specify the others. You would obviously need to have run the crawler first, to generate a data set. Simply run:

```
python main.py -skip
python main.py [-skip|--skip-crawl]
```

## Authors
Expand Down
22 changes: 15 additions & 7 deletions src/classes/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,16 @@

class Query:

def __init__(self, index, stats):
def __init__(self, index, stats, remove_stopwords=False):
"""
Query constructor.
:param index: dictionary generated by the crawler
:param stats: dictionary of pages scraped, with total number of terms, and Afinn score, for each page
:param remove_stopwords: whether or not stopwords in queries will be ignored
"""
self.index = index
self.stats = stats
self.remove_stopwords = remove_stopwords

self.tf_idf = TFIDF(self.index, self.stats)

Expand Down Expand Up @@ -184,8 +186,10 @@ def print_results(self):

class AndQuery(Query):

def __init__(self, index, stats):
Query.__init__(self, index, stats)
def __init__(self, index, stats, remove_stopwords=False):
Query.__init__(self, index, stats, remove_stopwords)

self.remove_stopwords = remove_stopwords

def execute(self, terms):
"""
Expand All @@ -194,7 +198,9 @@ def execute(self, terms):
:return: list of pages containing all of the terms in the query (AND).
"""
self.original_terms = terms
self.terms = clean_terms(terms)
print(self.original_terms)
self.terms = clean_terms(terms, self.remove_stopwords)
print(self.terms)
self.results_with_cosine_similarity = {}

lists_of_pages = self.get_pages()
Expand All @@ -210,8 +216,10 @@ def execute(self, terms):

class OrQuery(Query):

def __init__(self, index, stats):
Query.__init__(self, index, stats)
def __init__(self, index, stats, remove_stopwords=False):
Query.__init__(self, index, stats, remove_stopwords)

self.remove_stopwords = remove_stopwords

def execute(self, terms):
"""
Expand All @@ -220,7 +228,7 @@ def execute(self, terms):
:return: list of pages containing at least one of the terms in the query (OR).
"""
self.original_terms = terms
self.terms = clean_terms(terms)
self.terms = clean_terms(terms, self.remove_stopwords)
self.results_with_cosine_similarity = {}

lists_of_pages = self.get_pages()
Expand Down
10 changes: 7 additions & 3 deletions src/classes/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class ConcordiaSpider(CrawlSpider):

scraped_links = []

remove_stopwords = False

def parse_item(self, response):
"""
This method parses the response object.
Expand All @@ -52,10 +54,10 @@ def parse_item(self, response):
content = []

title = response.xpath("//title//text()").extract_first()
content.extend(clean_terms(title))
content.extend(clean_terms(title, self.remove_stopwords))

for text in response.xpath(self.tags).extract():
content.extend(clean_terms(text))
content.extend(clean_terms(text, self.remove_stopwords))

yield {
"url": url,
Expand Down Expand Up @@ -87,7 +89,7 @@ def get_process():
}
})

def crawl(self, start_url="https://www.concordia.ca/about.html", obey_robots=True, max=10):
def crawl(self, start_url="https://www.concordia.ca/about.html", obey_robots=True, max=10, remove_stopwords=False):
"""
First, we define the start URL of the crawler by appending it to its start_urls attribute, which is currently
just an empty list.
Expand All @@ -101,9 +103,11 @@ def crawl(self, start_url="https://www.concordia.ca/about.html", obey_robots=Tru
:param start_url: URL the crawler will start scraping links from
:param obey_robots: whether or not the crawler will obey websites' robots.txt
:param max: maximum number of pages to be crawled
:param remove_stopwords: whether or not stopwords will be removed from scraped content
:return: None
"""
ConcordiaSpider.start_urls = [start_url]
ConcordiaSpider.remove_stopwords = remove_stopwords

process = ConcordiaSpider.get_process()
process.settings.set("ROBOTSTXT_OBEY", obey_robots)
Expand Down
8 changes: 6 additions & 2 deletions src/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@
sqrt = sqrt


def clean_terms(text):
def clean_terms(text, remove_stopwords=False):
"""
:param text: string of text (could be one word, a sentence, a whole article, etc.) to be tokenized and casefolded
:param remove_stopwords: whether or not stopwords will be removed from the string of text
:return: list of terms without strings that are just punctuation, and without stopwords
"""
terms = word_tokenize(text)
terms = [term.casefold() for term in terms]
""" includes the Em dash (a long hyphen), another dash, a kind of single/double quotes, and other punctuation """
return [term for term in terms if not re.fullmatch("[" + string.punctuation + "–—‘’“”…•‹›«»]+", term)]
terms = [term for term in terms if not re.fullmatch("[" + string.punctuation + "–—‘’“”…•‹›«»]+", term)]
if remove_stopwords:
terms = [term for term in terms if term not in stopwords]
return terms


SENTIMENT = "sentiment"
Expand Down
28 changes: 17 additions & 11 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
parser.add_argument("-url", "--start-url", type=str, help="page where we start crawling for links", default="https://www.concordia.ca/about.html")
parser.add_argument("-ign", "--ignore-robots", action="store_true", help="ignore websites' robots.txt", default=False)
parser.add_argument("-m", "--max", type=int, help="maximum number of pages to crawl", default=10)
parser.add_argument("-skip", "--skip-crawl", action="store_true", help="skip crawler, build index and stats from current results.json", default=False)
parser.add_argument("-rs", "--remove-stopwords", action="store_true", help="remove stopwords from scraped content and queries", default=False)
parser.add_argument("-skip", "--skip-crawl", action="store_true", help="skip crawler, build index and stats from current files", default=False)

args = parser.parse_args()

Expand All @@ -24,7 +25,7 @@ def delete_results():
os.remove(output_file)


def run_spider():
def run_spider(remove_stopwords=False):

"""
First, delete the results.json file if it exists. The crawler will recreate it and populate it with data.
Expand All @@ -36,7 +37,12 @@ def run_spider():
delete_results()

spider = ConcordiaSpider()
spider.crawl(start_url=args.start_url, obey_robots=not args.ignore_robots, max=args.max)
spider.crawl(
start_url=args.start_url,
obey_robots=not args.ignore_robots,
max=args.max,
remove_stopwords=remove_stopwords
)

document_parser = DocumentParser(output_file)
document_parser.construct_stats()
Expand All @@ -48,21 +54,21 @@ def run_spider():

index = index_builder.get_index()

conduct_queries(index, stats)
conduct_queries(index, stats, remove_stopwords)


def build_stats_and_index():
def build_stats_and_index(remove_stopwords=False):

stats = DocumentParser.build_stats_from_file()
index = IndexBuilder.build_index_from_file()

conduct_queries(index, stats)
conduct_queries(index, stats, remove_stopwords)


def conduct_queries(index, stats):
def conduct_queries(index, stats, remove_stopwords=False):

and_query = AndQuery(index, stats)
or_query = OrQuery(index, stats)
and_query = AndQuery(index, stats, remove_stopwords)
or_query = OrQuery(index, stats, remove_stopwords)

while True:
user_input = input("Would you like to conduct an AND query or an OR query? Hit enter for no. [and/or] ")
Expand All @@ -80,7 +86,7 @@ def conduct_queries(index, stats):

if not args.skip_crawl:

run_spider()
run_spider(args.remove_stopwords)

else:

Expand All @@ -95,6 +101,6 @@ def conduct_queries(index, stats):

else:

build_stats_and_index()
build_stats_and_index(args.remove_stopwords)

print("Bye!")

0 comments on commit ae6d891

Please sign in to comment.