Skip to content

Commit

Permalink
Update webcrawler.py
Browse files Browse the repository at this point in the history
  • Loading branch information
nimay-gupta authored Apr 8, 2021
1 parent adef10b commit 9d3410f
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions Lab7-Spark/webcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@
from operator import add

def crawl(url):
url = "https://hari1500.github.io/CS387-lab7-crawler-website/" + url
url = "https://hari1500.github.io/CS387-lab7-crawler-website" + url
# skip downloading if not html
if "html" not in requests.head(url).headers.get('content-type'):
return []
# filter local URLs (remove those starting with http)
return [x.split('"')[1] for x in re.findall('<a[ ]+href[ ]*=[ ]*"[^{http}].*[{.html}]?">', requests.get(url).text)]
return ["/"+x.split('"')[1] for x in re.findall('<a[ ]+href[ ]*=[ ]*"[^{http}].*[{.html}]?">', requests.get(url).text)]

if __name__ == "__main__":
# create Spark context with necessary configuration
spark = SparkContext("local", "Web Crawler")

# create RDD with starting website
start_url = "1.html"
start_url = "/1.html"
rdd = spark.parallelize([start_url])
new = rdd

Expand All @@ -29,7 +29,7 @@ def crawl(url):
new = new.subtract(old).distinct()

# count the indegree of each URL
rdd = rdd.map(lambda x: ("/"+x, 1)).reduceByKey(add)
rdd = rdd.map(lambda x: (x, 1)).reduceByKey(add)

# save the indegree to output
rdd.saveAsTextFile("./webcrawler/")

0 comments on commit 9d3410f

Please sign in to comment.