From 9d3410f15c87136ce2c7e23c23b23c787b1a15f5 Mon Sep 17 00:00:00 2001
From: Nimay Gupta <49588368+nimay-gupta@users.noreply.github.com>
Date: Fri, 9 Apr 2021 01:20:09 +0530
Subject: [PATCH] Update webcrawler.py
---
Lab7-Spark/webcrawler.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/Lab7-Spark/webcrawler.py b/Lab7-Spark/webcrawler.py
index a10c06c..e91d78e 100644
--- a/Lab7-Spark/webcrawler.py
+++ b/Lab7-Spark/webcrawler.py
@@ -5,19 +5,19 @@
from operator import add
def crawl(url):
- url = "https://hari1500.github.io/CS387-lab7-crawler-website/" + url
+ url = "https://hari1500.github.io/CS387-lab7-crawler-website" + url
# skip downloading if not html
if "html" not in requests.head(url).headers.get('content-type'):
return []
# filter local URLs (remove those starting with http)
- return [x.split('"')[1] for x in re.findall('', requests.get(url).text)]
+ return ["/"+x.split('"')[1] for x in re.findall('', requests.get(url).text)]
if __name__ == "__main__":
# create Spark context with necessary configuration
spark = SparkContext("local", "Web Crawler")
# create RDD with starting website
- start_url = "1.html"
+ start_url = "/1.html"
rdd = spark.parallelize([start_url])
new = rdd
@@ -29,7 +29,7 @@ def crawl(url):
new = new.subtract(old).distinct()
# count the indegree of each URL
- rdd = rdd.map(lambda x: ("/"+x, 1)).reduceByKey(add)
+ rdd = rdd.map(lambda x: (x, 1)).reduceByKey(add)
# save the indegree to output
rdd.saveAsTextFile("./webcrawler/")