-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.py
94 lines (83 loc) · 2.39 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import functions as f
import random
import numpy as np
import matplotlib.pyplot as plt
import json
def crawl2(url, depth, first_call = 0, home_tags = None):
'''
Crawl the yt web graph and return relevance index
upto defined depth.
'''
print("---\tat depth = {}".format(depth))
if depth == 0:
return []
tags = home_tags
links = None
attempt=5
while (tags==None or links==None) and attempt>0:
attempt -= 1
tags, links = f.getDataFromUrl(url)
if tags == None or links == None:
print("---\t---\tpage issues...retrying")
continue
if attempt==0:
print("---\t---\tran out of attempts")
return []
temp_tags = None
attempt = 10
while temp_tags==None and attempt>0:
attempt -= 1
link = random.choice(links)
temp_tags, _ = f.getDataFromUrl(link)
if temp_tags == None:
print("---\t---\tpage issues...skipping page")
continue
if attempt==0:
print("---\t---\tran out of attempts")
return []
relevance_list = []
relevance = 0
if first_call==0:
tags = home_tags
relevance = f.getRelevance(tags, temp_tags)
relevance_list.append(relevance)
return_list = crawl2(link, depth-1, 0, tags)
relevance_list = relevance_list + return_list
return relevance_list
def visualize_results(results):
y = np.array([np.array(x) for x in results])
y_avg = np.average(y, axis=0)
for i in range(y.shape[0]):
plt.plot(y[i], marker='o', color='b', alpha=0.2)
plt.plot(y_avg, marker='o', color='r')
plt.xlabel("Depth of rabbit hole")
plt.ylabel("Relevance to root video")
fig = plt.gcf()
fig.set_size_inches(10, 7)
plt.show()
# Start URLs
urls = []
with open("links.txt","r") as file:
urls = file.readlines()
urls = list(set(urls))
# Each item in this list is a list with relevance
# at different depths signified by their indices
results_for_urls = []
depth_range = 16
num_links = len(urls)
lim_urls = random.sample(urls,num_links)
results_for_json = {}
for index, url in enumerate(lim_urls):
print("FOR \turl#{}/{}".format(index + 1, len(lim_urls)))
try:
result = crawl2(url, depth_range, 1)
except:
continue
if len(result)==depth_range:
# Save result in json
results_for_json[url] = result
with open("results_appended_new.json","w") as file:
file.write(json.dumps(results_for_json))
# links_done.add(url)
results_for_urls.append(result)
visualize_results(results_for_urls)