-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawling_demo.py
79 lines (50 loc) · 2.2 KB
/
crawling_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# coding: utf-8
# In[43]:
from crawl import *
from tqdm import tqdm
# # An example of crawling a website
# obtain all relevant links with the feature '/article/' in the domain
hsms_links = crawl_website_links("https://handstopmouthstop.com", feature = '/article/')
# get article title and content from each article link and save the data in a dict 'hsms_docs'.
# This step is to be customised for every website
hsms_docs = {}
for i, link in enumerate(tqdm(hsms_links)):
art_soup = get_soup(link)
if art_soup != None:
# get article title
try:
title = art_soup.h1.string
except:
title = ""
#get article content (words only)
content = []
for line in art_soup.find("div", class_=re.compile("^entry-content")):
string = line.get_text()
# replace \xa0 (space)
string = re.sub(u'\xa0', u' ', string)
# replace \n+
string = re.sub('\n+', '\n', string)
string = string.split("\n")
for s in string: #there might be multiple lines in the string, separated by \n
# remove empty lines
if re.match("^\s*$", s):
continue
content.append(s)
# save everything in a dictionary
hsms_docs[i] = {}
hsms_docs[i]["title"] = title
hsms_docs[i]["url"] = link
hsms_docs[i]["content"] = content
# # An example for crawling an instagram account via imginn.org
# crawling from instagram account: nownewshk
data, url = crawl_imginn("nownewshk")
# # Save data to df
df_hsms = df_from_dict(hsms_docs)
df_nownewshk = df_from_imginn(data, retain_comments = True)
# # An example of crawling subtitles from a list of youtube videos
# obtain all relevant links with the feature 'youtube.com' in the domain
domain = "https://sites.google.com/view/lihkg-kongjisubtitles/drama?authuser=0"
kjs_links = crawl_website_links(domain_url = domain, feature = 'youtube.com', domain_name = "https://sites.google.com/", exclude_feature = "#h")
# obtain df for each video, matching any cantonese-mandarin parallel subtitles
df_kjs = df_from_subtitles(kjs_links, parallel = True)