-
Notifications
You must be signed in to change notification settings - Fork 0
/
islamicbook_scrape.py
113 lines (100 loc) · 3.31 KB
/
islamicbook_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import urllib.request
from bs4 import BeautifulSoup
import basic as bs
def scrape_page(parent,page,writer):
rep = urllib.request.urlopen(parent+page)
soup = BeautifulSoup(rep, "lxml")
div_containers = soup.find('div',id="content")
div_containers.find("h1").extract()
node = div_containers.find("h6")
if node:
node.extract()
# for div in div_containers:
nextLink = ""
nex = False
for node in div_containers.find_all("a"):
if nex:
nex = False
nextLink = node.get("href")
if node.get("href") == page:
nex = True
node.extract()
for br in div_containers.find_all("br"):
br.replace_with("\n")
paragraph = div_containers.get_text()
# print(paragraph)
# for node in div_container.find_all("p"):
# paragraph += node.text
writer.write(str(paragraph)+ " ")
if nextLink != "":
scrape_page(parent,nextLink,writer)
# scrape_page(parent,"aaian-alasr-002.html",None)
def scrapeIslamic(parent,page,type="تاريخ",limit=-1):
rep = urllib.request.urlopen(parent+page)
soup = BeautifulSoup(rep, "html.parser")
nextLink = ""
for node in soup.find_all("a"):
if "التالية" in node.get_text():
nextLink = node.get("href")
body = soup.find("tbody")
books = bs.loadListOfBooksByEras()
for tr in body.find_all("tr"):
i = 0
link = ""
author = ""
book = ""
con = False
for i, td in enumerate(tr.find_all("td")):
if not i:
continue
if i == 1:
n = td.find("a")
if n:
link = n.get("href")
else:
con = True
break
if i == 2:
author = td.text
if i == 3:
book = td.text
if con: continue
era = bs.getEraFromAuthor(author)
if era == 'unknown':
continue
if bs.bookExists(book,books):
limit -= 1
if not limit:
return
continue
filename = bs.getFilePath(book, era, type,author)
if filename is None:
print('filename is None')
print('era is: ' + str(era))
continue
writer = open(filename, encoding="utf-8", mode="w")
print('file created:')
print(filename)
limit -= 1
if not limit:
return
# f = open("try", encoding="utf-8", mode="w")
scrape_page(parent, link, writer)
writer.close()
if nextLink != "":
scrapeIslamic(parent,nextLink,type)
def scrape_all(limit = -1):
parent = "http://www.islamicbook.ws/tarekh/"
scrapeIslamic(parent, "",limit=limit)
parents = ["http://www.islamicbook.ws/qbook/", "http://www.islamicbook.ws/ageda/"
, "http://www.islamicbook.ws/hadeth/", "http://www.islamicbook.ws/asol/", ]
for parent in parents:
scrapeIslamic(parent, "", "دين",limit=limit)
if limit > 0: # light mode selected
break
parent = "http://www.islamicbook.ws/adab/"
scrapeIslamic(parent, "", "أدب",limit=limit)
parent = "http://www.islamicbook.ws/amma/"
scrapeIslamic(parent, "", "متنوعة",limit=limit)
if __name__ == "__main__":
scrape_all()