-
Notifications
You must be signed in to change notification settings - Fork 0
/
chi3r_scrape.py
113 lines (104 loc) · 5.47 KB
/
chi3r_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import urllib.request
from bs4 import BeautifulSoup
import basic as bs
import re
""" procedure :
website -> jahili -> {
cha3ir1 -> diwan -> {
chi3r1 ,
chi3r2 ,
....
},
cha3ir2 -> diwan -> {
chi3r1 ,
chi3r2 ,
...
}
...
} """
def scrape_all(limit = -1):
rep = urllib.request.urlopen("https://www.aldiwan.net/")
soup = BeautifulSoup(rep, "lxml")
i = 0
mapEras = {
"العصر الجاهلي" : bs.eras[0],
'عصر المخضرمون': bs.eras[1],
"العصر الإسلامي" : bs.eras[1],
"العصر الاموي" : bs.eras[2],
"العصر العباسي" : bs.eras[3],
'العصر الايوبي' : bs.eras[4],
'العصر العثماني' : bs.eras[4],
'العصر المملوكي' : bs.eras[4],
'العصر الأندلسي' : bs.eras[4],
'العصر الحديث' : bs.eras[5]
}
books = bs.loadListOfBooksByEras()
exceptions = open("exceptions.txt", encoding="utf-8", mode="w")
for eras in soup.find_all("div", {"class": "col-md-4"}):
for node in eras.find_all("a"):
if node.text in mapEras:# ["العصر الجاهلي", "العصر العباسي", "العصر الإسلامي",
#"العصر الاموي"]: # get all jahili cho3araa list link
setLimit = limit
rep = urllib.request.urlopen("https://www.aldiwan.net/" + node.get("href"))
soup = BeautifulSoup(rep, "lxml")
for node1 in soup.find_all("a", {"class": "s-button"}): # get every cha3ir diwan
rep = urllib.request.urlopen("https://www.aldiwan.net/" + node1.get("href"))
soup2 = BeautifulSoup(rep, "lxml")
for node3 in soup2.find_all("a", {"class": "pull-right"}): # get every chi3r from diwan link
# create a file for everykassida using getFilePath function
print("getting poem")
print(i)
i = i + 1
cEra = mapEras[node.text]
if not cEra:
print("no era found for it")
continue
if bs.bookExists(node3.text,books):
print('book already exists')
setLimit -= 1
if not setLimit:
break
continue
filename = bs.getFilePath(node3.text, cEra, "شعر", node1.text)
if filename is None:
print('filename is None')
print('era is: ' + str(cEra))
continue
# if node.text == "العصر الجاهلي":
# filename = bs.getFilePath(node3.text, "Jahiliy", "poem", node1.text) + ".txt"
# # error in line 40 in one of the files i get file not found (file not getting created)
# elif node.text == "العصر العباسي":
# filename = bs.getFilePath(node3.text, "Abbasid", "poem", node1.text) + ".txt"
# elif node.text == "العصر الأموي":
# filename = bs.getFilePath(node3.text, "Umayyad", "poem", node1.text) + ".txt"
# elif node.text == "العصر الإسلامي":
# filename = bs.getFilePath(node3.text, "SadrIslam", "poem", node1.text) + ".txt"
try:
file = open(filename, encoding="utf-8", mode="w")
print("file created")
print(filename)
rep = urllib.request.urlopen("https://www.aldiwan.net/" + node3.get("href"))
soup3 = BeautifulSoup(rep, "lxml")
for main_text in soup3.find_all("div", {"class": "bet-1"}): # access to kassida link and
chatr = 1
for prt in main_text.find_all("h3"):
if chatr % 2 == 0:
file.write(prt.text)
else:
file.write(re.sub("\n", "\t", prt.text))
chatr = chatr + 1
break
file.close()
setLimit -= 1
if not setLimit:
break
print("file close")
except IOError:
print("inside exception")
exceptions.write(filename)
if not setLimit:
break
if not setLimit:
break
if __name__ == '__main__':
scrape_all()