-
Notifications
You must be signed in to change notification settings - Fork 0
/
hyperionherbsHandler.py
108 lines (96 loc) · 3.91 KB
/
hyperionherbsHandler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import csv, sys, time
class HyperionherbsHelper():
def __init__(self):
self.url = 'https://www.hyperionherbs.com/blog/'
self.driver = webdriver.Chrome(executable_path='../utility/chromedriver.exe')
# method to get items from given link.
def getItems(self):
items = []
count = 0
while True:
count += 1
try:
self.driver.get(self.url)
soup = BeautifulSoup(self.driver.page_source, u'html.parser')
container = soup.find('div', attrs={'class': 'et_pb_salvattore_content'})
articles = container.findAll('article')
items.extend(articles)
nxt_btn = self.driver.find_element_by_xpath("//div[@class='pagination clearfix']")
if nxt_btn:
nxt_btn = nxt_btn.find_element_by_xpath('./div/a')
nxt_btn.click()
time.sleep(3)
else:
break
except:
break
# close driver session here
self.driver.close()
return items
def getItemDtail(self, item):
data = {}
try:
article_url = item.find('a')['href']
print("article_url : "+article_url)
res_article = requests.get(article_url)
soup_article = BeautifulSoup(res_article.text, u'html.parser')
try:
article_title = soup_article.find('h1', attrs={'class': 'entry-title'})
article_title = str(article_title.text).encode('utf-8').strip()
except:
article_title = None
try:
article_author = soup_article.find('span', attrs={'class': 'author vcard'})
article_author = str(article_author.text).encode('utf-8').strip()
except:
article_author = None
try:
article_date = soup_article.find('span', attrs={'class': 'published'})
article_date = str(article_date.text).encode('utf-8').strip()
except:
article_date = None
try:
description = soup_article.find('div', attrs={'class': 'entry-content'})
description = str(description.text).encode('utf-8').strip()
except:
description = None
data.update({'title': article_title})
data.update({'author': article_author})
data.update({'date': article_date})
data.update({'description': description})
except:
pass
return data
# method to write csv file
def writeCSVFile(self, data):
try:
with open('sample.csv', mode='w') as csv_file:
fieldnames = ['title', 'author', 'date', 'description']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for d in data:
writer.writerow({'title': d['title'], 'author': d['author'], 'date': d['date'], 'description': d['description']})
csv_file.close()
print("File written successfully.")
except:
print(sys.exc_info())
pass
# method to start process.
def start(self):
items_data = []
items = self.getItems()
for item in items:
data = self.getItemDtail(item)
if len(data):
items_data.append(data)
# Going to write csv file if data here for writing.
if len(items_data):
self.writeCSVFile(items_data)
# main function call
if __name__ == "__main__":
# objHH is an instance for HyperionherbsHelper
objHH = HyperionherbsHelper()
objHH.start()