forked from mritzing/fertDataCollector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrapeFuncts.py
115 lines (104 loc) · 3.89 KB
/
scrapeFuncts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from bs4 import BeautifulSoup
from selenium import webdriver
import urlList
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import time
import dateparser
import re
#Gathers URLs for fertilizer trends and then extracts information from table within those pages
# will need to replace chromedriver.exe for appropriate per operating system
# returns href after specified string
def hrefSearch(searchStr, source):
soupStr = str(source)
results = []
index = 0
while index != -1:
index = soupStr.find(searchStr,index)
index = soupStr.find("href=",index) + 6
#because i added 6 up here, check for 5 (-1 is not found -1 + 6 = 5)
if index is 5:
return results
endIndex = soupStr.find('"', index)
results.append("https://www.dtnpf.com" + soupStr[index: endIndex])
index = endIndex + 1
#returns data from table on requested URL
def tableReturn(source):
data = []
# finds "DRY" <- table title , and then pulls parent to get info into list of lists
table = source.find(text="DRY").find_parent("table")
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
return data
def gspreadAuth():
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name('creds.json', scope)
gc = gspread.authorize(credentials)
wks = gc.open_by_url("https://docs.google.com/spreadsheets/d/1Bfj6mn4O4v7_wU7M3W3SYX3Sv_lOoefmSeoIiJmLSAM/edit#gid=0")
ws = wks.get_worksheet(0)
return ws
#finds next empty row
def next_available_row(sheet):
str_list = list(filter(None, sheet.col_values(1))) # fastest
return len(str_list)
def fixDate(dateStr):
#original string x[0]
#start date x[1]
#end x[2]
x = [dateStr, '','']
splitStr = dateStr.split('-')
x[1] = dateparser.parse(splitStr[0] + ' ' + dateStr.split(' ')[-1]).strftime("%m/%d/%Y")
if re.search('[a-zA-Z]', splitStr[1]) is None:
x[2] = dateparser.parse(splitStr[0].split(' ')[0] + ' ' + splitStr[1]).strftime("%m/%d/%Y")
else:
x[2] = dateparser.parse(splitStr[1]).strftime("%m/%d/%Y")
return x
#table json break list in half liquid and dry so can upload to drive across
if __name__ == "__main__":
#declare webdriver get base webpage
driver = webdriver.Chrome()
url = "https://www.dtnpf.com/agriculture/web/ag/news/crops/more"
driver.get(url)
html = driver.page_source
soup = BeautifulSoup(html,"html.parser")
ws = gspreadAuth()
totals = 0;
links = hrefSearch("DTN Retail Fertilizer Trends", soup) #find links
for link in links: #iterate through links check if any have already been used
count = 0;
if link not in urlList.urlList:
urlList.urlList.append(link) #add link to list
driver.get(link)
html = driver.page_source
soup = BeautifulSoup(html,"html.parser")
data = tableReturn(soup) # get data from link
if ['LIQUID'] in data:
split = data.index(['LIQUID'])
elif ['Liquid'] in data:
split = data.index(['Liquid'])
dryData = data[:split]
liquidData = data[split:]
dateCol = ws.col_values(1)
for x,y in zip(dryData, liquidData):
if len(x[0])>11 and x[0] not in dateCol:
count = count+1;
date = fixDate(x[0])
#ws.append_row([date[0], date[1], date[2],x[1],x[2],x[3],x[4],y[1],y[2],y[3],y[4]])
ws.append_row([date[0], date[1], date[2],x[1],x[2],x[3],x[4],y[1],y[2],y[3],y[4]],value_input_option='USER_ENTERED')
else:
print(link , "in URL list, skipping")
print("Added" ,str(count) ,"values from " , link)
totals = totals + count
time.sleep(5) #rate limit
#writeback url list
with open('urlList.py', 'w') as file:
file.write('urlList = ')
file.write(str(urlList.urlList))
#write time to sheet
ws.update_acell('N4', time.strftime("%Y-%m-%d %H:%M"))
print("Program complete added", totals,"values")