-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
93 lines (78 loc) · 3.01 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import requests
import sqlite3
import os
import json
import csv
from bs4 import BeautifulSoup
try:
os.remove('temp.db')
os.remove('temp.json')
except:
pass
csv_file = open('temp.csv', 'w', encoding='utf-8')
fieldnames = ['url', 'name', 'start_year', 'end_year', 'position', 'height', 'weight', 'dob', 'college']
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
#using this without a context manager because I dont wish to open and close multiple times during the loop
conn = sqlite3.connect('temp.db')
cor = conn.cursor()
temp_dict = {}
def main():
base_url = 'https://www.basketball-reference.com/'
ses = requests.Session()
#preparing to collect names and respective profile urls
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
sub_url = 'players/'
#itering over the letter to scrap the indices
for i in letters:
table = f""" CREATE TABLE {i} (
url CHAR(255),
name CHAR(255),
start_year INT,
end_year INT,
pos char(10),
height CHAR(10),
weight CHAR(10),
dob CHAR(25),
college CHAR(40)
); """
cor.execute(table)
conn.commit()
print(f'Collecting data of NBA players with last names starting from {i.capitalize()}')
temp_dict[i]=[]
url = base_url+sub_url+i
resp = ses.get(url=url)
soup = BeautifulSoup(resp.text, 'lxml')
players = soup.find_all('th', {'data-append-csv':True})
for p in players:
tr = p.parent
#getting name and url
for tag in p.children:
if tag.name=='a':
p_url = tag['href']
name = tag.text
elif tag.name == 'strong':
tag = tag.a
p_url = tag['href']
name = tag.text
start = tr.contents[1].text
end = tr.contents[2].text
pos = tr.contents[3].text
height = tr.contents[4].text
weight = tr.contents[5].text
dob = tr.contents[6].text
college = tr.contents[7].text
if college == '':
college = 'None'
bas_url='https://www.basketball-reference.com'
cor.execute(f"INSERT INTO {i} (url, name, start_year, end_year, pos, height, weight, dob, college) VALUES (?,?,?,?,?,?,?,?,?)", (bas_url+p_url,name,start,end,pos,height,weight,dob,college))
conn.commit()
player_dict= {'url':bas_url+p_url, 'name':name, 'start_year':start, 'end_year':end, 'position':pos, 'height':height, 'weight':weight, 'dob':dob, 'college':college}
temp_dict[i].append(player_dict)
writer.writerow(player_dict)
if __name__ == "__main__":
main()
with open('temp.json', 'w') as f:
json.dump(temp_dict, f)
csv_file.close()
exit("Thanks for using Amogh's mini webscraper!")