Skip to content

Commit

Permalink
scrape correct csv file remove scraping webpage
Browse files Browse the repository at this point in the history
  • Loading branch information
bzhangjma committed Nov 4, 2024
1 parent b85cd51 commit 0b80560
Showing 1 changed file with 5 additions and 50 deletions.
55 changes: 5 additions & 50 deletions ca_on_guelph/people.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,7 @@
from utils import CanadianPerson as Person
from utils import CanadianScraper
from utils import CSVScraper

COUNCIL_PAGE = "https://guelph.ca/city-hall/mayor-and-council/city-council/"
MAYOR_PAGE = "https://guelph.ca/city-hall/mayor-and-council/mayors-office/"


class GuelphPersonScraper(CanadianScraper):
def scrape(self):
page = self.lxmlize(COUNCIL_PAGE)

councillor_nodes = page.xpath('.//div[@class="thumbnail"]')[1:]
assert len(councillor_nodes), "No councillors found"

for councillor_node in councillor_nodes:
district = councillor_node.xpath(".//h2/text()")[0].split("Councillors")[0].strip()

councillors = councillor_node.xpath(".//div/div")
for councillor in councillors:
role_and_name = councillor.xpath(".//h3/text()")
if not role_and_name:
continue

role_and_name = councillor.xpath(".//h3/text()")[0]
role, name = role_and_name.split(" ", 1)
phone = councillor.xpath(".//p/text()")[1].strip()
email = self.get_email(councillor)
image = councillor.xpath(".//img/@src")[0]

p = Person(primary_org="legislature", name=name, district=district, role=role, image=image)
p.add_contact("email", email)
if phone:
p.add_contact("voice", phone, "legislature")
p.add_source(COUNCIL_PAGE)

yield self.scrape_mayor(MAYOR_PAGE)

def scrape_mayor(self, url):
page = self.lxmlize(url)

mayor_node = page.xpath('.//div[@class="entry-content"]/p')[-1]
name = mayor_node.xpath(".//text()")[0].strip().split("Mayor ")[1]
phone = self.get_phone(mayor_node)
email = self.get_email(mayor_node)
image = mayor_node.xpath('//img[contains(@alt, "Mayor")]/@src')[0]

p = Person(primary_org="legislature", name=name, district="Guelph", role="Mayor", image=image)
p.add_contact("voice", phone, "legislature")
p.add_contact("email", email)
p.add_source(MAYOR_PAGE)

return p
class GuelphPersonScraper(CSVScraper):
# https://explore.guelph.ca/documents/5ec8d85028c94e83be12a9f01d14eb7f/about
csv_url = "https://gismaps.guelph.ca/OpenData/guelph-city-council.csv"
many_posts_per_area = True

0 comments on commit 0b80560

Please sign in to comment.