Skip to content

Commit

Permalink
Merge branch 'master' into montreal_est_scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
samJMA committed Oct 29, 2024
2 parents 6f52701 + a370b4c commit 707e503
Show file tree
Hide file tree
Showing 148 changed files with 1,311 additions and 1,310 deletions.
6 changes: 6 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
35 changes: 35 additions & 0 deletions .github/workflows/automerge.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# The pull_request_target workflow trigger is dangerous. Do not add unrelated logic to this workflow.
# https://securitylab.github.com/research/github-actions-preventing-pwn-requests/
# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target
name: Auto-merge
on: pull_request_target
permissions:
pull-requests: write # to approve the PR
contents: write # to merge the PR
jobs:
dependabot:
if: ${{ github.event.pull_request.user.login == 'dependabot[bot]' }}
runs-on: ubuntu-latest
steps:
- id: dependabot-metadata
uses: dependabot/fetch-metadata@v2
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
- if: ${{ steps.dependabot-metadata.outputs.update-type != 'version-update:semver-major' || steps.dependabot-metadata.outputs.package-ecosystem == 'github_actions' }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: gh pr review --approve ${{ github.event.pull_request.html_url }}
- if: ${{ steps.dependabot-metadata.outputs.update-type != 'version-update:semver-major' || steps.dependabot-metadata.outputs.package-ecosystem == 'github_actions' }}
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: gh pr merge --auto --squash ${{ github.event.pull_request.html_url }}
precommit:
if: ${{ github.event.pull_request.user.login == 'pre-commit-ci[bot]' }}
runs-on: ubuntu-latest
steps:
- env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: gh pr review --approve ${{ github.event.pull_request.html_url }}
- env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: gh pr merge --auto --squash ${{ github.event.pull_request.html_url }}
23 changes: 12 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
ci:
autoupdate_schedule: quarterly
skip: [pip-compile]
default_language_version:
python: python3.10
repos:
- repo: https://github.com/psf/black
rev: 24.3.0
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.9
hooks:
- id: black
- repo: https://github.com/pycqa/flake8
rev: 7.0.0
- id: ruff
- id: ruff-format
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.4.18
hooks:
- id: flake8
additional_dependencies: [flake8-comprehensions]
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
- id: pip-compile
name: pip-compile requirements.in
args: [requirements.in, -o, requirements.txt]
4 changes: 2 additions & 2 deletions ca/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def scrape_people(self, rows, gender):
photo_response = self.get(photo)
if (
photo_response.status_code == 200
and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1
and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1 # noqa: S324 # non-cryptographic
):
m.image = photo

Expand Down Expand Up @@ -119,7 +119,7 @@ def scrape_people(self, rows, gender):
):
note = "constituency"
if i:
note += " ({})".format(i + 1)
note += f" ({i + 1})"

address = constituency_office_el.xpath("./p[1]")[0]
address = address.text_content().strip().splitlines()
Expand Down
14 changes: 7 additions & 7 deletions ca_ab/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@


def get_party(abbr):
"""Return full party name from abbreviation"""
"""Return full party name from abbreviation."""
return PARTIES[abbr]


Expand Down Expand Up @@ -59,8 +59,8 @@ def scrape(self):
field_names = next(reader)
for name in OFFICE_FIELDS:
assert field_names.count(name) == 2
field_names[field_names.index(name)] = "{} 1".format(name)
field_names[field_names.index(name)] = "{} 2".format(name)
field_names[field_names.index(name)] = f"{name} 1"
field_names[field_names.index(name)] = f"{name} 2"
rows = [dict(zip_longest(field_names, row)) for row in reader]
assert len(rows), "No members found"
for mla in rows:
Expand All @@ -76,8 +76,8 @@ def scrape(self):
row_xpath = '//td[normalize-space()="{}"]/..'.format(
mla["Constituency Name"],
)
(detail_url,) = index.xpath("{}//a/@href".format(row_xpath))
(photo_url,) = index.xpath("{}//img/@src".format(row_xpath))
(detail_url,) = index.xpath(f"{row_xpath}//a/@href")
(photo_url,) = index.xpath(f"{row_xpath}//img/@src")
district = mla["Constituency Name"]
if district == "Calgary-Bhullar-McCall":
district = "Calgary-McCall"
Expand Down Expand Up @@ -108,10 +108,10 @@ def scrape(self):

for suffix, note in addresses:
for key, contact_type in (("Phone", "voice"), ("Fax", "fax")):
value = mla["{} Number {}".format(key, suffix)]
value = mla[f"{key} Number {suffix}"]
if value and value != "Pending":
p.add_contact(contact_type, value, note)
address = ", ".join(filter(bool, [mla["{} {}".format(field, suffix)] for field in ADDRESS_FIELDS]))
address = ", ".join(filter(bool, [mla[f"{field} {suffix}"] for field in ADDRESS_FIELDS]))
if address:
p.add_contact("address", address, note)

Expand Down
2 changes: 1 addition & 1 deletion ca_ab_grande_prairie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_organizations(self):
for seat_number in range(1, 9):
organization.add_post(
role="Councillor",
label="{} (seat {})".format(self.division_name, seat_number),
label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)

Expand Down
33 changes: 28 additions & 5 deletions ca_ab_grande_prairie/people.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,30 @@
from utils import CSVScraper
from utils import CanadianPerson as Person
from utils import CanadianScraper

COUNCIL_PAGE = "https://cityofgp.com/city-government/mayor-city-council/council-members"

class GrandePrairiePersonScraper(CSVScraper):
# https://data.cityofgp.com/Community/City-Council-Contact-Information/vcfc-gi78
csv_url = "https://data.cityofgp.com/api/views/vcfc-gi78/rows.csv?accessType=DOWNLOAD"
many_posts_per_area = True

class GrandePrairiePersonScraper(CanadianScraper):
def scrape(self):
seat_number = 1
page = self.lxmlize(COUNCIL_PAGE)
councillors = page.xpath('//div[contains(@class, "council-bios")]//div[@class="views-row"]')

assert len(councillors), "No councillors found"
for councillor in councillors:
role, name = councillor.xpath(".//h3")[0].text_content().split(" ", 1)
if role == "Councillor":
district = f"Grande Prairie (seat {seat_number})"
seat_number += 1
else:
district = " Grande Prairie"
email = self.get_email(councillor)
phone = self.get_phone(councillor)
image = councillor.xpath(".//img/@src")[0]

p = Person(primary_org="legislature", name=name, district=district, role=role, image=image)
p.add_contact("email", email)
p.add_contact("voice", phone, "legislature")
p.add_source(COUNCIL_PAGE)

yield p
4 changes: 2 additions & 2 deletions ca_ab_grande_prairie_county_no_1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ def get_organizations(self):
for division_number in range(1, 10):
organization.add_post(
role="Councillor",
label="Division {}".format(division_number),
division_id="{}/division:{}".format(self.division_id, division_number),
label=f"Division {division_number}",
division_id=f"{self.division_id}/division:{division_number}",
)

yield organization
2 changes: 1 addition & 1 deletion ca_ab_lethbridge/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_organizations(self):
for seat_number in range(1, 9):
organization.add_post(
role="Councillor",
label="{} (seat {})".format(self.division_name, seat_number),
label=f"{self.division_name} (seat {seat_number})",
division_id=self.division_id,
)

Expand Down
4 changes: 2 additions & 2 deletions ca_ab_lethbridge/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class LethbridgePersonScraper(CanadianScraper):
def scrape_mayor(self):
page = self.lxmlize(MAYOR_PAGE)
paragraph = page.xpath("//p[1]")[0].text_content().split()
paragraph = page.xpath("//h4[contains(., 'Mayor')]/following-sibling::p")[0].text_content().split()
name = " ".join([paragraph[0], paragraph[1]])

p = Person(primary_org="legislature", name=name, district="Lethbridge", role="Mayor")
Expand All @@ -24,7 +24,7 @@ def scrape_person(self, url, seat_number):
p = Person(
primary_org="legislature",
name=name,
district="Lethbridge (seat {})".format(seat_number + 1),
district=f"Lethbridge (seat {seat_number + 1})",
role="Councillor",
)

Expand Down
12 changes: 6 additions & 6 deletions ca_ab_wood_buffalo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ def get_organizations(self):
for seat_number in range(1, 7):
organization.add_post(
role="Councillor",
label="Ward 1 (seat {})".format(seat_number),
division_id="{}/ward:1".format(self.division_id),
label=f"Ward 1 (seat {seat_number})",
division_id=f"{self.division_id}/ward:1",
)
for seat_number in range(1, 3):
organization.add_post(
role="Councillor",
label="Ward 2 (seat {})".format(seat_number),
division_id="{}/ward:2".format(self.division_id),
label=f"Ward 2 (seat {seat_number})",
division_id=f"{self.division_id}/ward:2",
)
organization.add_post(role="Councillor", label="Ward 3", division_id="{}/ward:3".format(self.division_id))
organization.add_post(role="Councillor", label="Ward 4", division_id="{}/ward:4".format(self.division_id))
organization.add_post(role="Councillor", label="Ward 3", division_id=f"{self.division_id}/ward:3")
organization.add_post(role="Councillor", label="Ward 4", division_id=f"{self.division_id}/ward:4")

yield organization
4 changes: 2 additions & 2 deletions ca_ab_wood_buffalo/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def scrape(self):
for ward in wards:
area = ward.text_content().split("–", 1)[1].strip()
councillors = ward.xpath("./following-sibling::table[1]/tbody/tr/td/h3")
assert len(councillors), "No councillors found for {}".format(area)
assert len(councillors), f"No councillors found for {area}"
for councillor in councillors:
name = councillor.text_content()

if area in ("Ward 1", "Ward 2"):
seat_numbers[area] += 1
district = "{} (seat {})".format(area, seat_numbers[area])
district = f"{area} (seat {seat_numbers[area]})"
else:
district = area

Expand Down
4 changes: 2 additions & 2 deletions ca_bc_abbotsford/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ def scrape(self):
]

assert len(councillors), "No councillors found"
assert len(councillors) == len(contact_data), "Expected {}, got {}".format(len(councillors), len(contact_data))
assert len(councillors) == len(contact_data), f"Expected {len(councillors)}, got {len(contact_data)}"
for councillor, contact in zip(councillors, contact_data):
text = councillor.xpath(".//h3/a")[0].text_content()
if text.startswith("Councill"):
role = "Councillor"
district = "Abbotsford (seat {})".format(councillor_seat_number)
district = f"Abbotsford (seat {councillor_seat_number})"
councillor_seat_number += 1
else:
role = "Mayor"
Expand Down
18 changes: 3 additions & 15 deletions ca_bc_burnaby/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,39 +12,27 @@ def scrape(self):
councillors = page.xpath("//a[@class='biography__link']/@href")
assert len(councillors), "No councillors found"
for person_url in councillors:

def decode_email(e):
de = ""
k = int(e[:2], 16)

for i in range(2, len(e) - 1, 2):
de += chr(int(e[i : i + 2], 16) ^ k)

return de

page = self.lxmlize(person_url)

role, name = page.xpath("//h1/span")[0].text_content().strip().split(" ", 1)
photo_url = page.xpath('//img[@typeof="foaf:Image"]/@src')[0]

contact_node = page.xpath('//div[@class="contact"]')[0]

email = page.xpath('//div[@class = "contact__detail contact__detail--email"]/a/@href')[0]
decoded_email = decode_email(email.split("#", 1)[1]) # cloudflare encrypts the email data

email = self.get_email(contact_node)
phone = self.get_phone(contact_node, area_codes=[604, 778])

if role == "Mayor":
district = "Burnaby"
else:
district = "Burnaby (seat {})".format(councillor_seat_number)
district = f"Burnaby (seat {councillor_seat_number})"
councillor_seat_number += 1

p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url)
p.add_source(COUNCIL_PAGE)
p.add_source(person_url)
if email:
p.add_contact("email", decoded_email)
p.add_contact("email", email)
if phone:
p.add_contact("voice", phone, "legislature")
yield p
8 changes: 3 additions & 5 deletions ca_bc_coquitlam/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,16 @@


class CoquitlamPersonScraper(CanadianScraper):

def scrape(self):
def build_email(script):
w = re.findall(r'w = "(.*?)"', script)[0]
x = re.findall(r'x = "(.*?)"', script)[0]
email = w + "@" + x
return email
return w + "@" + x

councillor_seat_number = 1

page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0")
councillors = page.xpath('//table[@id="cityDirectoryDepartmentDetails"]/tr')
councillors = page.xpath('//table[contains(@id, "cityDirectoryDepartmentDetails")]/tr')
assert len(councillors), "No councillors found"
for councillor in councillors:
name = " ".join(
Expand All @@ -36,7 +34,7 @@ def build_email(script):
if role == "Mayor":
district = "Coquitlam"
else:
district = "Coquitlam (seat {})".format(councillor_seat_number)
district = f"Coquitlam (seat {councillor_seat_number})"
councillor_seat_number += 1

p = Person(primary_org="legislature", name=name, district=district, role=role)
Expand Down
4 changes: 2 additions & 2 deletions ca_bc_langley/people.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def scrape(self):
page = self.lxmlize(url)
name = page.xpath("//h1")[0].text_content().strip()

district = "Langley (seat {})".format(seat_number)
district = f"Langley (seat {seat_number})"
seat_number += 1
email = self.get_email(page)
phone = self.get_phone(page)
Expand All @@ -34,7 +34,7 @@ def scrape(self):
address_block = page.xpath('//p/a[@rel="noopener noreferrer"]/parent::p')[0].text_content()
line1 = address_block[address_block.find("Facility") + 8 : address_block.find("Langley,")]
line2 = address_block[address_block.find("Langley,") : address_block.find("Phone") - 1]
address = ", ".join([line1, line2])
address = f"{line1}, {line2}"
p = Person(primary_org="legislature", name=name, role="Mayor", district="Langley")
p.add_contact("email", email)
p.add_contact("voice", phone, "legislature")
Expand Down
Loading

0 comments on commit 707e503

Please sign in to comment.