From c8afbc3f5f22b802a4a78dd67975099637d752cb Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 17:08:25 -0400 Subject: [PATCH 01/66] Fix getting emails and phone numbers for Regina --- ca_sk_regina/people.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/ca_sk_regina/people.py b/ca_sk_regina/people.py index adfb1714..13565418 100644 --- a/ca_sk_regina/people.py +++ b/ca_sk_regina/people.py @@ -1,11 +1,10 @@ -import re from urllib.parse import urljoin from utils import CanadianPerson as Person from utils import CanadianScraper COUNCIL_PAGE = "https://www.regina.ca/city-government/city-council" -MAYOR_CONTACT_URL = "https://www.regina.ca/city-government/city-council/mayors-office" +MAYOR_CONTACT_URL = "https://www.regina.ca/city-government/city-council/mayors-office/contact-mayor/" class ReginaPersonScraper(CanadianScraper): @@ -26,7 +25,6 @@ def scrape(self): def councillor_data(self, url, name, ward): page = self.lxmlize(url) - # sadly, email is a form on a separate page photo_url_rel = page.xpath('//div[@class="councillor__image"]//img/@src')[0] photo_url = urljoin(url, photo_url_rel) @@ -34,12 +32,8 @@ def councillor_data(self, url, name, ward): m.add_source(COUNCIL_PAGE) m.add_source(url) - # Scrape and add phone. - phone_path = page.xpath('//div[@class="councillor__contact"]//ul/li/a/@href[contains(., "306")]')[0] - phone_string = phone_path.rsplit("/", 1)[-1] - phone = re.sub("[^0-9]", "", phone_string) - if phone: - m.add_contact("voice", phone, "legislature") + m.add_contact("voice", self.get_phone(page), "legislature") + m.add_contact("email", self.get_email(page)) m.image = photo_url yield m @@ -57,11 +51,8 @@ def mayor_data(self, url): m.add_source(url) m.image = photo_url - # Scrape and add phone. - phone_path = page.xpath('//div[@class="councillor__contact"]//ul/li/a/@href[contains(., "306")]')[0] - phone_string = phone_path.rsplit("/", 1)[-1] - phone = re.sub("[^0-9]", "", phone_string) - if phone: - m.add_contact("voice", phone, "legislature") + page = self.lxmlize(MAYOR_CONTACT_URL) + m.add_contact("voice", self.get_phone(page), "legislature") + m.add_contact("email", self.get_email(page)) return m From 019aba6f6e415f5b4cb8984b884baca310895ef7 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 16:57:56 -0400 Subject: [PATCH 02/66] Fix getting emails for Westmount --- ca_qc_westmount/people.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ca_qc_westmount/people.py b/ca_qc_westmount/people.py index a047d864..9ed24012 100644 --- a/ca_qc_westmount/people.py +++ b/ca_qc_westmount/people.py @@ -21,11 +21,14 @@ def scrape(self): role = "Conseiller" district = councillor.xpath(".//li//text()")[0] + email = self.get_email(councillor, error=False) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath(".//@src")[0] p.add_contact("voice", self.get_phone(councillor), "legislature") - p.add_contact("email", self.get_email(councillor)) + if email: + p.add_contact("email", email) yield p From ef684413213ec47b3b97600d2505c5ea2e94b935 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 16:42:23 -0400 Subject: [PATCH 03/66] Fix Trois Rivieres scraper --- ca_qc_trois_rivieres/people.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ca_qc_trois_rivieres/people.py b/ca_qc_trois_rivieres/people.py index f0c2ea31..f8ff9ff9 100644 --- a/ca_qc_trois_rivieres/people.py +++ b/ca_qc_trois_rivieres/people.py @@ -11,15 +11,14 @@ class TroisRivieresPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - members = page.xpath('//div[@class="photos_conseillers"]//figure') + members = page.xpath('//div[contains(@class, "photos_conseillers")]//figure') assert len(members), "No councillors found" for member in members: photo_url = member.xpath(".//a//img/@src")[0] url = member.xpath(".//figcaption//a/@href")[0] - email = self.lxmlize(url).xpath('//div[@class="content-page"]//a[starts-with(@href, "mailto:")]/@href')[0] + email = self.get_email(self.lxmlize(url)) - email = re.sub("^mailto:", "", email) name, district = [x.strip() for x in member.xpath(".//figcaption//text()")] district = re.sub(r"\A(?:de|des|du) ", lambda match: match.group(0).lower(), district, flags=re.I) role = "Conseiller" From de58e5c13ec7ff71074135695ca61d28bbf44596 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 15:43:52 -0400 Subject: [PATCH 04/66] Update Sherbrooke scraper --- ca_qc_sherbrooke/people.py | 54 +++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index 9048e2d4..bcf5c4a0 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -1,44 +1,66 @@ +import json + +import lxml.html + from utils import CanadianPerson as Person from utils import CanadianScraper, clean_french_prepositions -COUNCIL_PAGE = "http://www.ville.sherbrooke.qc.ca/mairie-et-vie-democratique/conseil-municipal/elus-municipaux/" +COUNCIL_PAGE = "https://www.sherbrooke.ca/fr/vie-municipale/elues-et-elus-municipaux" class SherbrookePersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE) + districts = [] + + # The whole site is rendered with Javascript, but has part of the html documents in the scripts + def get_content(url): + page = self.lxmlize(url) + script = page.xpath(".//script[not(@type)]")[0].text_content() + data = script.split(" = ", 1)[1] + data = json.loads(data) + content = data["value"]["selected"]["content"]["fr"] + page = lxml.html.fromstring(content) + return page - councillors = page.xpath('//div[@id="c2087"]//a') + page = get_content(COUNCIL_PAGE) + councillors = page.xpath("//a[.//h3]") assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.text_content() - url = councillor.attrib["href"] - page = self.lxmlize(url) + name = councillor.xpath(".//h3")[0].text_content() + role = councillor.xpath('.//div[@class="poste"]')[0].text_content() - if "Maire" in page.xpath("//h2/text()")[0]: - district = "Sherbrooke" + if "Maire" in role: role = "Maire" + district = "Sherbrooke" else: - district = page.xpath('//div[@class="csc-default"]//a[contains(@href, "fileadmin")]/text()')[0] - district = clean_french_prepositions(district).replace("district", "").strip() role = "Conseiller" + district = councillor.xpath('.//div[@class="district"]')[0].text_content() + district = clean_french_prepositions(district).replace("District", "").strip() if district == "Lennoxville": district = "Arrondissement 3" + elif district == "Lac-Magog": + district = "Lac Magog" + districts.append(district) + url = "https://www.sherbrooke.ca" + councillor.xpath("./@href")[0] + page = get_content(url) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) - p.image = page.xpath('//div[@class="csc-textpic-image csc-textpic-last"]//img/@src')[0] - parts = page.xpath('//li[contains(text(), "phone")]/text()')[0].split(":") - note = parts[0] - phone = parts[1] - p.add_contact(note, phone, note) - email = self.get_email(page) + image = councillor.xpath(".//@src")[0] + if "https://" not in image: + image = "https://contenu.maruche.ca" + image + p.image = image + phone = self.get_phone(page, error=False) + email = self.get_email(page, error=False) if email: p.add_contact("email", email) + if phone: + p.add_contact("voice", phone, "legislature") if district == "Brompton": p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/brompton/" elif district == "Lennoxville": p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/lennoxville/" yield p + print(districts) From d5f2c285cb684142c9f28f6bc991f107bed7cdec Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 14:29:21 -0400 Subject: [PATCH 05/66] Update Senneville scraper --- ca_qc_senneville/people.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ca_qc_senneville/people.py b/ca_qc_senneville/people.py index 6233f7da..2d0d3948 100644 --- a/ca_qc_senneville/people.py +++ b/ca_qc_senneville/people.py @@ -1,20 +1,23 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.villagesenneville.qc.ca/fr/7/conseil-municipal" +COUNCIL_PAGE = "https://www.senneville.ca/municipalite/vie-democratique/conseil-municipal/" class SennevillePersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//section[@class="block text"][./header/h2][position() > 1]') + councillors = page.xpath('//div[@class="wp-block-media-text is-stacked-on-mobile"]') assert len(councillors), "No councillors found" for councillor in councillors: - role_and_district, name = councillor.xpath(".//h2/text()")[0].split("-") - role, district = role_and_district.split(" ", 1) - if role == "Maire": + role_and_district, name = councillor.xpath(".//h2")[0].text_content().split(" – ") + if "Maire" in role_and_district: + role = "Maire" district = "Senneville" + else: + role, district = role_and_district.split(" ", 1) + email = self.get_email(councillor) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) From 0bfd276dba6731793b572af6f9409ec54c77cd5e Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 10:38:56 -0400 Subject: [PATCH 06/66] Add support for cloudflare obfuscated emails in get_email method --- ca_bc_burnaby/people.py | 16 ++-------------- ca_pe_charlottetown/people.py | 19 ++----------------- ca_qc_cote_saint_luc/people.py | 13 +------------ ca_qc_kirkland/people.py | 12 +----------- utils.py | 14 ++++++++++++++ 5 files changed, 20 insertions(+), 54 deletions(-) diff --git a/ca_bc_burnaby/people.py b/ca_bc_burnaby/people.py index 981607ae..07155a92 100644 --- a/ca_bc_burnaby/people.py +++ b/ca_bc_burnaby/people.py @@ -12,16 +12,6 @@ def scrape(self): councillors = page.xpath("//a[@class='biography__link']/@href") assert len(councillors), "No councillors found" for person_url in councillors: - - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(person_url) role, name = page.xpath("//h1/span")[0].text_content().strip().split(" ", 1) @@ -29,9 +19,7 @@ def decode_email(e): contact_node = page.xpath('//div[@class="contact"]')[0] - email = page.xpath('//div[@class = "contact__detail contact__detail--email"]/a/@href')[0] - decoded_email = decode_email(email.split("#", 1)[1]) # cloudflare encrypts the email data - + email = self.get_email(contact_node) phone = self.get_phone(contact_node, area_codes=[604, 778]) if role == "Mayor": @@ -44,7 +32,7 @@ def decode_email(e): p.add_source(COUNCIL_PAGE) p.add_source(person_url) if email: - p.add_contact("email", decoded_email) + p.add_contact("email", email) if phone: p.add_contact("voice", phone, "legislature") yield p diff --git a/ca_pe_charlottetown/people.py b/ca_pe_charlottetown/people.py index 4bdc4c5d..1f66dc69 100644 --- a/ca_pe_charlottetown/people.py +++ b/ca_pe_charlottetown/people.py @@ -8,15 +8,6 @@ class CharlottetownPersonScraper(CanadianScraper): def scrape(self): - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") nodes = page.xpath('//div[@id="ctl00_ContentPlaceHolder1_ctl13_divContent"]/*') @@ -52,14 +43,8 @@ def decode_email(e): p.image = image - for node in group: - email_node = node.xpath("//a[span/@data-cfemail]") - if email_node: - email = email_node[0].xpath("./@href")[0].split("#")[1] - break - - decoded_email = decode_email(email).split("?")[0] - p.add_contact("email", decoded_email) + email = self.get_email(para) + p.add_contact("email", email) for text in para.xpath('.//strong[contains(., "Phone")]/following-sibling::text()'): if re.search(r"\d", text): diff --git a/ca_qc_cote_saint_luc/people.py b/ca_qc_cote_saint_luc/people.py index 011b005e..56a1f225 100644 --- a/ca_qc_cote_saint_luc/people.py +++ b/ca_qc_cote_saint_luc/people.py @@ -7,15 +7,6 @@ class CoteSaintLucPersonScraper(CanadianScraper): def scrape(self): - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) councillors = page.xpath('//div/div[contains(@class, "gb-container gb-container-") and .//img]') assert len(councillors), "No councillors found" @@ -39,13 +30,11 @@ def decode_email(e): blog = councillor.xpath( './/p[contains(.,"Blog")]//@href[not(contains(., "twitter") or contains(., "facebook"))]' ) - encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1] - email = decode_email(encrypted_email) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.add_contact("email", email) + p.add_contact("email", self.get_email(councillor)) p.add_contact("voice", self.get_phone(councillor, area_codes=[514]), "legislature") p.image = image if twitter: diff --git a/ca_qc_kirkland/people.py b/ca_qc_kirkland/people.py index 3425e57c..3f0bab4b 100644 --- a/ca_qc_kirkland/people.py +++ b/ca_qc_kirkland/people.py @@ -8,15 +8,6 @@ class KirklandPersonScraper(CanadianScraper): def scrape(self): - def decode_email(e): - de = "" - k = int(e[:2], 16) - - for i in range(2, len(e) - 1, 2): - de += chr(int(e[i : i + 2], 16) ^ k) - - return de - page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="container_content"]//tbody/tr') @@ -39,8 +30,7 @@ def decode_email(e): .replace(".", ",") # correcting a typo .replace(",-#-", " x") ) - encrypted_email = councillor.xpath('.//@href[contains(., "email")]')[0].split("#")[1] - email = decode_email(encrypted_email) + email = self.get_email(councillor) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) diff --git a/utils.py b/utils.py index 93916f03..4f18e78d 100644 --- a/utils.py +++ b/utils.py @@ -124,6 +124,9 @@ def get_email(self, node, expression=".", *, error=True): # e.g. ca_bc has one `href` of `mailto:first.last.mla@leg.bc.ca`. for match in node.xpath('{}//a[contains(@href, "mailto:")]'.format(expression)): matches.append(unquote(match.attrib["href"])) + # some emails are obfuscated by cloudflare + for match in node.xpath('{}//@href[contains(., "cdn-cgi/l/email-protection")]'.format(expression)): + matches.append(self._cloudflare_decode(match)) # If the node has no sub-tags. if not matches: for match in node.xpath('{}//text()[contains(., "@")]'.format(expression)): @@ -138,6 +141,17 @@ def get_email(self, node, expression=".", *, error=True): elif error: raise Exception("No email node in {}".format(etree.tostring(node))) + # Helper function for self,get_email + def _cloudflare_decode(self, link): + hex_email = link.split("#", 1)[1] + decoded_email = "" + key = int(hex_email[:2], 16) + + for i in range(2, len(hex_email) - 1, 2): + decoded_email += chr(int(hex_email[i : i + 2], 16) ^ key) + + return decoded_email + def get_phone(self, node, *, area_codes=[], error=True): """ Don't use if multiple telephone numbers are present, e.g. voice and fax. From 4d277796baceb35c8dfcd919a4fc10cb60d32da9 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 5 Jun 2024 15:17:00 -0400 Subject: [PATCH 07/66] Update Montreal Est scraper --- ca_qc_montreal_est/people.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py index 9dd0c650..5409ca97 100644 --- a/ca_qc_montreal_est/people.py +++ b/ca_qc_montreal_est/people.py @@ -1,28 +1,26 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://ville.montreal-est.qc.ca/la-ville/conseil-municipal/conseils-municipaux/" +COUNCIL_PAGE = "https://ville.montreal-est.qc.ca/vie-democratique/conseil-municipal/" class MontrealEstPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - - councillors = page.xpath("//table") + councillors = page.xpath("//div[contains(@id, 'membres-conseil-block_')]") assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.xpath(".//h3")[0].text_content() + name, role_district = councillor.xpath(".//span[@class='h3 d-block']")[0].text_content().split(" – ", 1) - if "maire" in name: - name = name.split(" ", 2)[-1] + if "Maire" in role_district: district = "Montréal-Est" role = "Maire" else: - district = "District {}".format(councillor.xpath(".//h3")[1].text_content()[-1]) + district = "District {}".format(role_district[-1]) role = "Conseiller" p = Person(primary_org="legislature", name=name, district=district, role=role) - p.image = councillor.xpath(".//@src")[0] + p.image = councillor.xpath(".//@data-lazy-src")[0] p.add_contact("email", self.get_email(councillor)) p.add_source(COUNCIL_PAGE) yield p From 28e0bbacf7ac47d5eb881e1d1ca880f5ab8e2f35 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 01:16:08 -0400 Subject: [PATCH 08/66] chore: Titlecase comment --- utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.py b/utils.py index 4f18e78d..9de9e5ea 100644 --- a/utils.py +++ b/utils.py @@ -124,7 +124,7 @@ def get_email(self, node, expression=".", *, error=True): # e.g. ca_bc has one `href` of `mailto:first.last.mla@leg.bc.ca`. for match in node.xpath('{}//a[contains(@href, "mailto:")]'.format(expression)): matches.append(unquote(match.attrib["href"])) - # some emails are obfuscated by cloudflare + # Some emails are obfuscated by Cloudflare. for match in node.xpath('{}//@href[contains(., "cdn-cgi/l/email-protection")]'.format(expression)): matches.append(self._cloudflare_decode(match)) # If the node has no sub-tags. From c18d7b33d0958d8aef8d17381bb08b2c81b24d2c Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 01:16:26 -0400 Subject: [PATCH 09/66] ca_on_school_boards_english_public: Delete --- .../__init__.py | 25 ------------------- ca_on_school_boards_english_public/people.py | 18 ------------- 2 files changed, 43 deletions(-) delete mode 100644 ca_on_school_boards_english_public/__init__.py delete mode 100644 ca_on_school_boards_english_public/people.py diff --git a/ca_on_school_boards_english_public/__init__.py b/ca_on_school_boards_english_public/__init__.py deleted file mode 100644 index bc76eb9f..00000000 --- a/ca_on_school_boards_english_public/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from opencivicdata.divisions import Division -from pupa.scrape import Organization - -from utils import CanadianJurisdiction - - -class OntarioEnglishPublicSchoolBoards(CanadianJurisdiction): - classification = "school" # just to avoid clash - division_id = "ocd-division/country:ca/province:on" - division_name = 'Ontario English Public School Board boundary"' - name = "Ontario English Public School Boards" - url = "http://www.edu.gov.on.ca/eng/sbinfo/boardList.html" - - def get_organizations(self): - organization = Organization(self.name, classification="committee") - organization.add_source(self.url) - - for division in Division.get(self.division_id).children("school_district"): - organization.add_post(role="Chair", label=division.name, division_id=division.id) - for i in range(0, 22): # XXX made-up number - organization.add_post( - role="Trustee", label="{} (seat {})".format(division.name, i), division_id=division.id - ) - - yield organization diff --git a/ca_on_school_boards_english_public/people.py b/ca_on_school_boards_english_public/people.py deleted file mode 100644 index 2ee7369a..00000000 --- a/ca_on_school_boards_english_public/people.py +++ /dev/null @@ -1,18 +0,0 @@ -from datetime import date - -from utils import CSVScraper - - -class OntarioEnglishPublicSchoolBoardsPersonScraper(CSVScraper): - # CSV source: https://docs.google.com/spreadsheets/d/1smXFR3nB9lovc6bWWcLvr621wb6E5b2TZKqUtxRTUtE/edit#gid=785048945 - csv_url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTbnQN0j_2Ky56MeRQsNTYXnt9Q6f_vFgH_KyAZ3O96QhjLqMK_Fzrjz2lI8ympE1FU0lkKgbGEvjW0/pub?gid=785048945&single=true&output=csv" - updated_at = date(2019, 9, 13) - contact_person = "andrew@newmode.net" - many_posts_per_area = True - unique_roles = ["Chair"] - encoding = "utf-8" - corrections = {"district name": {}} - organization_classification = "committee" - - def is_valid_row(self, row): - return any(row.values()) and row["last name"] and row["first name"] From 8889ea8d9fa17170ba567250e773b9633a8e1e9c Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Tue, 4 Jun 2024 15:14:52 -0400 Subject: [PATCH 10/66] Update Montreal scraper --- ca_qc_montreal/__init__.py | 2 +- ca_qc_montreal/people.py | 21 +++++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/ca_qc_montreal/__init__.py b/ca_qc_montreal/__init__.py index d881c7dd..dfa79166 100644 --- a/ca_qc_montreal/__init__.py +++ b/ca_qc_montreal/__init__.py @@ -17,7 +17,7 @@ class Montreal(CanadianJurisdiction): {"name": "Projet Montréal - Équipe Valérie Plante"}, {"name": "Vrai changement pour Montréal"}, {"name": "Équipe Anjou"}, - {"name": "Équipe Barbe Team"}, + {"name": "Équipe LaSalle Team"}, {"name": "Équipe Dauphin Lachine"}, {"name": "Équipe Denis Coderre pour Montréal"}, ] diff --git a/ca_qc_montreal/people.py b/ca_qc_montreal/people.py index 7bf5eeb1..046e95e0 100644 --- a/ca_qc_montreal/people.py +++ b/ca_qc_montreal/people.py @@ -3,15 +3,19 @@ class MontrealPersonScraper(CSVScraper): # http://donnees.ville.montreal.qc.ca/dataset/listes-des-elus-de-la-ville-de-montreal - csv_url = "http://donnees.ville.montreal.qc.ca/dataset/381d74ca-dadd-459f-95c9-db255b5f4480/resource/ce1315a3-50ee-48d0-a0f0-9bcc15f65643/download/listeelusmontreal.csv" + csv_url = "https://donnees.montreal.ca/dataset/381d74ca-dadd-459f-95c9-db255b5f4480/resource/ce1315a3-50ee-48d0-a0f0-9bcc15f65643/download/liste_elus_montreal.csv" encoding = "utf-8" locale = "fr" corrections = { "primary role": { # Normalize to masculine role descriptor. "Conseillère de la ville": "Conseiller de la ville", + "Conseiller(ère) de la ville": "Conseiller de la ville", + "Conseiller(ère) de la Ville": "Conseiller de la ville", "Mairesse d'arrondissement": "Maire d'arrondissement", + "Maire(sse) d'arrondissement": "Maire d'arrondissement", "Mairesse de la Ville de Montréal": "Maire de la Ville de Montréal", + "Maire(sse)": "Maire de la Ville de Montréal", "Mairesse suppl\u00e9ante d'arrondissement": "Conseiller de la ville", }, "arrondissement": { @@ -23,18 +27,23 @@ class MontrealPersonScraper(CSVScraper): "Rivière-des-Prairies - Pointe-aux-Trembles": "Rivière-des-Prairies—Pointe-aux-Trembles", "Rosemont-La Petite-Patrie": "Rosemont—La Petite-Patrie", "Villeray - Saint-Michel - Parc-Extension": "Villeray—Saint-Michel—Parc-Extension", + # Name. + "Ville de Montr\u00e9al": "Montr\u00e9al", }, "district name": { "Champlain—L'Île-des-Sœurs": "Champlain—L'Île-des-Soeurs", "De Lorimier": "DeLorimier", - "Saint-Henri-Est-Petite-Bourgogne-Pointe-Saint-Charles-Griffintown": "Saint-Henri—Petite-Bourgogne—Pointe-Saint-Charles", - "Saint-Paul-Émard-Saint-Henri-Ouest": "Saint-Paul—Émard", + "Saint-Henri-Est–Petite-\nBourgogne–Pointe-Saint-\nCharles–Griffintown": "Saint-Henri—Petite-Bourgogne—Pointe-Saint-Charles", + "Saint-Paul–Émard– \nSaint-Henri-Ouest": "Saint-Paul—Émard", # Hyphens. "Maisonneuve-Longue-Pointe": "Maisonneuve—Longue-Pointe", "Norman McLaren": "Norman-McLaren", + "Saint-Léonard Ouest": "Saint-Léonard-Ouest", + "Saint-Léonard Est": "Saint-Léonard-Est", }, "party name": { "Indépendante": "Indépendant", + "Ind\u00e9pendant(e)": "Indépendant", }, "gender": { "Madame": "female", @@ -58,4 +67,8 @@ def header_converter(self, s): }.get(s, s) def is_valid_row(self, row): - return row["primary role"] not in ("Conseiller d'arrondissement", "Conseillère d'arrondissement") + return row["primary role"] not in ( + "Conseiller d'arrondissement", + "Conseillère d'arrondissement", + "Conseiller(\u00e8re) d'arrondissement", + ) From 19d75a6b96685f83fb78a21e26bedce6cdd06f4e Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Tue, 4 Jun 2024 14:18:51 -0400 Subject: [PATCH 11/66] Update Mercier scraper --- ca_qc_mercier/people.py | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/ca_qc_mercier/people.py b/ca_qc_mercier/people.py index e66dd79a..1d879f34 100644 --- a/ca_qc_mercier/people.py +++ b/ca_qc_mercier/people.py @@ -1,31 +1,40 @@ -import re - -from utils import CUSTOM_USER_AGENT from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.ville.mercier.qc.ca/02_viedemocratique/default.asp" +COUNCIL_PAGE = "https://www.ville.mercier.qc.ca/affaires-municipales/conseil-municipal/membres-du-conseil/" class MercierPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT, encoding="windows-1252") + page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//table[@width="800"]/tr') + councillors = page.xpath('//div[@class="wp-block-team-member"]') assert len(councillors), "No councillors found" for councillor in councillors: - if councillor == councillors[0]: - name = councillor.xpath(".//strong/text()")[0].replace("Monsieur", "").replace("Madame", "").strip() - role = "Maire" - district = "Mercier" - else: - name = councillor.xpath(".//strong/text()")[0].replace("Monsieur", "").replace("Madame", "").strip() - role = "Conseiller" - district = "District {}".format(re.search(r"(\d)", councillor.xpath(".//text()")[3]).group(1)) + name = councillor.xpath(".//h4/text()")[0] + district = councillor.xpath(".//h5/text()")[0].split(" – ")[1] email = self.get_email(councillor) + phone = self.get_phone(councillor) + image = councillor.xpath(".//img/@src")[0] - p = Person(primary_org="legislature", name=name, district=district, role=role) + p = Person(primary_org="legislature", name=name, district=district, role="Conseiller", image=image) p.add_source(COUNCIL_PAGE) p.add_contact("email", email) + p.add_contact("voice", phone, "legislature") + yield p + + mayor_node = page.xpath('//div[@class="wp-block-media-text alignwide is-stacked-on-mobile"]')[0] + name = mayor_node.xpath(".//h1")[0].text_content() + + email = self.get_email(mayor_node) + phone = self.get_phone(mayor_node) + image = mayor_node.xpath(".//img/@src")[0] + + p = Person(primary_org="legislature", name=name, district="Mercier", role="Maire", image=image) + p.add_source(COUNCIL_PAGE) + p.add_contact("email", email) + p.add_contact("voice", phone, "legislature") + + yield p From ead0eff7246b90eb370038dd2fb7649ef738de78 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Tue, 4 Jun 2024 13:37:19 -0400 Subject: [PATCH 12/66] Get mayor url without hardcoding for Windsor --- ca_on_windsor/people.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/ca_on_windsor/people.py b/ca_on_windsor/people.py index 62b21688..707da754 100644 --- a/ca_on_windsor/people.py +++ b/ca_on_windsor/people.py @@ -1,14 +1,29 @@ +import json + +import requests + from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "https://www.citywindsor.ca/mayor-and-council/city-councillors" -MAYOR_PAGE = "https://www.citywindsor.ca/mayor-and-council/mayor-drew-dilkens" +COUNCIL_PAGE = "https://www.citywindsor.ca/mayor-and-council" class WindsorPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - + data_url = page.xpath('//comment()[contains(., "SITE JS")]/following-sibling::script/@src')[0] + data = json.loads(requests.get(data_url).text.split(" = ")[1]) + nav_items = [] + for item in data: + if item["RollupType"] == "SidebarNavigation": + nav_items = item["RollupFields"] + for item in nav_items: + if item["Title"].startswith("Mayor") and item["Parent"] == "Mayor and City Council": + mayor_url = "https://www.citywindsor.ca" + item["RelativeURL"] + if "Councillors" in item["Title"]: + councillors_url = "https://www.citywindsor.ca" + item["RelativeURL"] + + page = self.lxmlize(councillors_url, user_agent="Mozilla/5.0") councillors = page.xpath("//h2") assert len(councillors), "No councillors found" for councillor in councillors: @@ -28,12 +43,12 @@ def scrape(self): yield p - page = self.lxmlize(MAYOR_PAGE) + page = self.lxmlize(mayor_url) title = page.xpath("//h1")[0].text_content() name = title.replace("Mayor ", "") image = page.xpath('//img[contains(./@alt, "Mayor")]/@src')[0] p = Person(primary_org="legislature", name=name, district="Windsor", role="Mayor", image=image) - p.add_source(MAYOR_PAGE) + p.add_source(mayor_url) yield p From 2531e7d1cedc16d7e2b0ef5f56dee50eb477f8c7 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 01:26:56 -0400 Subject: [PATCH 13/66] ca_qc_sherbrooke: Remove print() --- ca_qc_sherbrooke/people.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index bcf5c4a0..054bbcb8 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -63,4 +63,3 @@ def get_content(url): elif district == "Lennoxville": p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/lennoxville/" yield p - print(districts) From d55c963af84da8303191321cd5c969eb88117b12 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 11:06:18 -0400 Subject: [PATCH 14/66] Update halifax scraper --- ca_ns_halifax/people.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/ca_ns_halifax/people.py b/ca_ns_halifax/people.py index c8cf9e7b..8709c515 100644 --- a/ca_ns_halifax/people.py +++ b/ca_ns_halifax/people.py @@ -10,25 +10,24 @@ class HalifaxPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@id = "block-districtdistrictindex"]/ul/li')[1:] + page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") + councillors = page.xpath('//div[@id = "block-districtdistrictindex"]//ul/li')[1:] assert len(councillors), "No councillors found" for councillor in councillors: photo_div = councillor.xpath("./a/div[1]")[0] info_div = councillor.xpath("./a/div[2]")[0] district = re.sub(r"\s*[–—-]\s*", "—", "—".join(info_div.xpath("./p/text()"))) - # FIXME: we special-case one malformed district name. If you're editing this file, - # try removing these lines - if district.startswith("District 16 "): - district = district[len("District 16 ") :] + # District name different than in database + if "Westphal" in district: + district = "Cole Harbour—Westphal" name = info_div.xpath("./strong/p/text()")[0].replace("Councillor ", "").replace("Deputy Mayor ", "") if name != "To be determined": photo = photo_div.xpath(".//img/@src")[0] url = councillor.xpath("./a/@href")[0] - councillor_page = self.lxmlize(url) + councillor_page = self.lxmlize(url, user_agent="Mozilla/5.0") contact_node = councillor_page.xpath('//div[@id = "block-districtdistrictprofile"]')[0] phone = self.get_phone(contact_node, area_codes=[902]) @@ -42,7 +41,7 @@ def scrape(self): p.image = photo yield p - mayor_page = self.lxmlize(MAYOR_PAGE, "iso-8859-1") + mayor_page = self.lxmlize(MAYOR_PAGE, "iso-8859-1", user_agent="Mozilla/5.0") name = " ".join(mayor_page.xpath("//h1/text()")).replace("Mayor", "").strip() contact_div = mayor_page.xpath('//aside[contains(@class, "layout-sidebar-second")]/section/div[1]')[0] phone = self.get_phone(contact_div.xpath("./p[2]")[0]) From 36bdc514eaa72305a8f49a139213d2057afd4853 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 5 Jun 2024 16:59:28 -0400 Subject: [PATCH 15/66] Update Quebec City scraper --- ca_qc_quebec/people.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/ca_qc_quebec/people.py b/ca_qc_quebec/people.py index 129a8f1b..abca0be2 100644 --- a/ca_qc_quebec/people.py +++ b/ca_qc_quebec/people.py @@ -25,18 +25,35 @@ def scrape(self): role = "Maire" else: district = councillor.xpath('./p[@itemprop="jobTitle"]/a/text()')[0] - district = re.search(r"\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)", district, flags=re.U).group( - 1 + district = ( + re.search(r"\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)", district, flags=re.U) + .group(1) + .strip() ) role = "Conseiller" - if district == "Saules": + if district == "Saules–Les Méandres": district = "Les Saules" + elif district == "Neufch\u00e2tel\u2013Lebourgneuf": + district = "Neufchâtel-Lebourgneuf" + elif district == "Loretteville\u2013Les Ch\u00e2tels": + district = "Loretteville-Les Ch\u00e2tels" else: district = re.sub(r"–", "—", district) # n-dash, m-dash - p = Person(primary_org="legislature", name=name, district=district, role=role) - p.add_source(COUNCIL_PAGE) - p.image = councillor.xpath("./figure//@src")[0] - p.add_contact("voice", self.get_phone(councillor, area_codes=[418]), "legislature") - yield p + districts = [district] + + borough = None + borough_strings = councillor.xpath('.//p[@itemprop = "affiliation"]/text()') + for string in borough_strings: + borough = re.findall(r"Présidente? de l’arrondissement (.*)$", string) + if borough: + borough = borough[0].replace("des", "Les").replace("de ", "") + districts.append(borough) + + for district in districts: + p = Person(primary_org="legislature", name=name, district=district, role=role) + p.add_source(COUNCIL_PAGE) + p.image = councillor.xpath("./figure//@src")[0] + p.add_contact("voice", self.get_phone(councillor, area_codes=[418]), "legislature") + yield p From 018656639d13d755316feb408d8c71fd097744f4 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 5 Jun 2024 18:40:08 -0400 Subject: [PATCH 16/66] Update Saguenay scraper --- ca_qc_saguenay/people.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/ca_qc_saguenay/people.py b/ca_qc_saguenay/people.py index 5cf7e813..98655ee5 100644 --- a/ca_qc_saguenay/people.py +++ b/ca_qc_saguenay/people.py @@ -10,14 +10,12 @@ class SaguenayPersonScraper(CanadianScraper): def scrape(self): mayor_page = self.lxmlize(MAYOR_PAGE) contact_page = self.lxmlize(CONTACT_PAGE) - - name = mayor_page.xpath('//span/text()[contains(., "maire")]')[0].split(", ", 1)[0] + name = mayor_page.xpath('//a[contains(., "maire")]/span/text()')[0] p = Person(primary_org="legislature", name=name, district="Saguenay", role="Maire") p.add_source(MAYOR_PAGE) p.add_source(CONTACT_PAGE) node = contact_page.xpath('//h2[contains(., "Coordonnées du cabinet")]/following-sibling::p')[1] p.add_contact("voice", self.get_phone(node, area_codes=[418]), "legislature") - p.add_contact("email", self.get_email(node)) yield p page = self.lxmlize(COUNCIL_PAGE) @@ -26,6 +24,19 @@ def scrape(self): for councillor in councillors: district = councillor.xpath("./h3/text()")[0].replace("#", "") name = councillor.xpath(".//p/text()")[0] + borough = None + borough_node = councillor.xpath(".//p/strong") + if borough_node: + text = borough_node[0].text_content() + if "Président" in text: + borough = text.replace("Président de l'arrondissement de ", "") + + if borough: + p = Person(primary_org="legislature", name=name, district=borough, role="Conseiller") + p.add_source(COUNCIL_PAGE) + p.add_contact("voice", self.get_phone(councillor), "legislature") + p.add_contact("email", self.get_email(councillor)) + yield p p = Person(primary_org="legislature", name=name, district=district, role="Conseiller") p.add_source(COUNCIL_PAGE) From f8cc83458299357006031610fc8cae86e6be2d75 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 01:49:02 -0400 Subject: [PATCH 17/66] ca_qc_sherbrooke: "Arrondissement 3" shadows later Lennoxville logic --- ca_qc_sherbrooke/people.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index 054bbcb8..524aa898 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -37,9 +37,7 @@ def get_content(url): district = councillor.xpath('.//div[@class="district"]')[0].text_content() district = clean_french_prepositions(district).replace("District", "").strip() - if district == "Lennoxville": - district = "Arrondissement 3" - elif district == "Lac-Magog": + if district == "Lac-Magog": district = "Lac Magog" districts.append(district) url = "https://www.sherbrooke.ca" + councillor.xpath("./@href")[0] From b9a0109e29503b21fdb3c994ad137fff8483abc7 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 01:54:39 -0400 Subject: [PATCH 18/66] ca_qc_quebec/ca_qc_saguenay: Set boundary_url --- ca_qc_quebec/people.py | 4 ++++ ca_qc_saguenay/people.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/ca_qc_quebec/people.py b/ca_qc_quebec/people.py index abca0be2..18d22281 100644 --- a/ca_qc_quebec/people.py +++ b/ca_qc_quebec/people.py @@ -1,5 +1,7 @@ import re +from django.template.defaultfilters import slugify + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -56,4 +58,6 @@ def scrape(self): p.add_source(COUNCIL_PAGE) p.image = councillor.xpath("./figure//@src")[0] p.add_contact("voice", self.get_phone(councillor, area_codes=[418]), "legislature") + if borough: + p._related[0].extras["boundary_url"] = f"/boundaries/quebec-boroughs/{slugify(borough)}" yield p diff --git a/ca_qc_saguenay/people.py b/ca_qc_saguenay/people.py index 98655ee5..5a0a44fd 100644 --- a/ca_qc_saguenay/people.py +++ b/ca_qc_saguenay/people.py @@ -1,3 +1,5 @@ +from django.template.defaultfilters import slugify + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -36,6 +38,7 @@ def scrape(self): p.add_source(COUNCIL_PAGE) p.add_contact("voice", self.get_phone(councillor), "legislature") p.add_contact("email", self.get_email(councillor)) + p._related[0].extras["boundary_url"] = f"/boundaries/saguenay-boroughs/{slugify(borough)}" yield p p = Person(primary_org="legislature", name=name, district=district, role="Conseiller") From cff980adb28408c579b7f34ed67354adaf3fa9e4 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 02:01:46 -0400 Subject: [PATCH 19/66] fix: Add trailing slash for boroughs --- ca_qc_quebec/people.py | 2 +- ca_qc_saguenay/people.py | 2 +- ca_qc_sherbrooke/people.py | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/ca_qc_quebec/people.py b/ca_qc_quebec/people.py index 18d22281..187b153e 100644 --- a/ca_qc_quebec/people.py +++ b/ca_qc_quebec/people.py @@ -59,5 +59,5 @@ def scrape(self): p.image = councillor.xpath("./figure//@src")[0] p.add_contact("voice", self.get_phone(councillor, area_codes=[418]), "legislature") if borough: - p._related[0].extras["boundary_url"] = f"/boundaries/quebec-boroughs/{slugify(borough)}" + p._related[0].extras["boundary_url"] = f"/boundaries/quebec-boroughs/{slugify(borough)}/" yield p diff --git a/ca_qc_saguenay/people.py b/ca_qc_saguenay/people.py index 5a0a44fd..2979234c 100644 --- a/ca_qc_saguenay/people.py +++ b/ca_qc_saguenay/people.py @@ -38,7 +38,7 @@ def scrape(self): p.add_source(COUNCIL_PAGE) p.add_contact("voice", self.get_phone(councillor), "legislature") p.add_contact("email", self.get_email(councillor)) - p._related[0].extras["boundary_url"] = f"/boundaries/saguenay-boroughs/{slugify(borough)}" + p._related[0].extras["boundary_url"] = f"/boundaries/saguenay-boroughs/{slugify(borough)}/" yield p p = Person(primary_org="legislature", name=name, district=district, role="Conseiller") diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index 524aa898..86bf9f1a 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -37,7 +37,9 @@ def get_content(url): district = councillor.xpath('.//div[@class="district"]')[0].text_content() district = clean_french_prepositions(district).replace("District", "").strip() - if district == "Lac-Magog": + if district == "Lennoxville": + district = "Arrondissement 3" + elif district == "Lac-Magog": district = "Lac Magog" districts.append(district) url = "https://www.sherbrooke.ca" + councillor.xpath("./@href")[0] @@ -58,6 +60,6 @@ def get_content(url): p.add_contact("voice", phone, "legislature") if district == "Brompton": p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/brompton/" - elif district == "Lennoxville": + elif district == "Arrondissement 3": p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/lennoxville/" yield p From a63823eab06b29bc8e43d48b85c8c0eb2692a795 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 02:18:25 -0400 Subject: [PATCH 20/66] ca_qc_quebec: Fix assignment of borough boundary URLs --- ca_qc_quebec/people.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ca_qc_quebec/people.py b/ca_qc_quebec/people.py index 187b153e..fbecc631 100644 --- a/ca_qc_quebec/people.py +++ b/ca_qc_quebec/people.py @@ -53,11 +53,11 @@ def scrape(self): borough = borough[0].replace("des", "Les").replace("de ", "") districts.append(borough) - for district in districts: + for i, district in enumerate(districts): p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath("./figure//@src")[0] p.add_contact("voice", self.get_phone(councillor, area_codes=[418]), "legislature") - if borough: - p._related[0].extras["boundary_url"] = f"/boundaries/quebec-boroughs/{slugify(borough)}/" + if i: + p._related[0].extras["boundary_url"] = f"/boundaries/quebec-boroughs/{slugify(district)}/" yield p From 0900e5c5e61e1ba2f4c70665fad6486acf5c6c81 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 02:19:33 -0400 Subject: [PATCH 21/66] ca_qc_sherbrooke: Remove hardcoding of district councillor to borough president --- ca_qc_sherbrooke/people.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index 86bf9f1a..13d2440e 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -37,6 +37,7 @@ def get_content(url): district = councillor.xpath('.//div[@class="district"]')[0].text_content() district = clean_french_prepositions(district).replace("District", "").strip() + # The shapefile in represent-canada-data is missing the Lennoxville district. if district == "Lennoxville": district = "Arrondissement 3" elif district == "Lac-Magog": From 55ead89f9787d72798ffaae6ba52b448af96a90a Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 02:37:40 -0400 Subject: [PATCH 22/66] ca_qc_sherbrooke: Don't hardcode borough presidents to specific districts --- ca_qc_sherbrooke/people.py | 9 +-------- country-ca.csv | 8 ++++---- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index 13d2440e..fc73d969 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -37,10 +37,7 @@ def get_content(url): district = councillor.xpath('.//div[@class="district"]')[0].text_content() district = clean_french_prepositions(district).replace("District", "").strip() - # The shapefile in represent-canada-data is missing the Lennoxville district. - if district == "Lennoxville": - district = "Arrondissement 3" - elif district == "Lac-Magog": + if district == "Lac-Magog": district = "Lac Magog" districts.append(district) url = "https://www.sherbrooke.ca" + councillor.xpath("./@href")[0] @@ -59,8 +56,4 @@ def get_content(url): p.add_contact("email", email) if phone: p.add_contact("voice", phone, "legislature") - if district == "Brompton": - p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/brompton/" - elif district == "Arrondissement 3": - p._related[0].extras["boundary_url"] = "/boundaries/sherbrooke-boroughs/lennoxville/" yield p diff --git a/country-ca.csv b/country-ca.csv index 3dbb69c6..5e17cf91 100644 --- a/country-ca.csv +++ b/country-ca.csv @@ -2550,10 +2550,10 @@ ocd-division/country:ca/csd:2442110/district:4,District 4,,,,,,,,,,,,,, ocd-division/country:ca/csd:2442110/district:5,District 5,,,,,,,,,,,,,, ocd-division/country:ca/csd:2442110/district:6,District 6,,,,,,,,,,,,,, ocd-division/country:ca/csd:2443027,Sherbrooke,,,,,V,Y,Sherbrooke,,Ville de Sherbrooke,,24,,, -ocd-division/country:ca/csd:2443027/borough:1,Arrondissement 1,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2443027/borough:2,Arrondissement 2,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2443027/borough:3,Arrondissement 3,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2443027/borough:4,Arrondissement 4,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2443027/borough:1,Brompton–Rock Forest–Saint-Élie–Deauville,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2443027/borough:2,Fleurimont,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2443027/borough:3,Lennoxville,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2443027/borough:4,Nations,,,,,,,,,,,,,, ocd-division/country:ca/csd:2443027/district:1.1,Lac Magog,,,,,,,,,,,,,, ocd-division/country:ca/csd:2443027/district:1.2,Rock Forest,,,,,,,,,,,,,, ocd-division/country:ca/csd:2443027/district:1.3,Saint-Élie,,,,,,,,,,,,,, From ecff43c141c3bc4184cef97ee7607d067beb7d57 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 14:20:53 -0400 Subject: [PATCH 23/66] Fix getting phone numbers for Saint Jerome --- ca_qc_saint_jerome/people.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ca_qc_saint_jerome/people.py b/ca_qc_saint_jerome/people.py index 9defa48d..23a6f19d 100644 --- a/ca_qc_saint_jerome/people.py +++ b/ca_qc_saint_jerome/people.py @@ -21,13 +21,14 @@ def scrape(self): role = "Conseiller" image = councillor.xpath('.//div[@class="portrait_single"]/img/@data-lazy-src')[0] - contact = councillor.xpath('.//div[contains(@class,"phone")]/text()')[0] + phone = self.get_phone(councillor, error=False) p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image - p.add_contact("voice", contact, "legislature") + if phone: + p.add_contact("voice", phone, "legislature") p.add_contact("email", self.get_email(councillor)) yield p From 366d344242c25c4a60de669ba996ac9fa2b46fd5 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 6 Jun 2024 16:33:41 -0400 Subject: [PATCH 24/66] Update Terrebonne scraper --- ca_qc_terrebonne/people.py | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/ca_qc_terrebonne/people.py b/ca_qc_terrebonne/people.py index 023b0066..40b87c9e 100644 --- a/ca_qc_terrebonne/people.py +++ b/ca_qc_terrebonne/people.py @@ -1,34 +1,33 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.ville.terrebonne.qc.ca/fr/10/Conseil_municipal" +COUNCIL_PAGE = "https://terrebonne.ca/membres-du-conseil-municipal/" class TerrebonnePersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE, "utf-8") - councillors = page.xpath('//div[contains(@class, "member-box member-box--")]') + councillors = page.xpath('//div[contains(@class, "member-card jsBlockLink")]') assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.xpath('.//div[@class="fiche__name"]/text()')[0] - phone = councillor.xpath('.//div[@class="fiche__social"]/span/text()')[0].split("T")[1] - email_mailto = councillor.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href') - photo_url = councillor.xpath(".//img")[0].attrib["src"] - - page = self.lxmlize(councillor.xpath('.//a[@class="member-box__calltoaction"]/@href')[0]) - district = page.xpath('.//div[@class="fiche__category"]/text()')[0] - - if district == "Maire": - district = "Terrebonne" + name = councillor.xpath('.//a[@class="name"]/text()')[0] + district = councillor.xpath('.//p[@class="district"]/text()')[0] + if "Maire" in district: role = "Maire" + district = "Terrebonne" else: - district = "District {}".format(district) role = "Conseiller" + district = district.split(" - ")[0] + + photo_url = councillor.xpath(".//noscript/img/@src")[0] + url = councillor.xpath(".//@href")[0] + + page = self.lxmlize(url) + email = self.get_email(page) + phone = self.get_phone(page) p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact("voice", phone, "legislature") - if email_mailto: - email = email_mailto[0].split("mailto:")[1] - p.add_contact("email", email) + p.add_contact("email", email) yield p From 95d9dda1636a5bd24e5e31389fa364932e22ce4a Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Fri, 7 Jun 2024 15:44:50 -0400 Subject: [PATCH 25/66] fix: Stop hardcoding Halifax Mayor url --- ca_ns_halifax/people.py | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/ca_ns_halifax/people.py b/ca_ns_halifax/people.py index 8709c515..06064166 100644 --- a/ca_ns_halifax/people.py +++ b/ca_ns_halifax/people.py @@ -4,14 +4,12 @@ from utils import CanadianScraper COUNCIL_PAGE = "https://www.halifax.ca/city-hall/districts-councillors" -MAYOR_PAGE = "https://www.halifax.ca/city-hall/mayor-mike-savage" -MAYOR_CONTACT_URL = "http://www.halifax.ca/mayor/contact.php" class HalifaxPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") - councillors = page.xpath('//div[@id = "block-districtdistrictindex"]//ul/li')[1:] + councillors = page.xpath('//div[@id = "block-districtdistrictindex"]//ul/li') assert len(councillors), "No councillors found" for councillor in councillors: @@ -24,31 +22,25 @@ def scrape(self): name = info_div.xpath("./strong/p/text()")[0].replace("Councillor ", "").replace("Deputy Mayor ", "") + if "Mayor" in name: + role = "Mayor" + name = name.replace("Mayor ", "") + district = "Halifax" + else: + role = "Councillor" + if name != "To be determined": photo = photo_div.xpath(".//img/@src")[0] url = councillor.xpath("./a/@href")[0] councillor_page = self.lxmlize(url, user_agent="Mozilla/5.0") - contact_node = councillor_page.xpath('//div[@id = "block-districtdistrictprofile"]')[0] - phone = self.get_phone(contact_node, area_codes=[902]) - email = self.get_email(contact_node) + phone = self.get_phone(councillor_page, area_codes=[902]) + email = self.get_email(councillor_page) - p = Person(primary_org="legislature", name=name, district=district, role="Councillor") + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact("voice", phone, "legislature") p.add_contact("email", email) p.image = photo yield p - - mayor_page = self.lxmlize(MAYOR_PAGE, "iso-8859-1", user_agent="Mozilla/5.0") - name = " ".join(mayor_page.xpath("//h1/text()")).replace("Mayor", "").strip() - contact_div = mayor_page.xpath('//aside[contains(@class, "layout-sidebar-second")]/section/div[1]')[0] - phone = self.get_phone(contact_div.xpath("./p[2]")[0]) - email = self.get_email(contact_div.xpath("./p[2]")[0]) - - p = Person(primary_org="legislature", name=name, district="Halifax", role="Mayor") - p.add_source(MAYOR_PAGE) - p.add_contact("email", email) - p.add_contact("voice", phone, "legislature") - yield p From 682fc86246e9ca789253a8d58bf6cd8c4f93a603 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Fri, 7 Jun 2024 16:44:47 -0400 Subject: [PATCH 26/66] fix: scrape Presidents of Arrondissements in Sherbrooke --- ca_qc_sherbrooke/people.py | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index fc73d969..875c1e6e 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -26,6 +26,7 @@ def get_content(url): councillors = page.xpath("//a[.//h3]") assert len(councillors), "No councillors found" for councillor in councillors: + districts = [] name = councillor.xpath(".//h3")[0].text_content() role = councillor.xpath('.//div[@class="poste"]')[0].text_content() @@ -33,9 +34,17 @@ def get_content(url): role = "Maire" district = "Sherbrooke" else: - role = "Conseiller" district = councillor.xpath('.//div[@class="district"]')[0].text_content() district = clean_french_prepositions(district).replace("District", "").strip() + if "président" in role: + borough = councillor.xpath('.//div[@class="bloc_bas"]/p')[0].text_content() + borough = clean_french_prepositions(borough).replace("Arrondissement", "").strip() + + if borough == "Brompton-Rock Forest-Saint-\u00c9lie-Deauville": + borough = "Brompton–Rock Forest–Saint-Élie–Deauville" # N-dashes + if borough != district: # Lennoxville + districts.append(borough) + role = "Conseiller" if district == "Lac-Magog": district = "Lac Magog" @@ -43,17 +52,20 @@ def get_content(url): url = "https://www.sherbrooke.ca" + councillor.xpath("./@href")[0] page = get_content(url) - p = Person(primary_org="legislature", name=name, district=district, role=role) - p.add_source(COUNCIL_PAGE) - p.add_source(url) + phone = self.get_phone(page, error=False) + email = self.get_email(page, error=False) image = councillor.xpath(".//@src")[0] if "https://" not in image: image = "https://contenu.maruche.ca" + image - p.image = image - phone = self.get_phone(page, error=False) - email = self.get_email(page, error=False) - if email: - p.add_contact("email", email) - if phone: - p.add_contact("voice", phone, "legislature") - yield p + + for district in districts: + p = Person(primary_org="legislature", name=name, district=district, role=role) + p.add_source(COUNCIL_PAGE) + p.add_source(url) + p.image = image + + if email: + p.add_contact("email", email) + if phone: + p.add_contact("voice", phone, "legislature") + yield p From 170a7597d93f612f75c884396252b3493446ca1e Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 7 Jun 2024 17:25:14 -0400 Subject: [PATCH 27/66] ca_qc_sherbrooke: Add boundary URLs for boroughs --- ca_qc_sherbrooke/people.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index 875c1e6e..18ad6b7a 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -1,6 +1,7 @@ import json import lxml.html +from django.template.defaultfilters import slugify from utils import CanadianPerson as Person from utils import CanadianScraper, clean_french_prepositions @@ -34,21 +35,23 @@ def get_content(url): role = "Maire" district = "Sherbrooke" else: + role = "Conseiller" district = councillor.xpath('.//div[@class="district"]')[0].text_content() district = clean_french_prepositions(district).replace("District", "").strip() - if "président" in role: - borough = councillor.xpath('.//div[@class="bloc_bas"]/p')[0].text_content() - borough = clean_french_prepositions(borough).replace("Arrondissement", "").strip() - - if borough == "Brompton-Rock Forest-Saint-\u00c9lie-Deauville": - borough = "Brompton–Rock Forest–Saint-Élie–Deauville" # N-dashes - if borough != district: # Lennoxville - districts.append(borough) - role = "Conseiller" + if district == "Lac-Magog": + district = "Lac Magog" - if district == "Lac-Magog": - district = "Lac Magog" districts.append(district) + + if "président" in role: + borough = councillor.xpath('.//div[@class="bloc_bas"]/p')[0].text_content() + borough = clean_french_prepositions(borough).replace("Arrondissement", "").strip() + + if borough == "Brompton-Rock Forest-Saint-\u00c9lie-Deauville": + borough = "Brompton–Rock Forest–Saint-Élie–Deauville" # N-dashes + if borough != district: # Lennoxville + districts.append(borough) + url = "https://www.sherbrooke.ca" + councillor.xpath("./@href")[0] page = get_content(url) @@ -58,7 +61,7 @@ def get_content(url): if "https://" not in image: image = "https://contenu.maruche.ca" + image - for district in districts: + for i, district in enumerate(districts): p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) @@ -68,4 +71,6 @@ def get_content(url): p.add_contact("email", email) if phone: p.add_contact("voice", phone, "legislature") + if i: + p._related[0].extras["boundary_url"] = f"/boundaries/sherbrooke-boroughs/{slugify(district)}/" yield p From 0ae367b2b815b1dcbcbf6d88a23fe0923716c78e Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 13 Jun 2024 17:11:15 -0400 Subject: [PATCH 28/66] fix Ottawa scraper and update Ottawa divisions --- ca_on_ottawa/people.py | 31 +++++++++++++++++++++++-------- country-ca.csv | 13 +++++++------ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/ca_on_ottawa/people.py b/ca_on_ottawa/people.py index 5d1054ee..b5fd3da7 100644 --- a/ca_on_ottawa/people.py +++ b/ca_on_ottawa/people.py @@ -1,12 +1,27 @@ +import csv +from io import BytesIO, StringIO + +import agate + from utils import CSVScraper class OttawaPersonScraper(CSVScraper): - # http://data.ottawa.ca/dataset/elected-officials - csv_url = "http://data.ottawa.ca/dataset/fd26ae83-fe1a-40d8-8951-72df40021c82/resource/3cd1b14d-cb45-4c4d-b22a-a607946e2ec2/download/elected-officials-2018-2022.csv" - encoding = "utf-8-sig" - corrections = { - "district name": { - "Orl\u0082ans": "Orléans", - }, - } + # https://open.ottawa.ca/documents/ottawa::elected-officials-2022-2026/about + csv_url = "https://www.arcgis.com/sharing/rest/content/items/a5e9dc2425274bb796d3ded47b0d7b00/data" + fallbacks = {"district name": "ward name"} + + # Workaround for the download link not having the correct extension + def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows=0, data=None, **kwargs): + data = StringIO() + binary = BytesIO(self.get(url).content) + table = agate.Table.from_xls(binary) + table.to_csv(data) + data.seek(0) + if skip_rows: + for _ in range(skip_rows): + data.readline() + if header: + return csv.DictReader(data, delimiter=delimiter) + else: + return csv.reader(data, delimiter=delimiter) diff --git a/country-ca.csv b/country-ca.csv index 5e17cf91..29d158b4 100644 --- a/country-ca.csv +++ b/country-ca.csv @@ -4722,7 +4722,7 @@ ocd-division/country:ca/csd:3502036,Clarence-Rockland,,,,,C,,Clarence-Rockland,, ocd-division/country:ca/csd:3502044,Casselman,,,,,VL,,Casselman,,Village of Casselman,ocd-division/country:ca/cd:3502,,,, ocd-division/country:ca/csd:3502048,Russell,,,,,TP,,Russell,,Township of Russell,ocd-division/country:ca/cd:3502,,,, ocd-division/country:ca/csd:3506008,Ottawa,,,,,CV,,Ottawa,,City of Ottawa,,,,, -ocd-division/country:ca/csd:3506008/ward:1,Orléans,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:1,Orléans East-Cumberland,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:10,Gloucester-Southgate,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:11,Beacon Hill-Cyrville,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:12,Rideau-Vanier,,,,,,,,,,,,,, @@ -4732,13 +4732,14 @@ ocd-division/country:ca/csd:3506008/ward:15,Kitchissippi,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:16,River,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:17,Capital,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:18,Alta Vista,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:19,Cumberland,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:2,Innes,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:19,Orléans South-Navan,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:2,Orléans West-Innes,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:20,Osgoode,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:21,Rideau-Goulbourn,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:22,Gloucester-South Nepean,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:21,Rideau-Jock,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:22,Riverside South-Findlay Creek,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:23,Kanata South,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3506008/ward:3,Barrhaven,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:24,Barrhaven East,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3506008/ward:3,Barrhaven West,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:4,Kanata North,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:5,West Carleton-March,,,,,,,,,,,,,, ocd-division/country:ca/csd:3506008/ward:6,Stittsville,,,,,,,,,,,,,, From c916cd1c95bd76e243756e0c976fe03d34a8a97a Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 13 Jun 2024 19:25:27 -0400 Subject: [PATCH 29/66] feat: Add CSVScraper extension class attribute --- ca_on_ottawa/people.py | 21 +-------------------- utils.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/ca_on_ottawa/people.py b/ca_on_ottawa/people.py index b5fd3da7..9db7f422 100644 --- a/ca_on_ottawa/people.py +++ b/ca_on_ottawa/people.py @@ -1,8 +1,3 @@ -import csv -from io import BytesIO, StringIO - -import agate - from utils import CSVScraper @@ -10,18 +5,4 @@ class OttawaPersonScraper(CSVScraper): # https://open.ottawa.ca/documents/ottawa::elected-officials-2022-2026/about csv_url = "https://www.arcgis.com/sharing/rest/content/items/a5e9dc2425274bb796d3ded47b0d7b00/data" fallbacks = {"district name": "ward name"} - - # Workaround for the download link not having the correct extension - def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows=0, data=None, **kwargs): - data = StringIO() - binary = BytesIO(self.get(url).content) - table = agate.Table.from_xls(binary) - table.to_csv(data) - data.seek(0) - if skip_rows: - for _ in range(skip_rows): - data.readline() - if header: - return csv.DictReader(data, delimiter=delimiter) - else: - return csv.reader(data, delimiter=delimiter) + extension = ".xls" diff --git a/utils.py b/utils.py index 9de9e5ea..99fef627 100644 --- a/utils.py +++ b/utils.py @@ -248,8 +248,7 @@ def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows= data.readline() if header: return csv.DictReader(data, delimiter=delimiter) - else: - return csv.reader(data, delimiter=delimiter) + return csv.reader(data, delimiter=delimiter) class CSVScraper(CanadianScraper): @@ -266,6 +265,10 @@ class CSVScraper(CanadianScraper): If `csv_url` is a ZIP file, set the compressed file to read. """ filename = None + """ + If `csv_url` is an XLS, XLSX or ZIP file, but has no extension, set the extension (like '.xlsx'). + """ + extension = None # Table flags """ @@ -366,7 +369,10 @@ def is_valid_row(self, row): def scrape(self): seat_numbers = defaultdict(lambda: defaultdict(int)) - extension = os.path.splitext(self.csv_url)[1] + if self.extension: + extension = self.extension + else: + extension = os.path.splitext(self.csv_url)[1] if extension in (".xls", ".xlsx"): data = StringIO() binary = BytesIO(self.get(self.csv_url).content) From 206519b467162028a538cf2745abfac4e632aeb7 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Fri, 21 Jun 2024 17:53:31 -0400 Subject: [PATCH 30/66] fix: ca_on_woolwich --- ca_on_woolwich/__init__.py | 22 ++++++++++++++++++++++ ca_on_woolwich/people.py | 36 ++++++++++++++++++++---------------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/ca_on_woolwich/__init__.py b/ca_on_woolwich/__init__.py index 8a4ba24f..a173b76b 100644 --- a/ca_on_woolwich/__init__.py +++ b/ca_on_woolwich/__init__.py @@ -1,3 +1,5 @@ +from pupa.scrape import Organization + from utils import CanadianJurisdiction @@ -7,3 +9,23 @@ class Woolwich(CanadianJurisdiction): division_name = "Woolwich" name = "Woolwich Township Council" url = "http://www.woolwich.ca" + + def get_organizations(self): + organization = Organization(self.name, classification=self.classification) + + organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) + # Dictionary of ward number to stop index for seats + stop = { + 1: 3, + 2: 2, + 3: 3, + } + for ward_number in range(1, 4): + for seat_number in range(1, stop[ward_number]): + organization.add_post( + role="Councillor", + label="Ward {} (seat {})".format(ward_number, seat_number), + division_id="{}/ward:{}".format(self.division_id, ward_number), + ) + + yield organization diff --git a/ca_on_woolwich/people.py b/ca_on_woolwich/people.py index 66992769..2e728678 100644 --- a/ca_on_woolwich/people.py +++ b/ca_on_woolwich/people.py @@ -1,4 +1,5 @@ import re +from collections import defaultdict from utils import CanadianPerson as Person from utils import CanadianScraper @@ -8,29 +9,32 @@ class WoolwichPersonScraper(CanadianScraper): def scrape(self): + seat_numbers = defaultdict(int) page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@id="printArea"]//strong') + councillors = page.xpath('//td[@data-name="accParent"]/h2') assert len(councillors), "No councillors found" for councillor in councillors: - info = councillor.xpath("./parent::p/text()") - if not info: - info = councillor.xpath("./parent::div/text()") - info = [x for x in info if x.strip()] - district = re.sub(r"(?<=Ward \d).+", "", info.pop(0)) - if "Mayor" in district: + role, name = re.split(r"\s", councillor.text_content(), 1) + area = re.search(r"Ward \d", name) + if not area: district = "Woolwich" - role = "Mayor" else: - district = district.replace("Councillor", "").strip() - role = "Councillor" + seat_numbers[area] += 1 + district = area.group(0) + " (seat {})".format(seat_numbers[area]) + if "(" in name: + name = name.split(" (")[0] + info = councillor.xpath("./ancestor::tr[1]/following-sibling::tr")[0].text_content() + office = re.search(r"(?<=Office: )\d{3}-\d{3}-\d{4}", info).group(0) + voice = ( + re.search(r"(?<=Toll Free: )(1-)?\d{3}-\d{3}-\d{4}( extension \d{4})?", info) + .group(0) + .replace("extension", "x") + ) - p = Person(primary_org="legislature", name=councillor.text_content(), district=district, role=role) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.image = councillor.xpath("./img/@src")[0] + p.add_contact("voice", office, "office") + p.add_contact("voice", voice, "legislature") - for contact in info: - note, num = contact.split(":") - num = num.strip().replace("(", "").replace(") ", "-").replace("extension ", "x") - p.add_contact(note, num, note) yield p From f7e1ab8e5510630dd373959fad8848ab28eb1215 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 20 Jun 2024 15:20:49 -0400 Subject: [PATCH 31/66] fix(ca-on_lasalle): Refactor scraper for new site --- ca_on_lasalle/people.py | 40 ++++++++++++---------------------------- 1 file changed, 12 insertions(+), 28 deletions(-) diff --git a/ca_on_lasalle/people.py b/ca_on_lasalle/people.py index 0930fc12..79acea39 100644 --- a/ca_on_lasalle/people.py +++ b/ca_on_lasalle/people.py @@ -3,7 +3,7 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.town.lasalle.on.ca/en/town-hall/LaSalle-Council.asp" +COUNCIL_PAGE = "https://www.lasalle.ca/en/town-hall/town-of-lasalle-council.aspx" class LaSallePersonScraper(CanadianScraper): @@ -12,39 +12,23 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//table[@id="Table1table"]//td/p') + councillors = page.xpath('//div[@class="fbg-row lb-imageBox cm-datacontainer"]') assert len(councillors), "No councillors found" for councillor in councillors: - if not councillor.text_content().strip(): - continue - name = councillor.xpath("./font/b/text()") - if not name: - name = councillor.xpath("./font/text()") - if "email" in name[0]: - name = councillor.xpath("./b/font/text()") - name = name[0] - role = "Councillor" - if "Mayor" in name: - name = name.replace("Mayor", "") - role = "Mayor" + role, name = re.split(r"(?<=Mayor)|(?<=Councillor)", councillor.xpath(".//a/div")[0].text_content(), 1) + if "Mayor" in role: district = "LaSalle" else: district = "LaSalle (seat {})".format(councillor_seat_number) - councillor_seat_number += 1 + image = councillor.xpath(".//img/@src")[0] + voice = re.search(r"\d{3}-\d{3}-\d{4} ext. \d+", councillor.text_content()) + cell = re.search(r"\d{3}-\d{3}-\d{4}(?! ext)", councillor.text_content()) - p = Person(primary_org="legislature", name=name, district=district, role=role) + p = Person(primary_org="legislature", name=name, role=role, district=district, image=image) p.add_source(COUNCIL_PAGE) + if voice: + p.add_contact("voice", voice.group(0), "legislature") + if cell: + p.add_contact("cell", cell.group(0), "legislature") - photo_url = councillor.xpath("./parent::td//img/@src")[0] - p.image = photo_url - - email = self.get_email(councillor) - p.add_contact("email", email) - - phone = re.findall(r"(?<=phone:)(.*)(?=home)", councillor.text_content(), flags=re.DOTALL) - if phone: - p.add_contact("voice", phone[0].strip(), "legislature") - - home_phone = re.findall(r"(?<=home phone:)(.*)", councillor.text_content(), flags=re.DOTALL)[0] - p.add_contact("voice", home_phone.strip(), "residence") yield p From 9473b05dcb18ac355b9b0d5e399c00acd32bbdbb Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Fri, 21 Jun 2024 16:50:39 -0400 Subject: [PATCH 32/66] fix(ca-on_wellesley): Get only councillor elements and standardize assertion --- ca_on_wellesley/people.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ca_on_wellesley/people.py b/ca_on_wellesley/people.py index 8d189e6a..44be0158 100644 --- a/ca_on_wellesley/people.py +++ b/ca_on_wellesley/people.py @@ -15,10 +15,10 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = [ el - for el in page.xpath('//div[@id="printAreaContent"]//td') + for el in page.xpath('//div//td[@data-name="accChild"]') if el.text_content().strip().lower().split()[0] in ["mayor", "councillor"] - ][1:] - assert len(members) == 5 + ] + assert len(members), "No councillors found" for member in members: position = member.text_content().split()[0] @@ -26,6 +26,7 @@ def scrape(self): name = srch.group(1).strip() district = srch.group(2).strip() phone = self.get_phone(member) + email = self.get_email(member, error=False) if position == "Mayor": district = "Wellesley" else: @@ -33,5 +34,7 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role=position) p.add_contact("voice", phone, "legislature") + if email: + p.add_contact("email", email) p.add_source(COUNCIL_PAGE) yield p From 4c48abbafc82275508b73ccfd4265afea67365a8 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 19 Jun 2024 16:25:03 -0400 Subject: [PATCH 33/66] fix(ca_on_clarington): Refactor scraper for new site --- ca_on_clarington/people.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/ca_on_clarington/people.py b/ca_on_clarington/people.py index 036d40c2..f2bc142b 100644 --- a/ca_on_clarington/people.py +++ b/ca_on_clarington/people.py @@ -3,26 +3,33 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.clarington.net/index.php?content=townhall/council" +COUNCIL_PAGE = "https://www.clarington.net/en/town-hall/Meet-Your-Councillors.aspx" +MAYOR_PAGE = "https://www.clarington.net/en/town-hall/mayor.aspx" class ClaringtonPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath("//h2") + councillors = page.xpath("//td[@data-name='accParent']") assert len(councillors), "No councillors found" - for person_header_elem in councillors: - role, name_post = person_header_elem.text.split(" - ") - try: - name, caps_post = re.match(r"(.+) \((.+)\)", name_post).groups() - post = caps_post.title() - except AttributeError: - name = name_post - post = "Clarington" - email = person_header_elem.xpath("./following-sibling::a[1]/@href")[0][len("mailto:") :] - photo_url = person_header_elem.xpath("./following-sibling::img[1]/@src")[0] - p = Person(primary_org="legislature", name=name, district=post, role=role, image=photo_url) + for councillor in councillors: + name, role_district = councillor.text_content().split(" - ") + role, district = re.split(r"(?<=Councillor) ", role_district, 1) + content_node = councillor.xpath("../following-sibling::tr")[0] + email = self.get_email(content_node) + photo_url = content_node.xpath(".//img/@src")[0] + p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact("email", email) yield p + + page = self.lxmlize(MAYOR_PAGE).xpath('//div[@id="mainContent"]')[0] + name = page.xpath(".//img/@alt")[0].replace("Mayor", "").strip() + photo_url = page.xpath(".//img/@src")[0] + email = self.get_email(page) + + p = Person(primary_org="legislature", name=name, district="Clarington", role="Mayor", image=photo_url) + p.add_contact("email", email) + p.add_source(MAYOR_PAGE) + yield p From 746ff65bebf0aa88be95f149dcc552157fed76e3 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 19 Jun 2024 13:22:39 -0400 Subject: [PATCH 34/66] fix(ca_on_grimsby): change xpath arguments --- ca_on_grimsby/people.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ca_on_grimsby/people.py b/ca_on_grimsby/people.py index 65f0572e..bdb30be3 100644 --- a/ca_on_grimsby/people.py +++ b/ca_on_grimsby/people.py @@ -11,12 +11,12 @@ class GrimsbyPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - wards = page.xpath("//div[@id='printAreaContent']//tbody/tr[td/h4]") + wards = page.xpath("//p[@class='tab ']") assert len(wards), "No wards found" for ward in wards: - area = ward.xpath(".//h4")[0].text_content() - councillors_node = ward.xpath("./following-sibling::tr/td")[0] + area = ward.xpath(".//a")[0].text_content().strip() + councillors_node = ward.xpath("./following-sibling::div")[0] for i in range(2): name_node = councillors_node.xpath( @@ -39,8 +39,8 @@ def scrape(self): role, name = page.xpath("//h3")[0].text_content().split(" ", 1) email = self.get_email(page) - phone = self.get_phone(page.xpath("//div[@id='printAreaContent']/p[contains(., '905')]")[0]) - image = page.xpath("//h3//@src")[0] + phone = self.get_phone(page.xpath("//div[contains(@class, 'left')]//p[contains(., '905')]")[0]) + image = page.xpath("//p//@src")[0] p = Person(primary_org="legislature", name=name, district="Grimsby", role=role, image=image) p.add_contact("email", email) From d0fb3ff6b50def77ad0f440cfa06725b38c5216b Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 19 Jun 2024 13:05:59 -0400 Subject: [PATCH 35/66] fix(ca_nb_fredericton): Get Wards correctly --- ca_nb_fredericton/people.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ca_nb_fredericton/people.py b/ca_nb_fredericton/people.py index f5977c6c..6ade8cb9 100644 --- a/ca_nb_fredericton/people.py +++ b/ca_nb_fredericton/people.py @@ -1,3 +1,5 @@ +import re + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -17,9 +19,9 @@ def scrape(self): text = councillor.xpath('.//div[@class="views-field views-field-field-councillor-title"]/div')[ 0 ].text_content() - ward_start = text.find("Ward") - if ward_start + 1: - district = text[ward_start : ward_start + 7].strip() + ward = re.findall(r"Ward \d+", text) + if ward: + district = ward[0] role = "Councillor" else: district = "Fredericton" From 4c9bec8e742583110f679ac971dca6491ba5e042 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 19 Jun 2024 12:50:06 -0400 Subject: [PATCH 36/66] fix(ca_ab_lethbridge): Get Mayor's name correctly --- ca_ab_lethbridge/people.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ca_ab_lethbridge/people.py b/ca_ab_lethbridge/people.py index 74f91e0e..5346d0e2 100644 --- a/ca_ab_lethbridge/people.py +++ b/ca_ab_lethbridge/people.py @@ -8,7 +8,7 @@ class LethbridgePersonScraper(CanadianScraper): def scrape_mayor(self): page = self.lxmlize(MAYOR_PAGE) - paragraph = page.xpath("//p[1]")[0].text_content().split() + paragraph = page.xpath("//h4[contains(., 'Mayor')]/following-sibling::p")[0].text_content().split() name = " ".join([paragraph[0], paragraph[1]]) p = Person(primary_org="legislature", name=name, district="Lethbridge", role="Mayor") From 49f507ab9eacc2c4208e33b33e77ed90c72192a2 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Tue, 18 Jun 2024 16:14:04 -0400 Subject: [PATCH 37/66] fix: scrape from Langley City's new site --- ca_bc_langley_city/people.py | 89 +++++++++++++----------------------- 1 file changed, 31 insertions(+), 58 deletions(-) diff --git a/ca_bc_langley_city/people.py b/ca_bc_langley_city/people.py index e7d88464..eaf1f3fd 100644 --- a/ca_bc_langley_city/people.py +++ b/ca_bc_langley_city/people.py @@ -1,9 +1,7 @@ -import re - from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.city.langley.bc.ca/index.php/city-hall/city-council" +COUNCIL_PAGE = "https://city.langley.bc.ca/cityhall/city-council/council-members" class LangleyPersonScraper(CanadianScraper): @@ -12,60 +10,35 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[@class="menuitems"]/ul//li/a[contains(text(), "Councillor")]/@href') - mayor = page.xpath('//div[@class="menuitems"]/ul//li/a[contains(text(), "Mayor")]/@href')[0] + councillors = page.xpath( + '//div[@class="field field--name-field-ec-section-title field--type-string field--label-hidden field__item"]' + )[:-1] assert len(councillors), "No councillors found" - for url in councillors: - district = "Langley (seat {})".format(councillor_seat_number) - councillor_seat_number += 1 - yield self.scrape_person(url, district) - - yield self.scrape_mayor(mayor) - - def scrape_person(self, url, district): - infos_page = self.lxmlize(url) - infos = infos_page.xpath('//div[@class="item-page"]')[0] - - name = " ".join(infos.xpath("p[2]/text()")[0].split(" ")[1:3]) - lname = name.lower() - email = lname.split(" ")[0][0] + lname.split(" ")[1] + "@langleycity.ca" - photo_url = infos.xpath("p[1]/img/@src")[0] - - p = Person(primary_org="legislature", name=name, district=district, role="Councillor", image=photo_url) - p.add_source(COUNCIL_PAGE) - p.add_source(url) - p.add_contact("email", email) - - personal_infos = infos.xpath("p[last()]/text()") - - if "Residence" in personal_infos[0]: - phone = re.findall(r"(Phone|Res)(:?) (.*)", "\n".join(personal_infos))[0][2] - address = re.findall(r"Address: (.*) (Phone|Res)", " ".join(personal_infos))[0][0] - p.add_contact("address", address, "residence") - p.add_contact("voice", phone, "residence") - - return p - - def scrape_mayor(self, url): - infos_page = self.lxmlize(url) - infos = infos_page.xpath('//div[@class="item-page"]')[0] - - name = " ".join(infos.xpath("p[2]/text()")[0].split(" ")[2:4]) - lname = name.lower() - email = lname.split(" ")[0][0] + lname.split(" ")[1] + "@langleycity.ca" - photo_url = infos.xpath("p[1]/img/@src")[0] - - p = Person(primary_org="legislature", name=name, district="Langley", role="Mayor", image=photo_url) - p.add_source(COUNCIL_PAGE) - p.add_source(url) - p.add_contact("email", email) - - personal_infos = infos.xpath("p[last()]/text()") - - phone = re.findall(r"Phone(:?) (.*)", "\n".join(personal_infos))[0][1] - address = re.findall(r"Address: (.*) Phone", " ".join(personal_infos))[0] - p.add_contact("address", address, "office") - p.add_contact("voice", phone, "office") - - return p + for councillor in councillors: + role, name = councillor.text_content().split(" ", 1) + if role == "Mayor": + district = "Langley" + phone_div = councillor.xpath('..//p[contains(., "Phone:")]')[0] + phone = self.get_phone(phone_div) + else: + district = "Langley (seat {})".format(councillor_seat_number) + phone = ( + "604 514 2800" # According to their site, all councillors can be contacted at this phone number + ) + councillor_seat_number += 1 + email = ( + councillor.xpath('..//p[contains(., "Email:")]')[0] + .text_content() + .split("Email:", 1)[1] + .strip() + .replace("(at)", "@") + ) + image = councillor.xpath("..//img/@src")[0] + + p = Person(primary_org="legislature", name=name, district=district, role=role, image=image) + p.add_contact("voice", phone, "legislature") + p.add_contact("email", email) + p.add_source(COUNCIL_PAGE) + + yield p From 36ebfc840f6e81588929dd28055a041c0b25a9c4 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Mon, 17 Jun 2024 12:36:22 -0400 Subject: [PATCH 38/66] Fix Brossard scraper --- ca_qc_brossard/people.py | 121 +++++++++++++++++++++++++++++---------- 1 file changed, 90 insertions(+), 31 deletions(-) diff --git a/ca_qc_brossard/people.py b/ca_qc_brossard/people.py index 1eff883a..f1b5eaac 100644 --- a/ca_qc_brossard/people.py +++ b/ca_qc_brossard/people.py @@ -1,49 +1,108 @@ +import json import re +import requests + from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.ville.brossard.qc.ca/Ma-ville/conseil-municipal/Municipal-council.aspx?lang=en-ca" -CONTACT_PAGE = "http://www.ville.brossard.qc.ca/Ma-ville/conseil-municipal/Municipal-council/Municipal-council-members-%E2%80%93-Contact-information.aspx" +DATA_PAGE = "https://www.brossard.ca/in/rest/public/contentGraphByPath?locale=fr&path=/elus-municipaux&propertyFilter=backup,site" +COUNCIL_PAGE = "https://www.brossard.ca/elus-municipaux" class BrossardPersonScraper(CanadianScraper): def scrape(self): - page = self.lxmlize(COUNCIL_PAGE) - contact_page = self.lxmlize(CONTACT_PAGE) + def indexById(elementList): + result = {} + for element in elementList: + id = element["id"] + result[id] = element + return result - councillors = page.xpath('//a[contains(@class, "slide item-")]') - emails = contact_page.xpath('//a[contains(@href, "mailto:")]') + # Gets the ids of all children elements recursively + def getChildren(parentId, elementDict): + returnList = [] + element = elementDict[parentId] + if element.get("children"): + for child in element.get("children"): + if not re.search(r"^\d+$", child): + continue + returnList.append(child) + if getChildren(child, elementDict): + returnList.extend(getChildren(child, elementDict)) + return returnList - assert len(councillors), "No councillors found" - for councillor in councillors: - name = councillor.xpath('.//div[@class="titre"]/text()')[0] - if name == "Poste vacant": + # The whole page is rendered in javascript and stored as a massive json object + page = requests.get(DATA_PAGE) + page = json.loads(page.content) + containers = page["content"].values() + for container in containers: + if container.get("contentType") != "CMSPage": continue - if name == "Sylvie Desgroseilliers": - name = "Sylvie DesGroseilliers" - - position = councillor.xpath('.//div[@class="poste"]/text()')[0] - role = "Conseiller" - - district = re.search(r"District \d+", position) - if "Mayor" in position: - district = "Brossard" - role = "Maire" - else: - district = district.group(0) + elements = indexById(container["properties"]["content"]["data"]) - photo = re.search(r"url\((.+)\)", councillor.attrib["style"]).group(1) + councillors = [] + for element in elements.values(): + if isinstance(element.get("children"), dict) and re.search( + r"DISTRICT \d+\s+[-|]\sSecteur", element.get("children").get("fr") + ): + councillors.append(element) - p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo) - p.add_source(COUNCIL_PAGE) - p.add_source(CONTACT_PAGE) + assert len(councillors), "No councillors found" + for councillor in councillors: + district = re.search(r"DISTRICT (\d+)", councillor["children"]["fr"]).group(0).title() + parent_id = councillor["parent"] + children = getChildren(parent_id, elements) + phone = None + for id in children: + child = elements[id] + if child["tag"] == "Link": + email = child["props"]["link"]["options"]["url"]["fr"].split(":")[1] + elif child["tag"] == "Image": + photo = "https://www.brossard.ca/in/rest/public/AttachmentThumb?id=" + child["children"]["fr"] + elif child["tag"] == "TextBox": + if not isinstance(child["children"], dict) or "DISTRICT" in child["children"]["fr"]: + continue + text = re.search(r"(?<=>).+(?=<)", child["children"]["fr"]).group(0) + if child["parent"] == parent_id and "Conseill" not in text: + name = text.replace(" ", "") + elif not phone: + phone_pattern = re.search(r"\d{3} \d{3}-\d{4}(, poste \d{4})?", text) + if phone_pattern: + phone = phone_pattern.group(0) - index = [i for i, link in enumerate(emails) if name in link.text_content().replace("\u2019", "'")][0] - email = emails[index + 1] - p.add_contact("email", re.match("mailto:(.+@brossard.ca)", email.attrib["href"]).group(1)) - phone = email.xpath('./preceding-sibling::text()[contains(., "450")]') - phone = phone[-1] + p = Person(primary_org="legislature", name=name, district=district, role="Conseiller", image=photo) + p.add_contact("email", email) p.add_contact("voice", phone, "legislature") + p.add_source(COUNCIL_PAGE) yield p + + for element in elements.values(): + if ( + isinstance(element.get("children"), dict) + and re.search(r"MAIRE", element.get("children").get("fr")) + and not element.get("children").get("en") + ): + mayor = element + parent_id = mayor["parent"] + children = getChildren(parent_id, elements) + phone = None + for id in children: + child = elements[id] + if child["tag"] == "Image": + photo = "https://www.brossard.ca/in/rest/public/AttachmentThumb?id=" + child["children"]["fr"] + elif child["tag"] == "TextBox": + if not isinstance(child["children"], dict) or "MAIRE" in child["children"]["fr"]: + continue + text = re.search(r"(?<=>).+(?=<)", child["children"]["fr"]).group(0) + if child["parent"] == parent_id: + name = text.replace(" ", "") + elif not phone: + phone_pattern = re.search(r"\d{3} \d{3}-\d{4}(, poste \d{4})?", text) + if phone_pattern: + phone = phone_pattern.group(0) + p = Person(primary_org="legislature", name=name, district="Brossard", role="Maire", image=photo) + p.add_contact("voice", phone, "legislature") + p.add_source(COUNCIL_PAGE) + yield p From f37fbdacaaec2fc3599195babf1451e3d9ad940f Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 19 Jun 2024 12:57:37 -0400 Subject: [PATCH 39/66] fix(ca_bc_coquitlam): Change councillor table id --- ca_bc_coquitlam/people.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ca_bc_coquitlam/people.py b/ca_bc_coquitlam/people.py index e43370f4..51e98652 100644 --- a/ca_bc_coquitlam/people.py +++ b/ca_bc_coquitlam/people.py @@ -18,7 +18,7 @@ def build_email(script): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") - councillors = page.xpath('//table[@id="cityDirectoryDepartmentDetails"]/tr') + councillors = page.xpath('//table[contains(@id, "cityDirectoryDepartmentDetails")]/tr') assert len(councillors), "No councillors found" for councillor in councillors: name = " ".join( From 9a3420bb478e88b9d3bf81ee411f60a7b0520056 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 19 Jun 2024 15:18:50 -0400 Subject: [PATCH 40/66] fix(ca_ab_grande_prairie): Replace broken csvscraper with new scraper --- ca_ab_grande_prairie/people.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/ca_ab_grande_prairie/people.py b/ca_ab_grande_prairie/people.py index fce739e2..1508f5d2 100644 --- a/ca_ab_grande_prairie/people.py +++ b/ca_ab_grande_prairie/people.py @@ -1,7 +1,30 @@ -from utils import CSVScraper +from utils import CanadianPerson as Person +from utils import CanadianScraper +COUNCIL_PAGE = "https://cityofgp.com/city-government/mayor-city-council/council-members" -class GrandePrairiePersonScraper(CSVScraper): - # https://data.cityofgp.com/Community/City-Council-Contact-Information/vcfc-gi78 - csv_url = "https://data.cityofgp.com/api/views/vcfc-gi78/rows.csv?accessType=DOWNLOAD" - many_posts_per_area = True + +class GrandePrairiePersonScraper(CanadianScraper): + def scrape(self): + seat_number = 1 + page = self.lxmlize(COUNCIL_PAGE) + councillors = page.xpath('//div[contains(@class, "council-bios")]//div[@class="views-row"]') + + assert len(councillors), "No councillors found" + for councillor in councillors: + role, name = councillor.xpath(".//h3")[0].text_content().split(" ", 1) + if role == "Councillor": + district = "Grande Prairie (seat {})".format(seat_number) + seat_number += 1 + else: + district = " Grande Prairie" + email = self.get_email(councillor) + phone = self.get_phone(councillor) + image = councillor.xpath(".//img/@src")[0] + + p = Person(primary_org="legislature", name=name, district=district, role=role, image=image) + p.add_contact("email", email) + p.add_contact("voice", phone, "legislature") + p.add_source(COUNCIL_PAGE) + + yield p From 83e767322991b659ba1c91a9a5c460dbe9ea9ccc Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 24 Jun 2024 15:50:04 -0400 Subject: [PATCH 41/66] chore(ca_on_woolwich): Align code with other scrapers --- ca_on_woolwich/__init__.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/ca_on_woolwich/__init__.py b/ca_on_woolwich/__init__.py index a173b76b..ce33ea11 100644 --- a/ca_on_woolwich/__init__.py +++ b/ca_on_woolwich/__init__.py @@ -14,14 +14,8 @@ def get_organizations(self): organization = Organization(self.name, classification=self.classification) organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) - # Dictionary of ward number to stop index for seats - stop = { - 1: 3, - 2: 2, - 3: 3, - } - for ward_number in range(1, 4): - for seat_number in range(1, stop[ward_number]): + for ward_number, stop in enumerate((3, 2, 3), 1): + for seat_number in range(1, stop): organization.add_post( role="Councillor", label="Ward {} (seat {})".format(ward_number, seat_number), From 9e5a94b1cb48e17bf391f8d78c2e5341b091740b Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 24 Jun 2024 16:12:28 -0400 Subject: [PATCH 42/66] fix(ca_yt): Make jurisdiction name match representative set name --- ca_yt/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ca_yt/__init__.py b/ca_yt/__init__.py index afb558fa..c80defd9 100644 --- a/ca_yt/__init__.py +++ b/ca_yt/__init__.py @@ -5,6 +5,6 @@ class Yukon(CanadianJurisdiction): classification = "legislature" division_id = "ocd-division/country:ca/territory:yt" division_name = "Yukon" - name = "Yukon Legislative Assembly" + name = "Legislative Assembly of Yukon" url = "https://yukonassembly.ca" parties = [{"name": "Yukon Liberal Party"}, {"name": "Yukon Party"}, {"name": "New Democratic Party"}] From 50410a64174575827623b9546a83a49ca7060945 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Mon, 24 Jun 2024 16:31:36 -0400 Subject: [PATCH 43/66] fix(ca-pe_stratford): Change url and refactor code --- ca_pe_stratford/people.py | 60 ++++++++++++--------------------------- 1 file changed, 18 insertions(+), 42 deletions(-) diff --git a/ca_pe_stratford/people.py b/ca_pe_stratford/people.py index 253b8d0a..b1879e1f 100644 --- a/ca_pe_stratford/people.py +++ b/ca_pe_stratford/people.py @@ -1,66 +1,42 @@ import re from collections import defaultdict -from utils import CUSTOM_USER_AGENT from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.townofstratford.ca/town-hall/government/town-council/" +COUNCIL_PAGE = "https://www.townofstratford.ca/government/about_our_government/mayor_council" class StratfordPersonScraper(CanadianScraper): def scrape(self): seat_numbers = defaultdict(int) - page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) + page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") - yield self.scrape_mayor(page) - - councillors = page.xpath( - '//div[@id="street-container"]//strong[contains(text(), "Councillor")]/parent::p|//div[@id="street-container"]//b[contains(text(), "Councillor")]/parent::p' - ) + councillors = page.xpath("//tr") assert len(councillors), "No councillors found" for councillor in councillors: - name = councillor.xpath("./strong/text()|./b/text()")[0].replace("Councillor", "").strip() - post = re.findall(r"(?<=Ward \d, ).*", councillor.text_content())[0].strip() - - seat_numbers[post] += 1 - post = "{} (seat {})".format(post, seat_numbers[post]) - - p = Person(primary_org="legislature", name=name, district=post, role="Councillor") + name = councillor.xpath(".//strong/text()")[0] + if re.search(r"(? Date: Tue, 25 Jun 2024 14:51:58 -0400 Subject: [PATCH 44/66] fix(ca_qc_pointe_claire): Get photo url correctly --- ca_qc_pointe_claire/people.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ca_qc_pointe_claire/people.py b/ca_qc_pointe_claire/people.py index c32c6d39..d94b944a 100644 --- a/ca_qc_pointe_claire/people.py +++ b/ca_qc_pointe_claire/people.py @@ -26,7 +26,7 @@ def scrape(self): assert False, "error parsing district" p = Person(primary_org="legislature", name=name, district=district, role=role) - p.image = councillor.xpath(".//@src")[0] + p.image = councillor.xpath(".//@data-src")[0] p.add_contact("email", self.get_email(councillor)) p.add_contact("voice", self.get_phone(councillor, area_codes=[514]), "legislature") p.add_source(COUNCIL_PAGE) From 5a459b68d03cec72df7fa519210081a47d217266 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Tue, 25 Jun 2024 14:47:34 -0400 Subject: [PATCH 45/66] fix(ca-qc_beaconsfield): Refactor scraper for new site --- ca_qc_beaconsfield/people.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/ca_qc_beaconsfield/people.py b/ca_qc_beaconsfield/people.py index b1f303a6..e117f08b 100644 --- a/ca_qc_beaconsfield/people.py +++ b/ca_qc_beaconsfield/people.py @@ -8,24 +8,19 @@ class BeaconsfieldPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//div[contains(@class, "items-row")]') + councillors = page.xpath('//div[contains(@class, "c-rubric-card__header")]') assert len(councillors), "No councillors found" for councillor in councillors: - text = councillor.xpath(".//h2")[0].text_content().strip() - if "," not in text: - continue - - name, role_and_district = text.split(", ", 1) - if role_and_district == "Maire": + name = councillor.xpath(".//h2")[0].text_content().strip() + district = councillor.xpath(".//span")[0].text_content().strip() + if district == "Maire": district = "Beaconsfield" role = "Maire" else: - district = role_and_district.split(" - ", 1)[1] role = "Conseiller" p = Person(primary_org="legislature", name=name, district=district, role=role) p.image = councillor.xpath(".//@src")[0] - p.add_contact("email", self.get_email(councillor)) - p.add_contact("voice", self.get_phone(councillor, area_codes=[514]), "legislature") + p.add_contact("email", self.get_email(councillor, "./following-sibling::div")) p.add_source(COUNCIL_PAGE) yield p From 5de8f13549fcdbe4bc2e04bc60561b5b35fd1115 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 20 Jun 2024 16:14:25 -0400 Subject: [PATCH 46/66] fix(ca_on_sault_ste_marie): Add new councillor posts and rewrite scraper for new site --- ca_on_sault_ste_marie/__init__.py | 16 ++++++ ca_on_sault_ste_marie/people.py | 89 ++++++++++++++----------------- 2 files changed, 57 insertions(+), 48 deletions(-) diff --git a/ca_on_sault_ste_marie/__init__.py b/ca_on_sault_ste_marie/__init__.py index 7bee6834..9cbf35f6 100644 --- a/ca_on_sault_ste_marie/__init__.py +++ b/ca_on_sault_ste_marie/__init__.py @@ -1,3 +1,5 @@ +from pupa.scrape import Organization + from utils import CanadianJurisdiction @@ -7,3 +9,17 @@ class SaultSteMarie(CanadianJurisdiction): division_name = "Sault Ste. Marie" name = "Sault Ste. Marie City Council" url = "http://www.city.sault-ste-marie.on.ca" + + def get_organizations(self): + organization = Organization(self.name, classification=self.classification) + + organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) + for ward_number in range(1, 6): + for seat_number in range(1, 3): + organization.add_post( + role="Councillor", + label="Ward {} (seat {})".format(ward_number, seat_number), + division_id="{}/ward:{}".format(self.division_id, ward_number), + ) + + yield organization diff --git a/ca_on_sault_ste_marie/people.py b/ca_on_sault_ste_marie/people.py index ffc890fb..c55237b8 100644 --- a/ca_on_sault_ste_marie/people.py +++ b/ca_on_sault_ste_marie/people.py @@ -1,57 +1,50 @@ -from urllib.parse import urljoin +import re +from collections import defaultdict from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.city.sault-ste-marie.on.ca/Open_Page.aspx?ID=174&deptid=1" - - -def word_to_number(word): - words = ("one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten") - return words.index(word.lower()) + 1 - - -def district_name_using_number(name): - district_split = name.split() - return " ".join([district_split[0], str(word_to_number(district_split[1]))]) +COUNCIL_PAGE = "https://saultstemarie.ca/Government/City-Council.aspx" class SaultSteMariePersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - table_data = page.xpath('//div[@id="litcontentDiv"]//tr') - council_data = table_data[2:-1] - - mayor_row = table_data[0] - - photo_url_rel = mayor_row.xpath("string(.//img/@src)") # can be empty - photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) - contact_node = mayor_row.xpath("./td")[1] - name = contact_node.xpath(".//font[1]/text()")[0] - email = self.get_email(contact_node) - - p = Person(primary_org="legislature", name=name, district="Sault Ste. Marie", role="Mayor") - p.add_source(COUNCIL_PAGE) - p.add_contact("email", email) - p.image = photo_url - yield p - - # alternate between a row represneting a ward name and councilors - assert len(council_data), "No councillors found" - for ward_row, data_row in zip(*[iter(council_data)] * 2): - district = ward_row.xpath('.//text()[contains(., "Ward")]')[0] - district_num = district_name_using_number(district) - for councillor_node in data_row.xpath("./td"): - name = councillor_node.xpath(".//strong/text()|.//font[1]/text()")[0] - email = self.get_email(councillor_node) - photo_url_rel = councillor_node.xpath("string(.//img/@src)") # can be empty - photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) - # address and phone are brittle, inconsistent - - p = Person(primary_org="legislature", name=name, district=district_num, role="Councillor") - p.add_source(COUNCIL_PAGE) - if email: - p.add_contact("email", email) - p.image = photo_url - - yield p + seat_numbers = defaultdict(int) + + councillors = page.xpath('//div[@class="mb-2"]//@href') + assert len(councillors), "No councillors found" + + for link in councillors: + page = self.lxmlize(link) + title = page.xpath("//h1")[0].text_content() + if "Mayor" in title: + role = "Mayor" + name = title.replace("Mayor ", "") + district = "Sault Ste. Marie" + image = None # No image on the Mayor's page at the moment + contact_node = page.xpath('//div[@id="mainContent_contactUs"]')[0] + phone_numbers = re.findall(r"\d{3}-\d{3}-\d{4}", contact_node.text_content()) + breakpoint() + phone = phone_numbers[0] + fax = phone_numbers[1] + else: + role = "Councillor" + area, name = title.split(" Councillor ") + seat_numbers[area] += 1 + district = "{} (seat {})".format(area, seat_numbers[area]) + image = page.xpath(".//h3/img/@src")[0] + contact_node = page.xpath('//div[@id="mainContent_left"]')[0] + phone = self.get_phone(contact_node) + email = self.get_email(contact_node) + + p = Person(primary_org="legislature", name=name, district=district, role=role) + if image: + p.image = image + if fax: + p.add_contact("fax", fax, "legislature") + p.add_contact("email", email) + p.add_contact("voice", phone, "legislature") + p.add_source(COUNCIL_PAGE) + p.add_source(link) + yield p From 4a67ffd7a1842693f06f2a4e2a865ebcbdedc1ec Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 26 Jun 2024 10:42:58 -0400 Subject: [PATCH 47/66] fix(ca_on_sault_ste_marie): Remove breakpoint --- ca_on_sault_ste_marie/people.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ca_on_sault_ste_marie/people.py b/ca_on_sault_ste_marie/people.py index c55237b8..f4f3ceb7 100644 --- a/ca_on_sault_ste_marie/people.py +++ b/ca_on_sault_ste_marie/people.py @@ -25,7 +25,6 @@ def scrape(self): image = None # No image on the Mayor's page at the moment contact_node = page.xpath('//div[@id="mainContent_contactUs"]')[0] phone_numbers = re.findall(r"\d{3}-\d{3}-\d{4}", contact_node.text_content()) - breakpoint() phone = phone_numbers[0] fax = phone_numbers[1] else: From 209633927aa5ecd6918b80b5e93bf73205e6c6dd Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:37:47 -0400 Subject: [PATCH 48/66] Update country-ca.csv --- country-ca.csv | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/country-ca.csv b/country-ca.csv index 29d158b4..8edabbad 100644 --- a/country-ca.csv +++ b/country-ca.csv @@ -4148,24 +4148,25 @@ ocd-division/country:ca/csd:2480135,Duhamel,,,,,MÉ,N,Duhamel,,,,7,,, ocd-division/country:ca/csd:2480140,Val-des-Bois,,,,,MÉ,N,Val-des-Bois,,,,7,,, ocd-division/country:ca/csd:2480145,Bowman,,,,,MÉ,N,Bowman,,,,7,,, ocd-division/country:ca/csd:2481017,Gatineau,,,,,V,Y,Gatineau,,Ville de Gatineau,,19,,, -ocd-division/country:ca/csd:2481017/district:1,d'Aylmer,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:10,de Touraine,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:11,de Pointe-Gatineau,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:12,du Carrefour-de-l'Hôpital,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:13,du Versant,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:14,de Bellevue,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:15,du Lac-Beauchamp,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:16,de la Rivière-Blanche,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:17,de Masson-Angers,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:18,de Buckingham,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:2,de Lucerne,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:3,de Deschênes,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:4,du Plateau—Manoir-des-Trembles,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:5,de Wright—Parc-de-la-Montagne,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:6,de l'Orée-du-Parc,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:7,de Saint-Raymond—Vanier,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:8,de Hull—Val-Tétreau,,,,,,,,,,,,,, -ocd-division/country:ca/csd:2481017/district:9,de Limbour,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:1,Aylmer,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:10,Limbour,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:11,Touraine,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:12,Pointe-Gatineau,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:13,Carrefour-de-l'Hôpital,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:14,Versant,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:15,Bellevue,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:16,Lac-Beauchamp,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:17,Rivière-Blanche,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:18,Masson-Angers,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:19,Buckingham,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:2,Lucerne,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:3,Deschênes,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:4,Plateau,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:5,Mitigomijokan,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:6,Manoir-des-Trembles—Val-Tétreau,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:7,Hull–Wright,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:8,Parc-de-la-Montagne—Saint-Raymond,,,,,,,,,,,,,, +ocd-division/country:ca/csd:2481017/district:9,Orée-du-Parc,,,,,,,,,,,,,, ocd-division/country:ca/csd:2482005,L'Ange-Gardien,,,,,MÉ,Y,L'Ange-Gardien,,,,7,,, ocd-division/country:ca/csd:2482005/district:1,du Lièvre,,,,,,,,,,,,,, ocd-division/country:ca/csd:2482005/district:2,Lac Donaldson,,,,,,,,,,,,,, @@ -5565,7 +5566,6 @@ ocd-division/country:ca/csd:3557061/ward:2,Ward 2,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557061/ward:3,Ward 3,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557061/ward:4,Ward 4,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557061/ward:5,Ward 5,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3557061/ward:6,Ward 6,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557066,Prince,,,,,TP,,Prince,,Township of Prince,,,,, ocd-division/country:ca/csd:3557071,Sagamok,,,,,IRI,N,Sagamok,,,,,,, ocd-division/country:ca/csd:3557072,Serpent River 7,,,,,IRI,N,Serpent River 7,,,,,,, From 50c59b20d02178da5aeda4f387d21615cf3e8e9a Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 27 Jun 2024 09:50:26 -0400 Subject: [PATCH 49/66] fix: ca_qc_gatineau --- ca_qc_gatineau/people.py | 43 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/ca_qc_gatineau/people.py b/ca_qc_gatineau/people.py index 34a9c91e..60522652 100644 --- a/ca_qc_gatineau/people.py +++ b/ca_qc_gatineau/people.py @@ -4,7 +4,6 @@ from utils import CanadianScraper COUNCIL_PAGE = "http://www.gatineau.ca/portail/default.aspx?p=guichet_municipal%2fconseil_municipal" -MAYOR_CONTACT_PAGE = "http://www.gatineau.ca/portail/default.aspx?p=la_ville/conseil_municipal/maire" class GatineauPersonScraper(CanadianScraper): @@ -12,32 +11,34 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) # it's all javascript rendered on the client... wow. - js = page.xpath('string(//div[@id="contenu-principal-centre-contenu-index"]/script[2])') # allow string() - districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js) - names = re.findall(r'arrayMembres\[a.+"(.+)"', js) - urls = re.findall(r'arrayLiens\[a.+"(.+)"', js) - # first item in list is mayor - p = Person(primary_org="legislature", name=names[0], district="Gatineau", role="Maire") - p.add_source(COUNCIL_PAGE) - p.add_source(MAYOR_CONTACT_PAGE) - email = "maire@gatineau.ca" # hardcoded - p.add_contact("email", email) - yield p + js = page.xpath('string(//div[@id="contenu-principal-centre-contenu-index"]/script[1])') # allow string() + roles = re.findall(r'arrayMembres\[.+?"(.+?)"', js) + districts = re.findall(r'arrayMembres\[.+?, "(.*?)"', js) + names = re.findall(r'arrayMembres\[.+?,.+?, "(.*?)"', js) + urls = re.findall(r'arrayMembres\[.+"(.*?)",', js) - councillors = list(zip(districts, names, urls))[1:] + councillors = list(zip(roles, districts, names, urls)) assert len(councillors), "No councillors found" - for raw_district, name, url in councillors: - if name == "Vacant": + for role, raw_district, name, url in councillors: + if name == "Vacant" or "(de " in role: continue - profile_url = COUNCIL_PAGE + "/" + url.split("/")[-1] profile_page = self.lxmlize(profile_url) - photo_url = profile_page.xpath('//div[@class="colonnes-2"]//img/@src')[0] - district = "District " + re.search(r"\d+", raw_district).group(0) - email = self.get_email(profile_page) - p = Person(primary_org="legislature", name=name, district=district, role="Conseiller") + photo_url = profile_page.xpath('//div[@class="colonnes-3"]//img/@src')[0] + if raw_district: + district = "District " + re.search(r"\d+", raw_district).group(0) + role = "Conseiller" + else: + district = "Gatineau" + role = "Maire" + email = self.get_email(profile_page, error=False) + phone = self.get_phone(profile_page, error=False) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(profile_url) p.image = photo_url - p.add_contact("email", email) + if email: + p.add_contact("email", email) + if phone: + p.add_contact("voice", phone, "legislature") yield p From b9a30058f32479dd37c9dd0acbc1ef9a6a165cc1 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 26 Jun 2024 17:38:12 -0400 Subject: [PATCH 50/66] fix(ca-on_kawartha_lakes): Refactor scraper for new site --- ca_on_kawartha_lakes/people.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ca_on_kawartha_lakes/people.py b/ca_on_kawartha_lakes/people.py index 767970f1..ad2d33db 100644 --- a/ca_on_kawartha_lakes/people.py +++ b/ca_on_kawartha_lakes/people.py @@ -3,34 +3,34 @@ from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.city.kawarthalakes.on.ca/city-hall/mayor-council/members-of-council" +COUNCIL_PAGE = "https://www.kawarthalakes.ca/en/municipal-services/contact-a-council-member.aspx" class KawarthaLakesPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//p[@class="WSIndent"]/a') + councillors = page.xpath("//tr[.//h2]") assert len(councillors), "No councillors found" for councillor in councillors: - district = re.findall(r"(Ward [0-9]{1,2})", councillor.text_content()) + district = re.findall(r"(Ward \d)", councillor.text_content()) if district: district = district[0] - name = councillor.text_content().replace(district, "").strip() + name = re.sub(r"Ward \d|Councillor|Deputy Mayor|-", "", councillor.text_content()).strip() role = "Councillor" else: district = "Kawartha Lakes" name = councillor.text_content().replace("Mayor", "").strip() role = "Mayor" - url = councillor.attrib["href"] - page = self.lxmlize(url) - email = self.get_email(page) - image = page.xpath('//img[@class="image-right"]/@src')[0] + info_node = councillor.xpath("./following-sibling::*")[0] + email = self.get_email(info_node) + phone = self.get_phone(info_node) + image = info_node.xpath("//img/@src")[0] p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - p.add_source(url) + p.add_contact("voice", phone, "legislature") p.add_contact("email", email) p.image = image yield p From 9bd6fceaa1d61a6225e8b97c71ec5c6987948415 Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Wed, 26 Jun 2024 17:01:03 -0400 Subject: [PATCH 51/66] fix(ca_pe_summerside): Refactor scraper for new site --- ca_pe_summerside/people.py | 98 +++++++++++++++----------------------- 1 file changed, 38 insertions(+), 60 deletions(-) diff --git a/ca_pe_summerside/people.py b/ca_pe_summerside/people.py index 6ff8f2bd..ee516c79 100644 --- a/ca_pe_summerside/people.py +++ b/ca_pe_summerside/people.py @@ -1,76 +1,54 @@ import re -from urllib.parse import urljoin -from utils import CONTACT_DETAIL_TYPE_MAP from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://city.summerside.pe.ca/mayor-and-council/pages/2012/2/councillors/" -MAYOR_PAGE = "http://city.summerside.pe.ca/mayor-and-council/pages/2012/2/mayor/" +COUNCIL_PAGE = "https://summerside.hosted.civiclive.com/mayor_and_council" -class SummersidePersonScraper(CanadianScraper): - def scrape(self): - page = self.lxmlize(COUNCIL_PAGE, "iso-8859-1") +def decode_email(hex_email): + decoded_email = "" + key = int(hex_email[:2], 16) - yield self.scrape_mayor() + for i in range(2, len(hex_email) - 1, 2): + decoded_email += chr(int(hex_email[i : i + 2], 16) ^ key) - councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]') - assert len(councillors), "No councillors found" - for councillor in councillors: - url = councillor.xpath(".//a")[0].attrib["href"].replace("../", "") - page = self.lxmlize(url, "iso-8859-1") + return decoded_email - name = ( - page.xpath('//div[@class="articletitle"]/h1')[0] - .text_content() - .replace("Councillor", "") - .replace("Deputy Mayor", "") - ) - district = "Ward {}".format( - re.sub(r"\D+", "", page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content()) - ) - p = Person(primary_org="legislature", name=name, district=district, role="Councillor") +class SummersidePersonScraper(CanadianScraper): + def scrape(self): + page = self.lxmlize(COUNCIL_PAGE, user_agent="Mozilla/5.0") + + councillors = page.xpath('//ul[@class="sidenav"]//a[contains(., "Mayor") or contains(., "Councillor")]/@href') + assert len(councillors), "No councillors found" + for url in councillors: + page = self.lxmlize(url, user_agent="Mozilla/5.0") + + role, name = page.xpath('//div[@id="pagetitle"]')[0].text_content().split(" /")[0].split(" ", 1) + + if role == "Mayor": + district = "Summerside" + else: + district = re.search( + r"(?<=Ward\s\d:\s).*(?=\n|\s$|)", + page.xpath('//div[contains(@id, "ContentPlaceHolder")]//img/parent::*')[0].text_content(), + ).group(0) + district = ( + district.replace(" -", "-").replace("- ", "-").replace("-", "—").replace("Councillor", "").strip() + ) + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) - photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace("/..", "") - p.image = urljoin(url, photo_url_rel) + photo = page.xpath('//div[contains(@id, "ContentPlaceHolder")]//img/@src')[0] + phone = self.get_phone(page) + hex_email = page.xpath('//div[contains(@id, "ContentPlaceHolder")]//@data-cfemail')[0] + email = decode_email(hex_email) - contacts = ( - page.xpath('//div[@class="articlebody-inside"]/p')[1] - .text_content() - .replace("Biography", "") - .replace("Committees", "") - .split(":") - ) - for i, contact in enumerate(contacts): - if i == 0 or not contact: - continue - contact_type = re.findall(r"([A-Z][a-z]+)", contacts[i - 1])[0] - if contact_type != "Address": - contact = re.split(r"[A-Z]", contact)[0] - contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] - p.add_contact(contact_type, contact, "" if contact_type == "email" else "legislature") - yield p - - def scrape_mayor(self): - page = self.lxmlize(MAYOR_PAGE, "iso-8859-1") - - name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace("Mayor", "") - - p = Person(primary_org="legislature", name=name, district="Summerside", role="Mayor") - p.add_source(MAYOR_PAGE) - p.image = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace("..", "") + p.image = photo + p.add_contact("voice", phone, "legislature") + p.add_contact("email", email) + print(email) - info = page.xpath('//div[@class="articlebody-inside"]/p') - phone = re.findall(r"to (.*)", info[1].text_content())[0] - address = info[3].text_content().replace("by mail: ", "") + " " + info[4].text_content() - email = self.get_email(info[5]) - - p.add_contact("voice", phone, "legislature") - p.add_contact("address", address, "legislature") - p.add_contact("email", email) - - return p + yield p From 3bca07fb6eecacd6a34dcf7ad2b6291dcef73a1d Mon Sep 17 00:00:00 2001 From: Rafe Murray Date: Thu, 27 Jun 2024 17:00:28 -0400 Subject: [PATCH 52/66] fix: ca_on_wilmot --- ca_on_wilmot/people.py | 54 +++++++++++------------------------------- 1 file changed, 14 insertions(+), 40 deletions(-) diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index 8c5898b6..b72dbbba 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -1,51 +1,28 @@ -import re - from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "http://www.wilmot.ca/current-council.php" +COUNCIL_PAGE = "https://www.wilmot.ca/Modules/contact/search.aspx?s=EFHOVXSi8AOIMKMStZMNvAeQuAleQuAl" class WilmotPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - councillors = page.xpath('//table[@id="Main Content"]//td[@colspan="3"]//td/p/b') + councillors = page.xpath('//table[@class="contactList"]//tr') assert len(councillors), "No councillors found" for councillor in councillors: - district, name = councillor.xpath("./text()")[0].split(":") - if "Mayor" in district: + name, role_district = councillor.xpath(".//button/text()")[0].split(" - ", 1) + if "Mayor" in role_district: yield scrape_mayor(councillor, name) continue + role, district = role_district.split(" - ") - p = Person(primary_org="legislature", name=name, district=district, role="Councillor") + p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) - base_info = councillor.xpath("./parent::p/text()") - for info in councillor.xpath("./parent::p/following-sibling::p"): - if info.xpath(".//b"): - break - base_info = base_info + info.xpath("./text()") - - address = "" - complete = False - while not complete: - address = address + " " + base_info.pop(0) - if re.search(r"[A-Z][0-9A-Z][A-Z] \d[A-Z]\d", address): - complete = True - p.add_contact("address", address, "legislature") - - base_info.pop(-1) - base_info = " ".join(base_info).split() - for i, contact in enumerate(base_info): - if re.match(r"[0-9]", contact): - continue - if "fax" in contact: - p.add_contact("fax", base_info[i + 1], "legislature") - else: - p.add_contact(contact, base_info[i + 1], contact) - email = self.get_email(councillor, "./parent::p/following-sibling::p") - p.add_contact("email", email) + phone = self.get_phone(councillor).replace("/", "") + p.add_contact("voice", phone, "legislature") + print(name + ";", role + ";", district) yield p @@ -53,14 +30,11 @@ def scrape_mayor(div, name): p = Person(primary_org="legislature", name=name, district="Wilmot", role="Mayor") p.add_source(COUNCIL_PAGE) - info = div.xpath("./parent::p//text()") - info.pop(0) - address = " ".join(info[:3]) - phone = info[3].split()[1] - fax = info[4].split()[1] - email = info[-1] + address = div.xpath('.//div[@class="contactListAddress"]')[0].text_content() + phone = div.xpath('.//div[@class="contactListMainNumber"]/a/text()')[0] + other_phone = div.xpath('.//div[@class="contactListPhNumber"]/a/text()')[0] p.add_contact("address", address, "legislature") p.add_contact("voice", phone, "legislature") - p.add_contact("fax", fax, "legislature") - p.add_contact("email", email) + p.add_contact("voice", other_phone, "office") + return p From efa5296e482cb9796c795d7a0d9511bb2a1cdc00 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 28 Jun 2024 15:33:55 -0400 Subject: [PATCH 53/66] Update country-ca.csv --- country-ca.csv | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/country-ca.csv b/country-ca.csv index 8edabbad..0734bb82 100644 --- a/country-ca.csv +++ b/country-ca.csv @@ -4836,13 +4836,13 @@ ocd-division/country:ca/csd:3515037,North Kawartha,,,,,TP,,North Kawartha,,Towns ocd-division/country:ca/csd:3515044,Trent Lakes,,,,,MU,,Trent Lakes,,Municipality of Trent Lakes,ocd-division/country:ca/cd:3515,,,, ocd-division/country:ca/csd:3516010,Kawartha Lakes,,,,,CY,,Kawartha Lakes,,City of Kawartha Lakes,,,,, ocd-division/country:ca/csd:3516010/ward:1,Ward 1,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:10,Ward 10,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:11,Ward 11,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:12,Ward 12,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:13,Ward 13,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:14,Ward 14,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:15,Ward 15,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:16,Ward 16,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:10,Ward 10,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:11,Ward 11,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:12,Ward 12,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:13,Ward 13,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:14,Ward 14,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:15,Ward 15,,2018-12-01,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:16,Ward 16,,2018-12-01,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:2,Ward 2,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:3,Ward 3,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:4,Ward 4,,,,,,,,,,,,,, @@ -4850,7 +4850,7 @@ ocd-division/country:ca/csd:3516010/ward:5,Ward 5,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:6,Ward 6,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:7,Ward 7,,,,,,,,,,,,,, ocd-division/country:ca/csd:3516010/ward:8,Ward 8,,,,,,,,,,,,,, -ocd-division/country:ca/csd:3516010/ward:9,Ward 9,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3516010/ward:9,Ward 9,,2018-12-01,,,,,,,,,,,, ocd-division/country:ca/csd:3518001,Pickering,,,,,CY,,Pickering,,City of Pickering,ocd-division/country:ca/cd:3518,,,, ocd-division/country:ca/csd:3518001/ward:1,Ward 1,,,,,,,,,,,,,, ocd-division/country:ca/csd:3518001/ward:2,Ward 2,,,,,,,,,,,,,, @@ -5566,6 +5566,7 @@ ocd-division/country:ca/csd:3557061/ward:2,Ward 2,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557061/ward:3,Ward 3,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557061/ward:4,Ward 4,,,,,,,,,,,,,, ocd-division/country:ca/csd:3557061/ward:5,Ward 5,,,,,,,,,,,,,, +ocd-division/country:ca/csd:3557061/ward:6,Ward 6,,2018-10-21,,,,,,,,,,,, ocd-division/country:ca/csd:3557066,Prince,,,,,TP,,Prince,,Township of Prince,,,,, ocd-division/country:ca/csd:3557071,Sagamok,,,,,IRI,N,Sagamok,,,,,,, ocd-division/country:ca/csd:3557072,Serpent River 7,,,,,IRI,N,Serpent River 7,,,,,,, From 2b4a9f8b31469e0ceb847ad985b1e33c0bef5064 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 2 Jul 2024 11:08:24 -0400 Subject: [PATCH 54/66] ci: Add automerge workflow --- .github/dependabot.yml | 6 ++++++ .github/workflows/automerge.yml | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/automerge.yml diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..12301490 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml new file mode 100644 index 00000000..55365732 --- /dev/null +++ b/.github/workflows/automerge.yml @@ -0,0 +1,35 @@ +# The pull_request_target workflow trigger is dangerous. Do not add unrelated logic to this workflow. +# https://securitylab.github.com/research/github-actions-preventing-pwn-requests/ +# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#pull_request_target +name: Auto-merge +on: pull_request_target +permissions: + pull-requests: write # to approve the PR + contents: write # to merge the PR +jobs: + dependabot: + if: ${{ github.event.pull_request.user.login == 'dependabot[bot]' }} + runs-on: ubuntu-latest + steps: + - id: dependabot-metadata + uses: dependabot/fetch-metadata@v2 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + - if: ${{ steps.dependabot-metadata.outputs.update-type != 'version-update:semver-major' || steps.dependabot-metadata.outputs.package-ecosystem == 'github_actions' }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh pr review --approve ${{ github.event.pull_request.html_url }} + - if: ${{ steps.dependabot-metadata.outputs.update-type != 'version-update:semver-major' || steps.dependabot-metadata.outputs.package-ecosystem == 'github_actions' }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh pr merge --auto --squash ${{ github.event.pull_request.html_url }} + precommit: + if: ${{ github.event.pull_request.user.login == 'pre-commit-ci[bot]' }} + runs-on: ubuntu-latest + steps: + - env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh pr review --approve ${{ github.event.pull_request.html_url }} + - env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: gh pr merge --auto --squash ${{ github.event.pull_request.html_url }} From 5a7c002c94c735f6daca70546500d392122b96c1 Mon Sep 17 00:00:00 2001 From: Simon Meers Date: Mon, 15 Jul 2024 14:56:34 +1000 Subject: [PATCH 55/66] update Saskatchewan --- ca_sk/people.py | 113 +++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 59 deletions(-) diff --git a/ca_sk/people.py b/ca_sk/people.py index 6f46fe52..b6a584b7 100644 --- a/ca_sk/people.py +++ b/ca_sk/people.py @@ -1,3 +1,5 @@ +import re + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -8,71 +10,64 @@ class SaskatchewanPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) - members = page.xpath('//table[@id="MLAs"]//tr')[1:] + members = page.xpath('//table[@id="mla-table"]//tr')[1:] assert len(members), "No members found" for member in members: - if "Vacant" not in member.xpath("./td")[0].text_content(): - name = member.xpath("./td")[0].text_content().split(". ", 1)[1] - district = member.xpath("./td")[2].text_content() - url = member.xpath("./td[1]/a/@href")[0] - page = self.lxmlize(url) - party = page.xpath('//span[@id="ContentContainer_MainContent_ContentBottom_Property4"]' "/span")[ - 0 - ].text - - p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) - p.add_source(COUNCIL_PAGE) - p.add_source(url) - try: - p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] - except IndexError: - pass + if "Vacant" in member.xpath("./td")[1].text_content(): + continue + name = member.xpath("./td")[0].text_content().split(". ", 1)[1].strip() + district = member.xpath("./td")[2].text_content().strip() + url = member.xpath("./td[1]/a/@href")[0] + page = self.lxmlize(url) + party = page.xpath('//div[contains(@class, "mla-header")]')[0].text.split(' - ')[1].strip() - contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0] - website = contact.xpath("./div[3]/div[3]/div[2]/a") - if website: - p.add_link(website[0].text_content()) + p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) + p.add_source(COUNCIL_PAGE) + p.add_source(url) + try: + p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] + except IndexError: + pass - def handle_address(lines, address_type): - address_lines = [] - for line in lines: - if line.endswith(":"): # Room:, Phone:, Fax: - break - address_lines.append(line) - if address_lines: - p.add_contact( - "address", - " ".join(address_lines), - address_type, - ) + def handle_address(lines, address_type): + address_lines = [] + for line in lines: + if re.match(r'(Room|Phone|Fax)\:', line): + break + address_lines.append(line) + if address_lines: + p.add_contact( + "address", + " ".join(address_lines), + address_type, + ) - def handle_phone(lines, phone_type): - if "Phone:" in lines: - next_line = lines[lines.index("Phone:") + 1] - if next_line.endswith(":"): - return - number = None - if "/" in next_line: - for fragment in next_line.split("/"): - if fragment.strip().startswith("306-"): - number = fragment.strip() - break - else: - number = next_line - p.add_contact("voice", number, phone_type, area_code=306) + def handle_phone(lines, phone_type): + matches = re.findall(r'Phone\:\s*(306-[\d\-]+)', '\n'.join(lines)) + if len(matches) == 1: + p.add_contact("voice", matches[0], phone_type, area_code=306) - legislature_lines = contact.xpath('.//div[@class="col-md-4"][1]/div//text()') - assert legislature_lines[0] == "Legislative Building Address" - handle_address(legislature_lines[1:], "legislature") - handle_phone(legislature_lines[1:], "legislature") + for address in page.xpath('//div[@class="col-md-3"]'): + lines = address.xpath('./div//text()') + address_type = None + if lines[0] == "Legislative Building Address": + address_type = "legislature" + elif lines[0] == "Constituency Address": + address_type = "constituency" + else: + raise AssertionError(f"Unexpected address type: {lines[0]}") + handle_address(lines[1:], address_type) + handle_phone(lines[1:], address_type) - constituency_lines = contact.xpath('.//div[@class="col-md-4"][2]/div//text()') - assert constituency_lines[0] == "Constituency Address" - handle_address(constituency_lines[1:], "constituency") - handle_phone(constituency_lines[1:], "constituency") + email = self.get_email(page.xpath('//div[@id="content"]')[0], error=False) + if email: + p.add_contact("email", email) - email = self.get_email(contact, error=False) - if email: - p.add_contact("email", email) + websites = re.findall( + r'Website:\s*(http\S+)', + ' '.join(page.xpath('//div[@class="col-md-4"]/div//text()')) + ) + if len(websites) == 1: + p.add_link(websites[0]) - yield p + yield p From 8160fe6f8e71388996dd57a6b5e5444a6ee46551 Mon Sep 17 00:00:00 2001 From: Simon Meers Date: Mon, 15 Jul 2024 15:02:33 +1000 Subject: [PATCH 56/66] black formatting --- ca_sk/people.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/ca_sk/people.py b/ca_sk/people.py index b6a584b7..352430cf 100644 --- a/ca_sk/people.py +++ b/ca_sk/people.py @@ -19,7 +19,7 @@ def scrape(self): district = member.xpath("./td")[2].text_content().strip() url = member.xpath("./td[1]/a/@href")[0] page = self.lxmlize(url) - party = page.xpath('//div[contains(@class, "mla-header")]')[0].text.split(' - ')[1].strip() + party = page.xpath('//div[contains(@class, "mla-header")]')[0].text.split(" - ")[1].strip() p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) p.add_source(COUNCIL_PAGE) @@ -32,7 +32,7 @@ def scrape(self): def handle_address(lines, address_type): address_lines = [] for line in lines: - if re.match(r'(Room|Phone|Fax)\:', line): + if re.match(r"(Room|Phone|Fax)\:", line): break address_lines.append(line) if address_lines: @@ -43,12 +43,12 @@ def handle_address(lines, address_type): ) def handle_phone(lines, phone_type): - matches = re.findall(r'Phone\:\s*(306-[\d\-]+)', '\n'.join(lines)) + matches = re.findall(r"Phone\:\s*(306-[\d\-]+)", "\n".join(lines)) if len(matches) == 1: p.add_contact("voice", matches[0], phone_type, area_code=306) for address in page.xpath('//div[@class="col-md-3"]'): - lines = address.xpath('./div//text()') + lines = address.xpath("./div//text()") address_type = None if lines[0] == "Legislative Building Address": address_type = "legislature" @@ -64,8 +64,7 @@ def handle_phone(lines, phone_type): p.add_contact("email", email) websites = re.findall( - r'Website:\s*(http\S+)', - ' '.join(page.xpath('//div[@class="col-md-4"]/div//text()')) + r"Website:\s*(http\S+)", " ".join(page.xpath('//div[@class="col-md-4"]/div//text()')) ) if len(websites) == 1: p.add_link(websites[0]) From 1fe12c6940cb9c0d47e9f1c803c17d17a3cbc28c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 17:48:45 +0000 Subject: [PATCH 57/66] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.3.0 → 24.4.2](https://github.com/psf/black/compare/24.3.0...24.4.2) - [github.com/pycqa/flake8: 7.0.0 → 7.1.0](https://github.com/pycqa/flake8/compare/7.0.0...7.1.0) --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bba16ab1..47434be2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,11 +2,11 @@ ci: autoupdate_schedule: quarterly repos: - repo: https://github.com/psf/black - rev: 24.3.0 + rev: 24.4.2 hooks: - id: black - repo: https://github.com/pycqa/flake8 - rev: 7.0.0 + rev: 7.1.0 hooks: - id: flake8 additional_dependencies: [flake8-comprehensions] From 5fdd7d43bb5c51487ac62b9cc5f41286f57fb0f4 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 16 Sep 2024 17:00:14 -0400 Subject: [PATCH 58/66] build: Upgrade Django. chore: Use ruff, uv. --- .pre-commit-config.yaml | 23 ++++---- ca_bc_coquitlam/people.py | 1 - ca_bc_surrey/people.py | 1 - ca_nl/people.py | 10 ++-- ca_ns_cape_breton/people.py | 3 +- ca_qc_cote_saint_luc/people.py | 3 +- patch.py | 6 +-- pyproject.toml | 13 +++-- requirements.in | 11 ++++ requirements.txt | 96 +++++++++++++++++++++++++++++----- setup.cfg | 2 - setup.py | 25 --------- tox.ini | 5 -- utils.py | 1 + 14 files changed, 124 insertions(+), 76 deletions(-) create mode 100644 requirements.in delete mode 100644 setup.cfg delete mode 100644 setup.py delete mode 100644 tox.ini diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 47434be2..5e135c6f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,16 +1,17 @@ ci: autoupdate_schedule: quarterly + skip: [pip-compile] +default_language_version: + python: python3.10 repos: - - repo: https://github.com/psf/black - rev: 24.4.2 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.3 hooks: - - id: black - - repo: https://github.com/pycqa/flake8 - rev: 7.1.0 + - id: ruff + - id: ruff-format + - repo: https://github.com/astral-sh/uv-pre-commit + rev: 0.4.4 hooks: - - id: flake8 - additional_dependencies: [flake8-comprehensions] - - repo: https://github.com/pycqa/isort - rev: 5.13.2 - hooks: - - id: isort + - id: pip-compile + name: pip-compile requirements.in + args: [requirements.in, -o, requirements.txt] diff --git a/ca_bc_coquitlam/people.py b/ca_bc_coquitlam/people.py index 51e98652..e8147c56 100644 --- a/ca_bc_coquitlam/people.py +++ b/ca_bc_coquitlam/people.py @@ -7,7 +7,6 @@ class CoquitlamPersonScraper(CanadianScraper): - def scrape(self): def build_email(script): w = re.findall(r'w = "(.*?)"', script)[0] diff --git a/ca_bc_surrey/people.py b/ca_bc_surrey/people.py index b0240acd..f9654b73 100644 --- a/ca_bc_surrey/people.py +++ b/ca_bc_surrey/people.py @@ -12,7 +12,6 @@ def scrape(self): assert len(members), "No members found" seat_number = 1 for member in members: - role, name = member.xpath('.//a[@class="teaser__link"]/h4')[0].text_content().split(" ", 1) district = "Surrey (seat {})".format(seat_number) seat_number += 1 diff --git a/ca_nl/people.py b/ca_nl/people.py index a75bec04..b9f38932 100644 --- a/ca_nl/people.py +++ b/ca_nl/people.py @@ -1,9 +1,8 @@ import json import re -from utils import CUSTOM_USER_AGENT +from utils import CUSTOM_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "https://www.assembly.nl.ca/js/members-index.js" @@ -26,9 +25,7 @@ def scrape(self): page = self.get(COUNCIL_PAGE) members = re.search( r"members = (\[(.+)\]);", page.content.decode().replace("[Member-elect]", ""), re.DOTALL - ).groups()[ - 0 - ] # extract javascript array + ).groups()[0] # extract javascript array members = re.sub("", "", members) # remove comments members = re.sub("", "", members).replace("", "") # tags members = members.replace('"', r"\"") # escape double quotes @@ -60,7 +57,8 @@ def scrape(self): ) if member.get("email"): p.add_contact( - "email", member["email"].replace("@gov.nl.ca@gov.nl.ca", "@gov.nl.ca") # seriously guys?! + "email", + member["email"].replace("@gov.nl.ca@gov.nl.ca", "@gov.nl.ca"), # seriously guys?! ) p.add_source(COUNCIL_PAGE) diff --git a/ca_ns_cape_breton/people.py b/ca_ns_cape_breton/people.py index dad984d0..6774bd7e 100644 --- a/ca_ns_cape_breton/people.py +++ b/ca_ns_cape_breton/people.py @@ -1,9 +1,8 @@ import html import re -from utils import CUSTOM_USER_AGENT +from utils import CUSTOM_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "http://www.cbrm.ns.ca/mayor-council-2.html" MAYOR_PAGE = "http://www.cbrm.ns.ca/mayor" diff --git a/ca_qc_cote_saint_luc/people.py b/ca_qc_cote_saint_luc/people.py index 56a1f225..9670ea1c 100644 --- a/ca_qc_cote_saint_luc/people.py +++ b/ca_qc_cote_saint_luc/people.py @@ -1,6 +1,5 @@ -from utils import CUSTOM_USER_AGENT +from utils import CUSTOM_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "https://cotesaintluc.org/fr/affaires-municipales/membres-du-conseil/" diff --git a/patch.py b/patch.py index 2d6482a0..8acf2c0c 100644 --- a/patch.py +++ b/patch.py @@ -27,9 +27,9 @@ (r"\A1 \d{3} \d{3}-\d{4}(?: x\d+)?\Z", lambda x: x["type"] in ("text", "voice", "fax", "cell", "video", "pager")), ] # Validate the format of contact_details[].note. -_contact_details["items"]["properties"]["note"][ - "pattern" -] = r"\A(?:constituency|legislature|office|residence|)(?: \(\d\))?\Z" +_contact_details["items"]["properties"]["note"]["pattern"] = ( + r"\A(?:constituency|legislature|office|residence|)(?: \(\d\))?\Z" +) # contact_details[] must not include unexpected properties. _contact_details["items"]["additionalProperties"] = False diff --git a/pyproject.toml b/pyproject.toml index 8656c702..059d331a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,11 @@ -[tool.black] +[project] +name = "scrapers_ca" +version = "0.0.1" + +[tool.ruff] line-length = 119 +target-version = "py310" -[tool.isort] -profile = 'black' -line_length = 119 +[tool.ruff.lint] +select = ["C4", "E", "F", "I", "W"] +ignore = ["E501"] diff --git a/requirements.in b/requirements.in new file mode 100644 index 00000000..3cb35058 --- /dev/null +++ b/requirements.in @@ -0,0 +1,11 @@ +# 0.9.0 uses jsonschema instead of validictory, so we use a commit after 0.8.0 that adds Django 2.0 support. +git+https://github.com/opencivicdata/pupa@f0791f7de07574039eff10d804e4683399a16ec5 +agate +agate-excel +django<5 +invoke +lxml +opencivicdata +regex +requests[security] +unidecode diff --git a/requirements.txt b/requirements.txt index c14d43a1..080af981 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,83 @@ -# 0.9.0 uses jsonschema instead of validictory, so we use a commit after 0.8.0 that adds Django 2.0 support. --e git+https://github.com/opencivicdata/pupa.git@f0791f7de07574039eff10d804e4683399a16ec5#egg=pupa -opencivicdata==3.3.1 -Django==2.2.28 - -# Scrapers -agate -agate-excel -lxml==4.9.1 -regex==2014.04.10 -requests[security]==2.32.0 - -# Maintenance +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt +agate==1.12.0 + # via + # -r requirements.in + # agate-excel +agate-excel==0.4.1 + # via -r requirements.in +asgiref==3.8.1 + # via django +babel==2.16.0 + # via agate +certifi==2024.8.30 + # via requests +charset-normalizer==3.3.2 + # via requests +dj-database-url==0.3.0 + # via pupa +django==4.2.16 + # via + # -r requirements.in + # opencivicdata + # pupa +et-xmlfile==1.1.0 + # via openpyxl +idna==3.10 + # via requests invoke==0.11.1 -Unidecode==0.04.14 + # via -r requirements.in +isodate==0.6.1 + # via agate +leather==0.4.0 + # via agate +lxml==4.9.1 + # via -r requirements.in +olefile==0.47 + # via agate-excel +opencivicdata==3.3.1 + # via + # -r requirements.in + # pupa +openpyxl==3.1.5 + # via agate-excel +parsedatetime==2.6 + # via agate +psycopg2==2.9.9 + # via pupa +psycopg2-binary==2.9.9 + # via opencivicdata +pupa @ git+https://github.com/opencivicdata/pupa@f0791f7de07574039eff10d804e4683399a16ec5 + # via -r requirements.in +python-slugify==8.0.4 + # via agate +pytimeparse==1.1.8 + # via agate +pytz==2024.2 + # via pupa +regex==2014.4.10 + # via -r requirements.in +requests==2.32.3 + # via + # -r requirements.in + # scrapelib +scrapelib==2.3.0 + # via pupa +six==1.16.0 + # via isodate +sqlparse==0.5.1 + # via django +text-unidecode==1.3 + # via python-slugify +typing-extensions==4.12.2 + # via asgiref +unidecode==0.4.14 + # via -r requirements.in +urllib3==1.26.20 + # via + # requests + # scrapelib +validictory==1.1.3 + # via pupa +xlrd==2.0.1 + # via agate-excel diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index cfb0df10..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[flake8] -extend-ignore = E203,E501 diff --git a/setup.py b/setup.py deleted file mode 100644 index 15719185..00000000 --- a/setup.py +++ /dev/null @@ -1,25 +0,0 @@ -# @see https://pythonhosted.org/an_example_pypi_project/setuptools.html -# @see https://pythonhosted.org/setuptools/setuptools.html -import os - -from setuptools import find_packages, setup - - -def read(fname): - return open(os.path.join(os.path.dirname(__file__), fname)).read() - - -setup( - name="scrapers_ca", - version="0.0.1", - author="Open North", - author_email="info@opennorth.ca", - description="Canadian legislative scrapers", - license="MIT", - url="https://github.com/opencivicdata/scrapers-ca", - packages=find_packages(), - long_description=read("README.md"), - install_requires=[ - "lxml", - ], -) diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 2c8b2524..00000000 --- a/tox.ini +++ /dev/null @@ -1,5 +0,0 @@ -[flake8] -exclude=disabled -ignore=E501,E731 -# E501 line too long (X > 79 characters) -# E731 do not assign a lambda expression, use a def diff --git a/utils.py b/utils.py index 99fef627..030728bb 100644 --- a/utils.py +++ b/utils.py @@ -256,6 +256,7 @@ class CSVScraper(CanadianScraper): """ Set the CSV file's delimiter. """ + delimiter = "," """ Set the CSV file's encoding, like 'windows-1252' ('utf-8' by default). From 66f4de3ad12a7a51ce8a476b4d3d689cf020e26b Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 16 Sep 2024 17:04:00 -0400 Subject: [PATCH 59/66] chore: Run ruff check . --fix --- ca/people.py | 2 +- ca_ab/people.py | 12 +-- ca_ab_grande_prairie/__init__.py | 2 +- ca_ab_grande_prairie/people.py | 2 +- ca_ab_grande_prairie_county_no_1/__init__.py | 4 +- ca_ab_lethbridge/__init__.py | 2 +- ca_ab_lethbridge/people.py | 2 +- ca_ab_wood_buffalo/__init__.py | 12 +-- ca_ab_wood_buffalo/people.py | 4 +- ca_bc_abbotsford/people.py | 4 +- ca_bc_burnaby/people.py | 2 +- ca_bc_coquitlam/people.py | 2 +- ca_bc_langley/people.py | 2 +- ca_bc_langley_city/people.py | 2 +- ca_bc_new_westminster/people.py | 2 +- ca_bc_richmond/people.py | 2 +- ca_bc_saanich/people.py | 2 +- ca_bc_surrey/people.py | 2 +- ca_bc_vancouver/__init__.py | 4 +- ca_bc_victoria/people.py | 2 +- ca_nb_moncton/__init__.py | 6 +- ca_nb_moncton/people.py | 2 +- ca_nb_saint_john/__init__.py | 6 +- ca_nl/people.py | 4 +- ca_nl_st_john_s/__init__.py | 6 +- ca_nl_st_john_s/people.py | 2 +- ca_ns/people.py | 18 ++-- ca_ns_cape_breton/people.py | 2 +- ca_on/people.py | 2 +- ca_on_ajax/__init__.py | 8 +- ca_on_belleville/__init__.py | 4 +- ca_on_belleville/people.py | 2 +- ca_on_brampton/__init__.py | 8 +- ca_on_brantford/__init__.py | 4 +- ca_on_caledon/people.py | 6 +- ca_on_cambridge/__init__.py | 6 +- ca_on_chatham_kent/__init__.py | 4 +- ca_on_chatham_kent/people.py | 2 +- ca_on_clarington/__init__.py | 2 +- ca_on_fort_erie/__init__.py | 2 +- ca_on_georgina/__init__.py | 2 +- ca_on_grimsby/__init__.py | 4 +- ca_on_grimsby/people.py | 2 +- ca_on_guelph/__init__.py | 4 +- ca_on_huron/__init__.py | 2 +- ca_on_lambton/__init__.py | 2 +- ca_on_lambton/people.py | 2 +- ca_on_lasalle/__init__.py | 2 +- ca_on_lasalle/people.py | 2 +- ca_on_lincoln/__init__.py | 4 +- ca_on_markham/__init__.py | 6 +- ca_on_markham/people.py | 2 +- ca_on_milton/__init__.py | 8 +- ca_on_newmarket/__init__.py | 4 +- ca_on_niagara/__init__.py | 2 +- ca_on_niagara_on_the_lake/__init__.py | 2 +- ca_on_north_dumfries/people.py | 2 +- ca_on_oakville/__init__.py | 8 +- ca_on_oshawa/__init__.py | 8 +- ca_on_peel/__init__.py | 12 +-- ca_on_pickering/__init__.py | 4 +- ca_on_richmond_hill/__init__.py | 6 +- ca_on_richmond_hill/people.py | 2 +- ca_on_sault_ste_marie/__init__.py | 4 +- ca_on_sault_ste_marie/people.py | 2 +- ca_on_st_catharines/__init__.py | 4 +- ca_on_thunder_bay/__init__.py | 4 +- ca_on_thunder_bay/people.py | 2 +- ca_on_uxbridge/__init__.py | 2 +- ca_on_vaughan/__init__.py | 6 +- ca_on_vaughan/people.py | 2 +- ca_on_waterloo_region/__init__.py | 6 +- ca_on_waterloo_region/people.py | 4 +- ca_on_welland/__init__.py | 4 +- ca_on_whitby/__init__.py | 6 +- ca_on_whitby/people.py | 2 +- ca_on_whitchurch_stouffville/__init__.py | 2 +- ca_on_woolwich/__init__.py | 4 +- ca_on_woolwich/people.py | 2 +- ca_pe_stratford/people.py | 2 +- ca_qc/people.py | 4 +- ca_qc_longueuil/__init__.py | 2 +- ca_qc_longueuil/people.py | 4 +- ca_qc_montreal_est/people.py | 2 +- ca_qc_quebec/people.py | 2 +- ca_qc_trois_rivieres/people.py | 2 +- disabled/ca_bc_municipalities/people.py | 2 +- .../ca_bc_municipalities_candidates/people.py | 2 +- disabled/ca_municipalities/people.py | 2 +- disabled/ca_nb_municipalities/people.py | 10 +-- disabled/ca_ns_municipalities/people.py | 2 +- patch.py | 2 +- tasks.py | 85 ++++++++----------- utils.py | 64 ++++++-------- 94 files changed, 231 insertions(+), 268 deletions(-) diff --git a/ca/people.py b/ca/people.py index 6cf7b5f4..a9605a35 100644 --- a/ca/people.py +++ b/ca/people.py @@ -119,7 +119,7 @@ def scrape_people(self, rows, gender): ): note = "constituency" if i: - note += " ({})".format(i + 1) + note += f" ({i + 1})" address = constituency_office_el.xpath("./p[1]")[0] address = address.text_content().strip().splitlines() diff --git a/ca_ab/people.py b/ca_ab/people.py index 601fa525..3edeb0bc 100644 --- a/ca_ab/people.py +++ b/ca_ab/people.py @@ -59,8 +59,8 @@ def scrape(self): field_names = next(reader) for name in OFFICE_FIELDS: assert field_names.count(name) == 2 - field_names[field_names.index(name)] = "{} 1".format(name) - field_names[field_names.index(name)] = "{} 2".format(name) + field_names[field_names.index(name)] = f"{name} 1" + field_names[field_names.index(name)] = f"{name} 2" rows = [dict(zip_longest(field_names, row)) for row in reader] assert len(rows), "No members found" for mla in rows: @@ -76,8 +76,8 @@ def scrape(self): row_xpath = '//td[normalize-space()="{}"]/..'.format( mla["Constituency Name"], ) - (detail_url,) = index.xpath("{}//a/@href".format(row_xpath)) - (photo_url,) = index.xpath("{}//img/@src".format(row_xpath)) + (detail_url,) = index.xpath(f"{row_xpath}//a/@href") + (photo_url,) = index.xpath(f"{row_xpath}//img/@src") district = mla["Constituency Name"] if district == "Calgary-Bhullar-McCall": district = "Calgary-McCall" @@ -108,10 +108,10 @@ def scrape(self): for suffix, note in addresses: for key, contact_type in (("Phone", "voice"), ("Fax", "fax")): - value = mla["{} Number {}".format(key, suffix)] + value = mla[f"{key} Number {suffix}"] if value and value != "Pending": p.add_contact(contact_type, value, note) - address = ", ".join(filter(bool, [mla["{} {}".format(field, suffix)] for field in ADDRESS_FIELDS])) + address = ", ".join(filter(bool, [mla[f"{field} {suffix}"] for field in ADDRESS_FIELDS])) if address: p.add_contact("address", address, note) diff --git a/ca_ab_grande_prairie/__init__.py b/ca_ab_grande_prairie/__init__.py index 67a6f1e5..42329fcc 100644 --- a/ca_ab_grande_prairie/__init__.py +++ b/ca_ab_grande_prairie/__init__.py @@ -17,7 +17,7 @@ def get_organizations(self): for seat_number in range(1, 9): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_ab_grande_prairie/people.py b/ca_ab_grande_prairie/people.py index 1508f5d2..9ff05187 100644 --- a/ca_ab_grande_prairie/people.py +++ b/ca_ab_grande_prairie/people.py @@ -14,7 +14,7 @@ def scrape(self): for councillor in councillors: role, name = councillor.xpath(".//h3")[0].text_content().split(" ", 1) if role == "Councillor": - district = "Grande Prairie (seat {})".format(seat_number) + district = f"Grande Prairie (seat {seat_number})" seat_number += 1 else: district = " Grande Prairie" diff --git a/ca_ab_grande_prairie_county_no_1/__init__.py b/ca_ab_grande_prairie_county_no_1/__init__.py index 632cdb9d..fc7fd5da 100644 --- a/ca_ab_grande_prairie_county_no_1/__init__.py +++ b/ca_ab_grande_prairie_county_no_1/__init__.py @@ -16,8 +16,8 @@ def get_organizations(self): for division_number in range(1, 10): organization.add_post( role="Councillor", - label="Division {}".format(division_number), - division_id="{}/division:{}".format(self.division_id, division_number), + label=f"Division {division_number}", + division_id=f"{self.division_id}/division:{division_number}", ) yield organization diff --git a/ca_ab_lethbridge/__init__.py b/ca_ab_lethbridge/__init__.py index 40d32197..d4e4c9c6 100644 --- a/ca_ab_lethbridge/__init__.py +++ b/ca_ab_lethbridge/__init__.py @@ -17,7 +17,7 @@ def get_organizations(self): for seat_number in range(1, 9): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_ab_lethbridge/people.py b/ca_ab_lethbridge/people.py index 5346d0e2..808c98e5 100644 --- a/ca_ab_lethbridge/people.py +++ b/ca_ab_lethbridge/people.py @@ -24,7 +24,7 @@ def scrape_person(self, url, seat_number): p = Person( primary_org="legislature", name=name, - district="Lethbridge (seat {})".format(seat_number + 1), + district=f"Lethbridge (seat {seat_number + 1})", role="Councillor", ) diff --git a/ca_ab_wood_buffalo/__init__.py b/ca_ab_wood_buffalo/__init__.py index 40ae2c69..91eee478 100644 --- a/ca_ab_wood_buffalo/__init__.py +++ b/ca_ab_wood_buffalo/__init__.py @@ -17,16 +17,16 @@ def get_organizations(self): for seat_number in range(1, 7): organization.add_post( role="Councillor", - label="Ward 1 (seat {})".format(seat_number), - division_id="{}/ward:1".format(self.division_id), + label=f"Ward 1 (seat {seat_number})", + division_id=f"{self.division_id}/ward:1", ) for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward 2 (seat {})".format(seat_number), - division_id="{}/ward:2".format(self.division_id), + label=f"Ward 2 (seat {seat_number})", + division_id=f"{self.division_id}/ward:2", ) - organization.add_post(role="Councillor", label="Ward 3", division_id="{}/ward:3".format(self.division_id)) - organization.add_post(role="Councillor", label="Ward 4", division_id="{}/ward:4".format(self.division_id)) + organization.add_post(role="Councillor", label="Ward 3", division_id=f"{self.division_id}/ward:3") + organization.add_post(role="Councillor", label="Ward 4", division_id=f"{self.division_id}/ward:4") yield organization diff --git a/ca_ab_wood_buffalo/people.py b/ca_ab_wood_buffalo/people.py index 2760aedd..b53bc03c 100644 --- a/ca_ab_wood_buffalo/people.py +++ b/ca_ab_wood_buffalo/people.py @@ -33,13 +33,13 @@ def scrape(self): for ward in wards: area = ward.text_content().split("–", 1)[1].strip() councillors = ward.xpath("./following-sibling::table[1]/tbody/tr/td/h3") - assert len(councillors), "No councillors found for {}".format(area) + assert len(councillors), f"No councillors found for {area}" for councillor in councillors: name = councillor.text_content() if area in ("Ward 1", "Ward 2"): seat_numbers[area] += 1 - district = "{} (seat {})".format(area, seat_numbers[area]) + district = f"{area} (seat {seat_numbers[area]})" else: district = area diff --git a/ca_bc_abbotsford/people.py b/ca_bc_abbotsford/people.py index 5d29da2f..db72003b 100644 --- a/ca_bc_abbotsford/people.py +++ b/ca_bc_abbotsford/people.py @@ -19,12 +19,12 @@ def scrape(self): ] assert len(councillors), "No councillors found" - assert len(councillors) == len(contact_data), "Expected {}, got {}".format(len(councillors), len(contact_data)) + assert len(councillors) == len(contact_data), f"Expected {len(councillors)}, got {len(contact_data)}" for councillor, contact in zip(councillors, contact_data): text = councillor.xpath(".//h3/a")[0].text_content() if text.startswith("Councill"): role = "Councillor" - district = "Abbotsford (seat {})".format(councillor_seat_number) + district = f"Abbotsford (seat {councillor_seat_number})" councillor_seat_number += 1 else: role = "Mayor" diff --git a/ca_bc_burnaby/people.py b/ca_bc_burnaby/people.py index 07155a92..856a1b34 100644 --- a/ca_bc_burnaby/people.py +++ b/ca_bc_burnaby/people.py @@ -25,7 +25,7 @@ def scrape(self): if role == "Mayor": district = "Burnaby" else: - district = "Burnaby (seat {})".format(councillor_seat_number) + district = f"Burnaby (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role, image=photo_url) diff --git a/ca_bc_coquitlam/people.py b/ca_bc_coquitlam/people.py index e8147c56..2df34976 100644 --- a/ca_bc_coquitlam/people.py +++ b/ca_bc_coquitlam/people.py @@ -35,7 +35,7 @@ def build_email(script): if role == "Mayor": district = "Coquitlam" else: - district = "Coquitlam (seat {})".format(councillor_seat_number) + district = f"Coquitlam (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_bc_langley/people.py b/ca_bc_langley/people.py index f6863abb..e3121e54 100644 --- a/ca_bc_langley/people.py +++ b/ca_bc_langley/people.py @@ -15,7 +15,7 @@ def scrape(self): page = self.lxmlize(url) name = page.xpath("//h1")[0].text_content().strip() - district = "Langley (seat {})".format(seat_number) + district = f"Langley (seat {seat_number})" seat_number += 1 email = self.get_email(page) phone = self.get_phone(page) diff --git a/ca_bc_langley_city/people.py b/ca_bc_langley_city/people.py index eaf1f3fd..0db77a03 100644 --- a/ca_bc_langley_city/people.py +++ b/ca_bc_langley_city/people.py @@ -22,7 +22,7 @@ def scrape(self): phone_div = councillor.xpath('..//p[contains(., "Phone:")]')[0] phone = self.get_phone(phone_div) else: - district = "Langley (seat {})".format(councillor_seat_number) + district = f"Langley (seat {councillor_seat_number})" phone = ( "604 514 2800" # According to their site, all councillors can be contacted at this phone number ) diff --git a/ca_bc_new_westminster/people.py b/ca_bc_new_westminster/people.py index 44e96727..a6f4a8a0 100644 --- a/ca_bc_new_westminster/people.py +++ b/ca_bc_new_westminster/people.py @@ -15,7 +15,7 @@ def scrape(self): assert len(councillors), "No councillors found" for councillor in councillors: name = councillor.xpath(".//a[@name]")[0].text_content() - district = "New Westminster (seat {})".format(seat_number) + district = f"New Westminster (seat {seat_number})" seat_number += 1 p = Person(primary_org="legislature", name=name, role="Councillor", district=district) photo = councillor.xpath("//img/@src")[0] diff --git a/ca_bc_richmond/people.py b/ca_bc_richmond/people.py index 795637b3..c90d664b 100644 --- a/ca_bc_richmond/people.py +++ b/ca_bc_richmond/people.py @@ -21,7 +21,7 @@ def scrape(self): if role == "Mayor": district = "Richmond" else: - district = "Richmond (seat {})".format(councillor_seat_number) + district = f"Richmond (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_bc_saanich/people.py b/ca_bc_saanich/people.py index e7088e99..d3ea7da2 100644 --- a/ca_bc_saanich/people.py +++ b/ca_bc_saanich/people.py @@ -26,7 +26,7 @@ def scrape(self): district = "Saanich" else: role = "Councillor" - district = "Saanich (seat {})".format(councillor_seat_number) + district = f"Saanich (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_bc_surrey/people.py b/ca_bc_surrey/people.py index f9654b73..bf877b91 100644 --- a/ca_bc_surrey/people.py +++ b/ca_bc_surrey/people.py @@ -13,7 +13,7 @@ def scrape(self): seat_number = 1 for member in members: role, name = member.xpath('.//a[@class="teaser__link"]/h4')[0].text_content().split(" ", 1) - district = "Surrey (seat {})".format(seat_number) + district = f"Surrey (seat {seat_number})" seat_number += 1 photo_url = member.xpath(".//figure//img/@src")[0] diff --git a/ca_bc_vancouver/__init__.py b/ca_bc_vancouver/__init__.py index f07c572d..3fa273f6 100644 --- a/ca_bc_vancouver/__init__.py +++ b/ca_bc_vancouver/__init__.py @@ -17,13 +17,13 @@ def get_organizations(self): for seat_number in range(1, 11): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for seat_number in range(1, 8): organization.add_post( role="Commissioner", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_bc_victoria/people.py b/ca_bc_victoria/people.py index b6c05cc8..9796b6ca 100644 --- a/ca_bc_victoria/people.py +++ b/ca_bc_victoria/people.py @@ -20,7 +20,7 @@ def scrape(self): phone = self.get_phone(councillor) url = councillor.xpath(".//h3/a/@href")[0] - district = "Victoria (seat {})".format(seat_number) + district = f"Victoria (seat {seat_number})" seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_nb_moncton/__init__.py b/ca_nb_moncton/__init__.py index c4a931d0..5d2abbe6 100644 --- a/ca_nb_moncton/__init__.py +++ b/ca_nb_moncton/__init__.py @@ -17,15 +17,15 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor at Large", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 5): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_nb_moncton/people.py b/ca_nb_moncton/people.py index f94e09db..e13a7aee 100644 --- a/ca_nb_moncton/people.py +++ b/ca_nb_moncton/people.py @@ -24,7 +24,7 @@ def scrape(self): role = councillor["Primary_role"] if role != "Mayor": seat_numbers[ward] += 1 - district = ward + " (seat {})".format(seat_numbers[ward]) + district = ward + f" (seat {seat_numbers[ward]})" else: district = ward name = councillor["Name"] diff --git a/ca_nb_saint_john/__init__.py b/ca_nb_saint_john/__init__.py index 6372f776..407b9322 100644 --- a/ca_nb_saint_john/__init__.py +++ b/ca_nb_saint_john/__init__.py @@ -18,15 +18,15 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 5): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_nl/people.py b/ca_nl/people.py index b9f38932..095cd04d 100644 --- a/ca_nl/people.py +++ b/ca_nl/people.py @@ -34,10 +34,10 @@ def scrape(self): assert len(members), "No members found" for member in json.loads(members): if not member["name"].strip(): - print("Skipping blank member: {}".format(member)) + print(f"Skipping blank member: {member}") continue if member["name"] == "Vacant": - print("Skipping vacant 'member': {}".format(member)) + print(f"Skipping vacant 'member': {member}") continue name = " ".join(reversed(member["name"].split(","))).strip() district = ( diff --git a/ca_nl_st_john_s/__init__.py b/ca_nl_st_john_s/__init__.py index 5b8632a8..c3fbca30 100644 --- a/ca_nl_st_john_s/__init__.py +++ b/ca_nl_st_john_s/__init__.py @@ -18,14 +18,14 @@ def get_organizations(self): for seat_number in range(1, 5): organization.add_post( role="Councillor at Large", - label="St. John's (seat {})".format(seat_number), + label=f"St. John's (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 6): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_nl_st_john_s/people.py b/ca_nl_st_john_s/people.py index ee1f41e2..7b0771ab 100644 --- a/ca_nl_st_john_s/people.py +++ b/ca_nl_st_john_s/people.py @@ -25,7 +25,7 @@ def scrape(self): district = "St. John's" if role != "Mayor" and role != "Deputy Mayor": role = "Councillor at Large" - district = "St. John's (seat {})".format(councillor_seat_number) + district = f"St. John's (seat {councillor_seat_number})" councillor_seat_number += 1 email = self.get_email(page) diff --git a/ca_ns/people.py b/ca_ns/people.py index 4013d33f..97763d39 100644 --- a/ca_ns/people.py +++ b/ca_ns/people.py @@ -18,7 +18,7 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath( '//div[contains(@class, "view-display-id-page_mlas_current_tiles")]//div[contains(@class, "views-row-")]' - ) # noqa + ) assert len(members), "No members found" for member in members: district = member.xpath('.//div[contains(@class, "views-field-field-constituency")]/div/text()')[0] @@ -66,16 +66,14 @@ def scrape(self): if len(mailing_address) > 0: address = mailing_address - else: - if len(civic_address) > 0 or len(civic_address_alt) > 0: - if len(civic_address_alt) > 0: - address = civic_address_alt - else: - address = civic_address - address.remove(address[0]) # remove civic address + elif len(civic_address) > 0 or len(civic_address_alt) > 0: + if len(civic_address_alt) > 0: + address = civic_address_alt else: - if len(business_address) > 0: - address = business_address + address = civic_address + address.remove(address[0]) # remove civic address + elif len(business_address) > 0: + address = business_address address = list(map(str.strip, address)) p.add_contact("address", "\n".join(address), "constituency") diff --git a/ca_ns_cape_breton/people.py b/ca_ns_cape_breton/people.py index 6774bd7e..f32939e3 100644 --- a/ca_ns_cape_breton/people.py +++ b/ca_ns_cape_breton/people.py @@ -54,7 +54,7 @@ def decode_email(script): councillor_url = councillor.xpath(".//a/@href")[0] p.add_source(councillor_url) page = self.lxmlize(councillor_url, user_agent=CUSTOM_USER_AGENT) - image = page.xpath('//img[contains(@title, "{0}")]/@src'.format(name)) + image = page.xpath(f'//img[contains(@title, "{name}")]/@src') if image: p.image = image[0] yield p diff --git a/ca_on/people.py b/ca_on/people.py index 82865e66..3836aa79 100644 --- a/ca_on/people.py +++ b/ca_on/people.py @@ -58,7 +58,7 @@ def scrape(self): p.extras["constituency_email"] = emails.pop(0) for heading, note in headings.items(): - office = node.xpath('//h3[contains(., "{}")]'.format(heading)) + office = node.xpath(f'//h3[contains(., "{heading}")]') if office: try: office_info = office[0].xpath( diff --git a/ca_on_ajax/__init__.py b/ca_on_ajax/__init__.py index 5f8340f8..ddbda0dc 100644 --- a/ca_on_ajax/__init__.py +++ b/ca_on_ajax/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 4): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_belleville/__init__.py b/ca_on_belleville/__init__.py index 6894249e..2c46ecb3 100644 --- a/ca_on_belleville/__init__.py +++ b/ca_on_belleville/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, stop): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_belleville/people.py b/ca_on_belleville/people.py index fce0386d..3aebe5c6 100644 --- a/ca_on_belleville/people.py +++ b/ca_on_belleville/people.py @@ -36,7 +36,7 @@ def scrape(self): councillors = ward.xpath("./following-sibling::*[img]") for councillor in councillors: self.seat_numbers[ward_name] += 1 - district = "{} (seat {})".format(ward_name, self.seat_numbers[ward_name]) + district = f"{ward_name} (seat {self.seat_numbers[ward_name]})" role = "Councillor" name = councillor.xpath("./following-sibling::p")[0].text_content() diff --git a/ca_on_brampton/__init__.py b/ca_on_brampton/__init__.py index 1c6f28d4..ca0cf1aa 100644 --- a/ca_on_brampton/__init__.py +++ b/ca_on_brampton/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 11): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_brantford/__init__.py b/ca_on_brantford/__init__.py index 86cb306a..5bac19e3 100644 --- a/ca_on_brantford/__init__.py +++ b/ca_on_brantford/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_caledon/people.py b/ca_on_caledon/people.py index 03ec332d..cbf51335 100644 --- a/ca_on_caledon/people.py +++ b/ca_on_caledon/people.py @@ -34,9 +34,7 @@ def scrape(self): # phone numbers populated by JS request contact_num = page.xpath('//div[@class="contactBody"]/div/@id')[0].replace("contactEntry_", "") contact_data = requests.get( - "https://www.caledon.ca//Modules/Contact/services/GetContactHTML.ashx?isMobile=false¶m={}&lang=en".format( - contact_num - ) + f"https://www.caledon.ca//Modules/Contact/services/GetContactHTML.ashx?isMobile=false¶m={contact_num}&lang=en" ).text voice = re.findall(r"(?<=tel://)\d+(?=\">)", contact_data) @@ -46,7 +44,7 @@ def scrape(self): if "&" in district: # Councillor for multiple wards wards = re.findall(r"\d", district) for ward_num in wards: - p = Person(primary_org="legislature", name=name, district="Ward {}".format(ward_num), role=role) + p = Person(primary_org="legislature", name=name, district=f"Ward {ward_num}", role=role) if voice: p.add_contact("voice", voice[0], "legislature") p.image = image diff --git a/ca_on_cambridge/__init__.py b/ca_on_cambridge/__init__.py index a3b13617..cbaa6a01 100644 --- a/ca_on_cambridge/__init__.py +++ b/ca_on_cambridge/__init__.py @@ -17,14 +17,14 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 9): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_chatham_kent/__init__.py b/ca_on_chatham_kent/__init__.py index 86be75b6..0a696434 100644 --- a/ca_on_chatham_kent/__init__.py +++ b/ca_on_chatham_kent/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, stop): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_chatham_kent/people.py b/ca_on_chatham_kent/people.py index 290940d8..61f9dca2 100644 --- a/ca_on_chatham_kent/people.py +++ b/ca_on_chatham_kent/people.py @@ -30,7 +30,7 @@ def scrape(self): ward, name = re.split(r"(?<=\d)\s", title) name.replace("Councillor ", "") seat_numbers[ward] += 1 - district = "{} (seat {})".format(ward, seat_numbers[ward]) + district = f"{ward} (seat {seat_numbers[ward]})" url = councillor.xpath("./@ows_URL")[0].split(",")[0] page = self.lxmlize(url, user_agent="Mozilla/5.0") diff --git a/ca_on_clarington/__init__.py b/ca_on_clarington/__init__.py index c8732d1e..6c2b2c7f 100644 --- a/ca_on_clarington/__init__.py +++ b/ca_on_clarington/__init__.py @@ -17,6 +17,6 @@ def get_organizations(self): organization.add_post(role="Regional Councillor", label="Wards 1 and 2") organization.add_post(role="Regional Councillor", label="Wards 3 and 4") for ward_number in range(1, 5): - organization.add_post(role="Councillor", label="Ward {}".format(ward_number)) + organization.add_post(role="Councillor", label=f"Ward {ward_number}") yield organization diff --git a/ca_on_fort_erie/__init__.py b/ca_on_fort_erie/__init__.py index 6bc1d6fb..8d016a6f 100644 --- a/ca_on_fort_erie/__init__.py +++ b/ca_on_fort_erie/__init__.py @@ -15,6 +15,6 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 7): - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=self.division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=self.division_id) yield organization diff --git a/ca_on_georgina/__init__.py b/ca_on_georgina/__init__.py index 3dcef56c..ed2903dd 100644 --- a/ca_on_georgina/__init__.py +++ b/ca_on_georgina/__init__.py @@ -20,7 +20,7 @@ def get_organizations(self): # organization.add_post(role='Councillor', label='Ward {}'.format(ward_number), division_id=self.division_id) organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, ward_number), + label=f"{self.division_name} (seat {ward_number})", division_id=self.division_id, ) diff --git a/ca_on_grimsby/__init__.py b/ca_on_grimsby/__init__.py index 3ce8ccb9..abf7b183 100644 --- a/ca_on_grimsby/__init__.py +++ b/ca_on_grimsby/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_grimsby/people.py b/ca_on_grimsby/people.py index bdb30be3..a083cf24 100644 --- a/ca_on_grimsby/people.py +++ b/ca_on_grimsby/people.py @@ -23,7 +23,7 @@ def scrape(self): './/h5[contains(./strong, "Councillor")]|.//h5[contains(., "Councillor")]' )[i] name = re.split(r"\s", name_node.text_content(), 1)[1] - district = "{} (seat {})".format(area, i + 1) + district = f"{area} (seat {i + 1})" phone = self.get_phone(name_node.xpath('./following-sibling::*[contains(., "Phone")]')[0]) email = self.get_email(name_node.xpath("./following-sibling::p[contains(., 'Email')]")[0]) image = councillors_node.xpath(".//@src")[i] diff --git a/ca_on_guelph/__init__.py b/ca_on_guelph/__init__.py index cfb78e38..4b265924 100644 --- a/ca_on_guelph/__init__.py +++ b/ca_on_guelph/__init__.py @@ -19,8 +19,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_huron/__init__.py b/ca_on_huron/__init__.py index 793a2a6c..389fc94d 100644 --- a/ca_on_huron/__init__.py +++ b/ca_on_huron/__init__.py @@ -56,7 +56,7 @@ def get_organizations(self): for seat_number in range(1, division["count"] + 1): organization.add_post( role="Councillor", - label="{} (seat {})".format(division_name, seat_number), + label=f"{division_name} (seat {seat_number})", division_id=division_id, ) diff --git a/ca_on_lambton/__init__.py b/ca_on_lambton/__init__.py index 57d42f26..eeaf9aae 100644 --- a/ca_on_lambton/__init__.py +++ b/ca_on_lambton/__init__.py @@ -18,7 +18,7 @@ def get_organizations(self): # @todo Fix labels along the lines of the regions for seat_number in range(1, 16): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_on_lambton/people.py b/ca_on_lambton/people.py index 8cb9dc7d..2757d6d7 100644 --- a/ca_on_lambton/people.py +++ b/ca_on_lambton/people.py @@ -24,7 +24,7 @@ def scrape(self): else: role = "Councillor" name = text.replace("Councillor ", "") - district = "Lambton (seat {})".format(councillor_seat_number) + district = f"Lambton (seat {councillor_seat_number})" councillor_seat_number += 1 p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_on_lasalle/__init__.py b/ca_on_lasalle/__init__.py index fa036878..003ae973 100644 --- a/ca_on_lasalle/__init__.py +++ b/ca_on_lasalle/__init__.py @@ -18,7 +18,7 @@ def get_organizations(self): for seat_number in range(1, 6): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_on_lasalle/people.py b/ca_on_lasalle/people.py index 79acea39..dd8d1bb2 100644 --- a/ca_on_lasalle/people.py +++ b/ca_on_lasalle/people.py @@ -19,7 +19,7 @@ def scrape(self): if "Mayor" in role: district = "LaSalle" else: - district = "LaSalle (seat {})".format(councillor_seat_number) + district = f"LaSalle (seat {councillor_seat_number})" image = councillor.xpath(".//img/@src")[0] voice = re.search(r"\d{3}-\d{3}-\d{4} ext. \d+", councillor.text_content()) cell = re.search(r"\d{3}-\d{3}-\d{4}(?! ext)", councillor.text_content()) diff --git a/ca_on_lincoln/__init__.py b/ca_on_lincoln/__init__.py index 3f3bba06..7624ad1c 100644 --- a/ca_on_lincoln/__init__.py +++ b/ca_on_lincoln/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_markham/__init__.py b/ca_on_markham/__init__.py index f229f92a..26c51504 100644 --- a/ca_on_markham/__init__.py +++ b/ca_on_markham/__init__.py @@ -18,14 +18,14 @@ def get_organizations(self): for seat_number in range(1, 4): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 9): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_markham/people.py b/ca_on_markham/people.py index d2f73fa4..2b01dfd7 100644 --- a/ca_on_markham/people.py +++ b/ca_on_markham/people.py @@ -26,7 +26,7 @@ def scrape(self): role = "Councillor" elif "Regional" in district: role = "Regional Councillor" - district = "Markham (seat {})".format(regional_councillor_seat_number) + district = f"Markham (seat {regional_councillor_seat_number})" regional_councillor_seat_number += 1 else: role = district diff --git a/ca_on_milton/__init__.py b/ca_on_milton/__init__.py index a1247215..09f6e79f 100644 --- a/ca_on_milton/__init__.py +++ b/ca_on_milton/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 5): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_newmarket/__init__.py b/ca_on_newmarket/__init__.py index 82f8dbe4..3e8bc0e0 100644 --- a/ca_on_newmarket/__init__.py +++ b/ca_on_newmarket/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for ward_number in range(1, 8): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_niagara/__init__.py b/ca_on_niagara/__init__.py index 7c6bc4e9..bc5cc9dc 100644 --- a/ca_on_niagara/__init__.py +++ b/ca_on_niagara/__init__.py @@ -70,7 +70,7 @@ def get_organizations(self): organization.add_post(role="Mayor", label=division_name, division_id=division_id) for seat_number in range(1, division["count"] + 1): organization.add_post( - role="Councillor", label="{} (seat {})".format(division_name, seat_number), division_id=division_id + role="Councillor", label=f"{division_name} (seat {seat_number})", division_id=division_id ) yield organization diff --git a/ca_on_niagara_on_the_lake/__init__.py b/ca_on_niagara_on_the_lake/__init__.py index 92dd00b7..d195985e 100644 --- a/ca_on_niagara_on_the_lake/__init__.py +++ b/ca_on_niagara_on_the_lake/__init__.py @@ -18,7 +18,7 @@ def get_organizations(self): for seat_number in range(1, 9): organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) diff --git a/ca_on_north_dumfries/people.py b/ca_on_north_dumfries/people.py index 9573eaa8..ef0b252d 100644 --- a/ca_on_north_dumfries/people.py +++ b/ca_on_north_dumfries/people.py @@ -26,7 +26,7 @@ def scrape(self): if role == "Mayor": district = "North Dumfries" else: - district = "Ward {}".format(word_to_number[match.group(1)]) + district = f"Ward {word_to_number[match.group(1)]}" p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) diff --git a/ca_on_oakville/__init__.py b/ca_on_oakville/__init__.py index 9c0373c2..113d8b9b 100644 --- a/ca_on_oakville/__init__.py +++ b/ca_on_oakville/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 8): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_oshawa/__init__.py b/ca_on_oshawa/__init__.py index eae140e7..14e62cd1 100644 --- a/ca_on_oshawa/__init__.py +++ b/ca_on_oshawa/__init__.py @@ -15,10 +15,8 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 6): - division_id = "{}/ward:{}".format(self.division_id, ward_number) - organization.add_post( - role="Regional Councillor", label="Ward {}".format(ward_number), division_id=division_id - ) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=division_id) + division_id = f"{self.division_id}/ward:{ward_number}" + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}", division_id=division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=division_id) yield organization diff --git a/ca_on_peel/__init__.py b/ca_on_peel/__init__.py index b22abfc0..4f2a0754 100644 --- a/ca_on_peel/__init__.py +++ b/ca_on_peel/__init__.py @@ -20,21 +20,21 @@ def get_organizations(self): for ward_number in range(1, 7): organization.add_post( role="Councillor", - label="Caledon Ward {} (seat 1)".format(ward_number), - division_id="ocd-division/country:ca/csd:3521024/ward:{}".format(ward_number), + label=f"Caledon Ward {ward_number} (seat 1)", + division_id=f"ocd-division/country:ca/csd:3521024/ward:{ward_number}", ) for ward_number in range(1, 11): for seat_number in range(1, 3 if ward_number <= 6 else 2): organization.add_post( role="Councillor", - label="Brampton Ward {} (seat {})".format(ward_number, seat_number), - division_id="ocd-division/country:ca/csd:3521010/ward:{}".format(ward_number), + label=f"Brampton Ward {ward_number} (seat {seat_number})", + division_id=f"ocd-division/country:ca/csd:3521010/ward:{ward_number}", ) for ward_number in range(1, 12): organization.add_post( role="Councillor", - label="Mississauga Ward {} (seat 1)".format(ward_number), - division_id="ocd-division/country:ca/csd:3521005/ward:{}".format(ward_number), + label=f"Mississauga Ward {ward_number} (seat 1)", + division_id=f"ocd-division/country:ca/csd:3521005/ward:{ward_number}", ) yield organization diff --git a/ca_on_pickering/__init__.py b/ca_on_pickering/__init__.py index 60739797..0acff44d 100644 --- a/ca_on_pickering/__init__.py +++ b/ca_on_pickering/__init__.py @@ -15,7 +15,7 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 4): - organization.add_post(role="Regional Councillor", label="Ward {}".format(ward_number)) - organization.add_post(role="Councillor", label="Ward {}".format(ward_number)) + organization.add_post(role="Regional Councillor", label=f"Ward {ward_number}") + organization.add_post(role="Councillor", label=f"Ward {ward_number}") yield organization diff --git a/ca_on_richmond_hill/__init__.py b/ca_on_richmond_hill/__init__.py index fd8c9735..32482ccc 100644 --- a/ca_on_richmond_hill/__init__.py +++ b/ca_on_richmond_hill/__init__.py @@ -17,14 +17,14 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 7): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_richmond_hill/people.py b/ca_on_richmond_hill/people.py index 9f92f22c..f6163c4a 100644 --- a/ca_on_richmond_hill/people.py +++ b/ca_on_richmond_hill/people.py @@ -14,7 +14,7 @@ def scrape(self): urls = page.xpath('//h3[contains(text(), "Regional and Local Councillors")]/following-sibling::p[1]//@href') assert len(urls), "No regional councillors found" for index, url in enumerate(urls, 1): - yield self.process(url, "Richmond Hill (seat {})".format(index), "Regional Councillor") + yield self.process(url, f"Richmond Hill (seat {index})", "Regional Councillor") councillors = page.xpath('//h3[text()="Local Councillors"]/following-sibling::p') assert len(councillors), "No councillors found" diff --git a/ca_on_sault_ste_marie/__init__.py b/ca_on_sault_ste_marie/__init__.py index 9cbf35f6..e329e1e8 100644 --- a/ca_on_sault_ste_marie/__init__.py +++ b/ca_on_sault_ste_marie/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_sault_ste_marie/people.py b/ca_on_sault_ste_marie/people.py index f4f3ceb7..5ff2672c 100644 --- a/ca_on_sault_ste_marie/people.py +++ b/ca_on_sault_ste_marie/people.py @@ -31,7 +31,7 @@ def scrape(self): role = "Councillor" area, name = title.split(" Councillor ") seat_numbers[area] += 1 - district = "{} (seat {})".format(area, seat_numbers[area]) + district = f"{area} (seat {seat_numbers[area]})" image = page.xpath(".//h3/img/@src")[0] contact_node = page.xpath('//div[@id="mainContent_left"]')[0] phone = self.get_phone(contact_node) diff --git a/ca_on_st_catharines/__init__.py b/ca_on_st_catharines/__init__.py index 40fe57d7..ce5f9aa4 100644 --- a/ca_on_st_catharines/__init__.py +++ b/ca_on_st_catharines/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="{} (seat {})".format(ward_name, seat_number), - division_id="{}/ward:{}".format(self.division_id, clean_type_id(ward_name)), + label=f"{ward_name} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{clean_type_id(ward_name)}", ) yield organization diff --git a/ca_on_thunder_bay/__init__.py b/ca_on_thunder_bay/__init__.py index 0f573abf..c94012f6 100644 --- a/ca_on_thunder_bay/__init__.py +++ b/ca_on_thunder_bay/__init__.py @@ -17,14 +17,14 @@ def get_organizations(self): for seat_number in range(1, 6): organization.add_post( role="Councillor at Large", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number, ward_name in enumerate( ("Current River", "Red River", "McKellar", "McIntyre", "Northwood", "Westfort", "Neebing"), 1 ): organization.add_post( - role="Councillor", label=ward_name, division_id="{}/ward:{}".format(self.division_id, ward_number) + role="Councillor", label=ward_name, division_id=f"{self.division_id}/ward:{ward_number}" ) yield organization diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py index f0123518..7a7f834e 100644 --- a/ca_on_thunder_bay/people.py +++ b/ca_on_thunder_bay/people.py @@ -29,7 +29,7 @@ def scrape(self): ].text_content() if "At Large" in district: role = "Councillor at Large" - district = "Thunder Bay (seat {})".format(seat_number) + district = f"Thunder Bay (seat {seat_number})" seat_number += 1 elif "Mayor" in district: district = "Thunder Bay" diff --git a/ca_on_uxbridge/__init__.py b/ca_on_uxbridge/__init__.py index f429091e..f0144483 100644 --- a/ca_on_uxbridge/__init__.py +++ b/ca_on_uxbridge/__init__.py @@ -16,6 +16,6 @@ def get_organizations(self): organization.add_post(role="Mayor", label=self.division_name, division_id=self.division_id) organization.add_post(role="Regional Councillor", label=self.division_name, division_id=self.division_id) for ward_number in range(1, 6): - organization.add_post(role="Councillor", label="Ward {}".format(ward_number), division_id=self.division_id) + organization.add_post(role="Councillor", label=f"Ward {ward_number}", division_id=self.division_id) yield organization diff --git a/ca_on_vaughan/__init__.py b/ca_on_vaughan/__init__.py index fc17bf88..77cf3f69 100644 --- a/ca_on_vaughan/__init__.py +++ b/ca_on_vaughan/__init__.py @@ -18,14 +18,14 @@ def get_organizations(self): for seat_number in range(1, 5): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number in range(1, 6): organization.add_post( role="Councillor", - label="Ward {}".format(ward_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number}", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_vaughan/people.py b/ca_on_vaughan/people.py index 9d25f274..75adb737 100644 --- a/ca_on_vaughan/people.py +++ b/ca_on_vaughan/people.py @@ -22,7 +22,7 @@ def scrape(self): district, name = title.split("Councillor") if "Regional" in district: role = "Regional Councillor" - district = "Vaughan (seat {})".format(regional_councillor_seat_number) + district = f"Vaughan (seat {regional_councillor_seat_number})" regional_councillor_seat_number += 1 elif "Ward" in district: role = "Councillor" diff --git a/ca_on_waterloo_region/__init__.py b/ca_on_waterloo_region/__init__.py index 0c5ad0a4..fca132eb 100644 --- a/ca_on_waterloo_region/__init__.py +++ b/ca_on_waterloo_region/__init__.py @@ -29,19 +29,19 @@ def get_organizations(self): for seat_number in range(1, 4): organization.add_post( role="Regional Councillor", - label="Cambridge (seat {})".format(seat_number), + label=f"Cambridge (seat {seat_number})", division_id="ocd-division/country:ca/csd:3530010", ) for seat_number in range(1, 6): organization.add_post( role="Regional Councillor", - label="Kitchener (seat {})".format(seat_number), + label=f"Kitchener (seat {seat_number})", division_id="ocd-division/country:ca/csd:3530013", ) for seat_number in range(1, 4): organization.add_post( role="Regional Councillor", - label="Waterloo (seat {})".format(seat_number), + label=f"Waterloo (seat {seat_number})", division_id="ocd-division/country:ca/csd:3530016", ) diff --git a/ca_on_waterloo_region/people.py b/ca_on_waterloo_region/people.py index ca0cdeb3..639006da 100644 --- a/ca_on_waterloo_region/people.py +++ b/ca_on_waterloo_region/people.py @@ -20,7 +20,7 @@ def scrape(self): area = re.sub(r"(?:City|Region|Township) of ", "", area) councillors = municipality.xpath("./following-sibling::tr[1]//a[not(@target)]") - assert len(councillors), "No councillors found for {}".format(area) + assert len(councillors), f"No councillors found for {area}" for councillor in councillors: name = councillor.text_content() @@ -29,7 +29,7 @@ def scrape(self): if re.search("Waterloo|Cambridge|Kitchener", area): seat_numbers[area] += 1 - district = "{} (seat {})".format(area, seat_numbers[area]) + district = f"{area} (seat {seat_numbers[area]})" else: district = area if "Regional Council" in area: diff --git a/ca_on_welland/__init__.py b/ca_on_welland/__init__.py index 5b8df234..5d388c54 100644 --- a/ca_on_welland/__init__.py +++ b/ca_on_welland/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, 3): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_whitby/__init__.py b/ca_on_whitby/__init__.py index dc57319f..7a3d88b7 100644 --- a/ca_on_whitby/__init__.py +++ b/ca_on_whitby/__init__.py @@ -17,14 +17,14 @@ def get_organizations(self): for seat_number in range(1, 5): organization.add_post( role="Regional Councillor", - label="{} (seat {})".format(self.division_name, seat_number), + label=f"{self.division_name} (seat {seat_number})", division_id=self.division_id, ) for ward_number, ward_name in enumerate(("North", "West", "Centre", "East"), 1): organization.add_post( role="Councillor", - label="{} Ward".format(ward_name), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"{ward_name} Ward", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_whitby/people.py b/ca_on_whitby/people.py index 24f08ab5..b0c5dd81 100644 --- a/ca_on_whitby/people.py +++ b/ca_on_whitby/people.py @@ -22,7 +22,7 @@ def scrape(self): else: name, role = name.split(", ") if role == "Regional Councillor": - district = "Whitby (seat {})".format(regional_councillor_seat_number) + district = f"Whitby (seat {regional_councillor_seat_number})" regional_councillor_seat_number += 1 else: district = role.split(" – ")[1] diff --git a/ca_on_whitchurch_stouffville/__init__.py b/ca_on_whitchurch_stouffville/__init__.py index 1e4c6e35..b48db3ee 100644 --- a/ca_on_whitchurch_stouffville/__init__.py +++ b/ca_on_whitchurch_stouffville/__init__.py @@ -19,7 +19,7 @@ def get_organizations(self): # organization.add_post(role='Councillor', label='Ward {}'.format(ward_number), division_id=self.division_id) organization.add_post( role="Councillor", - label="{} (seat {})".format(self.division_name, ward_number), + label=f"{self.division_name} (seat {ward_number})", division_id=self.division_id, ) diff --git a/ca_on_woolwich/__init__.py b/ca_on_woolwich/__init__.py index ce33ea11..e072a68f 100644 --- a/ca_on_woolwich/__init__.py +++ b/ca_on_woolwich/__init__.py @@ -18,8 +18,8 @@ def get_organizations(self): for seat_number in range(1, stop): organization.add_post( role="Councillor", - label="Ward {} (seat {})".format(ward_number, seat_number), - division_id="{}/ward:{}".format(self.division_id, ward_number), + label=f"Ward {ward_number} (seat {seat_number})", + division_id=f"{self.division_id}/ward:{ward_number}", ) yield organization diff --git a/ca_on_woolwich/people.py b/ca_on_woolwich/people.py index 2e728678..f2773670 100644 --- a/ca_on_woolwich/people.py +++ b/ca_on_woolwich/people.py @@ -21,7 +21,7 @@ def scrape(self): district = "Woolwich" else: seat_numbers[area] += 1 - district = area.group(0) + " (seat {})".format(seat_numbers[area]) + district = area.group(0) + f" (seat {seat_numbers[area]})" if "(" in name: name = name.split(" (")[0] info = councillor.xpath("./ancestor::tr[1]/following-sibling::tr")[0].text_content() diff --git a/ca_pe_stratford/people.py b/ca_pe_stratford/people.py index b1879e1f..158caf15 100644 --- a/ca_pe_stratford/people.py +++ b/ca_pe_stratford/people.py @@ -26,7 +26,7 @@ def scrape(self): role = "Councillor" area = re.findall(r"(?<=Ward \d,).*", councillor.text_content())[0].strip() seat_numbers[area] += 1 - district = "{} (seat {})".format(area, seat_numbers[area]) + district = f"{area} (seat {seat_numbers[area]})" p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) diff --git a/ca_qc/people.py b/ca_qc/people.py index 0c8568a0..3639da07 100644 --- a/ca_qc/people.py +++ b/ca_qc/people.py @@ -236,12 +236,12 @@ def scrape(self): p.add_link(twitter) for heading, note in headings.items(): - office = contact_page.xpath('//h3[contains(., "{}")]/parent::div'.format(heading)) + office = contact_page.xpath(f'//h3[contains(., "{heading}")]/parent::div') try: phone = self.get_phone(office[0]) office_info = contact_page.xpath( - '//h3[contains(., "{}")]/parent::div/address[1]/span/text()'.format(heading) + f'//h3[contains(., "{heading}")]/parent::div/address[1]/span/text()' ) office_items = [item for item in office_info if item.strip()] office_items = list(map(str.strip, office_items)) diff --git a/ca_qc_longueuil/__init__.py b/ca_qc_longueuil/__init__.py index c0c6b143..2a9425d9 100644 --- a/ca_qc_longueuil/__init__.py +++ b/ca_qc_longueuil/__init__.py @@ -21,7 +21,7 @@ def get_organizations(self): for seat_number in range(1, 4): organization.add_post( role="Conseiller", - label="{} (siège {})".format(division.name, seat_number), + label=f"{division.name} (siège {seat_number})", division_id=division.id, ) else: diff --git a/ca_qc_longueuil/people.py b/ca_qc_longueuil/people.py index 44d0c031..7227dbe1 100644 --- a/ca_qc_longueuil/people.py +++ b/ca_qc_longueuil/people.py @@ -20,7 +20,7 @@ def scrape(self): district = tr.xpath('.//p[contains(./strong, "District")]/a/text()')[0] if "Greenfield Park" in district: - district = "Greenfield Park (siège {})".format(seat_number) + district = f"Greenfield Park (siège {seat_number})" seat_number += 1 district = { @@ -46,7 +46,7 @@ def scrape(self): def scrape_mayor(self): page = self.lxmlize(MAYOR_PAGE) name = page.xpath("//h1[not(@class)]/text()")[0] - img = page.xpath('//img[contains(./@alt, "{}")]/@src'.format(name))[0] + img = page.xpath(f'//img[contains(./@alt, "{name}")]/@src')[0] p = Person(primary_org="legislature", name=name, district="Longueuil", role="Maire") p.add_source(COUNCIL_PAGE) p.add_source(MAYOR_PAGE) diff --git a/ca_qc_montreal_est/people.py b/ca_qc_montreal_est/people.py index 5409ca97..8d7fb6d4 100644 --- a/ca_qc_montreal_est/people.py +++ b/ca_qc_montreal_est/people.py @@ -16,7 +16,7 @@ def scrape(self): district = "Montréal-Est" role = "Maire" else: - district = "District {}".format(role_district[-1]) + district = f"District {role_district[-1]}" role = "Conseiller" p = Person(primary_org="legislature", name=name, district=district, role=role) diff --git a/ca_qc_quebec/people.py b/ca_qc_quebec/people.py index fbecc631..653a834d 100644 --- a/ca_qc_quebec/people.py +++ b/ca_qc_quebec/people.py @@ -28,7 +28,7 @@ def scrape(self): else: district = councillor.xpath('./p[@itemprop="jobTitle"]/a/text()')[0] district = ( - re.search(r"\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)", district, flags=re.U) + re.search(r"\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)", district, flags=re.UNICODE) .group(1) .strip() ) diff --git a/ca_qc_trois_rivieres/people.py b/ca_qc_trois_rivieres/people.py index f8ff9ff9..b80f295a 100644 --- a/ca_qc_trois_rivieres/people.py +++ b/ca_qc_trois_rivieres/people.py @@ -20,7 +20,7 @@ def scrape(self): email = self.get_email(self.lxmlize(url)) name, district = [x.strip() for x in member.xpath(".//figcaption//text()")] - district = re.sub(r"\A(?:de|des|du) ", lambda match: match.group(0).lower(), district, flags=re.I) + district = re.sub(r"\A(?:de|des|du) ", lambda match: match.group(0).lower(), district, flags=re.IGNORECASE) role = "Conseiller" if "Maire" in district: diff --git a/disabled/ca_bc_municipalities/people.py b/disabled/ca_bc_municipalities/people.py index 6e505ebb..f4100405 100644 --- a/disabled/ca_bc_municipalities/people.py +++ b/disabled/ca_bc_municipalities/people.py @@ -92,7 +92,7 @@ def scrape(self): if division_id in exclude_divisions: continue if division_id in processed_ids: - raise Exception("unhandled collision: {}".format(division_id)) + raise Exception(f"unhandled collision: {division_id}") division = Division.get(division_id) processed_divisions.add(division_name) diff --git a/disabled/ca_bc_municipalities_candidates/people.py b/disabled/ca_bc_municipalities_candidates/people.py index 109e4aa3..a103ee5c 100644 --- a/disabled/ca_bc_municipalities_candidates/people.py +++ b/disabled/ca_bc_municipalities_candidates/people.py @@ -116,7 +116,7 @@ def scrape(self): role = row["primary role"] if role not in expected_roles: - raise Exception("unexpected role: {}".format(role)) + raise Exception(f"unexpected role: {role}") if row["district id"]: district = format(division_id) else: diff --git a/disabled/ca_municipalities/people.py b/disabled/ca_municipalities/people.py index c20c3878..f274bc68 100644 --- a/disabled/ca_municipalities/people.py +++ b/disabled/ca_municipalities/people.py @@ -61,7 +61,7 @@ def scrape(self): if self.many_posts_per_area and role not in self.unique_roles: seat_numbers[role][district] += 1 - district = "{} (seat {})".format(district, seat_numbers[role][district]) + district = f"{district} (seat {seat_numbers[role][district]})" p = Person( primary_org=organization_classification, diff --git a/disabled/ca_nb_municipalities/people.py b/disabled/ca_nb_municipalities/people.py index c0c95580..6ae4c3e2 100644 --- a/disabled/ca_nb_municipalities/people.py +++ b/disabled/ca_nb_municipalities/people.py @@ -52,7 +52,7 @@ def scrape(self): if division.attrs["classification"] == "P": continue if division.name in names_to_ids: - raise Exception("unhandled collision: {}".format(division.name)) + raise Exception(f"unhandled collision: {division.name}") else: names_to_ids[division.name] = division.id @@ -79,11 +79,11 @@ def scrape(self): if division_id in exclude_divisions: continue if division_id in seen: - raise Exception("unhandled collision: {}".format(division_id)) + raise Exception(f"unhandled collision: {division_id}") seen.add(division_id) division_name = Division.get(division_id).name - organization_name = "{} {} Council".format(division_name, classifications[list_link.text]) + organization_name = f"{division_name} {classifications[list_link.text]} Council" organization = Organization(name=organization_name, classification="government") organization.add_source(detail_url) @@ -104,7 +104,7 @@ def scrape(self): for p in groups: role = p.xpath("./b/text()")[0].rstrip("s") if role not in expected_roles: - raise Exception("unexpected role: {}".format(role)) + raise Exception(f"unexpected role: {role}") councillors = p.xpath("./text()") assert len(councillors), "No councillors found" @@ -115,7 +115,7 @@ def scrape(self): if role in unique_roles: district = division_name else: - district = "{} (seat {})".format(division_name, seat_number) + district = f"{division_name} (seat {seat_number})" organization.add_post(role=role, label=district, division_id=division_id) diff --git a/disabled/ca_ns_municipalities/people.py b/disabled/ca_ns_municipalities/people.py index 3533a463..52459832 100644 --- a/disabled/ca_ns_municipalities/people.py +++ b/disabled/ca_ns_municipalities/people.py @@ -61,7 +61,7 @@ def scrape(self): for i, email in enumerate(emails): regex = name.split()[-1].lower() + "|" + "|".join(district.split()[-2:]).replace("of", "").lower() regex = regex.replace("||", "|") - matches = re.findall(r"{}".format(regex), email) + matches = re.findall(rf"{regex}", email) if matches: membership.add_contact_detail("email", emails.pop(i)) yield p diff --git a/patch.py b/patch.py index 8acf2c0c..97979862 100644 --- a/patch.py +++ b/patch.py @@ -133,7 +133,7 @@ r"\A" r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" r"(?:" + name_fragment + r"(?:'|-| - | )" - r")+" + name_fragment + r"\Z" # noqa: W504 + r")+" + name_fragment + r"\Z" ) person_schema["properties"]["gender"]["enum"] = ["male", "female", ""] # @note https://github.com/opennorth/represent-canada-images checks whether an diff --git a/tasks.py b/tasks.py index d0581c57..1e8ef8d7 100644 --- a/tasks.py +++ b/tasks.py @@ -35,7 +35,7 @@ def modules_and_module_names_and_classes(): Returns modules, module names, and person scraper classes. """ for module_name in module_names(): - module = importlib.import_module("{}.people".format(module_name)) + module = importlib.import_module(f"{module_name}.people") class_name = next(key for key in module.__dict__.keys() if "PersonScraper" in key) yield (module, module_name, module.__dict__[class_name]) @@ -129,32 +129,28 @@ def get_definition(division_id, aggregation=False): pattern = "ca_{}_municipalities" if aggregation else "ca_{}" expected["module_name"] = pattern.format(ocd_type_id) if aggregation: - expected["name"] = "{} Municipalities".format(division.name) + expected["name"] = f"{division.name} Municipalities" elif ocd_type_id in ("nl", "ns"): - expected["name"] = "{} House of Assembly".format(division.name) + expected["name"] = f"{division.name} House of Assembly" elif ocd_type_id == "qc": expected["name"] = "Assemblée nationale du Québec" else: - expected["name"] = "Legislative Assembly of {}".format(division.name) + expected["name"] = f"Legislative Assembly of {division.name}" elif division._type == "cd": - expected["module_name"] = "ca_{}_{}".format( - province_or_territory_abbreviation(division.id), slug(division.name) - ) + expected["module_name"] = f"ca_{province_or_territory_abbreviation(division.id)}_{slug(division.name)}" name_infix = ocdid_to_type_name_map[division.id] if name_infix == "Regional municipality": name_infix = "Regional" - expected["name"] = "{} {} Council".format(division.name, name_infix) + expected["name"] = f"{division.name} {name_infix} Council" elif division._type == "csd": - expected["module_name"] = "ca_{}_{}".format( - province_or_territory_abbreviation(division.id), slug(division.name) - ) + expected["module_name"] = f"ca_{province_or_territory_abbreviation(division.id)}_{slug(division.name)}" if ocd_type_id[:2] == "24": if division.name[0] in vowels: - expected["name"] = "Conseil municipal d'{}".format(division.name) + expected["name"] = f"Conseil municipal d'{division.name}" else: - expected["name"] = "Conseil municipal de {}".format(division.name) + expected["name"] = f"Conseil municipal de {division.name}" else: name_infix = ocdid_to_type_name_map[division.id] if name_infix in ("Municipality", "Specialized municipality"): @@ -163,21 +159,21 @@ def get_definition(division_id, aggregation=False): name_infix = "District" elif name_infix == "Regional municipality": name_infix = "Regional" - expected["name"] = "{} {} Council".format(division.name, name_infix) + expected["name"] = f"{division.name} {name_infix} Council" elif division._type == "arrondissement": - expected["module_name"] = "ca_{}_{}_{}".format( - province_or_territory_abbreviation(division.parent.id), slug(division.parent.name), slug(division.name) + expected["module_name"] = ( + f"ca_{province_or_territory_abbreviation(division.parent.id)}_{slug(division.parent.name)}_{slug(division.name)}" ) if division.name[0] in vowels: - expected["name"] = "Conseil d'arrondissement d'{}".format(division.name) + expected["name"] = f"Conseil d'arrondissement d'{division.name}" elif division.name[:3] == "Le ": - expected["name"] = "Conseil d'arrondissement du {}".format(division.name[3:]) + expected["name"] = f"Conseil d'arrondissement du {division.name[3:]}" else: - expected["name"] = "Conseil d'arrondissement de {}".format(division.name) + expected["name"] = f"Conseil d'arrondissement de {division.name}" else: - raise Exception("{}: Unrecognized OCD type {}".format(division.id, division._type)) + raise Exception(f"{division.id}: Unrecognized OCD type {division._type}") # Determine the class name. class_name_parts = re.split("[ -]", re.sub("[—–]", "-", re.sub("['.]", "", division.name))) @@ -204,12 +200,11 @@ def council_pages(): for module, module_name, klass in modules_and_module_names_and_classes(): if klass.__bases__[0].__name__ == "CSVScraper": if hasattr(module, "COUNCIL_PAGE"): - print("{:<60} Delete COUNCIL_PAGE".format(module_name)) + print(f"{module_name:<60} Delete COUNCIL_PAGE") + elif hasattr(module, "COUNCIL_PAGE"): + print(f"{module_name:<60} {module.COUNCIL_PAGE}") else: - if hasattr(module, "COUNCIL_PAGE"): - print("{:<60} {}".format(module_name, module.COUNCIL_PAGE)) - else: - print("{:<60} Missing COUNCIL_PAGE".format(module_name)) + print(f"{module_name:<60} Missing COUNCIL_PAGE") @task @@ -219,7 +214,7 @@ def csv_list(): """ for module, module_name, klass in modules_and_module_names_and_classes(): if hasattr(klass, "csv_url"): - print("{}: {}".format(module_name, klass.csv_url)) + print(f"{module_name}: {klass.csv_url}") @task @@ -229,7 +224,7 @@ def csv_stale(): """ for module, module_name, klass in modules_and_module_names_and_classes(): if hasattr(klass, "updated_at") and klass.updated_at < date.today() - timedelta(days=365): - print("{}: Created on {} by {}".format(module_name, klass.updated_at, klass.contact_person)) + print(f"{module_name}: Created on {klass.updated_at} by {klass.contact_person}") @task @@ -263,23 +258,19 @@ def csv_error(): keys -= {"encoding"} if keys: - print("\n{}\n{}".format(module_name, klass.csv_url)) + print(f"\n{module_name}\n{klass.csv_url}") extra_keys = keys - {"corrections", "encoding", "header_converter"} if extra_keys: print("- Manually check the configuration of: {}".format(", ".join(extra_keys))) if "encoding" in keys: - print( - "- The CSV file should be encoded as 'utf-8' or 'windows-1252', not '{}'".format( - klass.encoding - ) - ) + print(f"- The CSV file should be encoded as 'utf-8' or 'windows-1252', not '{klass.encoding}'") if "corrections" in keys: for key, values in klass.corrections.items(): for actual, expected in values.items(): - print("- Change '{}' to '{}' in {}".format(actual, expected, key)) + print(f"- Change '{actual}' to '{expected}' in {key}") if "header_converter" in keys: print("- Correct column headers according to:") @@ -296,9 +287,7 @@ def tidy(): member_styles = {} for gid in range(3): reader = csv_dict_reader( - "https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={}&output=csv".format( - gid - ) + f"https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={gid}&output=csv" ) for row in reader: key = row["Identifier"] @@ -316,14 +305,14 @@ def tidy(): # Ensure division_id is unique. division_id = metadata["division_id"] if division_id in division_ids: - print("{:<60} Duplicate division_id {}".format(module_name, division_id)) + print(f"{module_name:<60} Duplicate division_id {division_id}") else: division_ids.add(division_id) # Ensure jurisdiction_id is unique. jurisdiction_id = metadata["jurisdiction_id"] if jurisdiction_id in jurisdiction_ids: - print("{:<60} Duplicate jurisdiction_id {}".format(module_name, jurisdiction_id)) + print(f"{module_name:<60} Duplicate jurisdiction_id {jurisdiction_id}") else: jurisdiction_ids.add(jurisdiction_id) @@ -331,14 +320,14 @@ def tidy(): # Ensure presence of url and styles of address. if division_id not in member_styles: - print("{:<60} Missing member style of address: {}".format(module_name, division_id)) + print(f"{module_name:<60} Missing member style of address: {division_id}") if division_id not in leader_styles: - print("{:<60} Missing leader style of address: {}".format(module_name, division_id)) + print(f"{module_name:<60} Missing leader style of address: {division_id}") url = metadata["url"] if url and not expected["url"]: parsed = urlsplit(url) if parsed.scheme not in ("http", "https") or parsed.path or parsed.query or parsed.fragment: - print("{:<60} Check: {}".format(module_name, url)) + print(f"{module_name:<60} Check: {url}") # Warn if the name or classification may be incorrect. name = metadata["name"] @@ -346,7 +335,7 @@ def tidy(): print("{:<60} Expected {}".format(name, expected["name"])) classification = metadata["classification"] if classification != "legislature": - print("{:<60} Expected legislature".format(classification)) + print(f"{classification:<60} Expected legislature") # Name the classes correctly. class_name = metadata["class_name"] @@ -390,10 +379,10 @@ def sources_and_assertions(): source_count = content.count("add_source") request_count = content.count("lxmlize") + content.count("self.get(") + content.count("requests.get") if source_count < request_count: - print("Expected {} sources after {} requests {}".format(source_count, request_count, path)) + print(f"Expected {source_count} sources after {request_count} requests {path}") if "CSVScraper" not in content and "assert len(" not in content: - print("Expected an assertion like: assert len(councillors), 'No councillors found' {}".format(path)) + print(f"Expected an assertion like: assert len(councillors), 'No councillors found' {path}") @task @@ -413,13 +402,13 @@ def validate_spreadsheet(url, identifier_header, geographic_name_header): if len(identifier) == 2: identifier = sgc_to_id[identifier] elif len(identifier) == 4: - identifier = "ocd-division/country:ca/cd:{}".format(identifier) + identifier = f"ocd-division/country:ca/cd:{identifier}" elif len(identifier) == 7: - identifier = "ocd-division/country:ca/csd:{}".format(identifier) + identifier = f"ocd-division/country:ca/csd:{identifier}" division = Division.get(identifier) if row[geographic_name_header] != division.name: - print("{}: name: {} not {}".format(identifier, division.name, row[geographic_name_header])) + print(f"{identifier}: name: {division.name} not {row[geographic_name_header]}") def module_name_to_metadata(module_name): diff --git a/utils.py b/utils.py index 030728bb..c6e727bb 100644 --- a/utils.py +++ b/utils.py @@ -93,9 +93,7 @@ styles_of_address = {} for gid in range(3): response = requests.get( - "https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={}&output=csv".format( - gid - ), + f"https://docs.google.com/spreadsheets/d/11qUKd5bHeG5KIzXYERtVgs3hKcd9yuZlt-tCTLBFRpI/pub?single=true&gid={gid}&output=csv", verify=SSL_VERIFY, ) if response.status_code == 200: @@ -115,21 +113,20 @@ def get_email(self, node, expression=".", *, error=True): Make sure that the node/expression is narrow enough to not capture a generic email address in the footer of the page, for example. """ - matches = [] # If the text would be split across multiple sub-tags. - for match in node.xpath('{}//*[contains(text(), "@")]'.format(expression)): + for match in node.xpath(f'{expression}//*[contains(text(), "@")]'): matches.append(match.text_content()) # The text version is more likely to be correct, as it is more visible, # e.g. ca_bc has one `href` of `mailto:first.last.mla@leg.bc.ca`. - for match in node.xpath('{}//a[contains(@href, "mailto:")]'.format(expression)): + for match in node.xpath(f'{expression}//a[contains(@href, "mailto:")]'): matches.append(unquote(match.attrib["href"])) # Some emails are obfuscated by Cloudflare. - for match in node.xpath('{}//@href[contains(., "cdn-cgi/l/email-protection")]'.format(expression)): + for match in node.xpath(f'{expression}//@href[contains(., "cdn-cgi/l/email-protection")]'): matches.append(self._cloudflare_decode(match)) # If the node has no sub-tags. if not matches: - for match in node.xpath('{}//text()[contains(., "@")]'.format(expression)): + for match in node.xpath(f'{expression}//text()[contains(., "@")]'): matches.append(match) if matches: for match in matches: @@ -137,9 +134,9 @@ def get_email(self, node, expression=".", *, error=True): if match: return match.group(1) if error: - raise Exception("No email pattern in {}".format(matches)) + raise Exception(f"No email pattern in {matches}") elif error: - raise Exception("No email node in {}".format(etree.tostring(node))) + raise Exception(f"No email node in {etree.tostring(node)}") # Helper function for self,get_email def _cloudflare_decode(self, link): @@ -157,7 +154,6 @@ def get_phone(self, node, *, area_codes=[], error=True): Don't use if multiple telephone numbers are present, e.g. voice and fax. If writing a new scraper, check that extensions are captured. """ - if isinstance(node, etree._ElementUnicodeResult): match = re.search( r"(?:\A|\D)(\(?\d{3}\)?\D?\d{3}\D?\d{4}(?:\s*(?:/|x|ext[.:]?|poste)[\s-]?\d+)?)(?:\D|\Z)", node @@ -183,14 +179,14 @@ def get_phone(self, node, *, area_codes=[], error=True): if match: return match.group(1) if error: - raise Exception("No phone pattern in {}".format(node.text_content())) + raise Exception(f"No phone pattern in {node.text_content()}") def get_link(self, node, substring, *, error=True): - match = node.xpath('.//a[contains(@href,"{}")]/@href'.format(substring)) + match = node.xpath(f'.//a[contains(@href,"{substring}")]/@href') if match: return match[0] if error: - raise Exception("No link matching {}".format(substring)) + raise Exception(f"No link matching {substring}") def get(self, *args, **kwargs): return super().get(*args, verify=SSL_VERIFY, **kwargs) @@ -213,17 +209,16 @@ def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_age else: page = lxml.html.fromstring(text) except etree.ParserError: - raise etree.ParserError("Document is empty {}".format(url)) + raise etree.ParserError(f"Document is empty {url}") meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: _, url = meta[0].attrib["content"].split("=", 1) return self.lxmlize(url, encoding) - elif xml: - return page - else: - page.make_links_absolute(url) + if xml: return page + page.make_links_absolute(url) + return page def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows=0, data=None, **kwargs): if not data: @@ -232,7 +227,7 @@ def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows= data = StringIO() ftp = FTP(result.hostname) ftp.login(result.username, result.password) - ftp.retrbinary("RETR {}".format(result.path), lambda block: data.write(block.decode("utf-8"))) + ftp.retrbinary(f"RETR {result.path}", lambda block: data.write(block.decode("utf-8"))) ftp.quit() data.seek(0) else: @@ -347,19 +342,16 @@ def header_converter(self, s): Normalizes a column header name. By default, lowercases it and replaces underscores with spaces (e.g. because Esri fields can't contain spaces). """ - header = clean_string(s.lower().replace("_", " ")) if hasattr(self, "locale"): return self.column_headers[self.locale].get(header, header) - else: - return header + return header def is_valid_row(self, row): """ Returns whether the row should be imported. By default, skips empty rows and rows in which a name component is "Vacant". """ - empty = ("", "Vacant") if not any(row.values()): return False @@ -459,7 +451,7 @@ def scrape(self): if self.many_posts_per_area and role not in self.unique_roles: seat_numbers[role][district] += 1 - district = "{} (seat {})".format(district, seat_numbers[role][district]) + district = f"{district} (seat {seat_numbers[role][district]})" lines = [] if row.get("address line 1"): @@ -621,9 +613,7 @@ def get_organizations(self): if not children and parent.attrs["posts_count"]: for i in range(1, int(parent.attrs["posts_count"])): # exclude Mayor - organization.add_post( - role=member_role, label="{} (seat {})".format(parent.name, i), division_id=parent.id - ) + organization.add_post(role=member_role, label=f"{parent.name} (seat {i})", division_id=parent.id) yield organization @@ -663,9 +653,9 @@ def add_link(self, url, *, note=""): """ url = url.strip() if url.startswith("www."): - url = "http://{}".format(url) + url = f"http://{url}" if re.match(r"\A@[A-Za-z]+\Z", url): - url = "https://twitter.com/{}".format(url[1:]) + url = f"https://twitter.com/{url[1:]}" self.links.append({"note": note, "url": url}) def add_contact(self, type, value, note="", area_code=None): @@ -709,10 +699,8 @@ def clean_telephone_number(self, s, area_code=None): digits = re.sub(r"\A(\d)(\d{3})(\d{3})(\d{4})\Z", r"\1 \2 \3-\4", digits) if len(splits) == 2: return "{} x{}".format(digits, splits[1].rstrip(")")) - else: - return digits - else: - return s + return digits + return s def clean_address(self, s): """ @@ -740,14 +728,14 @@ def clean_address(self, s): ) -whitespace_re = re.compile(r"\s+", flags=re.U) -whitespace_and_newline_re = re.compile(r"[^\S\n]+", flags=re.U) +whitespace_re = re.compile(r"\s+", flags=re.UNICODE) +whitespace_and_newline_re = re.compile(r"[^\S\n]+", flags=re.UNICODE) honorific_prefix_re = re.compile(r"\A(?:Councillor|Dr|Hon|M|Mayor|Mme|Mr|Mrs|Ms|Miss)\.? ") honorific_suffix_re = re.compile(r", (?:Ph\.D, Q\.C\.)\Z") province_or_territory_abbreviation_memo = {} table = { - ord("​"): " ", # zero-width space + ord("\u200b"): " ", # zero-width space ord("’"): "'", ord("\xc2"): " ", # non-breaking space if mixing ISO-8869-1 into UTF-8 } @@ -785,4 +773,4 @@ def clean_type_id(type_id): def clean_french_prepositions(s): - return re.sub(r"\b(?:d'|de (?:l'|la )?|du |des |l')", "", clean_string(s), flags=re.I) + return re.sub(r"\b(?:d'|de (?:l'|la )?|du |des |l')", "", clean_string(s), flags=re.IGNORECASE) From e45d223a9e594fea92b5fd6dc1c60d71cce459f4 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 16 Sep 2024 17:10:03 -0400 Subject: [PATCH 60/66] chore: Run ruff check . --fix --unsafe-fixes --- ca_ab/people.py | 2 +- ca_bc_coquitlam/people.py | 3 +- ca_bc_langley/people.py | 2 +- ca_nl/people.py | 2 - ca_nl_st_john_s/people.py | 2 +- ca_ns_cape_breton/people.py | 3 +- ca_nt/people.py | 6 +- ca_nu/people.py | 6 +- ca_on/people.py | 2 +- ca_on_lasalle/people.py | 5 +- ca_on_north_dumfries/people.py | 5 +- ca_on_oakville/people.py | 5 +- ca_on_oshawa/people.py | 5 +- ca_on_wellesley/people.py | 5 +- ca_on_wilmot/people.py | 1 - ca_pe_charlottetown/people.py | 5 +- ca_pe_summerside/people.py | 1 - ca_qc_pointe_claire/people.py | 2 +- ca_qc_sainte_anne_de_bellevue/people.py | 2 +- ca_qc_sherbrooke/people.py | 3 +- ca_sk/people.py | 5 +- ca_yt/people.py | 6 +- .../ca_bc_municipalities_candidates/people.py | 5 +- disabled/ca_mb_municipalities/people.py | 5 +- disabled/ca_municipalities/people.py | 3 +- disabled/ca_nb_municipalities/people.py | 5 +- disabled/ca_sk_municipalities/people.py | 5 +- tasks.py | 67 ++++++------------- utils.py | 58 ++++++---------- 29 files changed, 71 insertions(+), 155 deletions(-) diff --git a/ca_ab/people.py b/ca_ab/people.py index 3edeb0bc..93696d59 100644 --- a/ca_ab/people.py +++ b/ca_ab/people.py @@ -24,7 +24,7 @@ def get_party(abbr): - """Return full party name from abbreviation""" + """Return full party name from abbreviation.""" return PARTIES[abbr] diff --git a/ca_bc_coquitlam/people.py b/ca_bc_coquitlam/people.py index 2df34976..4f7200e2 100644 --- a/ca_bc_coquitlam/people.py +++ b/ca_bc_coquitlam/people.py @@ -11,8 +11,7 @@ def scrape(self): def build_email(script): w = re.findall(r'w = "(.*?)"', script)[0] x = re.findall(r'x = "(.*?)"', script)[0] - email = w + "@" + x - return email + return w + "@" + x councillor_seat_number = 1 diff --git a/ca_bc_langley/people.py b/ca_bc_langley/people.py index e3121e54..b453cdfe 100644 --- a/ca_bc_langley/people.py +++ b/ca_bc_langley/people.py @@ -34,7 +34,7 @@ def scrape(self): address_block = page.xpath('//p/a[@rel="noopener noreferrer"]/parent::p')[0].text_content() line1 = address_block[address_block.find("Facility") + 8 : address_block.find("Langley,")] line2 = address_block[address_block.find("Langley,") : address_block.find("Phone") - 1] - address = ", ".join([line1, line2]) + address = f"{line1}, {line2}" p = Person(primary_org="legislature", name=name, role="Mayor", district="Langley") p.add_contact("email", email) p.add_contact("voice", phone, "legislature") diff --git a/ca_nl/people.py b/ca_nl/people.py index 095cd04d..3a86fcef 100644 --- a/ca_nl/people.py +++ b/ca_nl/people.py @@ -34,10 +34,8 @@ def scrape(self): assert len(members), "No members found" for member in json.loads(members): if not member["name"].strip(): - print(f"Skipping blank member: {member}") continue if member["name"] == "Vacant": - print(f"Skipping vacant 'member': {member}") continue name = " ".join(reversed(member["name"].split(","))).strip() district = ( diff --git a/ca_nl_st_john_s/people.py b/ca_nl_st_john_s/people.py index 7b0771ab..657fd8e9 100644 --- a/ca_nl_st_john_s/people.py +++ b/ca_nl_st_john_s/people.py @@ -23,7 +23,7 @@ def scrape(self): district = description[index : index + 6] else: district = "St. John's" - if role != "Mayor" and role != "Deputy Mayor": + if role not in ("Mayor", "Deputy Mayor"): role = "Councillor at Large" district = f"St. John's (seat {councillor_seat_number})" councillor_seat_number += 1 diff --git a/ca_ns_cape_breton/people.py b/ca_ns_cape_breton/people.py index f32939e3..9d9272a5 100644 --- a/ca_ns_cape_breton/people.py +++ b/ca_ns_cape_breton/people.py @@ -13,8 +13,7 @@ def scrape(self): def decode_email(script): raw_address = re.findall(r"(?<=addy).*?;\s*addy", script) local_part = html.unescape(raw_address[0]).split("= ", 1)[1].split(";", 1)[0] - email = re.sub(r"['\s+]", "", local_part) + "cbrm.ns.ca" - return email + return re.sub(r"['\s+]", "", local_part) + "cbrm.ns.ca" page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) diff --git a/ca_nt/people.py b/ca_nt/people.py index d9460a78..2beb0773 100644 --- a/ca_nt/people.py +++ b/ca_nt/people.py @@ -1,3 +1,5 @@ +import contextlib + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -24,10 +26,8 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role="MLA") p.add_source(COUNCIL_PAGE) p.add_source(url) - try: + with contextlib.suppress(IndexError): p.image = page.xpath('//div[contains(@class, "field--name-field-media-image")]/img/@src')[0] - except IndexError: - pass contact = page.xpath('//*[contains(@class, "paragraph--type--office")]')[0] if len(contact.xpath('./div[contains(@class, "office-address-wrapper")]')) == 0: diff --git a/ca_nu/people.py b/ca_nu/people.py index 363f2c98..cca099d3 100644 --- a/ca_nu/people.py +++ b/ca_nu/people.py @@ -1,3 +1,5 @@ +import contextlib + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -22,10 +24,8 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) - try: + with contextlib.suppress(IndexError): p.image = page.xpath('//div[contains(@class, "field--name-field-member-photo")]/div[2]/img/@src')[0] - except IndexError: - pass contact = page.xpath('//div[contains(@class, "field--name-field-member-constituency")]/div[2]/div/p')[0] website = contact.xpath("./div[3]/div[3]/div[2]/a") diff --git a/ca_on/people.py b/ca_on/people.py index 3836aa79..885aa199 100644 --- a/ca_on/people.py +++ b/ca_on/people.py @@ -42,7 +42,7 @@ def scrape(self): '//div[@block="block-views-block-member-current-party-block"]//div[@class="view-content"]//text()' ) - party = [item for item in party if item.strip()][0] + party = next(item for item in party if item.strip()) p = Person(primary_org="legislature", name=name, district=district, role="MPP", party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) diff --git a/ca_on_lasalle/people.py b/ca_on_lasalle/people.py index dd8d1bb2..a2c39372 100644 --- a/ca_on_lasalle/people.py +++ b/ca_on_lasalle/people.py @@ -16,10 +16,7 @@ def scrape(self): assert len(councillors), "No councillors found" for councillor in councillors: role, name = re.split(r"(?<=Mayor)|(?<=Councillor)", councillor.xpath(".//a/div")[0].text_content(), 1) - if "Mayor" in role: - district = "LaSalle" - else: - district = f"LaSalle (seat {councillor_seat_number})" + district = "LaSalle" if "Mayor" in role else f"LaSalle (seat {councillor_seat_number})" image = councillor.xpath(".//img/@src")[0] voice = re.search(r"\d{3}-\d{3}-\d{4} ext. \d+", councillor.text_content()) cell = re.search(r"\d{3}-\d{3}-\d{4}(?! ext)", councillor.text_content()) diff --git a/ca_on_north_dumfries/people.py b/ca_on_north_dumfries/people.py index ef0b252d..ffe19fc5 100644 --- a/ca_on_north_dumfries/people.py +++ b/ca_on_north_dumfries/people.py @@ -23,10 +23,7 @@ def scrape(self): role = match.group(2) name = match.group(3) - if role == "Mayor": - district = "North Dumfries" - else: - district = f"Ward {word_to_number[match.group(1)]}" + district = "North Dumfries" if role == "Mayor" else f"Ward {word_to_number[match.group(1)]}" p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) diff --git a/ca_on_oakville/people.py b/ca_on_oakville/people.py index e51f9ef1..93bcc493 100644 --- a/ca_on_oakville/people.py +++ b/ca_on_oakville/people.py @@ -20,10 +20,7 @@ def scrape(self): role = district_role else: district, role = re.split(r"(?<=\d)\s+", district_role, 1) - if "Regional" in role: - role = "Regional Councillor" - else: - role = "Councillor" + role = "Regional Councillor" if "Regional" in role else "Councillor" name = councillor.xpath(".//div[@class='user-name']/text()")[0] email = self.get_email(councillor) diff --git a/ca_on_oshawa/people.py b/ca_on_oshawa/people.py index 7105e8e2..5171e4ac 100644 --- a/ca_on_oshawa/people.py +++ b/ca_on_oshawa/people.py @@ -21,10 +21,7 @@ def scrape(self): name = info.replace("Mayor ", "") else: district, role_name = re.split(r"(?<=\d)\s", info, 1) - if "Regional" in role_name: - role = "Regional Councillor" - else: - role = "Councillor" + role = "Regional Councillor" if "Regional" in role_name else "Councillor" name = re.split(r"Councillor\s", role_name, 1)[1] photo_url = councillor.xpath(".//img/@src")[0] diff --git a/ca_on_wellesley/people.py b/ca_on_wellesley/people.py index 44be0158..1514047b 100644 --- a/ca_on_wellesley/people.py +++ b/ca_on_wellesley/people.py @@ -27,10 +27,7 @@ def scrape(self): district = srch.group(2).strip() phone = self.get_phone(member) email = self.get_email(member, error=False) - if position == "Mayor": - district = "Wellesley" - else: - district = post_number(district) + district = "Wellesley" if position == "Mayor" else post_number(district) p = Person(primary_org="legislature", name=name, district=district, role=position) p.add_contact("voice", phone, "legislature") diff --git a/ca_on_wilmot/people.py b/ca_on_wilmot/people.py index b72dbbba..d9676cfe 100644 --- a/ca_on_wilmot/people.py +++ b/ca_on_wilmot/people.py @@ -22,7 +22,6 @@ def scrape(self): phone = self.get_phone(councillor).replace("/", "") p.add_contact("voice", phone, "legislature") - print(name + ";", role + ";", district) yield p diff --git a/ca_pe_charlottetown/people.py b/ca_pe_charlottetown/people.py index 1f66dc69..ba216571 100644 --- a/ca_pe_charlottetown/people.py +++ b/ca_pe_charlottetown/people.py @@ -49,10 +49,7 @@ def scrape(self): for text in para.xpath('.//strong[contains(., "Phone")]/following-sibling::text()'): if re.search(r"\d", text): match = re.search(r"(.+) \((.+)\)", text) - if match.group(2) == "Fax": - contact_type = "fax" - else: - contact_type = "voice" + contact_type = "fax" if match.group(2) == "Fax" else "voice" p.add_contact(contact_type, match.group(1), match.group(2)) yield p diff --git a/ca_pe_summerside/people.py b/ca_pe_summerside/people.py index ee516c79..137386a0 100644 --- a/ca_pe_summerside/people.py +++ b/ca_pe_summerside/people.py @@ -49,6 +49,5 @@ def scrape(self): p.image = photo p.add_contact("voice", phone, "legislature") p.add_contact("email", email) - print(email) yield p diff --git a/ca_qc_pointe_claire/people.py b/ca_qc_pointe_claire/people.py index d94b944a..910d4915 100644 --- a/ca_qc_pointe_claire/people.py +++ b/ca_qc_pointe_claire/people.py @@ -23,7 +23,7 @@ def scrape(self): elif district: district = district[0].text_content().split(" – ")[0].strip() else: - assert False, "error parsing district" + raise AssertionError("error parsing district") p = Person(primary_org="legislature", name=name, district=district, role=role) p.image = councillor.xpath(".//@data-src")[0] diff --git a/ca_qc_sainte_anne_de_bellevue/people.py b/ca_qc_sainte_anne_de_bellevue/people.py index 48cbbdab..30d4662b 100644 --- a/ca_qc_sainte_anne_de_bellevue/people.py +++ b/ca_qc_sainte_anne_de_bellevue/people.py @@ -12,7 +12,7 @@ def scrape(self): councillors = page.xpath('//div[@class="block text"]') assert len(councillors), "No councillors found" - for i, councillor in enumerate(councillors): + for councillor in councillors: name = councillor.xpath('.//div[@class="content-writable"]//strong/text()')[0] district = councillor.xpath(".//h2/text()")[0] diff --git a/ca_qc_sherbrooke/people.py b/ca_qc_sherbrooke/people.py index 18ad6b7a..ecfa1bb3 100644 --- a/ca_qc_sherbrooke/people.py +++ b/ca_qc_sherbrooke/people.py @@ -20,8 +20,7 @@ def get_content(url): data = script.split(" = ", 1)[1] data = json.loads(data) content = data["value"]["selected"]["content"]["fr"] - page = lxml.html.fromstring(content) - return page + return lxml.html.fromstring(content) page = get_content(COUNCIL_PAGE) councillors = page.xpath("//a[.//h3]") diff --git a/ca_sk/people.py b/ca_sk/people.py index 352430cf..81315a53 100644 --- a/ca_sk/people.py +++ b/ca_sk/people.py @@ -1,3 +1,4 @@ +import contextlib import re from utils import CanadianPerson as Person @@ -24,10 +25,8 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) - try: + with contextlib.suppress(IndexError): p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] - except IndexError: - pass def handle_address(lines, address_type): address_lines = [] diff --git a/ca_yt/people.py b/ca_yt/people.py index d7f95eee..9e63c538 100644 --- a/ca_yt/people.py +++ b/ca_yt/people.py @@ -1,3 +1,5 @@ +import contextlib + from utils import CanadianPerson as Person from utils import CanadianScraper @@ -25,10 +27,8 @@ def scrape(self): p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) - try: + with contextlib.suppress(IndexError): p.image = page.xpath('//article[contains(@class, "member")]/p/img/@src')[0] - except IndexError: - pass contact = page.xpath('//article[contains(@class, "members-sidebar")]')[0] website = contact.xpath("./div[3]/div[3]/div[2]/a") diff --git a/disabled/ca_bc_municipalities_candidates/people.py b/disabled/ca_bc_municipalities_candidates/people.py index a103ee5c..dab5c42b 100644 --- a/disabled/ca_bc_municipalities_candidates/people.py +++ b/disabled/ca_bc_municipalities_candidates/people.py @@ -117,10 +117,7 @@ def scrape(self): role = row["primary role"] if role not in expected_roles: raise Exception(f"unexpected role: {role}") - if row["district id"]: - district = format(division_id) - else: - district = division_name + district = format(division_id) if row["district id"] else division_name organization.add_post(role=role, label=district, division_id=division_id) diff --git a/disabled/ca_mb_municipalities/people.py b/disabled/ca_mb_municipalities/people.py index 674b114f..87c2923a 100644 --- a/disabled/ca_mb_municipalities/people.py +++ b/disabled/ca_mb_municipalities/people.py @@ -15,10 +15,7 @@ def scrape(self): districts = page.xpath('//div[@id="ctl00_PublicContent_divSearchContent"]//tr')[5::3] for district in districts: title = district.xpath(".//td//text()") - if len(title[0]) > 1: - title = title[0] - else: - title = "".join(title[:2]) + title = title[0] if len(title[0]) > 1 else "".join(title[:2]) # @todo Need to distinguish between, e.g., R.M. and Town title = title.title() diff --git a/disabled/ca_municipalities/people.py b/disabled/ca_municipalities/people.py index f274bc68..4e8fc3c5 100644 --- a/disabled/ca_municipalities/people.py +++ b/disabled/ca_municipalities/people.py @@ -111,6 +111,5 @@ def scrape(self): p.validate() yield p - except Exception as e: - print(repr(e)) + except Exception: continue diff --git a/disabled/ca_nb_municipalities/people.py b/disabled/ca_nb_municipalities/people.py index 6ae4c3e2..a4f42954 100644 --- a/disabled/ca_nb_municipalities/people.py +++ b/disabled/ca_nb_municipalities/people.py @@ -112,10 +112,7 @@ def scrape(self): if "vacant" in name.lower(): continue - if role in unique_roles: - district = division_name - else: - district = f"{division_name} (seat {seat_number})" + district = division_name if role in unique_roles else f"{division_name} (seat {seat_number})" organization.add_post(role=role, label=district, division_id=division_id) diff --git a/disabled/ca_sk_municipalities/people.py b/disabled/ca_sk_municipalities/people.py index c0f67240..19e0a05b 100644 --- a/disabled/ca_sk_municipalities/people.py +++ b/disabled/ca_sk_municipalities/people.py @@ -34,10 +34,7 @@ def scrape(self): districts = [] for page in pages: index = re.search(r"(\s{6,})", page[0]) - if index: - index = index.end() - 1 - else: - index = -1 + index = index.end() - 1 if index else -1 dist1 = [] dist2 = [] for line in page: diff --git a/tasks.py b/tasks.py index 1e8ef8d7..fdf06e91 100644 --- a/tasks.py +++ b/tasks.py @@ -22,37 +22,29 @@ def module_names(): - """ - Returns all module names. - """ + """Returns all module names.""" for module_name in os.listdir("."): if os.path.isfile(os.path.join(module_name, "__init__.py")): yield module_name def modules_and_module_names_and_classes(): - """ - Returns modules, module names, and person scraper classes. - """ + """Returns modules, module names, and person scraper classes.""" for module_name in module_names(): module = importlib.import_module(f"{module_name}.people") - class_name = next(key for key in module.__dict__.keys() if "PersonScraper" in key) + class_name = next(key for key in module.__dict__ if "PersonScraper" in key) yield (module, module_name, module.__dict__[class_name]) def csv_dict_reader(url, encoding="utf-8"): - """ - Reads a remote CSV file. - """ + """Reads a remote CSV file.""" response = requests.get(url) response.encoding = encoding return csv.DictReader(StringIO(response.text)) def slug(name): - """ - Slugifies a division name. - """ + """Slugifies a division name.""" return unidecode( str(name) .lower() @@ -78,16 +70,12 @@ def province_or_territory_abbreviation(code): def type_id(id): - """ - Returns an OCD identifier's type ID. - """ + """Returns an OCD identifier's type ID.""" return id.rsplit(":", 1)[1] def get_definition(division_id, aggregation=False): - """ - Returns the expected configuration for a given division. - """ + """Returns the expected configuration for a given division.""" if not ocdid_to_type_name_map: # Map census division type codes to names. census_division_type_names = {} @@ -194,9 +182,7 @@ def get_definition(division_id, aggregation=False): @task def council_pages(): - """ - Prints scrapers' council page, or warns if it is missing or unneeded. - """ + """Prints scrapers' council page, or warns if it is missing or unneeded.""" for module, module_name, klass in modules_and_module_names_and_classes(): if klass.__bases__[0].__name__ == "CSVScraper": if hasattr(module, "COUNCIL_PAGE"): @@ -209,30 +195,24 @@ def council_pages(): @task def csv_list(): - """ - Lists scrapers with CSV data. - """ - for module, module_name, klass in modules_and_module_names_and_classes(): + """Lists scrapers with CSV data.""" + for _module, module_name, klass in modules_and_module_names_and_classes(): if hasattr(klass, "csv_url"): print(f"{module_name}: {klass.csv_url}") @task def csv_stale(): - """ - Lists scrapers with stale manual CSV data. - """ - for module, module_name, klass in modules_and_module_names_and_classes(): + """Lists scrapers with stale manual CSV data.""" + for _module, module_name, klass in modules_and_module_names_and_classes(): if hasattr(klass, "updated_at") and klass.updated_at < date.today() - timedelta(days=365): print(f"{module_name}: Created on {klass.updated_at} by {klass.contact_person}") @task def csv_error(): - """ - Notes corrections that CSV publishers should make. - """ - for module, module_name, klass in modules_and_module_names_and_classes(): + """Notes corrections that CSV publishers should make.""" + for _module, module_name, klass in modules_and_module_names_and_classes(): if klass.__bases__[0].__name__ == "CSVScraper": if "_candidates" in module_name and hasattr(klass, "updated_at"): continue @@ -279,9 +259,7 @@ def csv_error(): @task def tidy(): - """ - Checks that modules are configured correctly. - """ + """Checks that modules are configured correctly.""" # Map OCD identifiers to styles of address. leader_styles = {} member_styles = {} @@ -297,7 +275,7 @@ def tidy(): division_ids = set() jurisdiction_ids = set() for module_name in module_names(): - if module_name.endswith("_candidates") or module_name.endswith("_municipalities"): + if module_name.endswith(("_candidates", "_municipalities")): continue metadata = module_name_to_metadata(module_name) @@ -368,9 +346,7 @@ def tidy(): @task def sources_and_assertions(): - """ - Checks that sources are attributed and assertions are made. - """ + """Checks that sources are attributed and assertions are made.""" for module_name in module_names(): path = os.path.join(module_name, "people.py") with codecs.open(path, "r", "utf-8") as f: @@ -387,9 +363,7 @@ def sources_and_assertions(): @task def validate_spreadsheet(url, identifier_header, geographic_name_header): - """ - Validates the identifiers, geographic names and geographic types in a spreadsheet. - """ + """Validates the identifiers, geographic names and geographic types in a spreadsheet.""" sgc_to_id = {} for division in Division.all("ca", from_csv=ocd_division_csv): @@ -412,9 +386,7 @@ def validate_spreadsheet(url, identifier_header, geographic_name_header): def module_name_to_metadata(module_name): - """ - Copied from `reports.utils`. - """ + """Copied from `reports.utils`.""" module = importlib.import_module(module_name) for obj in module.__dict__.values(): division_id = getattr(obj, "division_id", None) @@ -431,3 +403,4 @@ def module_name_to_metadata(module_name): getattr(obj, "classification", "legislature"), ), } + return None diff --git a/utils.py b/utils.py index c6e727bb..5b3c3745 100644 --- a/utils.py +++ b/utils.py @@ -135,8 +135,10 @@ def get_email(self, node, expression=".", *, error=True): return match.group(1) if error: raise Exception(f"No email pattern in {matches}") - elif error: + return None + if error: raise Exception(f"No email node in {etree.tostring(node)}") + return None # Helper function for self,get_email def _cloudflare_decode(self, link): @@ -149,11 +151,13 @@ def _cloudflare_decode(self, link): return decoded_email - def get_phone(self, node, *, area_codes=[], error=True): + def get_phone(self, node, *, area_codes=None, error=True): """ Don't use if multiple telephone numbers are present, e.g. voice and fax. If writing a new scraper, check that extensions are captured. """ + if area_codes is None: + area_codes = [] if isinstance(node, etree._ElementUnicodeResult): match = re.search( r"(?:\A|\D)(\(?\d{3}\)?\D?\d{3}\D?\d{4}(?:\s*(?:/|x|ext[.:]?|poste)[\s-]?\d+)?)(?:\D|\Z)", node @@ -180,6 +184,7 @@ def get_phone(self, node, *, area_codes=[], error=True): return match.group(1) if error: raise Exception(f"No phone pattern in {node.text_content()}") + return None def get_link(self, node, substring, *, error=True): match = node.xpath(f'.//a[contains(@href,"{substring}")]/@href') @@ -187,6 +192,7 @@ def get_link(self, node, substring, *, error=True): return match[0] if error: raise Exception(f"No link matching {substring}") + return None def get(self, *args, **kwargs): return super().get(*args, verify=SSL_VERIFY, **kwargs) @@ -248,9 +254,7 @@ def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows= class CSVScraper(CanadianScraper): # File flags - """ - Set the CSV file's delimiter. - """ + """Set the CSV file's delimiter.""" delimiter = "," """ @@ -362,10 +366,7 @@ def is_valid_row(self, row): def scrape(self): seat_numbers = defaultdict(lambda: defaultdict(int)) - if self.extension: - extension = self.extension - else: - extension = os.path.splitext(self.csv_url)[1] + extension = self.extension if self.extension else os.path.splitext(self.csv_url)[1] if extension in (".xls", ".xlsx"): data = StringIO() binary = BytesIO(self.get(self.csv_url).content) @@ -480,11 +481,8 @@ def scrape(self): # District name,District ID,… # Toronto Centre,,… # ,3520005,… - if not row.get("district name") and row.get("district id"): - if len(row["district id"]) == 7: - p._related[0].extras["boundary_url"] = "/boundaries/census-subdivisions/{}/".format( - row["district id"] - ) + if not row.get("district name") and row.get("district id") and len(row["district id"]) == 7: + p._related[0].extras["boundary_url"] = "/boundaries/census-subdivisions/{}/".format(row["district id"]) if row.get("district name") in self.district_name_to_boundary_url: p._related[0].extras["boundary_url"] = self.district_name_to_boundary_url[row["district name"]] @@ -528,9 +526,7 @@ def scrape(self): class CanadianJurisdiction(Jurisdiction): - """ - Whether to create posts whose labels match division names or type IDs. - """ + """Whether to create posts whose labels match division names or type IDs.""" use_type_id = False """ @@ -603,10 +599,7 @@ def get_organizations(self): if valid_through and valid_through < datetime.now().strftime("%Y-%m-%d"): continue - if self.use_type_id: - label = child.id.rsplit("/", 1)[1].capitalize().replace(":", " ") - else: - label = child.name + label = child.id.rsplit("/", 1)[1].capitalize().replace(":", " ") if self.use_type_id else child.name # Yield posts to allow ca_on_toronto to make changes. post = Post(role=member_role, label=label, division_id=child.id, organization_id=organization._id) yield post @@ -620,9 +613,7 @@ def get_organizations(self): class CanadianPerson(Person): def __init__(self, *, name, district, role, **kwargs): - """ - Cleans a person's name, district, role and any other attributes. - """ + """Cleans a person's name, district, role and any other attributes.""" name = clean_name(name) district = clean_string(district).replace("&", "and") role = clean_string(role) @@ -636,9 +627,7 @@ def __init__(self, *, name, district, role, **kwargs): super().__init__(name=name, district=district, role=role, **kwargs) def __setattr__(self, name, value): - """ - Corrects gender values. - """ + """Corrects gender values.""" if name == "gender": value = value.lower() if value == "m": @@ -648,9 +637,7 @@ def __setattr__(self, name, value): super().__setattr__(name, value) def add_link(self, url, *, note=""): - """ - Corrects links without schemes or domains. - """ + """Corrects links without schemes or domains.""" url = url.strip() if url.startswith("www."): url = f"http://{url}" @@ -659,9 +646,7 @@ def add_link(self, url, *, note=""): self.links.append({"note": note, "url": url}) def add_contact(self, type, value, note="", area_code=None): - """ - Cleans and adds a contact detail to the person's membership. - """ + """Cleans and adds a contact detail to the person's membership.""" if type: type = clean_string(type) if note: @@ -684,9 +669,7 @@ def add_contact(self, type, value, note="", area_code=None): self._related[0].add_contact_detail(type=type, value=value, note=note) def clean_telephone_number(self, s, area_code=None): - """ - @see http://www.btb.termiumplus.gc.ca/tpv2guides/guides/favart/index-eng.html?lang=eng&lettr=indx_titls&page=9N6fM9QmOwCE.html - """ + """@see http://www.btb.termiumplus.gc.ca/tpv2guides/guides/favart/index-eng.html?lang=eng&lettr=indx_titls&page=9N6fM9QmOwCE.html.""" splits = re.split(r"(?:\b \(|/|x|ext[.:]?|p\.|poste)[\s-]?(?=\b|\d)", s, flags=re.IGNORECASE) digits = re.sub(r"\D", "", splits[0]) @@ -768,8 +751,7 @@ def clean_type_id(type_id): # "Spaces should be converted to underscores." type_id = re.sub(r" ", "_", type_id) # "All invalid characters should be converted to tilde (~)." - type_id = re.sub(r"[^\w.~-]", "~", type_id, re.UNICODE) - return type_id + return re.sub(r"[^\w.~-]", "~", type_id, re.UNICODE) def clean_french_prepositions(s): From 6a6a313c166633648d59d014745307b5952eb73d Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 16 Sep 2024 17:58:29 -0400 Subject: [PATCH 61/66] chore: Fix lints --- ca/people.py | 2 +- ca_nt/people.py | 8 +-- ca_nu/people.py | 8 +-- ca_on_chatham_kent/people.py | 2 +- ca_on_clarington/people.py | 2 +- ca_on_grimsby/people.py | 2 +- ca_on_lasalle/people.py | 4 +- ca_on_oakville/people.py | 2 +- ca_on_oshawa/people.py | 4 +- ca_on_pickering/people.py | 2 +- ca_on_thunder_bay/people.py | 4 +- ca_on_woolwich/people.py | 2 +- ca_qc_brossard/people.py | 36 ++++++------- ca_sk/people.py | 8 +-- ca_yt/people.py | 8 +-- disabled/ca_municipalities/people.py | 2 +- disabled/ca_nl_municipalities/people.py | 17 +++--- disabled/ca_ns_municipalities/people.py | 16 +++--- disabled/ca_pe_municipalities/people.py | 2 +- disabled/ca_sk_municipalities/people.py | 17 +++--- disabled/ca_yt_municipalities/people.py | 14 ++--- patch.py | 24 ++++----- pyproject.toml | 35 ++++++++++-- tasks.py | 34 ++++++------ utils.py | 71 ++++++++++++------------- 25 files changed, 178 insertions(+), 148 deletions(-) diff --git a/ca/people.py b/ca/people.py index a9605a35..49d55060 100644 --- a/ca/people.py +++ b/ca/people.py @@ -59,7 +59,7 @@ def scrape_people(self, rows, gender): photo_response = self.get(photo) if ( photo_response.status_code == 200 - and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1 + and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1 # noqa: S324 # non-cryptographic ): m.image = photo diff --git a/ca_nt/people.py b/ca_nt/people.py index 2beb0773..d04fb9db 100644 --- a/ca_nt/people.py +++ b/ca_nt/people.py @@ -35,7 +35,7 @@ def scrape(self): else: address_section = contact - def handle_address(contact, address_type): + def handle_address(p, contact, address_type): address_lines = [] po_box_line = ( "PO Box " @@ -56,7 +56,7 @@ def handle_address(contact, address_type): address_type, ) - def handle_phone(lines, phone_type): + def handle_phone(p, lines, phone_type): first_phone_added = False for line in lines: if "Assistant" in line.strip(): @@ -71,8 +71,8 @@ def handle_phone(lines, phone_type): first_phone_added = True contact_lines = contact.xpath(".//text()") - handle_address(address_section, "legislature") - handle_phone(contact_lines, "legislature") + handle_address(p, address_section, "legislature") + handle_phone(p, contact_lines, "legislature") email_elements = page.xpath( '//*[contains(@class, "field--paragraph--field-email")]/div[@class="field__item"]' diff --git a/ca_nu/people.py b/ca_nu/people.py index cca099d3..38b1c8eb 100644 --- a/ca_nu/people.py +++ b/ca_nu/people.py @@ -32,7 +32,7 @@ def scrape(self): if website: p.add_link(website[0].text_content()) - def handle_address(lines, address_type): + def handle_address(p, lines, address_type): address_lines = [] for line in lines: if ":" in line.strip(): # Room:, Phone:, Fax: @@ -45,15 +45,15 @@ def handle_address(lines, address_type): address_type, ) - def handle_phone(lines, phone_type): + def handle_phone(p, lines, phone_type): for line in lines: if "Phone:" in line: number = line.replace("Phone: (867) ", "") p.add_contact("voice", number, phone_type, area_code=867) address_lines = contact.xpath("./text()") - handle_address(address_lines, "legislature") - handle_phone(address_lines, "legislature") + handle_address(p, address_lines, "legislature") + handle_phone(p, address_lines, "legislature") email = self.get_email(contact, error=False) if email: diff --git a/ca_on_chatham_kent/people.py b/ca_on_chatham_kent/people.py index 61f9dca2..5ddd2165 100644 --- a/ca_on_chatham_kent/people.py +++ b/ca_on_chatham_kent/people.py @@ -20,7 +20,7 @@ def scrape(self): body = 'councillorsByWard50' response = requests.post(url=COUNCIL_DATA_URL, data=body, headers=headers) - page = etree.fromstring(response.content) + page = etree.fromstring(response.content) # noqa: S320 namespace = {"z": "#RowsetSchema", "rs": "urn:schemas-microsoft-com:rowset"} councillors = page.findall(".//z:row", namespace) diff --git a/ca_on_clarington/people.py b/ca_on_clarington/people.py index f2bc142b..01869853 100644 --- a/ca_on_clarington/people.py +++ b/ca_on_clarington/people.py @@ -15,7 +15,7 @@ def scrape(self): assert len(councillors), "No councillors found" for councillor in councillors: name, role_district = councillor.text_content().split(" - ") - role, district = re.split(r"(?<=Councillor) ", role_district, 1) + role, district = re.split(r"(?<=Councillor) ", role_district, maxsplit=1) content_node = councillor.xpath("../following-sibling::tr")[0] email = self.get_email(content_node) photo_url = content_node.xpath(".//img/@src")[0] diff --git a/ca_on_grimsby/people.py b/ca_on_grimsby/people.py index a083cf24..1a384ad2 100644 --- a/ca_on_grimsby/people.py +++ b/ca_on_grimsby/people.py @@ -22,7 +22,7 @@ def scrape(self): name_node = councillors_node.xpath( './/h5[contains(./strong, "Councillor")]|.//h5[contains(., "Councillor")]' )[i] - name = re.split(r"\s", name_node.text_content(), 1)[1] + name = re.split(r"\s", name_node.text_content(), maxsplit=1)[1] district = f"{area} (seat {i + 1})" phone = self.get_phone(name_node.xpath('./following-sibling::*[contains(., "Phone")]')[0]) email = self.get_email(name_node.xpath("./following-sibling::p[contains(., 'Email')]")[0]) diff --git a/ca_on_lasalle/people.py b/ca_on_lasalle/people.py index a2c39372..d0af6041 100644 --- a/ca_on_lasalle/people.py +++ b/ca_on_lasalle/people.py @@ -15,7 +15,9 @@ def scrape(self): councillors = page.xpath('//div[@class="fbg-row lb-imageBox cm-datacontainer"]') assert len(councillors), "No councillors found" for councillor in councillors: - role, name = re.split(r"(?<=Mayor)|(?<=Councillor)", councillor.xpath(".//a/div")[0].text_content(), 1) + role, name = re.split( + r"(?<=Mayor)|(?<=Councillor)", councillor.xpath(".//a/div")[0].text_content(), maxsplit=1 + ) district = "LaSalle" if "Mayor" in role else f"LaSalle (seat {councillor_seat_number})" image = councillor.xpath(".//img/@src")[0] voice = re.search(r"\d{3}-\d{3}-\d{4} ext. \d+", councillor.text_content()) diff --git a/ca_on_oakville/people.py b/ca_on_oakville/people.py index 93bcc493..c7362962 100644 --- a/ca_on_oakville/people.py +++ b/ca_on_oakville/people.py @@ -19,7 +19,7 @@ def scrape(self): district = "Oakville" role = district_role else: - district, role = re.split(r"(?<=\d)\s+", district_role, 1) + district, role = re.split(r"(?<=\d)\s+", district_role, maxsplit=1) role = "Regional Councillor" if "Regional" in role else "Councillor" name = councillor.xpath(".//div[@class='user-name']/text()")[0] diff --git a/ca_on_oshawa/people.py b/ca_on_oshawa/people.py index 5171e4ac..df02029e 100644 --- a/ca_on_oshawa/people.py +++ b/ca_on_oshawa/people.py @@ -20,9 +20,9 @@ def scrape(self): district = "Oshawa" name = info.replace("Mayor ", "") else: - district, role_name = re.split(r"(?<=\d)\s", info, 1) + district, role_name = re.split(r"(?<=\d)\s", info, maxsplit=1) role = "Regional Councillor" if "Regional" in role_name else "Councillor" - name = re.split(r"Councillor\s", role_name, 1)[1] + name = re.split(r"Councillor\s", role_name, maxsplit=1)[1] photo_url = councillor.xpath(".//img/@src")[0] phone = self.get_phone(councillor) diff --git a/ca_on_pickering/people.py b/ca_on_pickering/people.py index 78bfdd3f..420b3ede 100644 --- a/ca_on_pickering/people.py +++ b/ca_on_pickering/people.py @@ -21,7 +21,7 @@ def scrape(self): if "Councillor" in name: name = name.replace("Councillor", "").strip() role_ward = councillor.xpath(".//text()")[1] - role, ward = re.split(r"\s(?=Ward)", role_ward, 1) + role, ward = re.split(r"\s(?=Ward)", role_ward, maxsplit=1) else: name = name.replace("Mayor", "") role = "Mayor" diff --git a/ca_on_thunder_bay/people.py b/ca_on_thunder_bay/people.py index 7a7f834e..0c09dcac 100644 --- a/ca_on_thunder_bay/people.py +++ b/ca_on_thunder_bay/people.py @@ -1,7 +1,7 @@ import requests +from utils import DEFAULT_USER_AGENT, CanadianScraper from utils import CanadianPerson as Person -from utils import CanadianScraper COUNCIL_PAGE = "https://www.thunderbay.ca/en/city-hall/mayor-and-council-profiles.aspx" @@ -43,6 +43,6 @@ def scrape(self): yield p - def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_agent(), cookies=None, xml=False): + def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += ":HIGH:!DH:!aNULL" # site uses a weak DH key return super().lxmlize(url, encoding, user_agent, cookies, xml) diff --git a/ca_on_woolwich/people.py b/ca_on_woolwich/people.py index f2773670..e1baf6d5 100644 --- a/ca_on_woolwich/people.py +++ b/ca_on_woolwich/people.py @@ -15,7 +15,7 @@ def scrape(self): councillors = page.xpath('//td[@data-name="accParent"]/h2') assert len(councillors), "No councillors found" for councillor in councillors: - role, name = re.split(r"\s", councillor.text_content(), 1) + role, name = re.split(r"\s", councillor.text_content(), maxsplit=1) area = re.search(r"Ward \d", name) if not area: district = "Woolwich" diff --git a/ca_qc_brossard/people.py b/ca_qc_brossard/people.py index f1b5eaac..d82df70d 100644 --- a/ca_qc_brossard/people.py +++ b/ca_qc_brossard/people.py @@ -12,25 +12,25 @@ class BrossardPersonScraper(CanadianScraper): def scrape(self): - def indexById(elementList): + def index_by_id(element_list): result = {} - for element in elementList: + for element in element_list: id = element["id"] result[id] = element return result # Gets the ids of all children elements recursively - def getChildren(parentId, elementDict): - returnList = [] - element = elementDict[parentId] + def get_children(parent_id, element_dict): + return_list = [] + element = element_dict[parent_id] if element.get("children"): for child in element.get("children"): if not re.search(r"^\d+$", child): continue - returnList.append(child) - if getChildren(child, elementDict): - returnList.extend(getChildren(child, elementDict)) - return returnList + return_list.append(child) + if get_children(child, element_dict): + return_list.extend(get_children(child, element_dict)) + return return_list # The whole page is rendered in javascript and stored as a massive json object page = requests.get(DATA_PAGE) @@ -39,20 +39,20 @@ def getChildren(parentId, elementDict): for container in containers: if container.get("contentType") != "CMSPage": continue - elements = indexById(container["properties"]["content"]["data"]) + elements = index_by_id(container["properties"]["content"]["data"]) - councillors = [] - for element in elements.values(): - if isinstance(element.get("children"), dict) and re.search( - r"DISTRICT \d+\s+[-|]\sSecteur", element.get("children").get("fr") - ): - councillors.append(element) + councillors = [ + element + for element in elements.values() + if isinstance(element.get("children"), dict) + and re.search(r"DISTRICT \d+\s+[-|]\sSecteur", element.get("children").get("fr")) + ] assert len(councillors), "No councillors found" for councillor in councillors: district = re.search(r"DISTRICT (\d+)", councillor["children"]["fr"]).group(0).title() parent_id = councillor["parent"] - children = getChildren(parent_id, elements) + children = get_children(parent_id, elements) phone = None for id in children: child = elements[id] @@ -86,7 +86,7 @@ def getChildren(parentId, elementDict): ): mayor = element parent_id = mayor["parent"] - children = getChildren(parent_id, elements) + children = get_children(parent_id, elements) phone = None for id in children: child = elements[id] diff --git a/ca_sk/people.py b/ca_sk/people.py index 81315a53..c3ecfc0f 100644 --- a/ca_sk/people.py +++ b/ca_sk/people.py @@ -28,7 +28,7 @@ def scrape(self): with contextlib.suppress(IndexError): p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] - def handle_address(lines, address_type): + def handle_address(p, lines, address_type): address_lines = [] for line in lines: if re.match(r"(Room|Phone|Fax)\:", line): @@ -41,7 +41,7 @@ def handle_address(lines, address_type): address_type, ) - def handle_phone(lines, phone_type): + def handle_phone(p, lines, phone_type): matches = re.findall(r"Phone\:\s*(306-[\d\-]+)", "\n".join(lines)) if len(matches) == 1: p.add_contact("voice", matches[0], phone_type, area_code=306) @@ -55,8 +55,8 @@ def handle_phone(lines, phone_type): address_type = "constituency" else: raise AssertionError(f"Unexpected address type: {lines[0]}") - handle_address(lines[1:], address_type) - handle_phone(lines[1:], address_type) + handle_address(p, lines[1:], address_type) + handle_phone(p, lines[1:], address_type) email = self.get_email(page.xpath('//div[@id="content"]')[0], error=False) if email: diff --git a/ca_yt/people.py b/ca_yt/people.py index 9e63c538..b6dd62f4 100644 --- a/ca_yt/people.py +++ b/ca_yt/people.py @@ -35,7 +35,7 @@ def scrape(self): if website: p.add_link(website[0].text_content()) - def handle_address(lines, address_type): + def handle_address(p, lines, address_type): address_lines = [] for line in lines: if line.endswith(":"): # Room:, Phone:, Fax: @@ -48,7 +48,7 @@ def handle_address(lines, address_type): address_type, ) - def handle_phone(lines, phone_type): + def handle_phone(p, lines, phone_type): if "Phone:" in lines: next_line = lines[lines.index("Phone:") + 1] if next_line.endswith(":"): @@ -66,8 +66,8 @@ def handle_phone(lines, phone_type): address_lines = contact.xpath("//address//text()") contact_lines = contact.xpath("//p[2]//text()") assert address_lines[0].strip() == "Yukon Legislative Assembly" - handle_address(address_lines[1:], "legislature") - handle_phone(contact_lines[1:], "legislature") + handle_address(p, address_lines[1:], "legislature") + handle_phone(p, contact_lines[1:], "legislature") email = self.get_email(contact, error=False) if email: diff --git a/disabled/ca_municipalities/people.py b/disabled/ca_municipalities/people.py index 4e8fc3c5..e7537a4e 100644 --- a/disabled/ca_municipalities/people.py +++ b/disabled/ca_municipalities/people.py @@ -112,4 +112,4 @@ def scrape(self): yield p except Exception: - continue + pass diff --git a/disabled/ca_nl_municipalities/people.py b/disabled/ca_nl_municipalities/people.py index 3fbd9c6e..73262f79 100644 --- a/disabled/ca_nl_municipalities/people.py +++ b/disabled/ca_nl_municipalities/people.py @@ -1,8 +1,9 @@ import os import re import subprocess -from urllib.request import urlopen +import tempfile +import requests from pupa.scrape import Organization from utils import CanadianPerson as Person @@ -16,15 +17,14 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) url = page.xpath('//a[contains(text(),"Municipal Directory")]/@href')[0] - response = urlopen(url).read() - pdf = open("/tmp/nl.pdf", "w") - pdf.write(response) - pdf.close() + response = requests.get(url).content + with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: + pdf.write(response) - data = subprocess.check_output(["pdftotext", "-layout", "/tmp/nl.pdf", "-"]) + data = subprocess.check_output(["pdftotext", "-layout", pdf.name, "-"]) # noqa: S603,S607 pages = data.split("Municipal Directory")[1:] for page in pages: - page = page.splitlines(True) + page = page.splitlines(keepends=True) column_index = {} for line in page: if "Official Name" in line: @@ -81,4 +81,5 @@ def scrape(self): if address: membership.add_contact_detail("address", address, "legislature") yield p - os.system("rm /tmp/nl.pdf") + + os.unlink(pdf.name) diff --git a/disabled/ca_ns_municipalities/people.py b/disabled/ca_ns_municipalities/people.py index 52459832..9ed12739 100644 --- a/disabled/ca_ns_municipalities/people.py +++ b/disabled/ca_ns_municipalities/people.py @@ -1,8 +1,9 @@ import os import re import subprocess -from urllib.request import urlopen +import tempfile +import requests from pupa.scrape import Organization from utils import CanadianPerson as Person @@ -13,16 +14,15 @@ class NovaScotiaMunicipalitiesPersonScraper(CanadianScraper): def scrape(self): - response = urlopen(COUNCIL_PAGE).read() - pdf = open("/tmp/ns.pdf", "w") - pdf.write(response) - pdf.close() + response = requests.get(COUNCIL_PAGE).content + with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: + pdf.write(response) - data = subprocess.check_output(["pdftotext", "/tmp/ns.pdf", "-"]) + data = subprocess.check_output(["pdftotext", pdf.name, "-"]) # noqa: S603,S607 emails = re.findall(r"(?<=E-mail: ).+", data) data = re.split(r"Mayor |Warden ", data)[1:] for i, mayor in enumerate(data): - lines = mayor.splitlines(True) + lines = mayor.splitlines(keepends=True) name = lines.pop(0).strip() if name == "Jim Smith": continue @@ -66,4 +66,4 @@ def scrape(self): membership.add_contact_detail("email", emails.pop(i)) yield p - os.system("rm /tmp/ns.pdf") + os.unlink(pdf.name) diff --git a/disabled/ca_pe_municipalities/people.py b/disabled/ca_pe_municipalities/people.py index 3eb0154f..f39591bb 100644 --- a/disabled/ca_pe_municipalities/people.py +++ b/disabled/ca_pe_municipalities/people.py @@ -44,7 +44,7 @@ def scrape(self): councillors = page.xpath( '//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()' - )[0].splitlines(True) + )[0].splitlines(keepends=True) for councillor in councillors: name = ( councillor.replace("(Mayor)", "") diff --git a/disabled/ca_sk_municipalities/people.py b/disabled/ca_sk_municipalities/people.py index 19e0a05b..6f96d1ce 100644 --- a/disabled/ca_sk_municipalities/people.py +++ b/disabled/ca_sk_municipalities/people.py @@ -1,8 +1,9 @@ import os import re import subprocess -from urllib.request import urlopen +import tempfile +import requests from pupa.scrape import Organization from utils import CanadianPerson as Person @@ -14,14 +15,13 @@ class SaskatchewanMunicipalitiesPersonScraper(CanadianScraper): def scrape(self): - response = urlopen(COUNCIL_PAGE).read() - pdf = open("/tmp/sk.pdf", "w") - pdf.write(response) - pdf.close() + response = requests.get(COUNCIL_PAGE).read() + with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: + pdf.write(response) - data = subprocess.check_output(["pdftotext", "-layout", "/tmp/sk.pdf", "-"]) + data = subprocess.check_output(["pdftotext", "-layout", pdf.name, "-"]) # noqa: S603,S607 - data = data.splitlines(True) + data = data.splitlines(keepends=True) pages = [] page = [] for line in data: @@ -96,4 +96,5 @@ def scrape(self): for key, value in contacts.items(): membership.add_contact_detail(key, value, "" if key == "email" else "legislature") yield p - os.system("rm /tmp/sk.pdf") + + os.unlink(pdf.name) diff --git a/disabled/ca_yt_municipalities/people.py b/disabled/ca_yt_municipalities/people.py index 130aa33c..2cd79127 100644 --- a/disabled/ca_yt_municipalities/people.py +++ b/disabled/ca_yt_municipalities/people.py @@ -1,8 +1,9 @@ import os import re import subprocess -from urllib.request import urlopen +import tempfile +import requests from pupa.scrape import Organization from utils import CanadianPerson as Person @@ -13,12 +14,11 @@ class YukonMunicipalitiesPersonScraper(CanadianScraper): def scrape(self): - response = urlopen(COUNCIL_PAGE).read() - pdf = open("/tmp/yt.pdf", "w") - pdf.write(response) - pdf.close() + response = requests.get(COUNCIL_PAGE).content + with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: + pdf.write(response) - data = subprocess.check_output(["pdftotext", "-layout", "/tmp/yt.pdf", "-"]) + data = subprocess.check_output(["pdftotext", "-layout", pdf.name, "-"]) # noqa: S603,S607 data = re.split(r"\n\s*\n", data) for municipality in data: if "Councillors" not in municipality: @@ -81,4 +81,4 @@ def scrape(self): p.add_link(website) yield p - os.system("rm /tmp/yt.pdf") + os.unlink(pdf.name) diff --git a/patch.py b/patch.py index 97979862..6d6312e8 100644 --- a/patch.py +++ b/patch.py @@ -57,7 +57,7 @@ social_re = re.compile( r"(?:facebook|fb|instagram|linkedin|twitter|youtube)\.com|conservative\.ca" -) # XXX ca_candidates +) # special case: ca_candidates facebook_re = re.compile(r"facebook\.com") instagram_re = re.compile(r"instagram\.com") linkedin_re = re.compile(r"linkedin\.com") @@ -70,15 +70,15 @@ (1, lambda x: x["type"] == "email", "Membership has many emails"), ] -for type in ("address", "cell", "fax", "voice"): - for note in ("constituency", "legislature", "office", "residence"): - matchers.append( - ( - 1, - lambda x, type=type, note=note: x["type"] == type and x["note"] == note, - "Membership has contact_details with same type and note", - ) - ) +matchers.extend( + ( + 1, + lambda x, type=type, note=note: x["type"] == type and x["note"] == note, + "Membership has contact_details with same type and note", + ) + for type in ("address", "cell", "fax", "voice") + for note in ("constituency", "legislature", "office", "residence") +) # A membership should not have notes on emails, should have notes on non-emails, # should have at most one email, and should, in most cases, have at most one of @@ -147,7 +147,7 @@ organization_schema["properties"]["classification"]["enum"] += ["government"] -def validate_conditionalPattern(self, x, fieldname, schema, path, arguments=None): +def validate_conditionalPattern(self, x, fieldname, schema, path, arguments=None): # noqa: N802 value = x.get(fieldname) if isinstance(value, str): for pattern, method in arguments: @@ -158,7 +158,7 @@ def validate_conditionalPattern(self, x, fieldname, schema, path, arguments=None DatetimeValidator.validate_conditionalPattern = validate_conditionalPattern -def validate_maxMatchingItems(self, x, fieldname, schema, path, arguments=None): +def validate_maxMatchingItems(self, x, fieldname, schema, path, arguments=None): # noqa: N802 value = x.get(fieldname) if isinstance(value, list): for length, method, message in arguments: diff --git a/pyproject.toml b/pyproject.toml index 059d331a..85ef59fa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,37 @@ version = "0.0.1" [tool.ruff] line-length = 119 -target-version = "py310" +target-version = "py39" [tool.ruff.lint] -select = ["C4", "E", "F", "I", "W"] -ignore = ["E501"] +select = ["ALL"] +ignore = [ + "ANN", "C901", "COM812", "D203", "D212", "D415", "EM", "ISC001", "PERF203", "PLR091", "Q000", + "D1", "D205", + "DTZ", + "E501", + "ERA001", # commented-out code + "PLR2004", + "PLW2901", + "PTH", + "RUF012", + "S101", # assert + "S113", # timeout + "TRY003", # errors + + # Should be refactored: + "BLE001", # except Exception + "S110", # except pass + "TRY002", # raise Exception +] +allowed-confusables = ["’", "–"] + +[tool.ruff.lint.flake8-builtins] +builtins-ignorelist = ["id", "type"] + +[tool.ruff.lint.flake8-self] +extend-ignore-names = ["_ElementUnicodeResult", "_id", "_related", "_type"] + +[tool.ruff.lint.per-file-ignores] +"patch.py" = ["ARG001"] +"tasks.py" = ["T201"] diff --git a/tasks.py b/tasks.py index fdf06e91..004737c1 100644 --- a/tasks.py +++ b/tasks.py @@ -22,14 +22,14 @@ def module_names(): - """Returns all module names.""" + """Return all module names.""" for module_name in os.listdir("."): if os.path.isfile(os.path.join(module_name, "__init__.py")): yield module_name def modules_and_module_names_and_classes(): - """Returns modules, module names, and person scraper classes.""" + """Return modules, module names, and person scraper classes.""" for module_name in module_names(): module = importlib.import_module(f"{module_name}.people") class_name = next(key for key in module.__dict__ if "PersonScraper" in key) @@ -37,14 +37,14 @@ def modules_and_module_names_and_classes(): def csv_dict_reader(url, encoding="utf-8"): - """Reads a remote CSV file.""" + """Read a remote CSV file.""" response = requests.get(url) response.encoding = encoding return csv.DictReader(StringIO(response.text)) def slug(name): - """Slugifies a division name.""" + """Slugify a division name.""" return unidecode( str(name) .lower() @@ -70,12 +70,12 @@ def province_or_territory_abbreviation(code): def type_id(id): - """Returns an OCD identifier's type ID.""" + """Return an OCD identifier's type ID.""" return id.rsplit(":", 1)[1] -def get_definition(division_id, aggregation=False): - """Returns the expected configuration for a given division.""" +def get_definition(division_id, *, aggregation=False): + """Return the expected configuration for a given division.""" if not ocdid_to_type_name_map: # Map census division type codes to names. census_division_type_names = {} @@ -92,7 +92,7 @@ def get_definition(division_id, aggregation=False): requests.get("https://www12.statcan.gc.ca/census-recensement/2016/ref/dict/tab/t1_5-eng.cfm").content ) for text in document.xpath("//table//th[@headers]/text()"): - code, name = text.split(" – ", 1) # non-breaking space + code, name = text.split("\xa0– ", 1) census_subdivision_type_names[code] = name.split(" / ", 1)[0] # Map OCD identifiers to census types. @@ -182,7 +182,7 @@ def get_definition(division_id, aggregation=False): @task def council_pages(): - """Prints scrapers' council page, or warns if it is missing or unneeded.""" + """Print scrapers' council page, or warns if it is missing or unneeded.""" for module, module_name, klass in modules_and_module_names_and_classes(): if klass.__bases__[0].__name__ == "CSVScraper": if hasattr(module, "COUNCIL_PAGE"): @@ -195,7 +195,7 @@ def council_pages(): @task def csv_list(): - """Lists scrapers with CSV data.""" + """List scrapers with CSV data.""" for _module, module_name, klass in modules_and_module_names_and_classes(): if hasattr(klass, "csv_url"): print(f"{module_name}: {klass.csv_url}") @@ -203,7 +203,7 @@ def csv_list(): @task def csv_stale(): - """Lists scrapers with stale manual CSV data.""" + """List scrapers with stale manual CSV data.""" for _module, module_name, klass in modules_and_module_names_and_classes(): if hasattr(klass, "updated_at") and klass.updated_at < date.today() - timedelta(days=365): print(f"{module_name}: Created on {klass.updated_at} by {klass.contact_person}") @@ -211,7 +211,7 @@ def csv_stale(): @task def csv_error(): - """Notes corrections that CSV publishers should make.""" + """Note corrections that CSV publishers should make.""" for _module, module_name, klass in modules_and_module_names_and_classes(): if klass.__bases__[0].__name__ == "CSVScraper": if "_candidates" in module_name and hasattr(klass, "updated_at"): @@ -259,7 +259,7 @@ def csv_error(): @task def tidy(): - """Checks that modules are configured correctly.""" + """Check that modules are configured correctly.""" # Map OCD identifiers to styles of address. leader_styles = {} member_styles = {} @@ -294,7 +294,7 @@ def tidy(): else: jurisdiction_ids.add(jurisdiction_id) - expected = get_definition(division_id, bool(module_name.endswith("_municipalities"))) + expected = get_definition(division_id, aggregation=bool(module_name.endswith("_municipalities"))) # Ensure presence of url and styles of address. if division_id not in member_styles: @@ -346,7 +346,7 @@ def tidy(): @task def sources_and_assertions(): - """Checks that sources are attributed and assertions are made.""" + """Check that sources are attributed and assertions are made.""" for module_name in module_names(): path = os.path.join(module_name, "people.py") with codecs.open(path, "r", "utf-8") as f: @@ -363,7 +363,7 @@ def sources_and_assertions(): @task def validate_spreadsheet(url, identifier_header, geographic_name_header): - """Validates the identifiers, geographic names and geographic types in a spreadsheet.""" + """Validate the identifiers, geographic names and geographic types in a spreadsheet.""" sgc_to_id = {} for division in Division.all("ca", from_csv=ocd_division_csv): @@ -386,7 +386,7 @@ def validate_spreadsheet(url, identifier_header, geographic_name_header): def module_name_to_metadata(module_name): - """Copied from `reports.utils`.""" + # Copied from reports.utils module = importlib.import_module(module_name) for obj in module.__dict__.values(): division_id = getattr(obj, "division_id", None) diff --git a/utils.py b/utils.py index 5b3c3745..c401dd1e 100644 --- a/utils.py +++ b/utils.py @@ -9,7 +9,7 @@ from zipfile import ZipFile import agate -import agateexcel # noqa +import agateexcel # noqa: F401 import lxml.html import requests from lxml import etree @@ -17,11 +17,12 @@ from pupa.scrape import Jurisdiction, Organization, Person, Post, Scraper from requests.packages.urllib3.exceptions import InsecureRequestWarning -import patch # patch patches validictory # noqa +import patch # patch patches validictory # noqa: F401 requests.packages.urllib3.disable_warnings(InsecureRequestWarning) CUSTOM_USER_AGENT = "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)" +DEFAULT_USER_AGENT = requests.utils.default_user_agent() CONTACT_DETAIL_TYPE_MAP = { "Address": "address", @@ -82,10 +83,7 @@ "Voice Mail": "legislature", "Work": "legislature", } -if os.getenv("SSL_VERIFY", False): - SSL_VERIFY = "/usr/lib/ssl/certs/ca-certificates.crt" -else: - SSL_VERIFY = bool(os.getenv("SSL_VERIFY", False)) +SSL_VERIFY = "/usr/lib/ssl/certs/ca-certificates.crt" if os.getenv("SSL_VERIFY", "") else False email_re = re.compile(r"([A-Za-z0-9._-]+@(?:[A-Za-z0-9-]+\.)+[A-Za-z]{2,})") @@ -113,21 +111,21 @@ def get_email(self, node, expression=".", *, error=True): Make sure that the node/expression is narrow enough to not capture a generic email address in the footer of the page, for example. """ - matches = [] # If the text would be split across multiple sub-tags. - for match in node.xpath(f'{expression}//*[contains(text(), "@")]'): - matches.append(match.text_content()) + matches = [match.text_content() for match in node.xpath(f'{expression}//*[contains(text(), "@")]')] # The text version is more likely to be correct, as it is more visible, # e.g. ca_bc has one `href` of `mailto:first.last.mla@leg.bc.ca`. - for match in node.xpath(f'{expression}//a[contains(@href, "mailto:")]'): - matches.append(unquote(match.attrib["href"])) + matches.extend( + unquote(match.attrib["href"]) for match in node.xpath(f'{expression}//a[contains(@href, "mailto:")]') + ) # Some emails are obfuscated by Cloudflare. - for match in node.xpath(f'{expression}//@href[contains(., "cdn-cgi/l/email-protection")]'): - matches.append(self._cloudflare_decode(match)) + matches.extend( + self._cloudflare_decode(match) + for match in node.xpath(f'{expression}//@href[contains(., "cdn-cgi/l/email-protection")]') + ) # If the node has no sub-tags. if not matches: - for match in node.xpath(f'{expression}//text()[contains(., "@")]'): - matches.append(match) + matches = list(node.xpath(f'{expression}//text()[contains(., "@")]')) if matches: for match in matches: match = email_re.search(match) @@ -200,7 +198,7 @@ def get(self, *args, **kwargs): def post(self, *args, **kwargs): return super().post(*args, verify=SSL_VERIFY, **kwargs) - def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_agent(), cookies=None, xml=False): + def lxmlize(self, url, encoding=None, *, user_agent=DEFAULT_USER_AGENT, cookies=None, xml=False): self.user_agent = user_agent response = self.get(url, cookies=cookies) @@ -210,12 +208,12 @@ def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_age try: text = response.text if xml: - text = text.replace('', "") # XXX ca_bc - page = etree.fromstring(text) + text = text.replace('', "") # special case: ca_bc + page = etree.fromstring(text) # noqa: S320 else: page = lxml.html.fromstring(text) - except etree.ParserError: - raise etree.ParserError(f"Document is empty {url}") + except etree.ParserError as e: + raise etree.ParserError(f"Document is empty {url}") from e meta = page.xpath('//meta[@http-equiv="refresh"]') if meta: @@ -226,12 +224,12 @@ def lxmlize(self, url, encoding=None, user_agent=requests.utils.default_user_age page.make_links_absolute(url) return page - def csv_reader(self, url, delimiter=",", header=False, encoding=None, skip_rows=0, data=None, **kwargs): + def csv_reader(self, url, *, delimiter=",", header=False, encoding=None, skip_rows=0, data=None, **kwargs): if not data: result = urlparse(url) if result.scheme == "ftp": data = StringIO() - ftp = FTP(result.hostname) + ftp = FTP(result.hostname) # noqa: S321 ftp.login(result.username, result.password) ftp.retrbinary(f"RETR {result.path}", lambda block: data.write(block.decode("utf-8"))) ftp.quit() @@ -343,8 +341,9 @@ class CSVScraper(CanadianScraper): def header_converter(self, s): """ - Normalizes a column header name. By default, lowercases it and replaces - underscores with spaces (e.g. because Esri fields can't contain spaces). + Normalize a column header name. + + By default, lowercase it and replace underscores with spaces (e.g. because Esri fields can't contain spaces). """ header = clean_string(s.lower().replace("_", " ")) if hasattr(self, "locale"): @@ -353,8 +352,9 @@ def header_converter(self, s): def is_valid_row(self, row): """ - Returns whether the row should be imported. By default, skips empty rows - and rows in which a name component is "Vacant". + Return whether the row should be imported. + + By default, skip empty rows and rows in which a name component is "Vacant". """ empty = ("", "Vacant") if not any(row.values()): @@ -405,7 +405,7 @@ def scrape(self): # ca_qc_laval: "maire et president du comite executif", "conseiller et membre du comite executif" # ca_qc_montreal: "Conseiller de la ville; Membre…", "Maire d'arrondissement\nMembre…" if row.get("primary role"): - row["primary role"] = re.split(r"(?: (?:et)\b|[;\n])", row["primary role"], 1)[0].strip() + row["primary role"] = re.split(r"(?: (?:et)\b|[;\n])", row["primary role"], maxsplit=1)[0].strip() if not self.is_valid_row(row): continue @@ -613,7 +613,7 @@ def get_organizations(self): class CanadianPerson(Person): def __init__(self, *, name, district, role, **kwargs): - """Cleans a person's name, district, role and any other attributes.""" + """Clean a person's name, district, role and any other attributes.""" name = clean_name(name) district = clean_string(district).replace("&", "and") role = clean_string(role) @@ -627,7 +627,7 @@ def __init__(self, *, name, district, role, **kwargs): super().__init__(name=name, district=district, role=role, **kwargs) def __setattr__(self, name, value): - """Corrects gender values.""" + """Correct gender values.""" if name == "gender": value = value.lower() if value == "m": @@ -637,7 +637,7 @@ def __setattr__(self, name, value): super().__setattr__(name, value) def add_link(self, url, *, note=""): - """Corrects links without schemes or domains.""" + """Correct links without schemes or domains.""" url = url.strip() if url.startswith("www."): url = f"http://{url}" @@ -646,7 +646,7 @@ def add_link(self, url, *, note=""): self.links.append({"note": note, "url": url}) def add_contact(self, type, value, note="", area_code=None): - """Cleans and adds a contact detail to the person's membership.""" + """Clean and add a contact detail to the person's membership.""" if type: type = clean_string(type) if note: @@ -686,10 +686,7 @@ def clean_telephone_number(self, s, area_code=None): return s def clean_address(self, s): - """ - Corrects the postal code, abbreviates the province or territory name, and - formats the last line of the address. - """ + """Correct the postal code, abbreviate the province or territory name, and format the last line of the address.""" # The letter "O" instead of the numeral "0" is a common mistake. s = re.sub( r"\b[A-Z][O0-9][A-Z]\s?[O0-9][A-Z][O0-9]\b", lambda x: x.group(0).replace("O", "0"), clean_string(s) @@ -720,7 +717,7 @@ def clean_address(self, s): table = { ord("\u200b"): " ", # zero-width space ord("’"): "'", - ord("\xc2"): " ", # non-breaking space if mixing ISO-8869-1 into UTF-8 + ord("\xc2"): "\xa0", # non-breaking space if mixing ISO-8869-1 into UTF-8 } @@ -751,7 +748,7 @@ def clean_type_id(type_id): # "Spaces should be converted to underscores." type_id = re.sub(r" ", "_", type_id) # "All invalid characters should be converted to tilde (~)." - return re.sub(r"[^\w.~-]", "~", type_id, re.UNICODE) + return re.sub(r"[^\w.~-]", "~", type_id, flags=re.UNICODE) def clean_french_prepositions(s): From 6c5885f241bfb2c99e675d334261d44f9e83e2b8 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:07:33 -0400 Subject: [PATCH 62/66] fix: Don't think intention of 4bc9ded was to set SSL_VERIFY to False by default --- utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils.py b/utils.py index c401dd1e..3a5ded50 100644 --- a/utils.py +++ b/utils.py @@ -83,7 +83,7 @@ "Voice Mail": "legislature", "Work": "legislature", } -SSL_VERIFY = "/usr/lib/ssl/certs/ca-certificates.crt" if os.getenv("SSL_VERIFY", "") else False +SSL_VERIFY = "/usr/lib/ssl/certs/ca-certificates.crt" if os.getenv("SSL_VERIFY", "") else True email_re = re.compile(r"([A-Za-z0-9._-]+@(?:[A-Za-z0-9-]+\.)+[A-Za-z]{2,})") From 0d9bcec744359da6946daed9a9213e19d1c23948 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 16 Sep 2024 18:16:43 -0400 Subject: [PATCH 63/66] chore: Add comments to ruff configuration --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 85ef59fa..28cf62c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ ignore = [ "DTZ", "E501", "ERA001", # commented-out code - "PLR2004", + "PLR2004", # magic "PLW2901", "PTH", "RUF012", @@ -22,7 +22,7 @@ ignore = [ "S113", # timeout "TRY003", # errors - # Should be refactored: + # To fix: "BLE001", # except Exception "S110", # except pass "TRY002", # raise Exception From db3825c356aa16dc3ed591f52316f826528f8882 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 21 Sep 2024 02:51:15 -0400 Subject: [PATCH 64/66] fix: ca_on_toronto --- ca_on_toronto/people.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ca_on_toronto/people.py b/ca_on_toronto/people.py index 010a9867..02661961 100644 --- a/ca_on_toronto/people.py +++ b/ca_on_toronto/people.py @@ -12,3 +12,6 @@ class TorontoPersonScraper(CSVScraper): "councillor_ mckelvie@toronto.ca": "councillor_mckelvie@toronto.ca", }, } + + def is_valid_row(self, row): + return row["first name"] != "None" and row["last name"] != "None" From a7d4dd2d3673ebd5dbfe60c54aa55918e689e4c9 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 21 Sep 2024 03:07:34 -0400 Subject: [PATCH 65/66] chore: Use Scraper.get instead of requests.get --- ca_mb_winnipeg/people.py | 4 +--- ca_nb_moncton/people.py | 4 +--- ca_on_caledon/people.py | 4 +--- ca_on_chatham_kent/people.py | 3 +-- ca_on_windsor/people.py | 4 +--- ca_qc_brossard/people.py | 4 +--- disabled/ca_nl_municipalities/people.py | 3 +-- disabled/ca_ns_municipalities/people.py | 3 +-- disabled/ca_sk_municipalities/people.py | 3 +-- disabled/ca_yt_municipalities/people.py | 3 +-- 10 files changed, 10 insertions(+), 25 deletions(-) diff --git a/ca_mb_winnipeg/people.py b/ca_mb_winnipeg/people.py index 16e2af66..c3a2daa7 100644 --- a/ca_mb_winnipeg/people.py +++ b/ca_mb_winnipeg/people.py @@ -1,7 +1,5 @@ import json -import requests - from utils import CanadianPerson as Person from utils import CanadianScraper @@ -12,7 +10,7 @@ class WinnipegPersonScraper(CanadianScraper): def scrape(self): # from https://data.winnipeg.ca/Council-Services/Council-Data/r4tk-7dip/about_data api_url = "https://data.winnipeg.ca/resource/r4tk-7dip.json" - data = json.loads(requests.get(api_url).content) + data = json.loads(self.get(api_url).content) assert len(data), "No councillors found via API" page = self.lxmlize(COUNCIL_PAGE) diff --git a/ca_nb_moncton/people.py b/ca_nb_moncton/people.py index e13a7aee..aa47ed0f 100644 --- a/ca_nb_moncton/people.py +++ b/ca_nb_moncton/people.py @@ -1,8 +1,6 @@ import json from collections import defaultdict -import requests - from utils import CanadianPerson as Person from utils import CanadianScraper @@ -13,7 +11,7 @@ class MonctonPersonScraper(CanadianScraper): def scrape(self): seat_numbers = defaultdict(int) - data = json.loads(requests.get(API_URL).content)["features"] + data = json.loads(self.get(API_URL).content)["features"] assert len(data), "No councillors found" for item in data: diff --git a/ca_on_caledon/people.py b/ca_on_caledon/people.py index cbf51335..837019c1 100644 --- a/ca_on_caledon/people.py +++ b/ca_on_caledon/people.py @@ -1,7 +1,5 @@ import re -import requests - from utils import CanadianPerson as Person from utils import CanadianScraper @@ -33,7 +31,7 @@ def scrape(self): # phone numbers populated by JS request contact_num = page.xpath('//div[@class="contactBody"]/div/@id')[0].replace("contactEntry_", "") - contact_data = requests.get( + contact_data = self.get( f"https://www.caledon.ca//Modules/Contact/services/GetContactHTML.ashx?isMobile=false¶m={contact_num}&lang=en" ).text voice = re.findall(r"(?<=tel://)\d+(?=\">)", contact_data) diff --git a/ca_on_chatham_kent/people.py b/ca_on_chatham_kent/people.py index 5ddd2165..c5a02d6a 100644 --- a/ca_on_chatham_kent/people.py +++ b/ca_on_chatham_kent/people.py @@ -1,7 +1,6 @@ import re from collections import defaultdict -import requests from lxml import etree from utils import CanadianPerson as Person @@ -19,7 +18,7 @@ def scrape(self): headers = {"content-type": "text/xml"} body = 'councillorsByWard50' - response = requests.post(url=COUNCIL_DATA_URL, data=body, headers=headers) + response = self.post(url=COUNCIL_DATA_URL, data=body, headers=headers) page = etree.fromstring(response.content) # noqa: S320 namespace = {"z": "#RowsetSchema", "rs": "urn:schemas-microsoft-com:rowset"} diff --git a/ca_on_windsor/people.py b/ca_on_windsor/people.py index 707da754..7ee6649c 100644 --- a/ca_on_windsor/people.py +++ b/ca_on_windsor/people.py @@ -1,7 +1,5 @@ import json -import requests - from utils import CanadianPerson as Person from utils import CanadianScraper @@ -12,7 +10,7 @@ class WindsorPersonScraper(CanadianScraper): def scrape(self): page = self.lxmlize(COUNCIL_PAGE) data_url = page.xpath('//comment()[contains(., "SITE JS")]/following-sibling::script/@src')[0] - data = json.loads(requests.get(data_url).text.split(" = ")[1]) + data = json.loads(self.get(data_url).text.split(" = ")[1]) nav_items = [] for item in data: if item["RollupType"] == "SidebarNavigation": diff --git a/ca_qc_brossard/people.py b/ca_qc_brossard/people.py index d82df70d..348bae07 100644 --- a/ca_qc_brossard/people.py +++ b/ca_qc_brossard/people.py @@ -1,8 +1,6 @@ import json import re -import requests - from utils import CanadianPerson as Person from utils import CanadianScraper @@ -33,7 +31,7 @@ def get_children(parent_id, element_dict): return return_list # The whole page is rendered in javascript and stored as a massive json object - page = requests.get(DATA_PAGE) + page = self.get(DATA_PAGE) page = json.loads(page.content) containers = page["content"].values() for container in containers: diff --git a/disabled/ca_nl_municipalities/people.py b/disabled/ca_nl_municipalities/people.py index 73262f79..499b0e73 100644 --- a/disabled/ca_nl_municipalities/people.py +++ b/disabled/ca_nl_municipalities/people.py @@ -3,7 +3,6 @@ import subprocess import tempfile -import requests from pupa.scrape import Organization from utils import CanadianPerson as Person @@ -17,7 +16,7 @@ def scrape(self): page = self.lxmlize(COUNCIL_PAGE) url = page.xpath('//a[contains(text(),"Municipal Directory")]/@href')[0] - response = requests.get(url).content + response = self.get(url).content with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: pdf.write(response) diff --git a/disabled/ca_ns_municipalities/people.py b/disabled/ca_ns_municipalities/people.py index 9ed12739..2d8b9fbe 100644 --- a/disabled/ca_ns_municipalities/people.py +++ b/disabled/ca_ns_municipalities/people.py @@ -3,7 +3,6 @@ import subprocess import tempfile -import requests from pupa.scrape import Organization from utils import CanadianPerson as Person @@ -14,7 +13,7 @@ class NovaScotiaMunicipalitiesPersonScraper(CanadianScraper): def scrape(self): - response = requests.get(COUNCIL_PAGE).content + response = self.get(COUNCIL_PAGE).content with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: pdf.write(response) diff --git a/disabled/ca_sk_municipalities/people.py b/disabled/ca_sk_municipalities/people.py index 6f96d1ce..b162995d 100644 --- a/disabled/ca_sk_municipalities/people.py +++ b/disabled/ca_sk_municipalities/people.py @@ -3,7 +3,6 @@ import subprocess import tempfile -import requests from pupa.scrape import Organization from utils import CanadianPerson as Person @@ -15,7 +14,7 @@ class SaskatchewanMunicipalitiesPersonScraper(CanadianScraper): def scrape(self): - response = requests.get(COUNCIL_PAGE).read() + response = self.get(COUNCIL_PAGE).read() with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: pdf.write(response) diff --git a/disabled/ca_yt_municipalities/people.py b/disabled/ca_yt_municipalities/people.py index 2cd79127..7c7ba0af 100644 --- a/disabled/ca_yt_municipalities/people.py +++ b/disabled/ca_yt_municipalities/people.py @@ -3,7 +3,6 @@ import subprocess import tempfile -import requests from pupa.scrape import Organization from utils import CanadianPerson as Person @@ -14,7 +13,7 @@ class YukonMunicipalitiesPersonScraper(CanadianScraper): def scrape(self): - response = requests.get(COUNCIL_PAGE).content + response = self.get(COUNCIL_PAGE).content with tempfile.NamedTemporaryFile(delete_on_close=False) as pdf: pdf.write(response) From a370b4cfa34a8792a6c56bc28adff5a46e97c911 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 18:15:48 +0000 Subject: [PATCH 66/66] [pre-commit.ci] pre-commit autoupdate (#433) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.6.3 → v0.6.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.3...v0.6.9) - [github.com/astral-sh/uv-pre-commit: 0.4.4 → 0.4.18](https://github.com/astral-sh/uv-pre-commit/compare/0.4.4...0.4.18) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5e135c6f..2aea3241 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,12 +5,12 @@ default_language_version: python: python3.10 repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.6.3 + rev: v0.6.9 hooks: - id: ruff - id: ruff-format - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.4.4 + rev: 0.4.18 hooks: - id: pip-compile name: pip-compile requirements.in