From 6ceb72bb318e91703a63517940050b4e9e0d38e3 Mon Sep 17 00:00:00 2001 From: bzhangjma Date: Mon, 18 Nov 2024 14:47:06 -0500 Subject: [PATCH 1/5] Update people.py --- ca_bc/people.py | 78 ++++++++++--------------------------------------- 1 file changed, 15 insertions(+), 63 deletions(-) diff --git a/ca_bc/people.py b/ca_bc/people.py index 356e4379..e8f00f4a 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -1,76 +1,28 @@ -import re +import json +import requests from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "https://www.leg.bc.ca/_api/search/query?querytext='(contentclass:sts_listitem%20OR%20IsDocument:True)%20SPSiteUrl:/content%20ListId:8ecafcaa-2bf9-4434-a60c-3663a9afd175%20MLAActiveOWSBOOL:1%20-LastNameOWSTEXT:Vacant'&selectproperties='Picture1OWSIMGE,Title,Path'&&sortlist='LastNameSort:ascending'&rowlimit=100&QueryTemplatePropertiesUrl='spfile://webroot/queryparametertemplate.xml'" +COUNCIL_PAGE = "https://www.leg.bc.ca/members" +query = """ +query GetMLAsByConstituency($parliamentId: Int!) {\n allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {\n nodes {\n image: imageBySmallImageId {\n path\n description\n __typename\n }\n constituency: constituencyByConstituencyId {\n name\n __typename\n }\n member: memberByMemberId {\n firstName\n lastName\n __typename\n }\n isCounsel\n isDoctor\n isHonourable\n party: partyByPartyId {\n name\n abbreviation\n __typename\n }\n nodeId\n __typename\n }\n __typename\n }\n} +""" +variables = {"parliamentId": 43} class BritishColumbiaPersonScraper(CanadianScraper): def scrape(self): - parties = { - "BC NDP": "New Democratic Party of British Columbia", - "BC Liberal Party": "British Columbia Liberal Party", - } - - page = self.lxmlize(COUNCIL_PAGE, xml=True) - - nsmap = {"d": "http://schemas.microsoft.com/ado/2007/08/dataservices"} - members = page.xpath("//d:Cells", namespaces=nsmap) - assert len(members), "No members found" + response = requests.post(url="https://lims.leg.bc.ca/graphql", json={"query": query, "variables": variables}) + data = json.loads(response.content.decode("utf-8")) + members = data["data"]["allMemberParliaments"]["nodes"] for member in members: - url = member.xpath('./d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()', namespaces=nsmap)[ - 0 - ] - if "vacant" in url.lower(): - continue - page = self.lxmlize(url) + image = "https://lims.leg.bc.ca/public" + member["image"]["path"] + district = member["constituency"]["name"] + name = member["member"]["firstName"] + " " + member["member"]["lastName"] + party = member["party"]["name"] - name = ( - page.xpath('//div[contains(@class, "BCLASS-pagetitle")]//h3/text()')[0] - .replace("Wm.", "") - .replace(", Q.C.", "") - .replace(", K.C.", "") - .strip() - ) - district, party = cleanup_list(page.xpath('//div[@id="MinisterTitle"]/following-sibling::text()')) - party = parties.get(party, party) - p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) + p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party, image=image) p.add_source(COUNCIL_PAGE) - p.add_source(url) - - p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0] - - email = page.xpath('//div[@class="convertToEmail"]//text()')[0].strip() - if "#" in email: - email = email.split("#")[0] - if email: - p.add_contact("email", email) - - office = ", ".join(cleanup_list(page.xpath('//h4[contains(text(), "Office:")]/ancestor::div/text()'))) - office = re.sub(r"\s{2,}", " ", office) - p.add_contact("address", office, "legislature") - - constituency = ", ".join( - cleanup_list(page.xpath('//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()')) - ) - constituency = re.sub(r"\s{2,}", " ", constituency).split(", Phone")[0] - p.add_contact("address", constituency, "constituency") - - phones = cleanup_list(page.xpath('//span[contains(text(), "Phone:")]/following-sibling::text()')) - - office_phone = phones[0] - p.add_contact("voice", office_phone, "legislature") - if len(phones) > 1: - constituency_phone = phones[1] - p.add_contact("voice", constituency_phone, "constituency") - - roles = page.xpath('//div[@id="MinisterTitle"]/text()')[0].strip() - if roles: - p.extras["roles"] = [roles] yield p - - -def cleanup_list(dirty_list): - return list(filter(None, (x.strip() for x in dirty_list))) From adff484f47c5963023a44efacfeadad738046dfb Mon Sep 17 00:00:00 2001 From: bzhangjma Date: Mon, 18 Nov 2024 14:48:24 -0500 Subject: [PATCH 2/5] Update people.py --- ca_bc/people.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ca_bc/people.py b/ca_bc/people.py index e8f00f4a..dfa1bef4 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -16,6 +16,7 @@ def scrape(self): response = requests.post(url="https://lims.leg.bc.ca/graphql", json={"query": query, "variables": variables}) data = json.loads(response.content.decode("utf-8")) members = data["data"]["allMemberParliaments"]["nodes"] + assert len(members), "No members found" for member in members: image = "https://lims.leg.bc.ca/public" + member["image"]["path"] district = member["constituency"]["name"] From 4c6e6d493043f43599c5c434080caef058309596 Mon Sep 17 00:00:00 2001 From: bzhangjma Date: Mon, 18 Nov 2024 15:09:47 -0500 Subject: [PATCH 3/5] Update people.py --- ca_bc/people.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ca_bc/people.py b/ca_bc/people.py index dfa1bef4..2eed34b5 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -5,15 +5,15 @@ from utils import CanadianScraper COUNCIL_PAGE = "https://www.leg.bc.ca/members" -query = """ -query GetMLAsByConstituency($parliamentId: Int!) {\n allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {\n nodes {\n image: imageBySmallImageId {\n path\n description\n __typename\n }\n constituency: constituencyByConstituencyId {\n name\n __typename\n }\n member: memberByMemberId {\n firstName\n lastName\n __typename\n }\n isCounsel\n isDoctor\n isHonourable\n party: partyByPartyId {\n name\n abbreviation\n __typename\n }\n nodeId\n __typename\n }\n __typename\n }\n} -""" -variables = {"parliamentId": 43} +JSON = { + "query": "query GetMLAsByConstituency($parliamentId: Int!) {\n allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {\n nodes {\n image: imageBySmallImageId {\n path\n description\n __typename\n }\n constituency: constituencyByConstituencyId {\n name\n __typename\n }\n member: memberByMemberId {\n firstName\n lastName\n __typename\n }\n isCounsel\n isDoctor\n isHonourable\n party: partyByPartyId {\n name\n abbreviation\n __typename\n }\n nodeId\n __typename\n }\n __typename\n }\n}", + "variables": {"parliamentId": 43}, +} class BritishColumbiaPersonScraper(CanadianScraper): def scrape(self): - response = requests.post(url="https://lims.leg.bc.ca/graphql", json={"query": query, "variables": variables}) + response = requests.post(url="https://lims.leg.bc.ca/graphql", json=JSON) data = json.loads(response.content.decode("utf-8")) members = data["data"]["allMemberParliaments"]["nodes"] assert len(members), "No members found" From 6de91eeaf3d6620590310fc2d95f0a1f587e47b6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:56:58 -0500 Subject: [PATCH 4/5] ca_bc: Use multiline string for readability --- ca_bc/people.py | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/ca_bc/people.py b/ca_bc/people.py index 2eed34b5..68a7984c 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -1,21 +1,53 @@ import json -import requests +from textwrap import dedent from utils import CanadianPerson as Person from utils import CanadianScraper COUNCIL_PAGE = "https://www.leg.bc.ca/members" -JSON = { - "query": "query GetMLAsByConstituency($parliamentId: Int!) {\n allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {\n nodes {\n image: imageBySmallImageId {\n path\n description\n __typename\n }\n constituency: constituencyByConstituencyId {\n name\n __typename\n }\n member: memberByMemberId {\n firstName\n lastName\n __typename\n }\n isCounsel\n isDoctor\n isHonourable\n party: partyByPartyId {\n name\n abbreviation\n __typename\n }\n nodeId\n __typename\n }\n __typename\n }\n}", - "variables": {"parliamentId": 43}, -} class BritishColumbiaPersonScraper(CanadianScraper): def scrape(self): - response = requests.post(url="https://lims.leg.bc.ca/graphql", json=JSON) + response = self.post(url="https://lims.leg.bc.ca/graphql", json={ + "query": dedent("""\ + query GetMLAsByConstituency($parliamentId: Int!) { + allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) { + nodes { + image: imageBySmallImageId { + path + description + __typename + } + constituency: constituencyByConstituencyId { + name + __typename + } + member: memberByMemberId { + firstName + lastName + __typename + } + isCounsel + isDoctor + isHonourable + party: partyByPartyId { + name + abbreviation + __typename + } + nodeId + __typename + } + __typename + } + }""" + ), + "variables": {"parliamentId": 43}, + }) data = json.loads(response.content.decode("utf-8")) members = data["data"]["allMemberParliaments"]["nodes"] + assert len(members), "No members found" for member in members: image = "https://lims.leg.bc.ca/public" + member["image"]["path"] From 05054d96cb9a91c2313ee8546b508028ee2433f5 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:57:53 -0500 Subject: [PATCH 5/5] ca_bc: Fix validation to allow "Hon Chan" and "A'aliya --- patch.py | 4 ++-- utils.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/patch.py b/patch.py index d681e08b..4540cfc8 100644 --- a/patch.py +++ b/patch.py @@ -123,7 +123,7 @@ r'[("](?:\p{Lu}+|\p{Lu}\p{Ll}*(?:-\p{Lu}\p{Ll}*)*)[)"]|' r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|St\.|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|" r"\p{Lu}\p{Ll}+Anne?|Marie\p{Lu}\p{Ll}+|" - r"Ch'ng|Prud'homme|" + r"A'aliya|Ch'ng|Prud'homme|" r"D!ONNE|IsaBelle|Ya'ara" r")" ) @@ -131,7 +131,7 @@ # Name components can be joined by apostrophes, hyphens or spaces. person_schema["properties"]["name"]["pattern"] = re.compile( r"\A" - r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" + r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" r"(?:" + name_fragment + r"(?:'|-| - | )" r")+" + name_fragment + r"\Z" ) diff --git a/utils.py b/utils.py index 59561840..52ef6e34 100644 --- a/utils.py +++ b/utils.py @@ -739,9 +739,10 @@ def clean_string(s): def clean_name(s): - return honorific_suffix_re.sub( - "", honorific_prefix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip()) - ) + name = honorific_suffix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip()) + if name.count(" ") > 1: + return honorific_prefix_re.sub("", name) # Avoid truncating names like "Hon Chan" + return name def clean_type_id(type_id):