diff --git a/ca_bc/people.py b/ca_bc/people.py index 356e4379..68a7984c 100644 --- a/ca_bc/people.py +++ b/ca_bc/people.py @@ -1,76 +1,61 @@ -import re +import json +from textwrap import dedent from utils import CanadianPerson as Person from utils import CanadianScraper -COUNCIL_PAGE = "https://www.leg.bc.ca/_api/search/query?querytext='(contentclass:sts_listitem%20OR%20IsDocument:True)%20SPSiteUrl:/content%20ListId:8ecafcaa-2bf9-4434-a60c-3663a9afd175%20MLAActiveOWSBOOL:1%20-LastNameOWSTEXT:Vacant'&selectproperties='Picture1OWSIMGE,Title,Path'&&sortlist='LastNameSort:ascending'&rowlimit=100&QueryTemplatePropertiesUrl='spfile://webroot/queryparametertemplate.xml'" +COUNCIL_PAGE = "https://www.leg.bc.ca/members" class BritishColumbiaPersonScraper(CanadianScraper): def scrape(self): - parties = { - "BC NDP": "New Democratic Party of British Columbia", - "BC Liberal Party": "British Columbia Liberal Party", - } + response = self.post(url="https://lims.leg.bc.ca/graphql", json={ + "query": dedent("""\ + query GetMLAsByConstituency($parliamentId: Int!) { + allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) { + nodes { + image: imageBySmallImageId { + path + description + __typename + } + constituency: constituencyByConstituencyId { + name + __typename + } + member: memberByMemberId { + firstName + lastName + __typename + } + isCounsel + isDoctor + isHonourable + party: partyByPartyId { + name + abbreviation + __typename + } + nodeId + __typename + } + __typename + } + }""" + ), + "variables": {"parliamentId": 43}, + }) + data = json.loads(response.content.decode("utf-8")) + members = data["data"]["allMemberParliaments"]["nodes"] - page = self.lxmlize(COUNCIL_PAGE, xml=True) - - nsmap = {"d": "http://schemas.microsoft.com/ado/2007/08/dataservices"} - members = page.xpath("//d:Cells", namespaces=nsmap) assert len(members), "No members found" for member in members: - url = member.xpath('./d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()', namespaces=nsmap)[ - 0 - ] - if "vacant" in url.lower(): - continue - page = self.lxmlize(url) + image = "https://lims.leg.bc.ca/public" + member["image"]["path"] + district = member["constituency"]["name"] + name = member["member"]["firstName"] + " " + member["member"]["lastName"] + party = member["party"]["name"] - name = ( - page.xpath('//div[contains(@class, "BCLASS-pagetitle")]//h3/text()')[0] - .replace("Wm.", "") - .replace(", Q.C.", "") - .replace(", K.C.", "") - .strip() - ) - district, party = cleanup_list(page.xpath('//div[@id="MinisterTitle"]/following-sibling::text()')) - party = parties.get(party, party) - p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party) + p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party, image=image) p.add_source(COUNCIL_PAGE) - p.add_source(url) - - p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0] - - email = page.xpath('//div[@class="convertToEmail"]//text()')[0].strip() - if "#" in email: - email = email.split("#")[0] - if email: - p.add_contact("email", email) - - office = ", ".join(cleanup_list(page.xpath('//h4[contains(text(), "Office:")]/ancestor::div/text()'))) - office = re.sub(r"\s{2,}", " ", office) - p.add_contact("address", office, "legislature") - - constituency = ", ".join( - cleanup_list(page.xpath('//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()')) - ) - constituency = re.sub(r"\s{2,}", " ", constituency).split(", Phone")[0] - p.add_contact("address", constituency, "constituency") - - phones = cleanup_list(page.xpath('//span[contains(text(), "Phone:")]/following-sibling::text()')) - - office_phone = phones[0] - p.add_contact("voice", office_phone, "legislature") - if len(phones) > 1: - constituency_phone = phones[1] - p.add_contact("voice", constituency_phone, "constituency") - - roles = page.xpath('//div[@id="MinisterTitle"]/text()')[0].strip() - if roles: - p.extras["roles"] = [roles] yield p - - -def cleanup_list(dirty_list): - return list(filter(None, (x.strip() for x in dirty_list))) diff --git a/patch.py b/patch.py index d681e08b..4540cfc8 100644 --- a/patch.py +++ b/patch.py @@ -123,7 +123,7 @@ r'[("](?:\p{Lu}+|\p{Lu}\p{Ll}*(?:-\p{Lu}\p{Ll}*)*)[)"]|' r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|St\.|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|" r"\p{Lu}\p{Ll}+Anne?|Marie\p{Lu}\p{Ll}+|" - r"Ch'ng|Prud'homme|" + r"A'aliya|Ch'ng|Prud'homme|" r"D!ONNE|IsaBelle|Ya'ara" r")" ) @@ -131,7 +131,7 @@ # Name components can be joined by apostrophes, hyphens or spaces. person_schema["properties"]["name"]["pattern"] = re.compile( r"\A" - r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" + r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)" r"(?:" + name_fragment + r"(?:'|-| - | )" r")+" + name_fragment + r"\Z" ) diff --git a/utils.py b/utils.py index 59561840..52ef6e34 100644 --- a/utils.py +++ b/utils.py @@ -739,9 +739,10 @@ def clean_string(s): def clean_name(s): - return honorific_suffix_re.sub( - "", honorific_prefix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip()) - ) + name = honorific_suffix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip()) + if name.count(" ") > 1: + return honorific_prefix_re.sub("", name) # Avoid truncating names like "Hon Chan" + return name def clean_type_id(type_id):