Skip to content

Commit

Permalink
Merge branch 'ca_bc_fix_2'
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Nov 18, 2024
2 parents 4f8f4a7 + 05054d9 commit d1f8d1e
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 66 deletions.
107 changes: 46 additions & 61 deletions ca_bc/people.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,61 @@
import re
import json
from textwrap import dedent

from utils import CanadianPerson as Person
from utils import CanadianScraper

COUNCIL_PAGE = "https://www.leg.bc.ca/_api/search/query?querytext='(contentclass:sts_listitem%20OR%20IsDocument:True)%20SPSiteUrl:/content%20ListId:8ecafcaa-2bf9-4434-a60c-3663a9afd175%20MLAActiveOWSBOOL:1%20-LastNameOWSTEXT:Vacant'&selectproperties='Picture1OWSIMGE,Title,Path'&&sortlist='LastNameSort:ascending'&rowlimit=100&QueryTemplatePropertiesUrl='spfile://webroot/queryparametertemplate.xml'"
COUNCIL_PAGE = "https://www.leg.bc.ca/members"


class BritishColumbiaPersonScraper(CanadianScraper):
def scrape(self):
parties = {
"BC NDP": "New Democratic Party of British Columbia",
"BC Liberal Party": "British Columbia Liberal Party",
}
response = self.post(url="https://lims.leg.bc.ca/graphql", json={
"query": dedent("""\
query GetMLAsByConstituency($parliamentId: Int!) {
allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {
nodes {
image: imageBySmallImageId {
path
description
__typename
}
constituency: constituencyByConstituencyId {
name
__typename
}
member: memberByMemberId {
firstName
lastName
__typename
}
isCounsel
isDoctor
isHonourable
party: partyByPartyId {
name
abbreviation
__typename
}
nodeId
__typename
}
__typename
}
}"""
),
"variables": {"parliamentId": 43},
})
data = json.loads(response.content.decode("utf-8"))
members = data["data"]["allMemberParliaments"]["nodes"]

page = self.lxmlize(COUNCIL_PAGE, xml=True)

nsmap = {"d": "http://schemas.microsoft.com/ado/2007/08/dataservices"}
members = page.xpath("//d:Cells", namespaces=nsmap)
assert len(members), "No members found"
for member in members:
url = member.xpath('./d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()', namespaces=nsmap)[
0
]
if "vacant" in url.lower():
continue
page = self.lxmlize(url)
image = "https://lims.leg.bc.ca/public" + member["image"]["path"]
district = member["constituency"]["name"]
name = member["member"]["firstName"] + " " + member["member"]["lastName"]
party = member["party"]["name"]

name = (
page.xpath('//div[contains(@class, "BCLASS-pagetitle")]//h3/text()')[0]
.replace("Wm.", "")
.replace(", Q.C.", "")
.replace(", K.C.", "")
.strip()
)
district, party = cleanup_list(page.xpath('//div[@id="MinisterTitle"]/following-sibling::text()'))
party = parties.get(party, party)
p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party, image=image)
p.add_source(COUNCIL_PAGE)
p.add_source(url)

p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0]

email = page.xpath('//div[@class="convertToEmail"]//text()')[0].strip()
if "#" in email:
email = email.split("#")[0]
if email:
p.add_contact("email", email)

office = ", ".join(cleanup_list(page.xpath('//h4[contains(text(), "Office:")]/ancestor::div/text()')))
office = re.sub(r"\s{2,}", " ", office)
p.add_contact("address", office, "legislature")

constituency = ", ".join(
cleanup_list(page.xpath('//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()'))
)
constituency = re.sub(r"\s{2,}", " ", constituency).split(", Phone")[0]
p.add_contact("address", constituency, "constituency")

phones = cleanup_list(page.xpath('//span[contains(text(), "Phone:")]/following-sibling::text()'))

office_phone = phones[0]
p.add_contact("voice", office_phone, "legislature")
if len(phones) > 1:
constituency_phone = phones[1]
p.add_contact("voice", constituency_phone, "constituency")

roles = page.xpath('//div[@id="MinisterTitle"]/text()')[0].strip()
if roles:
p.extras["roles"] = [roles]

yield p


def cleanup_list(dirty_list):
return list(filter(None, (x.strip() for x in dirty_list)))
4 changes: 2 additions & 2 deletions patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,15 @@
r'[("](?:\p{Lu}+|\p{Lu}\p{Ll}*(?:-\p{Lu}\p{Ll}*)*)[)"]|'
r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|St\.|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|"
r"\p{Lu}\p{Ll}+Anne?|Marie\p{Lu}\p{Ll}+|"
r"Ch'ng|Prud'homme|"
r"A'aliya|Ch'ng|Prud'homme|"
r"D!ONNE|IsaBelle|Ya'ara"
r")"
)

# Name components can be joined by apostrophes, hyphens or spaces.
person_schema["properties"]["name"]["pattern"] = re.compile(
r"\A"
r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)"
r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)"
r"(?:" + name_fragment + r"(?:'|-| - | )"
r")+" + name_fragment + r"\Z"
)
Expand Down
7 changes: 4 additions & 3 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,9 +739,10 @@ def clean_string(s):


def clean_name(s):
return honorific_suffix_re.sub(
"", honorific_prefix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip())
)
name = honorific_suffix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip())
if name.count(" ") > 1:
return honorific_prefix_re.sub("", name) # Avoid truncating names like "Hon Chan"
return name


def clean_type_id(type_id):
Expand Down

0 comments on commit d1f8d1e

Please sign in to comment.