Merge branch 'ca_bc_fix_2'

opencivicdata · Nov 18, 2024 · d1f8d1e · d1f8d1e
2 parents 4f8f4a7 + 05054d9
commit d1f8d1e
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 66 deletions.
diff --git a/ca_bc/people.py b/ca_bc/people.py
@@ -1,76 +1,61 @@
-import re
+import json
+from textwrap import dedent
 
 from utils import CanadianPerson as Person
 from utils import CanadianScraper
 
-COUNCIL_PAGE = "https://www.leg.bc.ca/_api/search/query?querytext='(contentclass:sts_listitem%20OR%20IsDocument:True)%20SPSiteUrl:/content%20ListId:8ecafcaa-2bf9-4434-a60c-3663a9afd175%20MLAActiveOWSBOOL:1%20-LastNameOWSTEXT:Vacant'&selectproperties='Picture1OWSIMGE,Title,Path'&&sortlist='LastNameSort:ascending'&rowlimit=100&QueryTemplatePropertiesUrl='spfile://webroot/queryparametertemplate.xml'"
+COUNCIL_PAGE = "https://www.leg.bc.ca/members"
 
 
 class BritishColumbiaPersonScraper(CanadianScraper):
     def scrape(self):
-        parties = {
-            "BC NDP": "New Democratic Party of British Columbia",
-            "BC Liberal Party": "British Columbia Liberal Party",
-        }
+        response = self.post(url="https://lims.leg.bc.ca/graphql", json={
+            "query": dedent("""\
+                query GetMLAsByConstituency($parliamentId: Int!) {
+                  allMemberParliaments(condition: {parliamentId: $parliamentId, active: true}) {
+                    nodes {
+                      image: imageBySmallImageId {
+                        path
+                        description
+                        __typename
+                      }
+                      constituency: constituencyByConstituencyId {
+                        name
+                        __typename
+                      }
+                      member: memberByMemberId {
+                        firstName
+                        lastName
+                        __typename
+                      }
+                      isCounsel
+                      isDoctor
+                      isHonourable
+                      party: partyByPartyId {
+                        name
+                        abbreviation
+                        __typename
+                      }
+                      nodeId
+                      __typename
+                    }
+                    __typename
+                  }
+                }"""
+            ),
+            "variables": {"parliamentId": 43},
+        })
+        data = json.loads(response.content.decode("utf-8"))
+        members = data["data"]["allMemberParliaments"]["nodes"]
 
-        page = self.lxmlize(COUNCIL_PAGE, xml=True)
-
-        nsmap = {"d": "http://schemas.microsoft.com/ado/2007/08/dataservices"}
-        members = page.xpath("//d:Cells", namespaces=nsmap)
         assert len(members), "No members found"
         for member in members:
-            url = member.xpath('./d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()', namespaces=nsmap)[
-                0
-            ]
-            if "vacant" in url.lower():
-                continue
-            page = self.lxmlize(url)
+            image = "https://lims.leg.bc.ca/public" + member["image"]["path"]
+            district = member["constituency"]["name"]
+            name = member["member"]["firstName"] + " " + member["member"]["lastName"]
+            party = member["party"]["name"]
 
-            name = (
-                page.xpath('//div[contains(@class, "BCLASS-pagetitle")]//h3/text()')[0]
-                .replace("Wm.", "")
-                .replace(", Q.C.", "")
-                .replace(", K.C.", "")
-                .strip()
-            )
-            district, party = cleanup_list(page.xpath('//div[@id="MinisterTitle"]/following-sibling::text()'))
-            party = parties.get(party, party)
-            p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party)
+            p = Person(primary_org="legislature", name=name, district=district, role="MLA", party=party, image=image)
             p.add_source(COUNCIL_PAGE)
-            p.add_source(url)
-
-            p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0]
-
-            email = page.xpath('//div[@class="convertToEmail"]//text()')[0].strip()
-            if "#" in email:
-                email = email.split("#")[0]
-            if email:
-                p.add_contact("email", email)
-
-            office = ", ".join(cleanup_list(page.xpath('//h4[contains(text(), "Office:")]/ancestor::div/text()')))
-            office = re.sub(r"\s{2,}", " ", office)
-            p.add_contact("address", office, "legislature")
-
-            constituency = ", ".join(
-                cleanup_list(page.xpath('//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()'))
-            )
-            constituency = re.sub(r"\s{2,}", " ", constituency).split(", Phone")[0]
-            p.add_contact("address", constituency, "constituency")
-
-            phones = cleanup_list(page.xpath('//span[contains(text(), "Phone:")]/following-sibling::text()'))
-
-            office_phone = phones[0]
-            p.add_contact("voice", office_phone, "legislature")
-            if len(phones) > 1:
-                constituency_phone = phones[1]
-                p.add_contact("voice", constituency_phone, "constituency")
-
-            roles = page.xpath('//div[@id="MinisterTitle"]/text()')[0].strip()
-            if roles:
-                p.extras["roles"] = [roles]
 
             yield p
-
-
-def cleanup_list(dirty_list):
-    return list(filter(None, (x.strip() for x in dirty_list)))
diff --git a/patch.py b/patch.py
@@ -123,15 +123,15 @@
     r'[("](?:\p{Lu}+|\p{Lu}\p{Ll}*(?:-\p{Lu}\p{Ll}*)*)[)"]|'
     r"(?:D'|d'|De|de|Des|Di|Du|L'|La|Le|Mac|Mc|O'|San|St\.|Van|Vander?|van|vanden)?\p{Lu}\p{Ll}+|"
     r"\p{Lu}\p{Ll}+Anne?|Marie\p{Lu}\p{Ll}+|"
-    r"Ch'ng|Prud'homme|"
+    r"A'aliya|Ch'ng|Prud'homme|"
     r"D!ONNE|IsaBelle|Ya'ara"
     r")"
 )
 
 # Name components can be joined by apostrophes, hyphens or spaces.
 person_schema["properties"]["name"]["pattern"] = re.compile(
     r"\A"
-    r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|Hon|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)"
+    r"(?!(?:Chair|Commissioner|Conseiller|Councillor|Deputy|Dr|M|Maire|Mayor|Miss|Mme|Mr|Mrs|Ms|Regional|Warden)\b)"
     r"(?:" + name_fragment + r"(?:'|-| - | )"
     r")+" + name_fragment + r"\Z"
 )

diff --git a/utils.py b/utils.py
@@ -739,9 +739,10 @@ def clean_string(s):
 
 
 def clean_name(s):
-    return honorific_suffix_re.sub(
-        "", honorific_prefix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip())
-    )
+    name = honorific_suffix_re.sub("", whitespace_re.sub(" ", str(s).translate(table)).strip())
+    if name.count(" ") > 1:
+        return honorific_prefix_re.sub("", name)  # Avoid truncating names like "Hon Chan"
+    return name
 
 
 def clean_type_id(type_id):