Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

import local council data #507

Merged
merged 38 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
6eb6b50
import council RUC classification
struan Mar 13, 2024
0162ca7
imports for council climate emergency declarations and net zero targets
struan Mar 13, 2024
98ade8c
add some useful import functions
struan Mar 13, 2024
721dfcc
imports for council emissions data
struan Mar 13, 2024
56e867b
add council type filter function to import utils
struan Mar 14, 2024
dfc9678
update council filter to only return current authorities
struan Mar 19, 2024
5aafd1c
update council imports to filter out types not being imported
struan Mar 19, 2024
63256a1
import if council has a climate action plan
struan Mar 20, 2024
ac30751
update area page to allow related categories for place
struan Mar 21, 2024
5a28bd3
improve display of net zero declarations
struan Mar 21, 2024
27ab6fc
import council countries
struan Mar 21, 2024
427ce8d
update LatLon generator to add columns for all area types
struan Mar 21, 2024
fe1f549
add mapit calls to get GSS codes by area type from lat/lon and pc
struan Mar 21, 2024
dc0b5d9
fix base constituency count importer to handle multiple area types
struan Mar 21, 2024
0e4c4ba
import GWGW 2022 council event details
struan Mar 21, 2024
7cea948
add council areas to WI CSV generator
struan Mar 25, 2024
7cc910a
add council areas to foodbank CSV generator
struan Mar 25, 2024
07ed60e
add council areas to national trust property CSV generator
struan Mar 25, 2024
c4a0d2d
add council areas to RSPB reserve CSV generator
struan Mar 25, 2024
0dc6821
add council areas to power postcode generator
struan Mar 25, 2024
50cd90e
add council areas to onshore windfarm generator
struan Mar 25, 2024
f0c082e
add council areas to save the children generator
struan Mar 25, 2024
0bb89bd
add council areas to wildlife trust reserves generator
struan Mar 25, 2024
39723fa
update power postcodes importer to include councils
struan Mar 26, 2024
20cbbbe
update foodbank importer to handle councils
struan Mar 26, 2024
c5d0d02
update GBGW importers to handle councils
struan Mar 26, 2024
6341917
update NT property importer to handle councils
struan Mar 26, 2024
5ce9d64
update windfarm importer to handle councils
struan Mar 26, 2024
c39df7a
update RSPB importer to handle councils
struan Mar 26, 2024
1f0986c
update save the children importer to handle councils
struan Mar 26, 2024
e10c744
update wildlife trust reserve importer to handle councils
struan Mar 26, 2024
68c78de
update WI group importer to handle councils
struan Mar 26, 2024
83eaaf6
import council polling data from Onward and RenewableUK
struan Mar 27, 2024
6f70179
import council level IMD data
struan Apr 2, 2024
c3131a9
generate/import tearfund church data for all areas
struan Apr 9, 2024
28f3abf
update dataset list generator to add council area types
struan Apr 9, 2024
57bb97d
local council action scorecards import
struan Apr 17, 2024
3ef66ef
display changes to support local council scorecards
struan Apr 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions hub/import_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from datetime import date
from functools import lru_cache

import pandas as pd
from mysoc_dataset import get_dataset_url

council_types = {"STC": ["CTY", "LBO", "MD", "SCO", "NID", "UA", "WPA"], "DIS": ["NMD"]}


@lru_cache
def get_authority_mapping() -> pd.DataFrame:
"""
Return a dataframe mapping different names to authority code
"""
url = get_dataset_url(
repo_name="uk_local_authority_names_and_codes",
package_name="uk_la_future",
version_name="1",
file_name="lookup_name_to_registry.csv",
done_survey=True,
)
return pd.read_csv(url)


@lru_cache
def get_council_df():
"""
Return a dataframe of councils that are live or historical as of a given date
"""
url = get_dataset_url(
repo_name="uk_local_authority_names_and_codes",
package_name="uk_la_future",
version_name="1",
file_name="uk_local_authorities_future.csv",
done_survey=True,
)
return pd.read_csv(url)


def add_gss_codes(df: pd.DataFrame, code_column: str):
"""
Given a DataFrame with a column called "authority_code", add a column called "gss_code"
"""
authority_df = get_council_df()

rows = len(df[code_column])
df["gss_code"] = pd.Series([None] * rows, index=df.index)

for index, row in df.iterrows():
authority_code = row[code_column]
if not pd.isnull(authority_code):
authority_match = authority_df[
authority_df["local-authority-code"] == authority_code
]
df.at[index, "gss_code"] = authority_match["gss-code"].values[0]

return df


def _filter_authority_type(df: pd.DataFrame, types: list, gss_code: str):
authority_df = get_council_df()

today = date.today()

rows = len(df[gss_code])
df["type"] = pd.Series([None] * rows, index=df.index)
df["start-date"] = pd.Series([None] * rows, index=df.index)
df["end-date"] = pd.Series([None] * rows, index=df.index)
for index, row in df.iterrows():
if not pd.isnull(row[gss_code]):
authority_match = authority_df[authority_df["gss-code"] == row[gss_code]]
df.at[index, "type"] = authority_match["local-authority-type"].values[0]
df.at[index, "start-date"] = pd.to_datetime(
authority_match["start-date"].values[0]
).date()
df.at[index, "end-date"] = pd.to_datetime(
authority_match["end-date"].values[0]
).date()

df = df.loc[df["type"].isin(types)]

# only select authorities with a start date in the past
df = df.loc[(df["start-date"] < today) | df["start-date"].isna()]

# only select authorities with an end date in the future
df = df.loc[(df["end-date"] > today) | df["end-date"].isna()]

return df


def filter_authority_type(
df: pd.DataFrame, authority_type: str, gss_code: str = "gss-code"
):
return _filter_authority_type(df, council_types[authority_type], gss_code)
132 changes: 107 additions & 25 deletions hub/management/commands/base_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,28 @@
RateLimitException,
)

mapit_types = {
"LBO": "STC",
"UTA": "STC",
"COI": "STC",
"LGD": "STC",
"CTY": "STC",
"MTD": "STC",
"NMD": "DIS",
"DIS": "DIS",
"WMC": "WMC",
"WMCF": "WMC23",
}


class BaseLatLonGeneratorCommand(BaseCommand):
uses_gss = False
uses_postcodes = False
out_file = None
location_col = "lat_lon"
legacy_col = "area"
cols = ["WMC", "WMC23", "STC", "DIS"]

tqdm.pandas()

def get_dataframe(self):
Expand All @@ -25,53 +45,112 @@ def get_dataframe(self):

return df

def _process_lat_long(self, lat_lon=None, row_name=None):
lat = lat_lon[0]
lon = lat_lon[1]

if not pd.isna(lat) and not pd.isna(lon):
def _process_location(self, lat_lon=None, postcode=None, row_name=None):
lat, lon = None, None
if lat_lon is not None:
lat = lat_lon[0]
lon = lat_lon[1]

cols = [self.legacy_col, *self.cols]
if (self.uses_postcodes and not pd.isna(postcode)) or (
not pd.isna(lat) and not pd.isna(lon)
):
areas = {}
try:
mapit = MapIt()
gss_codes = mapit.wgs84_point_to_gss_codes(lon, lat)

area = Area.objects.filter(gss__in=gss_codes).first()
if area:
return area.name
if self.uses_postcodes:
gss_codes = mapit.postcode_point_to_gss_codes_with_type(postcode)
else:
return None
gss_codes = mapit.wgs84_point_to_gss_codes_with_type(lon, lat)

for area_type, code in gss_codes.items():
if mapit_types.get(area_type, None) is not None:
if self.uses_gss:
areas[mapit_types[area_type]] = code
else:
area = Area.objects.filter(
gss=code, area_type__code=mapit_types[area_type]
).first()
areas[mapit_types[area_type]] = area.name
else:
continue
except (
NotFoundException,
BadRequestException,
InternalServerErrorException,
ForbiddenException,
) as error:
print(f"Error fetching row {row_name} with {lat}, {lon}: {error}")
return None
location_data = lat_lon
if self.uses_postcodes:
location_data = postcode
self.stderr.write(
f"Error fetching row {row_name} with {location_data}: {error}"
)
return pd.Series([None for t in cols], index=cols)
except RateLimitException as error:
print(f"Mapit Error - {error}, waiting for a minute")
self.stderr.write(f"Mapit Error - {error}, waiting for a minute")
sleep(60)
return False

areas[self.legacy_col] = areas.get("WMC", None)
vals = [areas.get(t, None) for t in cols]
return pd.Series(vals, index=cols)
else:
print(f"missing lat or lon for row {row_name}")
return None
self.stderr.write(f"missing location data for row {row_name}")
return pd.Series([None for t in cols], index=cols)

def process_lat_long(self, lat_lon=None, row_name=None):
success = self._process_lat_long(lat_lon=lat_lon, row_name=row_name)
def process_location(self, lat_lon=None, postcode=None, row_name=None):
success = self._process_location(
lat_lon=lat_lon, postcode=postcode, row_name=row_name
)
# retry once if it fails so we can catch rate limit errors
if success is False:
return self._process_lat_long(lat_lon=lat_lon, row_name=row_name)
return self._process_location(
lat_lon=lat_lon, postcode=postcode, row_name=row_name
)
else:
return success

def get_location_from_row(self, row):
if self.uses_postcodes:
return {"postcode": row["postcode"]}
else:
return {"lat_lon": [row["lat"], row["lon"]]}

def process_data(self, df):
if not self._quiet:
self.stdout.write("Generating Area name from lat + lon values")
self.stdout.write("Generating Area details from location values")

df["area"] = df.progress_apply(
lambda row: self.process_lat_long(
self.get_lat_lon_from_row(row), row[self.row_name]
),
axis=1,
if not self._ignore and self.out_file is not None:
try:
# check that we've got all the output we're expecting before using
# the old values
old_df = pd.read_csv(self.out_file)
usecols = list(set(self.cols).intersection(df.columns))
if len(usecols) == len(self.cols):
old_df = pd.read_csv(
self.out_file, usecols=[self.lat_lon_row, *self.cols]
)
location_lookup = {
row[self.location_col]: row[self.legacy_col]
for index, row in old_df.iterrows()
}
if not self._quiet:
self.stdout.write("Reading codes from existing file")
df[self.legacy_col] = df.apply(
lambda row: location_lookup.get((row[self.location_col]), None),
axis=1,
)
except FileNotFoundError:
self.stderr.write("No existing file.")

df = df.join(
df.progress_apply(
lambda row: self.process_location(
row_name=row[self.row_name], **self.get_location_from_row(row)
),
axis=1,
)
)

return df
Expand All @@ -89,6 +168,9 @@ def add_arguments(self, parser):

def handle(self, quiet=False, ignore=False, *args, **options):
self._quiet = quiet

if not self._quiet:
self.stdout.write(self.message)
self._ignore = ignore
df = self.get_dataframe()
out_df = self.process_data(df)
Expand Down
37 changes: 28 additions & 9 deletions hub/management/commands/base_importers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from functools import cache
from time import sleep

from django.core.management.base import BaseCommand
Expand Down Expand Up @@ -27,6 +26,7 @@ def handle(self, *args, **options):

class BaseAreaImportCommand(BaseCommand):
area_type = "WMC"
uses_gss = False

def __init__(self):
super().__init__()
Expand All @@ -53,10 +53,15 @@ def delete_data(self):
data_type=data_type, area__area_type__code=self.area_type
).delete()

@cache
def get_area_type(self):
return AreaType.objects.get(code=self.area_type)

def get_cons_col(self):
if hasattr(self, "cons_col_map"):
return self.cons_col_map[self.area_type]

return self.cons_col

def add_data_sets(self, df=None):
for name, config in self.data_sets.items():
label = self.get_label(config)
Expand Down Expand Up @@ -175,6 +180,15 @@ def convert_to_new_con(self):
data_type, delete_old=True, quiet=self._quiet
)

def handle(self, quiet=False, *args, **kwargs):
self._quiet = quiet
self.add_data_sets()
self.delete_data()
self.process_data()
self.update_averages()
self.update_max_min()
self.convert_to_new_con()


class BaseImportFromDataFrameCommand(BaseAreaImportCommand):
uses_gss = True
Expand Down Expand Up @@ -278,26 +292,29 @@ def process_lat_long(self, lat=None, lon=None, row_name=None):


class BaseConstituencyGroupListImportCommand(BaseAreaImportCommand):
use_gss = False
do_not_convert = True

def process_data(self):
df = self.get_df()

if not self._quiet:
self.stdout.write(self.message)
self.stdout.write(f"{self.message} ({self.area_type})")

group_by = "constituency"
if self.use_gss:
if self.uses_gss:
group_by = "gss"
if hasattr(self, "area_types"):
group_by = self.cons_col_map[self.area_type]

for lookup, data in tqdm(df.groupby(group_by)):
try:
area = Area.objects.filter(area_type__code=self.area_type)
if self.use_gss:
if self.uses_gss:
area = area.get(gss=lookup)
else:
area = area.get(name=lookup)
except Area.DoesNotExist:
self.stderr.write(f"no area found for {lookup} and {self.area_type}")
continue

json = []
Expand Down Expand Up @@ -328,16 +345,18 @@ def handle(self, quiet=False, *args, **kwargs):


class BaseConstituencyCountImportCommand(BaseAreaImportCommand):
do_not_convert = True

def set_data_type(self):
self.data_type = list(self.data_types.values())[0]

def get_dataframe(self):
df = pd.read_csv(self.data_file)
df = df.astype({self.cons_col: "str"})
df = df.astype({self.get_cons_col(): "str"})
return df

def _get_areas_from_row(self, row):
value = row[self.cons_col]
value = row[self.get_cons_col()]
if self.uses_gss:
areas = Area.objects.filter(gss__in=value.split(","))
else:
Expand All @@ -347,7 +366,7 @@ def _get_areas_from_row(self, row):

def process_data(self, df):
if not self._quiet:
self.stdout.write(self.message)
self.stdout.write(f"{self.message} ({self.area_type})")

for index, row in tqdm(df.iterrows(), disable=self._quiet, total=df.shape[0]):
areas = self._get_areas_from_row(row)
Expand Down
Loading
Loading