From 789f053e82411e358710128c05d3a7e3d4b52ad9 Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Wed, 9 Feb 2022 15:37:12 -0800
Subject: [PATCH 01/11] fix: avoid duplicate 'interconnect' columns

---
 prereise/gather/griddata/hifld/orchestration.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/prereise/gather/griddata/hifld/orchestration.py b/prereise/gather/griddata/hifld/orchestration.py
index afbdbcf56..ef51308cd 100644
--- a/prereise/gather/griddata/hifld/orchestration.py
+++ b/prereise/gather/griddata/hifld/orchestration.py
@@ -106,7 +106,8 @@ def create_grid(output_folder=None):
                 col_names += ["type", "GenFuelCost", "GenIOB", "GenIOC", "GenIOD"]
             if name == "dcline":
                 col_names += ["from_interconnect", "to_interconnect"]
-            else:
+            elif name not in {"sub", "bus2sub"}:
+                # these tables already have 'interconnect' within their col_names
                 col_names += ["interconnect"]
             powersimdata_outputs[name] = full_tables[name][col_names]
 

From c1a18d7e3a39b86a23f8068e6c886d61ccd53fa3 Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Mon, 7 Feb 2022 16:13:17 -0800
Subject: [PATCH 02/11] feat: add helper function for lat, lon to unit vector

---
 prereise/gather/helpers.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/prereise/gather/helpers.py b/prereise/gather/helpers.py
index c6189e5d7..15eef14d7 100644
--- a/prereise/gather/helpers.py
+++ b/prereise/gather/helpers.py
@@ -1,4 +1,5 @@
 import os
+from math import cos, radians, sin
 
 import numpy as np
 import pandas as pd
@@ -97,3 +98,21 @@ def get_monthly_net_generation(state, eia_form_923, resource, hps=True):
     eia_net_generation = list(np.nan_to_num(eia_net_generation))
 
     return eia_net_generation
+
+
+def latlon_to_xyz(latitude, longitude):
+    """Convert (latitude, longitude) to unit vector.
+
+    :param float latitude: latitude of the site (in deg.). Equator is the zero point.
+    :param float longitude: longitude of the site (in deg.) measured eastward from
+        Greenwich, UK.
+    :return: (*tuple*) -- 3-components (x,y,z) unit vector.
+    """
+    cos_lat = cos(radians(latitude))
+    sin_lat = sin(radians(latitude))
+    cos_lon = cos(radians(longitude))
+    sin_lon = sin(radians(longitude))
+
+    uv = [cos_lat * cos_lon, cos_lat * sin_lon, sin_lat]
+
+    return uv

From 03e07a94082aa9e5beaa30b39491341944eb9f71 Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Thu, 17 Feb 2022 15:16:48 -0800
Subject: [PATCH 03/11] data: add balancing authority to interconnect mapping

---
 prereise/gather/griddata/hifld/const.py | 73 +++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/prereise/gather/griddata/hifld/const.py b/prereise/gather/griddata/hifld/const.py
index 1edf0a6fb..3250232c3 100644
--- a/prereise/gather/griddata/hifld/const.py
+++ b/prereise/gather/griddata/hifld/const.py
@@ -194,6 +194,79 @@
     "Flywheels",
 }
 
+balancingauthority2interconnect = {
+    "AEC": "Eastern",
+    "AECI": "Eastern",
+    "AVA": "Western",
+    "AVRN": "Western",
+    "AZPS": "Western",
+    "BANC": "Western",
+    "BPAT": "Western",
+    "CHPD": "Western",
+    "CISO": "Western",
+    "CPLE": "Eastern",
+    "CPLW": "Eastern",
+    "CSTO": "Western",
+    "DEAA": "Western",
+    "DOPD": "Western",
+    "DUK": "Eastern",
+    "EEI": "Eastern",
+    "EPE": "Western",
+    "ERCO": "ERCOT",
+    "FMPP": "Eastern",
+    "FPC": "Eastern",
+    "FPL": "Eastern",
+    "GCPD": "Western",
+    "GRIF": "Western",
+    "GRIS": "Western",
+    "GRMA": "Western",
+    "GVL": "Eastern",
+    "GWA": "Western",
+    "HGMA": "Western",
+    "HST": "Eastern",
+    "IID": "Western",
+    "IPCO": "Western",
+    "ISNE": "Eastern",
+    "JEA": "Eastern",
+    "LDWP": "Western",
+    "LGEE": "Eastern",
+    "MISO": "Eastern",
+    "NBSO": "Eastern",
+    "NEVP": "Western",
+    "NSB": "Eastern",
+    "NWMT": "Western",
+    "NYIS": "Eastern",
+    "OVEC": "Eastern",
+    "PACE": "Western",
+    "PACW": "Western",
+    "PGE": "Western",
+    "PJM": "Eastern",
+    "PNM": "Western",
+    "PSCO": "Western",
+    "PSEI": "Western",
+    "SC": "Eastern",
+    "SCEG": "Eastern",
+    "SCL": "Western",
+    "SEC": "Eastern",
+    "SEPA": "Eastern",
+    "SOCO": "Eastern",
+    "SPA": "Eastern",
+    "SRP": "Western",
+    "SWPP": "Eastern",
+    "TAL": "Eastern",
+    "TEC": "Eastern",
+    "TEPC": "Western",
+    "TIDC": "Western",
+    "TPWR": "Western",
+    "TVA": "Eastern",
+    # "WACM": "Western",  # can be Western or Eastern
+    # "WALC": "Western",  # can be Western or Eastern
+    # "WAUW": "Western",  # can be Western or Eastern
+    "WWA": "Western",
+    "YAD": "Eastern",
+}
+
+# Usage of this is deprecated, since these data seem noisier than Balancing Authorities
 nercregion2interconnect = {
     "ASCC": "Alaska",  # Not currently used
     "HICC": "Hawaii",  # Not currently used

From 2f56d1f5d8962fd91eccbde0b45572ae6f84d080 Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Fri, 18 Feb 2022 12:33:39 -0800
Subject: [PATCH 04/11] refactor: use BA as primary interconnect mapper, NERC
 region as fallback

---
 .../griddata/hifld/data_process/generators.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/prereise/gather/griddata/hifld/data_process/generators.py b/prereise/gather/griddata/hifld/data_process/generators.py
index bbac3f36c..b38bdf090 100644
--- a/prereise/gather/griddata/hifld/data_process/generators.py
+++ b/prereise/gather/griddata/hifld/data_process/generators.py
@@ -289,17 +289,16 @@ def build_plant(bus, substations, kwargs={}):
     substation_groupby = substations.groupby(["interconnect", "ZIP"])
     epa_ampd_groupby = epa_ampd.groupby(["ORISPL_CODE", "UNITID"])
 
-    # Add information
-    generators["interconnect"] = (
-        generators["Plant Code"]
-        .map(plants["NERC Region"])
-        .map(const.nercregion2interconnect)
+    # Add information to generators based on Form 860 Plant table
+    generators = generators.merge(plants, on="Plant Code", suffixes=(None, "_860Plant"))
+    generators.rename(
+        {"Latitude": "lat", "Longitude": "lon", "Zip": "ZIP"}, axis=1, inplace=True
     )
-    generators["lat"] = generators["Plant Code"].map(plants["Latitude"])
-    generators["lon"] = generators["Plant Code"].map(plants["Longitude"])
-    generators["ZIP"] = generators["Plant Code"].map(plants["Zip"])
-    generators["Balancing Authority Code"] = generators["Plant Code"].map(
-        plants["Balancing Authority Code"]
+    # Map interconnect via BA first (more reliable) then by NERC Region
+    generators["interconnect"] = (
+        generators["Balancing Authority Code"]
+        .map(const.balancingauthority2interconnect)
+        .combine_first(generators["NERC Region"].map(const.nercregion2interconnect))
     )
     print("Mapping generators to substations... (this may take several minutes)")
     generators["sub_id"] = generators.apply(

From 886190629d40b3673ca19014ec9e74d05a0af10c Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Fri, 18 Feb 2022 12:14:23 -0800
Subject: [PATCH 05/11] refactor: override existing substation MIN_VOLT and
 MAX_VOLT whenever possible

---
 prereise/gather/griddata/hifld/data_process/transmission.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/prereise/gather/griddata/hifld/data_process/transmission.py b/prereise/gather/griddata/hifld/data_process/transmission.py
index 611f912f4..55d7e0e4b 100644
--- a/prereise/gather/griddata/hifld/data_process/transmission.py
+++ b/prereise/gather/griddata/hifld/data_process/transmission.py
@@ -822,6 +822,10 @@ def build_transmission(method="line2sub", **kwargs):
         lambda x: estimate_branch_rating(x, bus["baseKV"]), axis=1
     )
 
+    # Update substation max & min voltages using bus data (from lines)
+    substations["MAX_VOLT"].update(bus.groupby("sub_id")["baseKV"].apply(max))
+    substations["MIN_VOLT"].update(bus.groupby("sub_id")["baseKV"].apply(min))
+
     # Rename columns to match PowerSimData expectations
     branch.rename({"type": "branch_device_type"}, axis=1, inplace=True)
     substations.rename(

From 7666200fd6e1c86c4a3d772566e529efce1efece Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Mon, 31 Jan 2022 10:19:38 -0800
Subject: [PATCH 06/11] refactor: aggregate hydro generators by plant

---
 .../griddata/hifld/data_process/generators.py | 45 ++++++++++++++++---
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/prereise/gather/griddata/hifld/data_process/generators.py b/prereise/gather/griddata/hifld/data_process/generators.py
index b38bdf090..524113c6a 100644
--- a/prereise/gather/griddata/hifld/data_process/generators.py
+++ b/prereise/gather/griddata/hifld/data_process/generators.py
@@ -246,6 +246,30 @@ def estimate_coefficients(generator, regressions):
     generators.update(linear_heat_rate_assumptions)
 
 
+def aggregate_hydro_generators_by_plant_id(generators):
+    """Combine hydro generators within the same plant into aggregated larger generators.
+    'Pmin' and 'Pmax' values will be summed, all other attributes (including the index)
+    will be taken from the (somewhat arbitrary) first generator in the plant grouping.
+
+    :param pandas.DataFrame generators: data frame of generators.
+    :return: (*pandas.DataFrame*) -- data frame of generators, with hydro generators
+        aggregated.
+    """
+    indiv_hydro_gens = generators.query("`Energy Source 1` == 'WAT'").copy()
+    original_hydro_indices = indiv_hydro_gens.index.tolist()
+    # Retain the original indices to keep track of original indices for later append
+    indiv_hydro_gens.reset_index(inplace=True)
+    hydro_groupby = indiv_hydro_gens.groupby("Plant Code")
+    # Choose characteristics from the (arbitrary) first plant
+    aggregated_hydro = hydro_groupby.first()
+    aggregated_hydro[["Pmin", "Pmax"]] = hydro_groupby[["Pmin", "Pmax"]].sum()
+    # Reset/set index to restore 'Plant Code' as a column and original index numbering
+    aggregated_hydro.reset_index(inplace=True)
+    aggregated_hydro.set_index("index", inplace=True)  # 'index' was the original
+    generators = generators.drop(original_hydro_indices).append(aggregated_hydro)
+    return generators
+
+
 def build_plant(bus, substations, kwargs={}):
     """Use source data on generating units from EIA/EPA, along with transmission network
     data, to produce a plant data frame.
@@ -300,13 +324,8 @@ def build_plant(bus, substations, kwargs={}):
         .map(const.balancingauthority2interconnect)
         .combine_first(generators["NERC Region"].map(const.nercregion2interconnect))
     )
-    print("Mapping generators to substations... (this may take several minutes)")
-    generators["sub_id"] = generators.apply(
-        lambda x: map_generator_to_sub_by_location(x, substation_groupby), axis=1
-    )
-    generators["bus_id"] = generators.apply(
-        lambda x: map_generator_to_bus_by_sub(x, bus_groupby), axis=1
-    )
+
+    # Ensure we have Pmax and Pmin for each generator
     generators["Pmax"] = generators[
         ["Summer Capacity (MW)", "Winter Capacity (MW)"]
     ].max(axis=1)
@@ -314,6 +333,18 @@ def build_plant(bus, substations, kwargs={}):
     generators = generators.loc[~generators["Pmax"].isnull()]
     generators.rename({"Minimum Load (MW)": "Pmin"}, inplace=True, axis=1)
     generators["Pmin"] = generators["Pmin"].fillna(0)
+
+    # Aggregate hydro generators within each plant
+    generators = aggregate_hydro_generators_by_plant_id(generators)
+
+    print("Mapping generators to substations... (this may take several minutes)")
+    generators["sub_id"] = generators.apply(
+        lambda x: map_generator_to_sub_by_location(x, substation_groupby), axis=1
+    )
+    generators["bus_id"] = generators.apply(
+        lambda x: map_generator_to_bus_by_sub(x, bus_groupby), axis=1
+    )
+
     print("Fitting heat rate curves to EPA data... (this may take several minutes)")
     heat_rate_curve_estimates = generators.apply(
         lambda x: estimate_heat_rate_curve(

From 052f83cb97984b32efc3bb073e108667fd8effcc Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Wed, 26 Jan 2022 15:17:08 -0800
Subject: [PATCH 07/11] refactor: map larger generators to higher-voltage buses

---
 .../gather/griddata/hifld/data_process/generators.py   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/prereise/gather/griddata/hifld/data_process/generators.py b/prereise/gather/griddata/hifld/data_process/generators.py
index 524113c6a..4f48fe59c 100644
--- a/prereise/gather/griddata/hifld/data_process/generators.py
+++ b/prereise/gather/griddata/hifld/data_process/generators.py
@@ -83,7 +83,15 @@ def map_generator_to_bus_by_sub(generator, bus_groupby):
     if pd.isna(generator.sub_id):
         return pd.NA
     else:
-        return bus_groupby.get_group(generator.sub_id)["baseKV"].idxmin()
+        bus_voltages = bus_groupby.get_group(generator.sub_id)["baseKV"]
+        if len(bus_voltages) == 1 or generator.Pmax < 200:
+            # Return the lowest-voltage for small generators, or the only voltage
+            return bus_voltages.idxmin()
+        if generator.Pmax < 500:
+            # Return the second-lowest voltage for mid-sized generators
+            return bus_voltages.sort_values().index[1]
+        # Return the highest voltage for large generators
+        return bus_voltages.idxmax()
 
 
 def estimate_heat_rate_curve(

From facd3d38212fc5834c9d587a6cb429e58efdbd46 Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Mon, 7 Feb 2022 16:13:39 -0800
Subject: [PATCH 08/11] refactor: map generators to substations using voltage
 information

---
 .../griddata/hifld/data_process/generators.py | 185 +++++++++++++-----
 1 file changed, 134 insertions(+), 51 deletions(-)

diff --git a/prereise/gather/griddata/hifld/data_process/generators.py b/prereise/gather/griddata/hifld/data_process/generators.py
index 4f48fe59c..ed4318bd0 100644
--- a/prereise/gather/griddata/hifld/data_process/generators.py
+++ b/prereise/gather/griddata/hifld/data_process/generators.py
@@ -1,10 +1,14 @@
+from math import asin
+
+import numpy as np
 import pandas as pd
-from powersimdata.utility.distance import haversine
 from scipy.optimize import curve_fit
+from scipy.spatial import KDTree
 from scipy.stats import linregress
 
 from prereise.gather.griddata.hifld import const
 from prereise.gather.griddata.hifld.data_access import load
+from prereise.gather.helpers import latlon_to_xyz
 
 
 def floatify(value, default=float("nan")):
@@ -22,54 +26,127 @@ def floatify(value, default=float("nan")):
         return default
 
 
-def map_generator_to_sub_by_location(generator, substation_groupby):
-    """Determine a likely substation for a generator to be connected to. Priority order
-    of mapping is: 1) if location is available and one or more substations exist in that
-    ZIP code, map by location to closest substation within that ZIP code, 2) if location
-    is available but no substations exist in that ZIP code, map to the closest
-    substation within neighboring ZIP codes, 3) if only ZIP code is available
-    (no location), and one or more substations exist, map to an arbitrarily chosen
-    substation within that ZIP code, 4) if only ZIP code is available (no location)
-    but no substations exist in that ZIP code, return NA.
-
-    :param pandas.Series generator: one generating unit from data frame.
-    :param pandas.GroupBy substation_groupby: data frame of substations, grouped by
-        (interconnect, ZIP).
-    :return: (*int/pd.NA*) -- substation ID if the generator can be mapped successfully
-        to a substation, else pd.NA.
+def map_generators_to_sub_by_location(
+    generators, substations, inplace=True, report_worst=None
+):
+    """Determine the closest substation to each generator. For generators without
+    latitude and longitude, an attempt will be made to match via ZIP code, and failing
+    that a pandas.NA value will be returned.
+
+    :param pandas.DataFrame generators: generator data. Required columns:
+        'interconnect', 'lat', 'lon', 'ZIP'.
+    :param pandas.DataFrame substations: substation data. Required columns:
+        'interconnect', 'lat', 'lon', 'ZIP'.
+    :param bool inplace: whether to modify the generator table inplace with new 'sub_id'
+        and 'sub_dist' columns or to return a new one.
+    :param int report_worst: if not None, display the distances of the worst N mappings.
+    :return: (*pandas.DataFrame/None*) -- if ``inplace`` is `False`, return the modified
+        DataFrame; otherwise return nothing.
     """
-    lookup_params = tuple(generator.loc[["interconnect", "ZIP"]])
-    if pd.isna(generator["lat"]) or pd.isna(generator["lon"]):
-        # No location available
-        try:
-            matching_subs = substation_groupby.get_group(lookup_params)
-            return matching_subs.index[0]
-        except KeyError:
-            return pd.NA
-    try:
-        # This ZIP code contains substations, this block will execute successfully
-        matching_subs = substation_groupby.get_group(lookup_params)
-    except KeyError:
-        # If this ZIP code does not contain substations, this block will execute, and
-        # we select a set of 'nearby' substations
-        zip_range = [int(generator.loc["ZIP"]) + offset for offset in range(-100, 101)]
-        zip_range_strings = [str(z).rjust(5, "0") for z in zip_range]
-        try:
-            matching_subs = pd.concat(
-                [
-                    substation_groupby.get_group((generator.loc["interconnect"], z))
-                    for z in zip_range_strings
-                    if (generator.loc["interconnect"], z) in substation_groupby.groups
-                ]
-            )
-        except ValueError:
-            # If no matching subs within the given interconnection and ZIPs, give up
+
+    def get_closest_substation(generator, voltage_trees, subs_voltage_lookup):
+        if not isinstance(generator["xyz"], list):
             return pd.NA
-    distance_to_subs = matching_subs.apply(
-        lambda x: haversine((x.lat, x.lon), (generator.lat, generator.lon)),
+        if pd.isnull(generator["voltage_class"]) or generator["Pmax"] < 100:
+            grouper_key = generator["interconnect"]
+        else:
+            grouper_key = (generator["interconnect"], generator["voltage_class"])
+        chord_dist, array_index = voltage_trees[grouper_key].query(generator["xyz"])
+        sub_id = subs_voltage_lookup[grouper_key][array_index]
+        # Translate chord distance (unit circle) to great circle distance (miles)
+        dist_in_miles = 3963 * 2 * asin(chord_dist / 2)  # use 3963 mi as earth radius
+        return pd.Series({"dist": dist_in_miles, "sub_id": sub_id})
+
+    def classify_voltages(voltage, voltage_ranges):
+        for v_range, bounds in voltage_ranges.items():
+            if bounds["min"] <= voltage <= bounds["max"]:
+                return v_range
+        return float("nan")
+
+    voltage_ranges = {
+        "under 100": {"min": 0, "max": 99},
+        "100-161": {"min": 100, "max": 161},
+        "220-287": {"min": 220, "max": 287},
+        "345": {"min": 345, "max": 345},
+        "500": {"min": 500, "max": 500},
+        "735 and above": {"min": 735, "max": float("inf")},
+    }
+
+    # Translate lat/lon to 3D positions (assume spherical earth, origin at center)
+    substations_with_xyz = substations.assign(
+        xyz=substations.apply(lambda x: latlon_to_xyz(x["lat"], x["lon"]), axis=1)
+    )
+    generators_with_xyz = generators.assign(
+        xyz=generators.apply(
+            lambda x: (
+                pd.NA
+                if pd.isna(x["lat"]) or pd.isna(x["lon"])
+                else latlon_to_xyz(x["lat"], x["lon"])
+            ),
+            axis=1,
+        )
+    )
+
+    # Bin voltages into broad classes
+    generators_with_xyz["voltage_class"] = generators["Grid Voltage (kV)"].map(
+        lambda x: classify_voltages(x, voltage_ranges)
+    )
+
+    # Group substations by voltage to build KDTrees
+    subs_voltage_lookup = {
+        (interconnect, voltage_level): substations_with_xyz.loc[
+            (substations_with_xyz["interconnect"] == interconnect)
+            & (substations_with_xyz["MAX_VOLT"] >= voltage_range["min"])
+        ].index
+        for interconnect in generators["interconnect"].unique()
+        for voltage_level, voltage_range in voltage_ranges.items()
+    }
+    # Group substations by ZIP code for a fallback for generators without coordinates
+    subs_zip_groupby = substations_with_xyz.groupby(["interconnect", "ZIP"])
+
+    # Create a KDTree for each combination of voltage and interconnect
+    voltage_trees = {
+        key: KDTree(np.vstack(substations_with_xyz.loc[sub_ids, "xyz"]))
+        for key, sub_ids in subs_voltage_lookup.items()
+        if len(sub_ids) > 0
+    }
+    # Create a KDTree for each interconnect (all voltages)
+    subs_interconnect_groupby = substations_with_xyz.groupby("interconnect")
+    for interconnect in generators["interconnect"].unique():
+        tree_subs = subs_interconnect_groupby.get_group(interconnect)
+        voltage_trees[interconnect] = KDTree(np.vstack(tree_subs["xyz"]))
+        subs_voltage_lookup[interconnect] = tree_subs.index
+
+    # Query the appropriate tree for each generator to get the closest substation ID
+    mapping_results = generators_with_xyz.apply(
+        lambda x: get_closest_substation(x, voltage_trees, subs_voltage_lookup),
         axis=1,
     )
-    return distance_to_subs.idxmin()
+    # For generators without coordinates, try to pick a substation with a matching ZIP
+    for g in generators.loc[mapping_results["sub_id"].isnull()].index:
+        try:
+            candidates = subs_zip_groupby.get_group(
+                (generators.loc[g, "interconnect"], generators.loc[g, "ZIP"])
+            )
+            # arbitrary choose the first one
+            mapping_results.loc[g, "sub_id"] = candidates.index[0]
+        except KeyError:
+            continue  # No coordinates, no matching ZIP, we're out of luck
+
+    if report_worst is not None:
+        print(
+            mapping_results.sort_values("sub_dist", ascending=False)
+            .join(generators[["Plant Code", "Grid Voltage (kV)", "Pmax"]])
+            .head(report_worst)
+        )
+
+    if inplace:
+        generators["sub_id"] = mapping_results["sub_id"]
+        generators["sub_dist"] = mapping_results["dist"]
+    else:
+        return generators_with_xyz.drop(["xyz", "voltage_class"], axis=1).join(
+            mapping_results
+        )
 
 
 def map_generator_to_bus_by_sub(generator, bus_groupby):
@@ -318,11 +395,19 @@ def build_plant(bus, substations, kwargs={}):
     bus_groupby = bus.groupby(bus["sub_id"].astype(int))
     # Filter substations with no buses
     substations = substations.loc[set(bus_groupby.groups.keys())]
-    substation_groupby = substations.groupby(["interconnect", "ZIP"])
     epa_ampd_groupby = epa_ampd.groupby(["ORISPL_CODE", "UNITID"])
 
     # Add information to generators based on Form 860 Plant table
-    generators = generators.merge(plants, on="Plant Code", suffixes=(None, "_860Plant"))
+    # Merging this way allows column-on-column merge while preserving original index
+    generators = (
+        generators.reset_index()
+        .merge(
+            plants,
+            on="Plant Code",
+            suffixes=(None, "_860Plant"),
+        )
+        .set_index("index")
+    )
     generators.rename(
         {"Latitude": "lat", "Longitude": "lon", "Zip": "ZIP"}, axis=1, inplace=True
     )
@@ -332,6 +417,7 @@ def build_plant(bus, substations, kwargs={}):
         .map(const.balancingauthority2interconnect)
         .combine_first(generators["NERC Region"].map(const.nercregion2interconnect))
     )
+    generators["Grid Voltage (kV)"] = generators["Grid Voltage (kV)"].map(floatify)
 
     # Ensure we have Pmax and Pmin for each generator
     generators["Pmax"] = generators[
@@ -345,10 +431,7 @@ def build_plant(bus, substations, kwargs={}):
     # Aggregate hydro generators within each plant
     generators = aggregate_hydro_generators_by_plant_id(generators)
 
-    print("Mapping generators to substations... (this may take several minutes)")
-    generators["sub_id"] = generators.apply(
-        lambda x: map_generator_to_sub_by_location(x, substation_groupby), axis=1
-    )
+    map_generators_to_sub_by_location(generators, substations)
     generators["bus_id"] = generators.apply(
         lambda x: map_generator_to_bus_by_sub(x, bus_groupby), axis=1
     )

From e7c6259a6490ad60ea77b20f3e277e04f44fe760 Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Tue, 22 Feb 2022 15:44:09 -0800
Subject: [PATCH 09/11] refactor: connect transformers in 'cascade' rather than
 'tree'

---
 .../griddata/hifld/data_process/tests/test_transmission.py    | 2 +-
 prereise/gather/griddata/hifld/data_process/transmission.py   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/prereise/gather/griddata/hifld/data_process/tests/test_transmission.py b/prereise/gather/griddata/hifld/data_process/tests/test_transmission.py
index 7165a0bc7..10ea4a3cf 100644
--- a/prereise/gather/griddata/hifld/data_process/tests/test_transmission.py
+++ b/prereise/gather/griddata/hifld/data_process/tests/test_transmission.py
@@ -118,7 +118,7 @@ def test_create_transformers():
         dtype="float",
     )
     expected_transformers = pd.DataFrame(
-        {"from_bus_id": [0, 2, 4, 5], "to_bus_id": [1, 3, 6, 6]}
+        {"from_bus_id": [0, 2, 4, 5], "to_bus_id": [1, 3, 5, 6]}
     )
     transformers = create_transformers(bus)
     assert_frame_equal(transformers, expected_transformers)
diff --git a/prereise/gather/griddata/hifld/data_process/transmission.py b/prereise/gather/griddata/hifld/data_process/transmission.py
index 55d7e0e4b..760c41082 100644
--- a/prereise/gather/griddata/hifld/data_process/transmission.py
+++ b/prereise/gather/griddata/hifld/data_process/transmission.py
@@ -526,9 +526,9 @@ def create_transformers(bus):
         ["from_bus_id", "to_bus_id"].
     """
     bus_pairs = [
-        (b, volt_series.idxmax())
+        (b, volt_series.sort_values().index[i + 1])
         for sub, volt_series in bus.groupby("sub_id")["baseKV"]
-        for b in volt_series.sort_values().index[:-1]
+        for i, b in enumerate(volt_series.sort_values().index[:-1])
         if len(volt_series) > 1
     ]
 

From 42b9be39095e5dbb7e3e3f472c77cc36851be85e Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Tue, 8 Mar 2022 18:16:05 -0800
Subject: [PATCH 10/11] feat: add proxy substations

---
 prereise/gather/griddata/hifld/const.py                | 10 ++++++++++
 .../gather/griddata/hifld/data_process/transmission.py |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/prereise/gather/griddata/hifld/const.py b/prereise/gather/griddata/hifld/const.py
index 3250232c3..00fb76b3d 100644
--- a/prereise/gather/griddata/hifld/const.py
+++ b/prereise/gather/griddata/hifld/const.py
@@ -511,6 +511,16 @@
     "west": -125,
 }
 
+proxy_substations = [
+    {"LATITUDE": 35.0514, "LONGITUDE": -81.0694, "NAME": "Catawba", "STATE": "SC"},
+    {
+        "LATITUDE": 44.2853,
+        "LONGITUDE": -105.3826,
+        "NAME": "Neil Simpson II",
+        "STATE": "WY",
+    },
+]
+
 seams_substations = {
     "east_west": {
         202364,  # 'LAMAR HVDC TIE'
diff --git a/prereise/gather/griddata/hifld/data_process/transmission.py b/prereise/gather/griddata/hifld/data_process/transmission.py
index 760c41082..b8b3ada04 100644
--- a/prereise/gather/griddata/hifld/data_process/transmission.py
+++ b/prereise/gather/griddata/hifld/data_process/transmission.py
@@ -745,6 +745,9 @@ def build_transmission(method="line2sub", **kwargs):
     # Filter substations based on their `LINES` attribute, check for location dupes
     substations = filter_substations_with_zero_lines(hifld_substations)
     check_for_location_conflicts(substations)
+    # Append the proxy substations to the source data
+    substations = pd.concat([substations, pd.DataFrame(const.proxy_substations)])
+    substations.index.name = "ID"
 
     # Filter out keyword arguments for filter_islands_and_connect_with_mst function
     island_kwargs = dict()

From ffb2d4da265788ded166fcf65123abc53cb37ee9 Mon Sep 17 00:00:00 2001
From: Daniel Olsen <daniel@breakthroughenergy.org>
Date: Wed, 9 Mar 2022 09:46:19 -0800
Subject: [PATCH 11/11] feat: add overrides to substation LINES filtering

---
 prereise/gather/griddata/hifld/const.py                     | 2 ++
 prereise/gather/griddata/hifld/data_process/transmission.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/prereise/gather/griddata/hifld/const.py b/prereise/gather/griddata/hifld/const.py
index 00fb76b3d..2e9e570d8 100644
--- a/prereise/gather/griddata/hifld/const.py
+++ b/prereise/gather/griddata/hifld/const.py
@@ -521,6 +521,8 @@
     },
 ]
 
+substations_lines_filter_override = {301995}
+
 seams_substations = {
     "east_west": {
         202364,  # 'LAMAR HVDC TIE'
diff --git a/prereise/gather/griddata/hifld/data_process/transmission.py b/prereise/gather/griddata/hifld/data_process/transmission.py
index b8b3ada04..6a64dea12 100644
--- a/prereise/gather/griddata/hifld/data_process/transmission.py
+++ b/prereise/gather/griddata/hifld/data_process/transmission.py
@@ -743,6 +743,7 @@ def build_transmission(method="line2sub", **kwargs):
     hifld_zones = get_zone(os.path.join(hifld_data_dir, "zone.csv"))  # noqa: F841
 
     # Filter substations based on their `LINES` attribute, check for location dupes
+    hifld_substations.loc[const.substations_lines_filter_override, "LINES"] = None
     substations = filter_substations_with_zero_lines(hifld_substations)
     check_for_location_conflicts(substations)
     # Append the proxy substations to the source data