Change aggregation and remove SQLite from ETL pipeline (#23)

* Updated jupyter notebook to directly compare I wanted to see how far off the aggregated file is (src/lib/data.json) in a single dataframe. * Add Caity Maple's 2026 campaign to config * Remove prisma & aggregate in-memory from JSON files I think the database was an unncessary layer of complexity so I'm aggregating using in-memory JSON objects instead. Also ensuring that the FPPC of the reporting committee is carried all the way through to the data * Account for different aggregated data shape Aggregation now keeps the contributors separated by filing committee for easier auditing, but that means we want to group everything back together before we show it to the user * Re-run Jupyter notebook * Removed prisma but need to remove it from the election route which doesnt work * Update readme since sqlite was removed * Readme changes
code4sac · Sep 26, 2023 · 14a7576 · 14a7576
1 parent 3a549a3
commit 14a7576
Show file tree

Hide file tree

Showing 14 changed files with 346 additions and 639 deletions.
diff --git a/Campaign_finance_check.ipynb b/Campaign_finance_check.ipynb
diff --git a/README.md b/README.md
@@ -18,40 +18,13 @@ The application is built using [SvelteKit](https://kit.svelte.dev). If you're un
 
 ## Data generation
 
-If you'd like to run the scraper, use `node scripts/index.js`.
-
-The web scraper requires Node and sqlite-utils.
-
-### Install [sqlite-utils](https://sqlite-utils.datasette.io/en/stable/installation.html)
-
-#### Mac
-```shell
-brew install sqlite-utils
-```
-
-#### Windows and Linux
-```shell
-pipx install sqlite-utils
-```
-
-Windows and Linux require [pipx](https://pypa.github.io/pipx/installation/) to install sqlite-utils.
-
-#### Windows Install pipx
-```shell
-py -3 -m pip install --user pipx
-py -3 -m pipx ensurepath
-```
-
-#### Linux Mint (Ubuntu) Install pipx
-```shell
-sudo apt install pipx
-```
+You'll need Node installed if you'd like to run the scraper. Once you're ready to go, run the command `node scripts/index.js`.
 
 ### Scraper methodology
 
 Runs from `scripts/index.js`, which calls out to the other files in `scripts/`. It:
 1. downloads a single year, usually the current year, from the two portals as a ZIP archive (`download.js`)
 2. unzips the downloaded file into an Excel file (`extract.js`)
 3. convert that Excel file into a series of `.json` files within `data/` (`transform.js`) - the files are stored here so that we only have to download a single year's worth of data to update
-4. loads the data into a SQL lite database so we can do some subsequent aggregation (`loads.js`)
+4. loads all the data in-memory so we can do some subsequent aggregation (`loads.js`)
 5. create `$lib/data.json` with the data needed for the body route (`aggregate.js`)
diff --git a/config.js b/config.js
@@ -71,6 +71,10 @@ const config = {
             id: "1435034",
             name: "CAITY MAPLE FOR CITY COUNCIL 2022",
           },
+          {
+            id: "1458316",
+            name: "CAITY MAPLE FOR CITY COUNCIL 2026",
+          },
         ],
       }, {
         name: "Eric Guerra",

diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -16,7 +16,6 @@
     "license": "CC-BY-SA",
     "dependencies": {},
     "devDependencies": {
-        "@prisma/client": "^5.0.0",
         "@sveltejs/adapter-netlify": "^2.0.8",
         "@sveltejs/kit": "^1.22.3",
         "convert-excel-to-json": "^1.7.0",
@@ -30,7 +29,6 @@
         "p-queue": "^7.3.4",
         "prettier": "^3.0.0",
         "prettier-plugin-svelte": "^3.0.0",
-        "prisma": "^5.0.0",
         "progress": "^2.0.3",
         "puppeteer": "^20.9.0",
         "sass": "^1.64.0",

diff --git a/prisma/.env b/prisma/.env
diff --git a/prisma/schema.prisma b/prisma/schema.prisma
diff --git a/scripts/aggregate.js b/scripts/aggregate.js
@@ -1,9 +1,5 @@
 import _ from "lodash";
 import { rollup, sum } from "d3-array";
-import Queue from "p-queue";
-import { PrismaClient } from "@prisma/client";
-
-const prisma = new PrismaClient();
 
 function createContributorId(d) {
   return [
@@ -14,61 +10,66 @@ function createContributorId(d) {
   ].join("--");
 }
 
-export default async function aggregate(legislators, body) {
+export default function aggregate(data, legislators, body) {
   const legislatorsWithContributors = [];
-  const queue = new Queue({ concurrency: 2 });
 
   legislators.forEach((legislator) => {
     const { name, title, committees } = legislator;
     const committeeIds = committees.map((d) => d.id);
-    queue.add(async () => {
-      const scheduleA = await prisma.scheduleA.findMany({
-        where: {
-          fppcId: { in: committeeIds },
-        },
-      });
+    const scheduleA = data['schedule-a'].filter((d) => committeeIds.includes(d.fppcId));
+    const scheduleC = data['schedule-c'].filter((d) => committeeIds.includes(d.fppcId));
+    const scheduleAAndC = [...scheduleA, ...scheduleC]
 
-      const rolled = rollup(scheduleA, (contributions) => {
-        const amount = sum(contributions, (d) => d.amount);
-        const {
-          filerName,
-          fppcId,
-          contributorCommitteeId,
-          contributorFirstName,
-          contributorLastName,
-          contributorCity,
-          contributorState,
-          contributorType,
-          contributorZip,
-        } = contributions[0];
-        return {
-          filerName,
-          fppcId,
-          contributorCommitteeId,
-          contributorFirstName,
-          contributorLastName,
-          contributorCity,
-          contributorState,
-          contributorType,
-          contributorZip,
-          amount,
-        };
-      }, createContributorId);
+    const rolled = rollup(scheduleAAndC, (contributions) => {
+      const amount = sum(contributions, (d) => d.amount);
+      const {
+        filerName,
+        fppcId,
+        contributorCommitteeId,
+        contributorFirstName,
+        contributorLastName,
+        contributorCity,
+        contributorState,
+        contributorType,
+        contributorZip,
+      } = contributions[0];
+      const dates = contributions.map(d => ({
+        date: d.date,
+        amount: d.amount
+      }))
+      return {
+        filerName,
+        fppcId,
+        contributorCommitteeId,
+        contributorFirstName,
+        contributorLastName,
+        contributorCity,
+        contributorState,
+        contributorType,
+        contributorZip,
+        amount,
+        dates,
+      };
+    }, createContributorId, d => d.fppcId);
 
-      const contributors = Array.from(rolled).map((d) => d[1]);
-      const sorted = _.orderBy(contributors, ["amount"], ["desc"]);
+    const contributors = []
+    rolled.forEach((d, contributorId) => {
+      d.forEach((data, fppcId) => {
+        contributors.push(data)
+      })
+    })
+
+    // const contributors = Array.from(rolled).map((d) => d[1]);
+    const sorted = _.orderBy(contributors, ["amount"], ["desc"]);
 
-      legislatorsWithContributors.push({
-        name,
-        title,
-        body,
-        committees,
-        contributors: sorted,
-      });
+    legislatorsWithContributors.push({
+      name,
+      title,
+      body,
+      committees,
+      contributors: sorted,
     });
   });
 
-  await queue.onIdle();
-
   return legislatorsWithContributors;
 }