[issue 2927] Expand issue_history table to include project and sprint (…

…#2992) ## Summary Fixes #2927 ### Time to review: __5 mins__ ## Changes proposed > What was added, updated, or removed in this PR. - Extends transform/load to support the case in which an issue occurs multiple times in an input json file but with different project and sprint value for each instance - Alters `gh_issue_history` table to add columns `project_id` and `sprint_id`, and updates the issue transform/load step accordingly - Changes the unique constraint on `gh_issue_history` from `(issue_id, d_effective)` to `(issue_id, project_id, d_effective)` thereby allowing an issue to be mapped to concurrent sprints in separate projects - Updates tests ## Context for reviewers > Testing instructions, background context, more in-depth details of the implementation, and anything else you'd like to call out or ask reviewers. Explain how the changes were verified. The current schema does not allow for an issue to be mapped to multiple concurrent sprints (e.g. sprint 1.4 in project 13 and sprint 1.4 in project 17). The limitation causes minor calculation errors in percent complete metrics. This PR removes the limitation by extending the schema and the transform/load logic. Note: With this PR, the table `gh_issue_sprint_map` becomes obsolete, but it still exists in the database and transform/load is still writing to it. After the new iteration of `gh_issue_history` is verified to be sufficient for percent complete metrics calculation, the obsolete table and code will be deleted. ## Additional information > Screenshots, GIF demos, code examples or output to help show the changes working as expected. In schema version 4, `gh_issue_history` looks like this: ``` Column | Type | Collation | Nullable | Default -------------+-----------------------------+-----------+----------+---------------------------------------------- id | integer | | not null | nextval('gh_issue_history_id_seq'::regclass) issue_id | integer | | not null | status | text | | | is_closed | integer | | not null | points | integer | | not null | 0 d_effective | date | | not null | t_created | timestamp without time zone | | | CURRENT_TIMESTAMP t_modified | timestamp without time zone | | | CURRENT_TIMESTAMP Indexes: "gh_issue_history_pkey" PRIMARY KEY, btree (id) "gh_ih_i1" btree (issue_id, d_effective) "gh_issue_history_issue_id_d_effective_key" UNIQUE CONSTRAINT, btree (issue_id, d_effective) ``` In schema version 5, `gh_issue_history` looks like this: ``` Column | Type | Collation | Nullable | Default -------------+-----------------------------+-----------+----------+---------------------------------------------- id | integer | | not null | nextval('gh_issue_history_id_seq'::regclass) issue_id | integer | | not null | status | text | | | is_closed | integer | | not null | points | integer | | not null | 0 d_effective | date | | not null | t_created | timestamp without time zone | | | CURRENT_TIMESTAMP t_modified | timestamp without time zone | | | CURRENT_TIMESTAMP project_id | integer | | | 0 sprint_id | integer | | | 0 Indexes: "gh_issue_history_pkey" PRIMARY KEY, btree (id) "gh_ih_i1" btree (issue_id, d_effective) "gh_issue_history_issue_id_project_id_d_effective_key" UNIQUE CONSTRAINT, btree (issue_id, project_id, d_effective) ```
HHS · Nov 22, 2024 · 3fc12a4 · 3fc12a4
1 parent 37ca2a3
commit 3fc12a4
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 12 deletions.
diff --git a/analytics/src/analytics/datasets/etl_dataset.py b/analytics/src/analytics/datasets/etl_dataset.py
@@ -144,6 +144,11 @@ def get_issue(self, issue_ghid: str) -> pd.Series:
         query_string = f"issue_ghid == '{issue_ghid}'"
         return self.df.query(query_string).iloc[0]
 
+    def get_issues(self, issue_ghid: str) -> pd.DataFrame:
+        """Fetch data about a given issue."""
+        query_string = f"issue_ghid == '{issue_ghid}'"
+        return self.df.query(query_string)
+
     def get_issue_ghids(self) -> NDArray[Any]:
         """Fetch an array of unique non-null issue ghids."""
         df = self.df[self.df.issue_ghid.notna()]

diff --git a/analytics/src/analytics/integrations/etldb/issue_model.py b/analytics/src/analytics/integrations/etldb/issue_model.py
@@ -99,18 +99,21 @@ def _insert_facts(
             "points": issue_df["issue_points"],
             "sprint_id": ghid_map[EtlEntityType.SPRINT].get(issue_df["sprint_ghid"]),
             "effective": self.dbh.effective_date,
+            "project_id": ghid_map[EtlEntityType.PROJECT].get(issue_df["project_ghid"]),
         }
         history_id = None
         map_id = None
 
         # insert into fact table: issue_history
         cursor = self.dbh.connection()
         insert_sql1 = text(
-            "insert into gh_issue_history (issue_id, status, is_closed, points, d_effective) "
-            "values (:issue_id, :status, :is_closed, :points, :effective) "
-            "on conflict (issue_id, d_effective) "
-            "do update set (status, is_closed, points, t_modified) = "
-            "(:status, :is_closed, :points, current_timestamp) "
+            "insert into gh_issue_history "
+            "(issue_id, status, is_closed, points, d_effective, project_id, sprint_id) "
+            "values "
+            "(:issue_id, :status, :is_closed, :points, :effective, :project_id, :sprint_id) "
+            "on conflict (issue_id, project_id, d_effective) "
+            "do update set (status, is_closed, points, t_modified, sprint_id) = "
+            "(:status, :is_closed, :points, current_timestamp, :sprint_id) "
             "returning id",
         )
         result1 = cursor.execute(insert_sql1, insert_values)
@@ -119,6 +122,7 @@ def _insert_facts(
             history_id = row1[0]
 
         # insert into fact table: issue_sprint_map
+        # note: issue_sprint_map will be removed after validating changes to issue_history
         insert_sql2 = text(
             "insert into gh_issue_sprint_map (issue_id, sprint_id, d_effective) "
             "values (:issue_id, :sprint_id, :effective) "

diff --git a/analytics/src/analytics/integrations/etldb/main.py b/analytics/src/analytics/integrations/etldb/main.py
@@ -141,10 +141,11 @@ def sync_issues(db: EtlDb, dataset: EtlDataset, ghid_map: dict) -> dict:
     result = {}
     model = EtlIssueModel(db)
     for ghid in dataset.get_issue_ghids():
-        issue_df = dataset.get_issue(ghid)
-        result[ghid], _ = model.sync_issue(issue_df, ghid_map)
-        if VERBOSE:
-            print(f"ISSUE '{ghid}' issue_id = {result[ghid]}")
+        all_rows = dataset.get_issues(ghid)
+        for _, issue_df in all_rows.iterrows():
+            result[ghid], _ = model.sync_issue(issue_df, ghid_map)
+            if VERBOSE:
+                print(f"ISSUE '{ghid}' issue_id = {result[ghid]}")
     return result
 
 

diff --git a/.../src/analytics/integrations/etldb/migrations/versions/0005_add_proj_col_to_issue_hist.sql b/.../src/analytics/integrations/etldb/migrations/versions/0005_add_proj_col_to_issue_hist.sql
@@ -0,0 +1,6 @@
+ALTER TABLE IF EXISTS gh_issue_history ADD COLUMN IF NOT EXISTS sprint_id INTEGER;
+ALTER TABLE IF EXISTS gh_issue_history ADD COLUMN IF NOT EXISTS project_id INTEGER NOT NULL DEFAULT 0;
+ALTER TABLE IF EXISTS gh_issue_history ALTER COLUMN project_id DROP DEFAULT;
+ALTER TABLE IF EXISTS gh_issue_history DROP CONSTRAINT IF EXISTS gh_issue_history_issue_id_d_effective_key;
+ALTER TABLE IF EXISTS gh_issue_history ADD UNIQUE (issue_id, project_id, d_effective);
+
diff --git a/analytics/tests/datasets/test_etldb.py b/analytics/tests/datasets/test_etldb.py
@@ -14,7 +14,7 @@ def test_load_from_json_files(self):
 
         row_count = dataset.df.shape[0]
         col_count = dataset.df.shape[1]
-        assert row_count == 22
+        assert row_count == 23
         assert col_count == 27
 
     def test_deliverable_fetchers(self):
@@ -56,6 +56,9 @@ def test_issue_fetchers(self):
         issue = dataset.get_issue(ghid)
         assert issue["issue_opened_at"] == "2024-11-07"
 
+        rows = dataset.get_issues(ghid)
+        assert len(rows) == 2
+
     def test_sprint_fetchers(self):
         """Deliverable fetchers should return expected values."""
         dataset = EtlDataset.load_from_json_file(self.TEST_FILE_1)
@@ -90,7 +93,7 @@ def test_project_fetchers(self):
         assert len(unique_ghids) == 2
 
         ghid = unique_ghids[0]
-        assert ghid == 13
+        assert ghid == 17
 
         project = dataset.get_project(ghid)
         assert project["project_name"] == "HHS"
diff --git a/analytics/tests/etldb_test_01.json b/analytics/tests/etldb_test_01.json
@@ -1,7 +1,7 @@
 [
   {
     "project_owner": "HHS",
-    "project_number": 13,
+    "project_number": 17,
     "issue_title": "Draft email content for participants",
     "issue_url": "https://github.com/HHS/simpler-grants-gov/issues/2763",
     "issue_parent": "https://github.com/HHS/simpler-grants-gov/issues/2719",
@@ -658,5 +658,35 @@
     "epic_url": "https://github.com/HHS/simpler-grants-gov/issues/2353",
     "epic_title": "Public dashboard with at least one static chart",
     "issue_state": "open"
+  },
+  {
+    "project_owner": "HHS",
+    "project_number": 13,
+    "issue_title": "Draft email content for participants",
+    "issue_url": "https://github.com/HHS/simpler-grants-gov/issues/2763",
+    "issue_parent": "https://github.com/HHS/simpler-grants-gov/issues/2719",
+    "issue_type": "Task",
+    "issue_is_closed": true,
+    "issue_opened_at": "2024-11-07",
+    "issue_closed_at": "2024-11-12",
+    "issue_points": null,
+    "issue_status": "Done",
+    "sprint_id": null,
+    "sprint_name": null,
+    "sprint_start": null,
+    "sprint_length": null,
+    "sprint_end": null,
+    "quad_id": null,
+    "quad_name": null,
+    "quad_start": null,
+    "quad_length": null,
+    "quad_end": null,
+    "deliverable_pillar": null,
+    "deliverable_url": null,
+    "deliverable_title": null,
+    "deliverable_status": null,
+    "epic_url": null,
+    "epic_title": null,
+    "issue_state": "closed"
   }
 ]