[Issue #2664] Join data from multiple sprint boards (#2694)

- Adds a new `analytics.etl` sub-package and moves the transformation code in `analytics.datasets.issues` to `analytics.etl.github` - Updates the `export_sprint_data()` and `export_roadmap_data()` to include `project_owner` and `project_number` fields - Updates the `export_github_data()` entry point to accept a config file instead of separate command line options for the sprint board owner, project number, sprint field, etc. - Allows users to specify multiple sprint project boards in the config file that will be loaded and combined in the `GitHubProjectETL` class - Updates docs to reference update entrypoint for GitHub export
HHS · Nov 4, 2024 · 96cc55e · 96cc55e
1 parent 785079e
commit 96cc55e
Show file tree

Hide file tree

Showing 17 changed files with 944 additions and 382 deletions.
diff --git a/analytics/Makefile b/analytics/Makefile
@@ -7,6 +7,8 @@ REPO ?= simpler-grants-gov
 SPRINT_PROJECT ?= 13
 ROADMAP_PROJECT ?= 12
 OUTPUT_DIR ?= data
+CONFIG_DIR ?= config
+PROJECT_CONFIG_FILE ?= $(CONFIG_DIR)/github-projects.json
 SPRINT_FILE ?= $(OUTPUT_DIR)/sprint-data.json
 ROADMAP_FILE ?= $(OUTPUT_DIR)/roadmap-data.json
 ISSUE_FILE ?= $(OUTPUT_DIR)/issue-data.json
@@ -169,12 +171,9 @@ delivery-data-export:
 	@echo "=> Exporting GitHub issue and sprint data for delivery metrics"
 	@echo "====================================================="
 	$(POETRY)  analytics export gh_delivery_data \
-	--owner $(ORG) \
-	--sprint-project $(SPRINT_PROJECT) \
-	--roadmap-project $(ROADMAP_PROJECT) \
+	--config-file $(PROJECT_CONFIG_FILE) \
 	--output-file $(DELIVERY_FILE) \
-	--points-field "$(POINTS_FIELD)" \
-	--sprint-field "$(SPRINT_FIELD)"
+	--temp-dir $(OUTPUT_DIR)
 
 issue-data-export:
 	@echo "=> Exporting issue data from the repository"
@@ -187,12 +186,23 @@ issue-data-export:
 gh-data-export: sprint-data-export issue-data-export roadmap-data-export delivery-data-export
 
 sprint-burndown:
-	@echo "=> Running sprint burndown report"
+	@echo "=> Running sprint burndown report for HHS/13"
+	@echo "====================================================="
+	$(POETRY) analytics calculate sprint_burndown \
+	--issue-file $(DELIVERY_FILE) \
+	--output-dir $(OUTPUT_DIR) \
+	--sprint  "$(SPRINT)" \
+	--project 13 \
+	--unit $(UNIT) \
+	--$(ACTION)
+	@echo "====================================================="
+	@echo "=> Running sprint burndown report for HHS/17"
 	@echo "====================================================="
 	$(POETRY) analytics calculate sprint_burndown \
 	--issue-file $(DELIVERY_FILE) \
 	--output-dir $(OUTPUT_DIR) \
 	--sprint  "$(SPRINT)" \
+	--project 17 \
 	--unit $(UNIT) \
 	--$(ACTION)
 

diff --git a/analytics/config/github-projects.json b/analytics/config/github-projects.json
@@ -0,0 +1,22 @@
+{
+  "roadmap_project": {
+    "owner": "HHS",
+    "project_number": 12,
+    "quad_field": "Quad",
+    "pillar_field": "Pillar"
+  },
+  "sprint_projects": [
+    {
+      "owner": "HHS",
+      "project_number": 13,
+      "sprint_field": "Sprint",
+      "points_field": "Story Points"
+    },
+    {
+      "owner": "HHS",
+      "project_number": 17,
+      "sprint_field": "Sprint",
+      "points_field": "Points"
+    }
+  ]
+}
diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py
@@ -11,6 +11,8 @@
 
 from analytics.datasets.deliverable_tasks import DeliverableTasks
 from analytics.datasets.issues import GitHubIssues
+from analytics.etl.github import GitHubProjectConfig, GitHubProjectETL
+from analytics.etl.utils import load_config
 from analytics.integrations import db, github, slack
 from analytics.metrics.base import BaseMetric, Unit
 from analytics.metrics.burndown import SprintBurndown
@@ -22,6 +24,7 @@
 
 # fmt: off
 # Instantiate typer options with help text for the commands below
+CONFIG_FILE_ARG = typer.Option(help="Path to JSON file with configurations for this entrypoint")
 SPRINT_FILE_ARG = typer.Option(help="Path to file with exported sprint data")
 ISSUE_FILE_ARG = typer.Option(help="Path to file with exported issue data")
 ROADMAP_FILE_ARG = typer.Option(help="Path to file with exported roadmap data")
@@ -85,44 +88,20 @@ def export_github_issue_data(
 
 @export_app.command(name="gh_delivery_data")
 def export_github_data(
-    owner: Annotated[str, OWNER_ARG],
-    sprint_project: Annotated[int, PROJECT_ARG],
-    roadmap_project: Annotated[int, PROJECT_ARG],
+    config_file: Annotated[str, CONFIG_FILE_ARG],
     output_file: Annotated[str, OUTPUT_FILE_ARG],
-    sprint_field: Annotated[str, FIELD_ARG] = "Sprint",
-    points_field: Annotated[str, FIELD_ARG] = "Points",
-    tmp_dir: Annotated[str, TMP_DIR_ARG] = "data",
+    temp_dir: Annotated[str, TMP_DIR_ARG],
 ) -> None:
     """Export and flatten metadata about GitHub issues used for delivery metrics."""
-    # Specify path to intermediate files
-    sprint_file = Path(tmp_dir) / "sprint-data.json"
-    roadmap_file = Path(tmp_dir) / "roadmap-data.json"
-
-    # # Export sprint and roadmap data
-    logger.info("Exporting roadmap data")
-    github.export_roadmap_data(
-        owner=owner,
-        project=roadmap_project,
-        quad_field="Quad",
-        pillar_field="Pillar",
-        output_file=str(roadmap_file),
-    )
-    logger.info("Exporting sprint data")
-    github.export_sprint_data(
-        owner=owner,
-        project=sprint_project,
-        sprint_field=sprint_field,
-        points_field=points_field,
-        output_file=str(sprint_file),
-    )
-
-    # load and flatten data into GitHubIssues dataset
-    logger.info("Transforming exported data")
-    issues = GitHubIssues.load_from_json_files(
-        sprint_file=str(sprint_file),
-        roadmap_file=str(roadmap_file),
-    )
-    issues.to_json(output_file)
+    # Configure ETL pipeline
+    config_path = Path(config_file)
+    if not config_path.exists():
+        typer.echo(f"Not a path to a valid config file: {config_path}")
+    config = load_config(config_path, GitHubProjectConfig)
+    config.temp_dir = temp_dir
+    config.output_file = output_file
+    # Run ETL pipeline
+    GitHubProjectETL(config).run()
 
 
 # ===========================================================
@@ -139,12 +118,20 @@ def calculate_sprint_burndown(
     show_results: Annotated[bool, SHOW_RESULTS_ARG] = False,
     post_results: Annotated[bool, POST_RESULTS_ARG] = False,
     output_dir: Annotated[str, OUTPUT_DIR_ARG] = "data",
+    owner: Annotated[str, OWNER_ARG] = "HHS",
+    project: Annotated[int, PROJECT_ARG] = 13,
 ) -> None:
     """Calculate the burndown for a particular sprint."""
     # load the input data
     sprint_data = GitHubIssues.from_json(issue_file)
     # calculate burndown
-    burndown = SprintBurndown(sprint_data, sprint=sprint, unit=unit)
+    burndown = SprintBurndown(
+        sprint_data,
+        sprint=sprint,
+        unit=unit,
+        project=project,
+        owner=owner,
+    )
     show_and_or_post_results(
         metric=burndown,
         show_results=show_results,

diff --git a/analytics/src/analytics/datasets/issues.py b/analytics/src/analytics/datasets/issues.py
@@ -2,13 +2,11 @@
 
 import logging
 from enum import Enum
-from typing import Self
 
 import pandas as pd
-from pydantic import BaseModel, Field, ValidationError
+from pydantic import BaseModel, Field
 
 from analytics.datasets.base import BaseDataset
-from analytics.datasets.utils import load_json_file
 
 logger = logging.getLogger(__name__)
 
@@ -31,7 +29,10 @@ class IssueType(Enum):
 class IssueMetadata(BaseModel):
     """Stores information about issue type and parent (if applicable)."""
 
-    # Common metadata -- attributes about the issue common to both projects
+    # Project metadata -- attributes about the sprint project board
+    project_owner: str
+    project_number: int
+    # Issue metadata -- attributes about the issue common to both projects
     issue_title: str
     issue_url: str
     issue_parent: str | None
@@ -77,6 +78,7 @@ def __init__(self, df: pd.DataFrame) -> None:
         self.sprint_col = "sprint_name"
         self.sprint_start_col = "sprint_start"
         self.sprint_end_col = "sprint_end"
+        self.project_col = "project_number"
         self.date_cols = [
             self.sprint_start_col,
             self.sprint_end_col,
@@ -127,133 +129,12 @@ def get_sprint_name_from_date(self, date: pd.Timestamp) -> str | None:
 
     def to_dict(self) -> list[dict]:
         """Convert this dataset to a python dictionary."""
-        # Convert date cols into dates
+        # Temporarily convert date cols into strings before exporting
         for col in self.date_cols:
-            # strip off the timestamp portion of the date
             self.df[col] = self.df[col].dt.strftime("%Y-%m-%d")
-        return super().to_dict()
-
-    @classmethod
-    def load_from_json_files(
-        cls,
-        sprint_file: str = "data/sprint-data.json",
-        roadmap_file: str = "data/roadmap-data.json",
-    ) -> Self:
-        """Load GitHubIssues dataset from input json files."""
-        # Load sprint and roadmap data
-        sprint_data_in = load_json_file(sprint_file)
-        roadmap_data_in = load_json_file(roadmap_file)
-        # Populate a lookup table with this data
-        lookup: dict = {}
-        lookup = populate_issue_lookup_table(lookup, roadmap_data_in)
-        lookup = populate_issue_lookup_table(lookup, sprint_data_in)
-        # Flatten and write issue level data to output file
-        issues = flatten_issue_data(lookup)
-        return cls(pd.DataFrame(data=issues))
-
-
-# ===============================================================
-# Transformation helper functions
-# ===============================================================
-
-
-def populate_issue_lookup_table(
-    lookup: dict[str, IssueMetadata],
-    issues: list[dict],
-) -> dict[str, IssueMetadata]:
-    """Populate a lookup table that maps issue URLs to their issue type and parent."""
-    for i, issue in enumerate(issues):
-        try:
-            entry = IssueMetadata.model_validate(issue)
-        except ValidationError as err:  # noqa: PERF203
-            logger.error("Error parsing row %d, skipped.", i)  # noqa: TRY400
-            logger.debug("Error: %s", err)
-            continue
-        lookup[entry.issue_url] = entry
-    return lookup
-
-
-def get_parent_with_type(
-    child_url: str,
-    lookup: dict[str, IssueMetadata],
-    type_wanted: IssueType,
-) -> IssueMetadata | None:
-    """
-    Traverse the lookup table to find an issue's parent with a specific type.
-
-    This is useful if we have multiple nested issues, and we want to find the
-    top level deliverable or epic that a given task or bug is related to.
-    """
-    # Get the initial child issue and its parent (if applicable) from the URL
-    child = lookup.get(child_url)
-    if not child:
-        err = f"Lookup doesn't contain issue with url: {child_url}"
-        raise ValueError(err)
-    if not child.issue_parent:
-        return None
-
-    # Travel up the issue hierarchy until we:
-    #  - Find a parent issue with the desired type
-    #  - Get to an issue without a parent
-    #  - Have traversed 5 issues (breaks out of issue cycles)
-    max_traversal = 5
-    parent_url = child.issue_parent
-    for _ in range(max_traversal):
-        parent = lookup.get(parent_url)
-        # If no parent is found, return None
-        if not parent:
-            return None
-        # If the parent matches the desired type, return it
-        if IssueType(parent.issue_type) == type_wanted:
-            return parent
-        # If the parent doesn't have a its own parent, return None
-        if not parent.issue_parent:
-            return None
-        # Otherwise update the parent_url to "grandparent" and continue
-        parent_url = parent.issue_parent
-
-    # Return the URL of the parent deliverable (or None)
-    return None
-
-
-def flatten_issue_data(lookup: dict[str, IssueMetadata]) -> list[dict]:
-    """Flatten issue data and inherit data from parent epic an deliverable."""
-    result: list[dict] = []
-    for issue in lookup.values():
-        # If the issue is a deliverable or epic, move to the next one
-        if IssueType(issue.issue_type) in [IssueType.DELIVERABLE, IssueType.EPIC]:
-            continue
-
-        # Get the parent deliverable, if the issue has one
-        deliverable = get_parent_with_type(
-            child_url=issue.issue_url,
-            lookup=lookup,
-            type_wanted=IssueType.DELIVERABLE,
-        )
-        if deliverable:
-            # Set deliverable metadata
-            issue.deliverable_title = deliverable.issue_title
-            issue.deliverable_url = deliverable.issue_url
-            issue.deliverable_pillar = deliverable.deliverable_pillar
-            # Set quad metadata
-            issue.quad_id = deliverable.quad_id
-            issue.quad_name = deliverable.quad_name
-            issue.quad_start = deliverable.quad_start
-            issue.quad_end = deliverable.quad_end
-            issue.quad_length = deliverable.quad_length
-
-        # Get the parent epic, if the issue has one
-        epic = get_parent_with_type(
-            child_url=issue.issue_url,
-            lookup=lookup,
-            type_wanted=IssueType.EPIC,
-        )
-        if epic:
-            issue.epic_title = epic.issue_title
-            issue.epic_url = epic.issue_url
-
-        # Add the issue to the results
-        result.append(issue.__dict__)
-
-    # Return the results
-    return result
+        # Return the dictionary
+        export_dict = super().to_dict()
+        # Convert date columns back into dates
+        for col in self.date_cols:
+            self.df[col] = pd.to_datetime(self.df[col]).dt.floor("d")
+        return export_dict
diff --git a/analytics/src/analytics/etl/__init__.py b/analytics/src/analytics/etl/__init__.py
@@ -0,0 +1 @@
+"""Run extract, transform, and load (ETL) pipelines to generate datasets."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Run extract, transform, and load (ETL) pipelines to generate datasets."""