Skip to content

Commit

Permalink
[Issue #2664] Join data from multiple sprint boards (#2694)
Browse files Browse the repository at this point in the history
- Adds a new `analytics.etl` sub-package and moves the transformation
code in `analytics.datasets.issues` to `analytics.etl.github`
- Updates the `export_sprint_data()` and `export_roadmap_data()` to
include `project_owner` and `project_number` fields
- Updates the `export_github_data()` entry point to accept a config file
instead of separate command line options for the sprint board owner,
project number, sprint field, etc.
- Allows users to specify multiple sprint project boards in the config
file that will be loaded and combined in the `GitHubProjectETL` class
- Updates docs to reference update entrypoint for GitHub export
  • Loading branch information
widal001 authored Nov 4, 2024
1 parent 785079e commit 96cc55e
Show file tree
Hide file tree
Showing 17 changed files with 944 additions and 382 deletions.
22 changes: 16 additions & 6 deletions analytics/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ REPO ?= simpler-grants-gov
SPRINT_PROJECT ?= 13
ROADMAP_PROJECT ?= 12
OUTPUT_DIR ?= data
CONFIG_DIR ?= config
PROJECT_CONFIG_FILE ?= $(CONFIG_DIR)/github-projects.json
SPRINT_FILE ?= $(OUTPUT_DIR)/sprint-data.json
ROADMAP_FILE ?= $(OUTPUT_DIR)/roadmap-data.json
ISSUE_FILE ?= $(OUTPUT_DIR)/issue-data.json
Expand Down Expand Up @@ -169,12 +171,9 @@ delivery-data-export:
@echo "=> Exporting GitHub issue and sprint data for delivery metrics"
@echo "====================================================="
$(POETRY) analytics export gh_delivery_data \
--owner $(ORG) \
--sprint-project $(SPRINT_PROJECT) \
--roadmap-project $(ROADMAP_PROJECT) \
--config-file $(PROJECT_CONFIG_FILE) \
--output-file $(DELIVERY_FILE) \
--points-field "$(POINTS_FIELD)" \
--sprint-field "$(SPRINT_FIELD)"
--temp-dir $(OUTPUT_DIR)

issue-data-export:
@echo "=> Exporting issue data from the repository"
Expand All @@ -187,12 +186,23 @@ issue-data-export:
gh-data-export: sprint-data-export issue-data-export roadmap-data-export delivery-data-export

sprint-burndown:
@echo "=> Running sprint burndown report"
@echo "=> Running sprint burndown report for HHS/13"
@echo "====================================================="
$(POETRY) analytics calculate sprint_burndown \
--issue-file $(DELIVERY_FILE) \
--output-dir $(OUTPUT_DIR) \
--sprint "$(SPRINT)" \
--project 13 \
--unit $(UNIT) \
--$(ACTION)
@echo "====================================================="
@echo "=> Running sprint burndown report for HHS/17"
@echo "====================================================="
$(POETRY) analytics calculate sprint_burndown \
--issue-file $(DELIVERY_FILE) \
--output-dir $(OUTPUT_DIR) \
--sprint "$(SPRINT)" \
--project 17 \
--unit $(UNIT) \
--$(ACTION)

Expand Down
22 changes: 22 additions & 0 deletions analytics/config/github-projects.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"roadmap_project": {
"owner": "HHS",
"project_number": 12,
"quad_field": "Quad",
"pillar_field": "Pillar"
},
"sprint_projects": [
{
"owner": "HHS",
"project_number": 13,
"sprint_field": "Sprint",
"points_field": "Story Points"
},
{
"owner": "HHS",
"project_number": 17,
"sprint_field": "Sprint",
"points_field": "Points"
}
]
}
59 changes: 23 additions & 36 deletions analytics/src/analytics/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

from analytics.datasets.deliverable_tasks import DeliverableTasks
from analytics.datasets.issues import GitHubIssues
from analytics.etl.github import GitHubProjectConfig, GitHubProjectETL
from analytics.etl.utils import load_config
from analytics.integrations import db, github, slack
from analytics.metrics.base import BaseMetric, Unit
from analytics.metrics.burndown import SprintBurndown
Expand All @@ -22,6 +24,7 @@

# fmt: off
# Instantiate typer options with help text for the commands below
CONFIG_FILE_ARG = typer.Option(help="Path to JSON file with configurations for this entrypoint")
SPRINT_FILE_ARG = typer.Option(help="Path to file with exported sprint data")
ISSUE_FILE_ARG = typer.Option(help="Path to file with exported issue data")
ROADMAP_FILE_ARG = typer.Option(help="Path to file with exported roadmap data")
Expand Down Expand Up @@ -85,44 +88,20 @@ def export_github_issue_data(

@export_app.command(name="gh_delivery_data")
def export_github_data(
owner: Annotated[str, OWNER_ARG],
sprint_project: Annotated[int, PROJECT_ARG],
roadmap_project: Annotated[int, PROJECT_ARG],
config_file: Annotated[str, CONFIG_FILE_ARG],
output_file: Annotated[str, OUTPUT_FILE_ARG],
sprint_field: Annotated[str, FIELD_ARG] = "Sprint",
points_field: Annotated[str, FIELD_ARG] = "Points",
tmp_dir: Annotated[str, TMP_DIR_ARG] = "data",
temp_dir: Annotated[str, TMP_DIR_ARG],
) -> None:
"""Export and flatten metadata about GitHub issues used for delivery metrics."""
# Specify path to intermediate files
sprint_file = Path(tmp_dir) / "sprint-data.json"
roadmap_file = Path(tmp_dir) / "roadmap-data.json"

# # Export sprint and roadmap data
logger.info("Exporting roadmap data")
github.export_roadmap_data(
owner=owner,
project=roadmap_project,
quad_field="Quad",
pillar_field="Pillar",
output_file=str(roadmap_file),
)
logger.info("Exporting sprint data")
github.export_sprint_data(
owner=owner,
project=sprint_project,
sprint_field=sprint_field,
points_field=points_field,
output_file=str(sprint_file),
)

# load and flatten data into GitHubIssues dataset
logger.info("Transforming exported data")
issues = GitHubIssues.load_from_json_files(
sprint_file=str(sprint_file),
roadmap_file=str(roadmap_file),
)
issues.to_json(output_file)
# Configure ETL pipeline
config_path = Path(config_file)
if not config_path.exists():
typer.echo(f"Not a path to a valid config file: {config_path}")
config = load_config(config_path, GitHubProjectConfig)
config.temp_dir = temp_dir
config.output_file = output_file
# Run ETL pipeline
GitHubProjectETL(config).run()


# ===========================================================
Expand All @@ -139,12 +118,20 @@ def calculate_sprint_burndown(
show_results: Annotated[bool, SHOW_RESULTS_ARG] = False,
post_results: Annotated[bool, POST_RESULTS_ARG] = False,
output_dir: Annotated[str, OUTPUT_DIR_ARG] = "data",
owner: Annotated[str, OWNER_ARG] = "HHS",
project: Annotated[int, PROJECT_ARG] = 13,
) -> None:
"""Calculate the burndown for a particular sprint."""
# load the input data
sprint_data = GitHubIssues.from_json(issue_file)
# calculate burndown
burndown = SprintBurndown(sprint_data, sprint=sprint, unit=unit)
burndown = SprintBurndown(
sprint_data,
sprint=sprint,
unit=unit,
project=project,
owner=owner,
)
show_and_or_post_results(
metric=burndown,
show_results=show_results,
Expand Down
145 changes: 13 additions & 132 deletions analytics/src/analytics/datasets/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@

import logging
from enum import Enum
from typing import Self

import pandas as pd
from pydantic import BaseModel, Field, ValidationError
from pydantic import BaseModel, Field

from analytics.datasets.base import BaseDataset
from analytics.datasets.utils import load_json_file

logger = logging.getLogger(__name__)

Expand All @@ -31,7 +29,10 @@ class IssueType(Enum):
class IssueMetadata(BaseModel):
"""Stores information about issue type and parent (if applicable)."""

# Common metadata -- attributes about the issue common to both projects
# Project metadata -- attributes about the sprint project board
project_owner: str
project_number: int
# Issue metadata -- attributes about the issue common to both projects
issue_title: str
issue_url: str
issue_parent: str | None
Expand Down Expand Up @@ -77,6 +78,7 @@ def __init__(self, df: pd.DataFrame) -> None:
self.sprint_col = "sprint_name"
self.sprint_start_col = "sprint_start"
self.sprint_end_col = "sprint_end"
self.project_col = "project_number"
self.date_cols = [
self.sprint_start_col,
self.sprint_end_col,
Expand Down Expand Up @@ -127,133 +129,12 @@ def get_sprint_name_from_date(self, date: pd.Timestamp) -> str | None:

def to_dict(self) -> list[dict]:
"""Convert this dataset to a python dictionary."""
# Convert date cols into dates
# Temporarily convert date cols into strings before exporting
for col in self.date_cols:
# strip off the timestamp portion of the date
self.df[col] = self.df[col].dt.strftime("%Y-%m-%d")
return super().to_dict()

@classmethod
def load_from_json_files(
cls,
sprint_file: str = "data/sprint-data.json",
roadmap_file: str = "data/roadmap-data.json",
) -> Self:
"""Load GitHubIssues dataset from input json files."""
# Load sprint and roadmap data
sprint_data_in = load_json_file(sprint_file)
roadmap_data_in = load_json_file(roadmap_file)
# Populate a lookup table with this data
lookup: dict = {}
lookup = populate_issue_lookup_table(lookup, roadmap_data_in)
lookup = populate_issue_lookup_table(lookup, sprint_data_in)
# Flatten and write issue level data to output file
issues = flatten_issue_data(lookup)
return cls(pd.DataFrame(data=issues))


# ===============================================================
# Transformation helper functions
# ===============================================================


def populate_issue_lookup_table(
lookup: dict[str, IssueMetadata],
issues: list[dict],
) -> dict[str, IssueMetadata]:
"""Populate a lookup table that maps issue URLs to their issue type and parent."""
for i, issue in enumerate(issues):
try:
entry = IssueMetadata.model_validate(issue)
except ValidationError as err: # noqa: PERF203
logger.error("Error parsing row %d, skipped.", i) # noqa: TRY400
logger.debug("Error: %s", err)
continue
lookup[entry.issue_url] = entry
return lookup


def get_parent_with_type(
child_url: str,
lookup: dict[str, IssueMetadata],
type_wanted: IssueType,
) -> IssueMetadata | None:
"""
Traverse the lookup table to find an issue's parent with a specific type.
This is useful if we have multiple nested issues, and we want to find the
top level deliverable or epic that a given task or bug is related to.
"""
# Get the initial child issue and its parent (if applicable) from the URL
child = lookup.get(child_url)
if not child:
err = f"Lookup doesn't contain issue with url: {child_url}"
raise ValueError(err)
if not child.issue_parent:
return None

# Travel up the issue hierarchy until we:
# - Find a parent issue with the desired type
# - Get to an issue without a parent
# - Have traversed 5 issues (breaks out of issue cycles)
max_traversal = 5
parent_url = child.issue_parent
for _ in range(max_traversal):
parent = lookup.get(parent_url)
# If no parent is found, return None
if not parent:
return None
# If the parent matches the desired type, return it
if IssueType(parent.issue_type) == type_wanted:
return parent
# If the parent doesn't have a its own parent, return None
if not parent.issue_parent:
return None
# Otherwise update the parent_url to "grandparent" and continue
parent_url = parent.issue_parent

# Return the URL of the parent deliverable (or None)
return None


def flatten_issue_data(lookup: dict[str, IssueMetadata]) -> list[dict]:
"""Flatten issue data and inherit data from parent epic an deliverable."""
result: list[dict] = []
for issue in lookup.values():
# If the issue is a deliverable or epic, move to the next one
if IssueType(issue.issue_type) in [IssueType.DELIVERABLE, IssueType.EPIC]:
continue

# Get the parent deliverable, if the issue has one
deliverable = get_parent_with_type(
child_url=issue.issue_url,
lookup=lookup,
type_wanted=IssueType.DELIVERABLE,
)
if deliverable:
# Set deliverable metadata
issue.deliverable_title = deliverable.issue_title
issue.deliverable_url = deliverable.issue_url
issue.deliverable_pillar = deliverable.deliverable_pillar
# Set quad metadata
issue.quad_id = deliverable.quad_id
issue.quad_name = deliverable.quad_name
issue.quad_start = deliverable.quad_start
issue.quad_end = deliverable.quad_end
issue.quad_length = deliverable.quad_length

# Get the parent epic, if the issue has one
epic = get_parent_with_type(
child_url=issue.issue_url,
lookup=lookup,
type_wanted=IssueType.EPIC,
)
if epic:
issue.epic_title = epic.issue_title
issue.epic_url = epic.issue_url

# Add the issue to the results
result.append(issue.__dict__)

# Return the results
return result
# Return the dictionary
export_dict = super().to_dict()
# Convert date columns back into dates
for col in self.date_cols:
self.df[col] = pd.to_datetime(self.df[col]).dt.floor("d")
return export_dict
1 change: 1 addition & 0 deletions analytics/src/analytics/etl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Run extract, transform, and load (ETL) pipelines to generate datasets."""
Loading

0 comments on commit 96cc55e

Please sign in to comment.