HHS · widal001 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025 · Jan 3, 2025
@@ -18,7 +18,6 @@ RUN apt-get update \
     libpq-dev \
     postgresql \
     wget \
-    jq \
   # Install security updates
   # https://pythonspeed.com/articles/security-updates-in-docker/
   && apt-get upgrade --yes \
@@ -28,24 +27,11 @@ RUN apt-get update \
     libpq-dev \
     postgresql \
     wget \
-    jq \
   # Reduce the image size by clear apt cached lists
   # Complies with https://github.com/codacy/codacy-hadolint/blob/master/codacy-hadolint/docs/description/DL3009.md
   && rm -fr /var/lib/apt/lists/* \
   && rm /etc/ssl/private/ssl-cert-snakeoil.key
 
-# Install gh CLI
-# docs: https://github.com/cli/cli/blob/trunk/docs/install_linux.md
-SHELL ["/bin/bash", "-o", "pipefail", "-c"]
-RUN mkdir -p /etc/apt/keyrings \
-  && wget -qO- https://cli.github.com/packages/githubcli-archive-keyring.gpg | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \
-  && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \
-  && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \
-  && apt-get update \
-  && apt-get install gh -y \
-  && rm -fr /var/lib/apt/lists/* \
-  && gh --version
-
 ARG RUN_UID
 ARG RUN_USER
 

@@ -1,14 +1,14 @@
 """Loads configuration variables from settings files
 
 """
-import os 
+import os
 from typing import Optional
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from pydantic import Field
 
 # reads environment variables from .env files defaulting to "local.env"
 class PydanticBaseEnvConfig(BaseSettings):
-    model_config = SettingsConfigDict(env_file="%s.env" % os.getenv("ENVIRONMENT", "local"), extra="allow") 
+    model_config = SettingsConfigDict(env_file="%s.env" % os.getenv("ENVIRONMENT", "local"), extra="allow")
 
 class DBSettings(PydanticBaseEnvConfig):
      db_host: str = Field(alias="DB_HOST")
@@ -19,6 +19,7 @@ class DBSettings(PydanticBaseEnvConfig):
      ssl_mode: str = Field("require", alias="DB_SSL_MODE")
      db_schema: str = Field ("app", alias="DB_SCHEMA")
      slack_bot_token: str = Field(alias="ANALYTICS_SLACK_BOT_TOKEN")
+     github_token: str = Field(alias="GH_TOKEN")
      reporting_channel_id: str = Field(alias="ANALYTICS_REPORTING_CHANNEL_ID")
      aws_region: Optional[str] = Field(None, alias="AWS_REGION")
      local_env: bool = True if os.getenv("ENVIRONMENT", "local") == "local" else False

@@ -31,14 +31,15 @@ MB_DB_PASS=secret123
 MB_DB_HOST=grants-analytics-db
 
 ###########################
-# Slack Configuration   #
+# Secret Configuration   #
 ###########################
 # Do not add these values to this file
 # to avoid mistakenly committing them.
 # Set these in your shell
 # by doing `export ANALYTICS_REPORTING_CHANNEL_ID=whatever`
 ANALYTICS_REPORTING_CHANNEL_ID=DO_NOT_SET_HERE
 ANALYTICS_SLACK_BOT_TOKEN=DO_NOT_SET_HERE
+GH_TOKEN=DO_NOT_SET_HERE
 
 ############################
 # Logging

@@ -68,8 +68,6 @@ line-length = 100
 [tool.ruff.lint]
 select = ["ALL"]
 ignore = [
-  "ANN101",  # missing type annotation for self
-  "ANN102",  # missing type annotation for cls
   "D203",    # no blank line before class
   "D212",    # multi-line summary first line
   "FIX002",  # line contains TODO
@@ -78,7 +76,6 @@ ignore = [
   "PTH123",  # `open()` should be replaced by `Path.open()`
   "RUF012",  # Mutable class attributes should be annotated with `typing.ClassVar`
   "TD003",   # missing an issue link on TODO
-  "PT004",   # pytest fixture leading underscore - is marked deprecated
   "FA102",   # Adding "from __future__ import annotations" to any new-style type annotation
 ]
 

@@ -101,6 +101,7 @@ def __init__(self, config: GitHubProjectConfig) -> None:
         self.config = config
         # Declare private attributes shared across ETL steps
         self._transient_files: list[InputFiles]
+        self.client = github.GitHubGraphqlClient()
         self.dataset: GitHubIssues
 
     def run(self) -> None:
@@ -121,7 +122,8 @@ def extract(self) -> None:
             output_file=roadmap_file,
         )
 
-        # Export sprint data
+        # Export sprint data for each GitHub project that the scrum teams use
+        # to manage their sprints, e.g. HHS/17 and HHS/13
         input_files: list[InputFiles] = []
         for sprint_board in self.config.sprint_projects:
             project = sprint_board.project_number
@@ -167,6 +169,7 @@ def _export_roadmap_data(
         )
         # Export the data
         github.export_roadmap_data(
+            client=self.client,
             owner=roadmap.owner,
             project=roadmap.project_number,
             quad_field=roadmap.quad_field,
@@ -186,6 +189,7 @@ def _export_sprint_data(
             sprint_board.project_number,
         )
         github.export_sprint_data(
+            client=self.client,
             owner=sprint_board.owner,
             project=sprint_board.project_number,
             sprint_field=sprint_board.sprint_field,
@@ -201,6 +205,8 @@ def _export_sprint_data(
 
 def run_transformation_pipeline(files: InputFiles) -> list[dict]:
     """Load data from input files and apply transformations."""
+    # Log the current sprint for which we're running the transformations
+    logger.info("Running transformations for sprint: %s", files.sprint)
     # Load sprint and roadmap data
     sprint_data_in = load_json_file(files.sprint)
     roadmap_data_in = load_json_file(files.roadmap)

@@ -1,10 +1,12 @@
 """Export data from GitHub."""
 
 __all__ = [
+    "GitHubGraphqlClient",
     "export_roadmap_data",
     "export_sprint_data",
 ]
 
+from analytics.integrations.github.client import GitHubGraphqlClient
 from analytics.integrations.github.main import (
     export_roadmap_data,
     export_sprint_data,

@@ -0,0 +1,139 @@
+"""Expose a client for making calls to GitHub's GraphQL API."""
+
+import logging
+from typing import Any
+
+import requests
+
+from config import get_db_settings
+
+logger = logging.getLogger(__name__)
+
+
+class GraphqlError(Exception):
+    """
+    Exception raised for errors returned by the GraphQL API.
+
+    Attributes
+    ----------
+    errors : list
+        List of error details returned by the API.
+    message : str
+        Human-readable explanation of the error.
+
+    """
+
+    def __init__(self, errors: list[dict]) -> None:
+        """Initialize the GraphqlError."""
+        self.errors = errors
+        self.message = f"GraphQL API returned errors: {errors}"
+        super().__init__(self.message)
+
+
+class GitHubGraphqlClient:
+    """
+    A client to interact with GitHub's GraphQL API.
+
+    Methods
+    -------
+    execute_paginated_query(query, variables, data_path, batch_size=100)
+        Executes a paginated GraphQL query and returns all results.
+
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize the GitHubClient.
+
+        Parameters
+        ----------
+        token : str
+            GitHub personal access token for authentication.
+
+        """
+        settings = get_db_settings()
+        self.endpoint = "https://api.github.com/graphql"
+        self.headers = {
+            "Authorization": f"Bearer {settings.github_token}",
+            "Content-Type": "application/json",
+            "GraphQL-Features": "sub_issues,issue_types",
+        }
+
+    def execute_query(self, query: str, variables: dict[str, str | int]) -> dict:
+        """
+        Make a POST request to the GitHub GraphQL API.
+
+        Parameters
+        ----------
+        query : str
+            The GraphQL query string.
+        variables : dict
+            A dictionary of variables to pass to the query.
+
+        Returns
+        -------
+        dict
+            The JSON response from the API.
+
+        """
+        response = requests.post(
+            self.endpoint,
+            headers=self.headers,
+            json={"query": query, "variables": variables},
+            timeout=60,
+        )
+        response.raise_for_status()
+        result = response.json()
+        if "errors" in result:
+            raise GraphqlError(result["errors"])
+        return result
+
+    def execute_paginated_query(
+        self,
+        query: str,
+        variables: dict[str, Any],
+        path_to_nodes: list[str],
+        batch_size: int = 100,
+    ) -> list[dict]:
+        """
+        Execute a paginated GraphQL query.
+
+        Parameters
+        ----------
+        query : str
+            The GraphQL query string.
+        variables : dict
+            A dictionary of variables to pass to the query.
+        path_to_nodes : list of str
+            The path to traverse the response data to extract the "nodes" list,
+            so the nodes can be combined from multiple paginated responses.
+        batch_size : int, optional
+            The number of items to fetch per batch, by default 100.
+
+        Returns
+        -------
+        list of dict
+            The combined results from all paginated responses.
+
+        """
+        all_data = []
+        has_next_page = True
+        variables["batch"] = batch_size
+        variables["endCursor"] = None
+
+        while has_next_page:
+            response = self.execute_query(query, variables)
+            data = response["data"]
+
+            # Traverse the data path to extract nodes
+            for key in path_to_nodes:
+                data = data[key]
+
+            all_data.extend(data["nodes"])
+
+            # Handle pagination
+            page_info = data["pageInfo"]
+            has_next_page = page_info["hasNextPage"]
+            variables["endCursor"] = page_info["endCursor"]
+
+        return all_data