-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Issue #2590] Replace gh
in analytics ETL
#3393
base: main
Are you sure you want to change the base?
Changes from all commits
13b94af
f075de0
73a6f79
fe41e37
23a3478
69d7109
26e3fd2
7c3bbb0
538b009
a1e7969
93e1012
e347dba
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,7 +18,6 @@ RUN apt-get update \ | |
libpq-dev \ | ||
postgresql \ | ||
wget \ | ||
jq \ | ||
# Install security updates | ||
# https://pythonspeed.com/articles/security-updates-in-docker/ | ||
&& apt-get upgrade --yes \ | ||
|
@@ -28,24 +27,11 @@ RUN apt-get update \ | |
libpq-dev \ | ||
postgresql \ | ||
wget \ | ||
jq \ | ||
# Reduce the image size by clear apt cached lists | ||
# Complies with https://github.com/codacy/codacy-hadolint/blob/master/codacy-hadolint/docs/description/DL3009.md | ||
&& rm -fr /var/lib/apt/lists/* \ | ||
&& rm /etc/ssl/private/ssl-cert-snakeoil.key | ||
|
||
# Install gh CLI | ||
# docs: https://github.com/cli/cli/blob/trunk/docs/install_linux.md | ||
Comment on lines
-37
to
-38
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removing this script because we no longer need the |
||
SHELL ["/bin/bash", "-o", "pipefail", "-c"] | ||
RUN mkdir -p /etc/apt/keyrings \ | ||
&& wget -qO- https://cli.github.com/packages/githubcli-archive-keyring.gpg | tee /etc/apt/keyrings/githubcli-archive-keyring.gpg > /dev/null \ | ||
&& chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \ | ||
&& echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" | tee /etc/apt/sources.list.d/github-cli.list > /dev/null \ | ||
&& apt-get update \ | ||
&& apt-get install gh -y \ | ||
&& rm -fr /var/lib/apt/lists/* \ | ||
&& gh --version | ||
|
||
ARG RUN_UID | ||
ARG RUN_USER | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,14 @@ | ||
"""Loads configuration variables from settings files | ||
|
||
""" | ||
import os | ||
import os | ||
from typing import Optional | ||
from pydantic_settings import BaseSettings, SettingsConfigDict | ||
from pydantic import Field | ||
|
||
# reads environment variables from .env files defaulting to "local.env" | ||
class PydanticBaseEnvConfig(BaseSettings): | ||
model_config = SettingsConfigDict(env_file="%s.env" % os.getenv("ENVIRONMENT", "local"), extra="allow") | ||
model_config = SettingsConfigDict(env_file="%s.env" % os.getenv("ENVIRONMENT", "local"), extra="allow") | ||
|
||
class DBSettings(PydanticBaseEnvConfig): | ||
db_host: str = Field(alias="DB_HOST") | ||
|
@@ -19,6 +19,7 @@ class DBSettings(PydanticBaseEnvConfig): | |
ssl_mode: str = Field("require", alias="DB_SSL_MODE") | ||
db_schema: str = Field ("app", alias="DB_SCHEMA") | ||
slack_bot_token: str = Field(alias="ANALYTICS_SLACK_BOT_TOKEN") | ||
github_token: str = Field(alias="GH_TOKEN") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added this because we now need to reference it directly within the codebase, instead of indirectly like we did previously with the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we are in this file, can we rename |
||
reporting_channel_id: str = Field(alias="ANALYTICS_REPORTING_CHANNEL_ID") | ||
aws_region: Optional[str] = Field(None, alias="AWS_REGION") | ||
local_env: bool = True if os.getenv("ENVIRONMENT", "local") == "local" else False | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,14 +31,15 @@ MB_DB_PASS=secret123 | |
MB_DB_HOST=grants-analytics-db | ||
|
||
########################### | ||
# Slack Configuration # | ||
# Secret Configuration # | ||
########################### | ||
# Do not add these values to this file | ||
# to avoid mistakenly committing them. | ||
# Set these in your shell | ||
# by doing `export ANALYTICS_REPORTING_CHANNEL_ID=whatever` | ||
ANALYTICS_REPORTING_CHANNEL_ID=DO_NOT_SET_HERE | ||
ANALYTICS_SLACK_BOT_TOKEN=DO_NOT_SET_HERE | ||
GH_TOKEN=DO_NOT_SET_HERE | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prevents tests from failing if someone hasn't set their GitHub token locally. |
||
|
||
############################ | ||
# Logging | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,8 +68,6 @@ line-length = 100 | |
[tool.ruff.lint] | ||
select = ["ALL"] | ||
ignore = [ | ||
"ANN101", # missing type annotation for self | ||
"ANN102", # missing type annotation for cls | ||
Comment on lines
-71
to
-72
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed these because they've been removed in the latest version of ruff |
||
"D203", # no blank line before class | ||
"D212", # multi-line summary first line | ||
"FIX002", # line contains TODO | ||
|
@@ -78,7 +76,6 @@ ignore = [ | |
"PTH123", # `open()` should be replaced by `Path.open()` | ||
"RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` | ||
"TD003", # missing an issue link on TODO | ||
"PT004", # pytest fixture leading underscore - is marked deprecated | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same with this one |
||
"FA102", # Adding "from __future__ import annotations" to any new-style type annotation | ||
] | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
"""Expose a client for making calls to GitHub's GraphQL API.""" | ||
|
||
import logging | ||
from typing import Any | ||
|
||
import requests | ||
|
||
from config import get_db_settings | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class GraphqlError(Exception): | ||
""" | ||
Exception raised for errors returned by the GraphQL API. | ||
|
||
Attributes | ||
---------- | ||
errors : list | ||
List of error details returned by the API. | ||
message : str | ||
Human-readable explanation of the error. | ||
|
||
""" | ||
|
||
def __init__(self, errors: list[dict]) -> None: | ||
"""Initialize the GraphqlError.""" | ||
self.errors = errors | ||
self.message = f"GraphQL API returned errors: {errors}" | ||
super().__init__(self.message) | ||
|
||
|
||
class GitHubGraphqlClient: | ||
""" | ||
A client to interact with GitHub's GraphQL API. | ||
|
||
Methods | ||
------- | ||
execute_paginated_query(query, variables, data_path, batch_size=100) | ||
Executes a paginated GraphQL query and returns all results. | ||
|
||
""" | ||
|
||
def __init__(self) -> None: | ||
""" | ||
Initialize the GitHubClient. | ||
|
||
Parameters | ||
---------- | ||
token : str | ||
GitHub personal access token for authentication. | ||
|
||
""" | ||
settings = get_db_settings() | ||
self.endpoint = "https://api.github.com/graphql" | ||
self.headers = { | ||
"Authorization": f"Bearer {settings.github_token}", | ||
"Content-Type": "application/json", | ||
"GraphQL-Features": "sub_issues,issue_types", | ||
} | ||
|
||
def execute_query(self, query: str, variables: dict[str, str | int]) -> dict: | ||
""" | ||
Make a POST request to the GitHub GraphQL API. | ||
|
||
Parameters | ||
---------- | ||
query : str | ||
The GraphQL query string. | ||
variables : dict | ||
A dictionary of variables to pass to the query. | ||
|
||
Returns | ||
------- | ||
dict | ||
The JSON response from the API. | ||
|
||
""" | ||
response = requests.post( | ||
self.endpoint, | ||
headers=self.headers, | ||
json={"query": query, "variables": variables}, | ||
timeout=60, | ||
) | ||
response.raise_for_status() | ||
result = response.json() | ||
if "errors" in result: | ||
raise GraphqlError(result["errors"]) | ||
return result | ||
|
||
def execute_paginated_query( | ||
self, | ||
query: str, | ||
variables: dict[str, Any], | ||
path_to_nodes: list[str], | ||
batch_size: int = 100, | ||
) -> list[dict]: | ||
""" | ||
Execute a paginated GraphQL query. | ||
|
||
Parameters | ||
---------- | ||
query : str | ||
The GraphQL query string. | ||
variables : dict | ||
A dictionary of variables to pass to the query. | ||
path_to_nodes : list of str | ||
The path to traverse the response data to extract the "nodes" list, | ||
so the nodes can be combined from multiple paginated responses. | ||
batch_size : int, optional | ||
The number of items to fetch per batch, by default 100. | ||
|
||
Returns | ||
------- | ||
list of dict | ||
The combined results from all paginated responses. | ||
|
||
""" | ||
all_data = [] | ||
has_next_page = True | ||
variables["batch"] = batch_size | ||
variables["endCursor"] = None | ||
|
||
while has_next_page: | ||
response = self.execute_query(query, variables) | ||
data = response["data"] | ||
|
||
# Traverse the data path to extract nodes | ||
for key in path_to_nodes: | ||
data = data[key] | ||
|
||
all_data.extend(data["nodes"]) | ||
|
||
# Handle pagination | ||
page_info = data["pageInfo"] | ||
has_next_page = page_info["hasNextPage"] | ||
variables["endCursor"] = page_info["endCursor"] | ||
|
||
return all_data |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removing
jq
because we no longer need it for transformations