Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ops 2662 can migration 2 #2945

Merged
merged 13 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/data_tools/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ azure-storage-blob = "==12.22.0"
azure-identity = "==1.17.1"
azure-keyvault-secrets = "==4.8.0"
loguru = "==0.7.2"
click = "==8.1.7"
aiohttp = "==3.10.10"

[dev-packages]
nox = "==2024.4.15"
Expand Down
580 changes: 500 additions & 80 deletions backend/data_tools/Pipfile.lock

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions backend/data_tools/environment/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,46 @@ def db_connection_string(self) -> str:
db_host = os.getenv("PGHOST")
db_port = os.getenv("PGPORT")
db_name = os.getenv("PGDATABASE")

if not db_username or not db_password or not db_host or not db_port or not db_name:
raise ValueError("Missing environment variables for database connection.")

return (
f"postgresql+psycopg2://{db_username}:{db_password}@{db_host}:{db_port}/{db_name}"
)

@property
def verbosity(self) -> bool:
return True

@property
def is_remote(self) -> bool:
return True

@property
def vault_url(self) -> str | None:
url = os.getenv("VAULT_URL")

if not url:
raise ValueError("Missing environment variable for Azure Vault URL.")
return url

@property
def vault_file_storage_key(self) -> str:
key = os.getenv("VAULT_FILE_STORAGE_KEY")

if not key:
raise ValueError("Missing environment variable for Azure Vault File Storage Key.")
return key

@property
def file_storage_auth_method(self) -> str | None:
access_key = os.getenv("FILE_STORAGE_AUTH_METHOD")

if not access_key:
raise ValueError("Missing environment variable for FILE_STORAGE_AUTH_METHOD.")

if access_key not in ["access_key", "rbac", "mi"]:
raise ValueError("Invalid value for FILE_STORAGE_AUTH_METHOD. Must be either 'access_key' or 'rbac' or 'mi'.")

return access_key
20 changes: 20 additions & 0 deletions backend/data_tools/environment/dev.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,23 @@ def db_connection_string(self) -> str:
@property
def verbosity(self) -> bool:
return True

@property
def is_remote(self) -> bool:
return False

@property
def file_system_path(self) -> str:
return "."

@property
def vault_url(self) -> str | None:
return None

@property
def vault_file_storage_key(self) -> str | None:
return None

@property
def file_storage_auth_method(self) -> str | None:
return None
20 changes: 20 additions & 0 deletions backend/data_tools/environment/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,23 @@ def db_connection_string(self) -> str:
@property
def verbosity(self) -> bool:
return False

@property
def is_remote(self) -> bool:
return False

@property
def file_system_path(self) -> str:
return "."

@property
def vault_url(self) -> str | None:
return None

@property
def vault_file_storage_key(self) -> str | None:
return None

@property
def file_storage_auth_method(self) -> str | None:
return None
20 changes: 20 additions & 0 deletions backend/data_tools/environment/pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,23 @@ def db_connection_string(self) -> str:
@property
def verbosity(self) -> bool:
return True

@property
def is_remote(self) -> bool:
return False

@property
def file_system_path(self) -> str:
return "."

@property
def vault_url(self) -> str | None:
return None

@property
def vault_file_storage_key(self) -> str | None:
return None

@property
def file_storage_auth_method(self) -> str | None:
return None
39 changes: 38 additions & 1 deletion backend/data_tools/environment/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,47 @@ class DataToolsConfig(Protocol):
@property
@abstractmethod
def db_connection_string(self) -> str:
"""
Returns the connection string for the SQLAlchemy engine.
"""
...


@property
@abstractmethod
def verbosity(self) -> bool:
"""
Returns whether the SQLAlchemy engine is verbose or not.
"""
...

@property
@abstractmethod
def is_remote(self) -> bool:
"""
Returns whether the environment is remote or not, e.g. Azure, AWS, etc.
"""
...

@property
@abstractmethod
def vault_url(self) -> str | None:
"""
Returns the path to the cloud vault url when the environment is remote else returns None.
"""
...

@property
@abstractmethod
def vault_file_storage_key(self) -> str | None:
"""
Returns the key to the cloud vault file storage when the environment is remote else returns None.
"""
...

@property
@abstractmethod
def file_storage_auth_method(self) -> str | None:
"""
Returns whether to use the access key or role-based access control when the environment is remote else returns None.
"""
...
11 changes: 11 additions & 0 deletions backend/data_tools/scripts/azure/build_and_push_data_tools.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

REGISTRY_NAME=$1

az acr login --name "${REGISTRY_NAME}"

docker build -f Dockerfile.data-tools -t data-tools-test --platform linux/amd64 .

docker tag data-tools-test "${REGISTRY_NAME}".azurecr.io/data-tools-test:latest

docker push "${REGISTRY_NAME}".azurecr.io/data-tools-test:latest
23 changes: 23 additions & 0 deletions backend/data_tools/scripts/azure/create_azure_vault.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

RESOURCE_GROUP_NAME=$1
VAULT_NAME=$2
FILE_STORAGE_ACCESS_KEY=$3
MI_NAME=$4
#SUBSCRIPTION_ID=$5

# Create a azure vault
az keyvault create \
--name "${VAULT_NAME}" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--location eastus \
--add accessPolicies objectId="${MI_NAME}" secret-permissions get list set

# Add a role assignment
#az role assignment create \
# --role "Key Vault Secrets Officer" \
# --assignee "${ROLE_EMAIL}"\
# --scope "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP_NAME}/providers/Microsoft.KeyVault/vaults/${VAULT_NAME}"

# Add a secret to the vault
az keyvault secret set --vault-name "${VAULT_NAME}-kv" --name "file-storage-access-key" --value "${FILE_STORAGE_ACCESS_KEY}"
29 changes: 29 additions & 0 deletions backend/data_tools/scripts/azure/create_container_app_job.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env bash

RESOURCE_GROUP_NAME=$1
MI_NAME=$2
CAE_NAME=$3
REGISTRY_NAME=$4

# Get the managed identity id
MI_ID=$(az identity show --name "${MI_NAME}" --resource-group "${RESOURCE_GROUP_NAME}" --query id --output tsv)
MI_CLIENT_ID=$(az identity show --name "${MI_NAME}" --resource-group "${RESOURCE_GROUP_NAME}" --query clientId --output tsv)
MI_OBJECT_ID=$(az identity show --name "${MI_NAME}" --resource-group "${RESOURCE_GROUP_NAME}" --query principalId --output tsv)

# Create a container app job
az containerapp job create \
--name "data-tools-test-job" \
--resource-group "${RESOURCE_GROUP_NAME}" \
--image "${REGISTRY_NAME}.azurecr.io/data-tools-test:latest" \
--cpu 0.25 \
--memory 0.5Gi \
--trigger-type Manual \
--args "/bin/ash, -c, ./data_tools/scripts/get_csv.sh azure https://tsytx8kx85test.blob.core.windows.net/data/can.tsv https://tsytx8kx85test.blob.core.windows.net/data/can.tsv" \
--parallelism 1 \
--replica-timeout 600 \
--replica-retry-limit 0 \
--replica-completion-count 1 \
--environment "${CAE_NAME}" \
--registry-identity "${MI_ID}" \
--registry-server "${REGISTRY_NAME}.azurecr.io" \
--env-vars ENV=azure FILE_STORAGE_AUTH_METHOD=mi MI_CLIENT_ID="${MI_CLIENT_ID}" MI_OBJECT_ID="${MI_OBJECT_ID}" PGUSER="${PGUSER}" ADMIN_PGUSER="${ADMIN_PGUSER}" PGPASSWORD="${PGPASSWORD}" ADMIN_PGPASSWORD="${ADMIN_PGPASSWORD}" PGHOST="${PGHOST}" PGPORT="${PGPORT}" ADMIN_PGHOST="${ADMIN_PGHOST}" ADMIN_PGPORT="${ADMIN_PGPORT}" PGDATABASE="${PGDATABASE}"
24 changes: 24 additions & 0 deletions backend/data_tools/scripts/azure/create_container_registry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

RESOURCE_GROUP_NAME=$1
REGISTRY_NAME=$2
MI_NAME=$3
SUBSCRIPTION_ID=$4

# Create a resource group
az group create --name "${RESOURCE_GROUP_NAME}" --location eastus

# Create a container registry
az acr create --name "${REGISTRY_NAME}" --resource-group "${RESOURCE_GROUP_NAME}" --sku Basic

# Create a user managed identity
az identity create --name "${MI_NAME}" --resource-group "${RESOURCE_GROUP_NAME}"

# Get the managed identity id
MI_ID=$(az identity show --name "${MI_NAME}" --resource-group "${RESOURCE_GROUP_NAME}" --query principalId --output tsv)

# Assign a role with managed identity to the container registry
az role assignment create \
--role "AcrPull" \
--assignee "${MI_ID}" \
--scope "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP_NAME}/providers/Microsoft.ContainerRegistry/registries/${REGISTRY_NAME}"
13 changes: 13 additions & 0 deletions backend/data_tools/scripts/azure/create_mi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

MI_NAME=$1
RESOURCE_GROUP_NAME=$2

# Create a resource group
az group create --name "${RESOURCE_GROUP_NAME}" --location eastus

# Create a user managed identity
az identity create --name "${MI_NAME}" --resource-group "${RESOURCE_GROUP_NAME}"

# Delete a user managed identity
# az identity delete --name "${MI_NAME}" --resource-group "${RESOURCE_GROUP_NAME}"
9 changes: 9 additions & 0 deletions backend/data_tools/scripts/azure/create_resource_group.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

RESOURCE_GROUP_NAME=$1

# Create a resource group
az group create --name "${RESOURCE_GROUP_NAME}" --location eastus

# Delete a resource group
# az group delete --name "${RESOURCE_GROUP_NAME}" --yes --no-wait
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ RESOURCE_GROUP_NAME=$1
STORAGE_ACCOUNT_NAME=$2
ROLE_EMAIL=$3
SUBSCRIPTION_ID=$4
MI_NAME=$5

# Create a resource group
az group create --name "${RESOURCE_GROUP_NAME}" --location eastus
Expand All @@ -20,11 +21,19 @@ az storage account create \
--min-tls-version TLS1_2 \
--allow-blob-public-access false

MI_ID=$(az identity show --name "${MI_NAME}" --resource-group "${RESOURCE_GROUP_NAME}" --query principalId --output tsv)

# Assign a role with managed identity to the storage account
az role assignment create \
--role "Storage Blob Data Contributor" \
--assignee "${MI_ID}"\
--scope "/subscriptions/${SUBSCRIPTION_ID}/resourceGroups/${RESOURCE_GROUP_NAME}/providers/Microsoft.Storage/storageAccounts/${STORAGE_ACCOUNT_NAME}"

# Get storage account key
az storage account keys list \
--resource-group "${RESOURCE_GROUP_NAME}" \
--account-name "${STORAGE_ACCOUNT_NAME}" \
--query "[0].value" --output json
#az storage account keys list \
# --resource-group "${RESOURCE_GROUP_NAME}" \
# --account-name "${STORAGE_ACCOUNT_NAME}" \
# --query "[0].value" --output json

# Create a container
az storage container create \
Expand Down
11 changes: 11 additions & 0 deletions backend/data_tools/scripts/get_csv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,19 @@ set -eo pipefail

export PYTHONPATH=.:$PYTHONPATH

ENV=$1
INPUT_CSV=$2
OUTPUT_CSV=$3

echo "Activating virtual environment..."
. .venv/bin/activate

echo "ENV is $ENV"
echo "INPUT_CSV is $INPUT_CSV"
echo "OUTPUT_CSV is $OUTPUT_CSV"

echo "Running script..."
python data_tools/src/load_cans/main.py \
--env "${ENV}" \
--input-csv "${INPUT_CSV}" \
--output-csv "${OUTPUT_CSV}"
Loading