diff --git a/.github/workflows/local-setup-test.yaml b/.github/workflows/local-setup-test.yaml index 847cb5a..23400a4 100644 --- a/.github/workflows/local-setup-test.yaml +++ b/.github/workflows/local-setup-test.yaml @@ -23,10 +23,10 @@ jobs: poetry install - name: Run local unit tests run: | - ./go.sh run-local-unit-test + poetry run python -m pytest tests/unit - name: Run local integration tests run: | - ./go.sh run-local-integration-test + poetry run python -m pytest tests/integration windows: runs-on: windows-latest @@ -65,11 +65,10 @@ jobs: winutils.exe chmod 777 D:\a\dataengineer-transformations-python\dataengineer-transformations-python - name: Install Python Dependencies run: | - scripts\install.bat poetry install - name: Run local unit tests run: | - .\go.ps1 run-local-unit-test + poetry run python -m pytest tests/unit - name: Run local integration tests run: | - .\go.ps1 run-local-integration-test + poetry run python -m pytest tests/integration diff --git a/.gitpod.Dockerfile b/.gitpod.Dockerfile index f9cf755..863de24 100644 --- a/.gitpod.Dockerfile +++ b/.gitpod.Dockerfile @@ -5,13 +5,9 @@ USER root WORKDIR /opt RUN if [ "$(arch)" = "aarch64" ] ; then ARCHITECTURE="aarch64" ; else ARCHITECTURE="x64"; fi && \ wget -O OpenJDK.tar.gz https://github.com/AdoptOpenJDK/openjdk11-binaries/releases/download/jdk-11.0.11%2B9/OpenJDK11U-jdk_${ARCHITECTURE}_linux_hotspot_11.0.11_9.tar.gz && \ - wget -O scala.tgz https://downloads.lightbend.com/scala/2.13.5/scala-2.13.5.tgz && \ - wget -O spark-hadoop.tgz https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz RUN tar xzf OpenJDK.tar.gz && \ - tar xvf scala.tgz && \ - tar xvf spark-hadoop.tgz -ENV PATH="/opt/jdk-11.0.11+9/bin:/opt/scala-2.13.5/bin:/opt/spark-3.5.1-bin-hadoop3/bin:$PATH" - +ENV JAVA_HOME="/opt/jdk-11.0.11+9" \ + PATH="/opt/jdk-11.0.11+9/bin:$PATH" #TODO : Change the user to non root user #USER 185 @@ -19,4 +15,4 @@ WORKDIR /app COPY ./pyproject.toml /app/pyproject.toml -RUN pyenv install 3.11.4 && pyenv global 3.11.4 \ No newline at end of file +RUN pyenv install 3.11.4 && pyenv global 3.11.4 && poetry env use "${HOME}/.pyenv/versions/3.11.4/bin/python3" \ No newline at end of file diff --git a/.gitpod.yml b/.gitpod.yml index 0744f2b..aab9f1f 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -7,9 +7,7 @@ image: tasks: - init: | - pyenv install 3.11.4 - pyenv shell 3.11.4 - poetry env use "${HOME}/.pyenv/versions/3.11.4/bin/python3" poetry install poetry env info - make tests + poetry run python -m pytest test/unit + poetry run python -m pytest tests/integration diff --git a/.vscode/extensions.json b/.vscode/extensions.json index 9657cff..a584819 100644 --- a/.vscode/extensions.json +++ b/.vscode/extensions.json @@ -1,4 +1,3 @@ { - "recommendations": ["ms-python.python", "donjayamanne.python-environment-manager"] - } - \ No newline at end of file + "recommendations": ["ms-python.python"] +} diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index 0b1dade..0000000 --- a/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -ARG PYTHON_VERSION=3.11 -FROM --platform=linux/amd64 python:$PYTHON_VERSION -USER root -WORKDIR /opt -RUN if [ "$(arch)" = "aarch64" ] ; then ARCHITECTURE="aarch64" ; else ARCHITECTURE="x64"; fi && \ - wget -O OpenJDK.tar.gz https://github.com/AdoptOpenJDK/openjdk11-binaries/releases/download/jdk-11.0.11%2B9/OpenJDK11U-jdk_${ARCHITECTURE}_linux_hotspot_11.0.11_9.tar.gz && \ - wget -O scala.tgz https://downloads.lightbend.com/scala/2.13.5/scala-2.13.5.tgz && \ - wget -O spark-hadoop.tgz https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3-scala2.13.tgz -RUN tar xzf OpenJDK.tar.gz && \ - tar xvf scala.tgz && \ - tar xvf spark-hadoop.tgz -ENV PATH="/opt/jdk-11.0.11+9/bin:/opt/scala-2.13.5/bin:/opt/spark-3.5.1-bin-hadoop3/bin:$PATH" - -RUN curl -sSL https://install.python-poetry.org | python3 - -ENV PATH="/root/.local/bin:${PATH}" -RUN poetry config virtualenvs.in-project false - -#TODO : Change the user to non root user -#USER 185 -WORKDIR /app - -COPY ./pyproject.toml /app/pyproject.toml - -RUN poetry install \ No newline at end of file diff --git a/README-DOCKER.md b/README-DOCKER.md deleted file mode 100644 index 9daae3d..0000000 --- a/README-DOCKER.md +++ /dev/null @@ -1,229 +0,0 @@ -# Data transformations with Python -This is a collection of _Python_ jobs that are supposed to transform data. -These jobs are using _PySpark_ to process larger volumes of data and are supposed to run on a _Spark_ cluster (via `spark-submit`). - -## Pre-requisites - -We use [`batect`](https://batect.dev/) to dockerise the tasks in this exercise. -`batect` is a lightweight wrapper around Docker that helps to ensure tasks run consistently (across linux, mac windows). -Similarly, `go.sh` / `go.ps` enables commands to be consistent across linux, mac & windows. -With `batect`, the only dependencies that need to be installed are Docker and Java >=8. Every other dependency is managed inside Docker containers. -If docker desktop can't be installed then Colima could be used on Mac and Linux. - -> **For Windows, docker desktop is the only option for using container to run application -otherwise local laptop should be set up.** - -Please make sure you have the following installed and can run them - -* Docker Desktop or Colima -* Java (11) - - -You could use following instructions as guidelines to install Docker or Colima and Java. - -```bash -# Install pre-requisites needed by batect -# For mac users: -./go.sh install-with-docker-desktop -OR -./go.sh install-with-colima - -# For windows/linux users: -# Please ensure Docker and java >=8 is installed -scripts\install_choco.ps1 -scripts\install.bat - -# For local laptop setup ensure that Java 11 with Spark 3.2.1 is available. More details in README-LOCAL.md -``` - -> **If you are using Colima, please ensure that you start Colima. For staring Colima, you could use following command:** - -`./go.sh start-colima` - - -> **Please install poetry if you would like to use lint command. Instructions to install poetry in [README-LOCAL](README.md) ** - - -## List of commands - -General pattern apart from installation and starting of Colima is: - -`./go.sh run--` - -type could be local, colima or docker-desktop - -action could be unit-test, integration-test or job. - -Full list of commands for Mac and Linux users is as follows: - -| S.No. | Command | Action | -| :---: | :---- | :--- | -| 1 | ./go.sh lint | Static analysis, code style, etc. (please install poetry if you would like to use this command) | -| 2 | ./go.sh linting | Static analysis, code style, etc. (please install poetry if you would like to use this command) | -| 3 | ./go.sh install-with-docker-desktop | Install the application requirements along with docker desktop | -| 4 | ./go.sh install-with-colima | Install the application requirements along with colima | -| 5 | ./go.sh start-colima | Start Colima | -| 6 | ./go.sh run-local-unit-test | Run unit tests on local machine | -| 7 | ./go.sh run-colima-unit-test | Run unit tests on containers using Colima | -| 8 | ./go.sh run-docker-desktop-unit-test | Run unit tests on containers using Docker Desktop | -| 9 | ./go.sh run-local-integration-test | Run integration tests on local machine | -| 10 | ./go.sh run-colima-integration-test | Run integration tests on containers using Colima | -| 11 | ./go.sh run-docker-desktop-integration-test | Run integration tests on containers using Docker Desktop | -| 12 | ./go.sh run-local-job | Run job on local machine | -| 13 | ./go.sh run-colima-job | Run job on containers using Colima | -| 14 | ./go.sh run-docker-desktop-job | Run job on containers using Docker Desktop | -| 15 | ./go.sh Usage | Display usage | - - -Full list of commands for Windows users is as follows: - -| S.No. | Command | Action | -| :---: | :---- | :--- | -| 1 | go.ps1 linting | Static analysis, code style, etc. (please install poetry if you would like to use this command) | -| 2 | go.ps1 install-with-docker-desktop | Install the application requirements along with docker desktop | -| 3 | go.ps1 run-local-unit-test | Run unit tests on local machine | -| 4 | go.ps1 run-docker-desktop-unit-test | Run unit tests on containers using Docker Desktop | -| 5 | go.ps1 run-local-integration-test | Run integration tests on local machine | -| 6 | go.ps1 run-docker-desktop-integration-test | Run integration tests on containers using Docker Desktop | -| 7 | go.ps1 run-local-job | Run job on local machine | -| 8 | go.ps1 run-docker-desktop-job | Run job on containers using Docker Desktop | -| 9 | go.ps1 Usage | Display usage | - - -## Jobs - -There are two applications in this repo: Word Count, and Citibike. - -Currently, these exist as skeletons, and have some initial test cases which are defined but ignored. -For each application, please un-ignore the tests and implement the missing logic. - -### Word Count -A NLP model is dependent on a specific input file. This job is supposed to preprocess a given text file to produce this -input file for the NLP model (feature engineering). This job will count the occurrences of a word within the given text -file (corpus). - -There is a dump of the datalake for this under `resources/word_count/words.txt` with a text file. - -#### Input -Simple `*.txt` file containing text. - -#### Output -A single `*.csv` file containing data similar to: -```csv -"word","count" -"a","3" -"an","5" -... -``` - -#### Run the job using Docker Desktop on Mac or Linux - -```bash -JOB=wordcount ./go.sh run-docker-desktop-job -``` - -#### Run the job using Docker Desktop on Windows - -```bash -$env:JOB = wordcount -.\go.ps1 run-docker-desktop-job -``` - -#### Run the job using Colima - -```bash -JOB=wordcount ./go.sh run-colima-job -``` - -### Citibike -***This problem uses data made publicly available by [Citibike](https://citibikenyc.com/), a New York based bike share company.*** - -For analytics purposes, the BI department of a hypothetical bike share company would like to present dashboards, displaying the -distance each bike was driven. There is a `*.csv` file that contains historical data of previous bike rides. This input -file needs to be processed in multiple steps. There is a pipeline running these jobs. - -![citibike pipeline](docs/citibike.png) - -There is a dump of the datalake for this under `resources/citibike/citibike.csv` with historical data. - -#### Ingest -Reads a `*.csv` file and transforms it to parquet format. The column names will be sanitized (whitespaces replaced). - -##### Input -Historical bike ride `*.csv` file: -```csv -"tripduration","starttime","stoptime","start station id","start station name","start station latitude",... -364,"2017-07-01 00:00:00","2017-07-01 00:06:05",539,"Metropolitan Ave & Bedford Ave",40.71534825,... -... -``` - -##### Output -`*.parquet` files containing the same content -```csv -"tripduration","starttime","stoptime","start_station_id","start_station_name","start_station_latitude",... -364,"2017-07-01 00:00:00","2017-07-01 00:06:05",539,"Metropolitan Ave & Bedford Ave",40.71534825,... -... -``` - -##### Run the job using Docker Desktop on Mac or Linux - -```bash -JOB=citibike_ingest ./go.sh run-docker-desktop-job -``` - -##### Run the job using Docker Desktop on Windows - -```bash -$env:JOB = citibike_ingest -.\go.ps1 run-docker-desktop-job -``` - -##### Run the job using Colima - -```bash -JOB=citibike_ingest ./go.sh run-colima-job -``` - -#### Distance calculation -This job takes bike trip information and calculates the "as the crow flies" distance traveled for each trip. -It reads the previously ingested data parquet files. - -Hint: - - For distance calculation, consider using [**Haversine formula**](https://en.wikipedia.org/wiki/Haversine_formula) as an option. - -##### Input -Historical bike ride `*.parquet` files -```csv -"tripduration",... -364,... -... -``` - -##### Outputs -`*.parquet` files containing historical data with distance column containing the calculated distance. -```csv -"tripduration",...,"distance" -364,...,1.34 -... -``` - -##### Run the job - -##### Run the job using Docker Desktop on Mac or Linux - -```bash -JOB=citibike_distance_calculation ./go.sh run-docker-desktop-job -``` - -##### Run the job using Docker Desktop on Windows - -```bash -$env:JOB = citibike_distance_calculation -.\go.ps1 run-docker-desktop-job -``` - -##### Run the job using Colima - -```bash -JOB=citibike_distance_calculation ./go.sh run-colima-job -``` \ No newline at end of file diff --git a/README.md b/README.md index 5cb73d6..26a1f78 100644 --- a/README.md +++ b/README.md @@ -1,83 +1,157 @@ # Data transformations with Python -This is a collection of _Python_ jobs that are supposed to transform data. + +This coding challenge is a collection of _Python_ jobs that are supposed to extract, transform and load data. These jobs are using _PySpark_ to process larger volumes of data and are supposed to run on a _Spark_ cluster (via `spark-submit`). -## Pre-requisites +## Gearing Up for the Pairing Session + +**✅ Goals** + +1. **Get a working environment** + Either local ([local](#local-setup), or using [gitpod](#gitpod-setup)) +2. **Get a high-level understanding of the code and test dataset structure** +3. Have your preferred text editor or IDE setup and ready to go. + +**❌ Non-Goals** + +- solving the exercises / writing code + > ⚠️ The exercises will be given at the time of interview, and solved by pairing with the interviewer. + +### Local Setup + +> 💡 If you don't manage to run the local setup or you have restrictions to install software in your laptop, use the [gitpod](#gitpod-setup) one + +#### Pre-requisites + Please make sure you have the following installed and can run them -* Python (3.11.x), you can use for example [pyenv](https://github.com/pyenv/pyenv#installation) to manage your python versions locally -* [Poetry](https://python-poetry.org/docs/#installation) -* Java (11) - * To run pySpark, it's important that the environment variable `JAVA_HOME` is set correctly, check via `echo $JAVA_HOME` - * [test_validate_spark_environment.py](/tests/integration/test_validate_spark_environment.py) will help you figure out if your environment will work -## Install all dependencies +- Python (3.11.X), you can use for example [pyenv](https://github.com/pyenv/pyenv#installation) to manage your python versions locally +- [Poetry](https://python-poetry.org/docs/#installation) +- Java (11), you can use [sdkman](https://sdkman.io/) to install and manage java locally + +#### Windows users + +We recommend using WSL 2 on Windows for this exercise, due to the [lack of support](https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems) of windows paths from Hadoop/Spark. + +Follow instructions on the [Windows official page](https://learn.microsoft.com/en-us/windows/wsl/setup/environment) + +> 💡 In case of issues, like missing permissions on the machine, please use the [gitpod setup](#gitpod-setup) + +#### Install all dependencies + ```bash poetry install ``` -## Setup -### Run tests +### Gitpod setup + +Alternatively, you can setup the environment using + +[![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/techops-recsys-lateral-hiring/dataengineer-transformations-python) + +There's an initialize script setup that takes around 3 minutes to complete. Once you use paste this repository link in new Workspace, please wait until the packages are installed. After everything is setup, select Poetry's environment by clicking on thumbs up icon and navigate to Testing tab and hit refresh icon to discover tests. + +Note that you can use gitpod's web interface or setup [ssh to Gitpod](https://www.gitpod.io/docs/references/ides-and-editors/vscode#connecting-to-vs-code-desktop) so that you can use VS Code from local to remote to Gitpod + +Remember to stop the vm and restart it just before the interview. + +### Verify setup + +> All of the following commands should be running successfully #### Run unit tests + ```bash poetry run pytest tests/unit ``` #### Run integration tests + ```bash poetry run pytest tests/integration ``` #### Run style checks + ```bash poetry run mypy --ignore-missing-imports --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs \ data_transformations tests poetry run pylint data_transformations tests ``` -This is running the linter and a type checker. -## Create package (optional) -This will create a `tar.gz` and a `.wheel` in `dist/` folder: -```bash -# Install pre-requisites needed by batect -# For mac users: -./go.sh install-with-docker-desktop -OR -./go.sh install-with-colima +### Anything else? -# For windows/linux users: -# Please ensure Docker and java >=8 is installed -scripts\install_choco.ps1 -scripts\install.bat +All commands are passing? +You are good to go! -# For local laptop setup ensure that Java 11 with Spark 3.5.1 is available. -``` -More: https://python-poetry.org/docs/cli/#build +> ⚠️ do not try to solve the exercises ahead of the interview ---- -# STOP HERE: Do not code before the interview begins. ---- +You are allowed to customize your environment (having the test in vscode directly for example): feel free to spend the time making this comfortable for you. This is not an expectation. ## Jobs -There are two applications in this repo: Word Count, and Citibike. +There are two exercises in this repo: Word Count, and Citibike. + +Currently, these exist as skeletons, and have some **initial test cases** which are defined but some are skipped. -Currently, these exist as skeletons, and have some initial test cases which are defined but ignored. -For each application, please un-ignore the tests and implement the missing logic. +The following section provides context over them. + +> ⚠️ do not try to solve the exercises ahead of the interview + +### Code walk + +``` + +/ +├─ /data_transformations # Contains the main python library +│ # with the code to the transformations +│ +├─ /jobs # Contains the entry points to the jobs +│ # performs argument parsing, and are +│ # passed to `spark-submit` +│ +├─ /resources # Contains the raw datasets for the jobs +│ +├─ /tests +│ ├─ /units # contains basic unit tests for the code +│ └─ /integration # contains integrations tests for the jobs +│ # and the setup +│ +├─ .gitignore +├─ .gitpod\* # required for the gitpod setup +├─ .pylintrc # configuration for pylint +├─ LICENCE +├─ poetry.lock +├─ pyproject.toml +└─ README.md # The current file + +``` ### Word Count + A NLP model is dependent on a specific input file. This job is supposed to preprocess a given text file to produce this input file for the NLP model (feature engineering). This job will count the occurrences of a word within the given text -file (corpus). +file (corpus). There is a dump of the datalake for this under `resources/word_count/words.txt` with a text file. +```mermaid +--- +title: Citibike Pipeline +--- +flowchart LR + Raw["fa:fa-file words.txt"] --> J1{{word_count.py}} --> Bronze["fa:fa-file-csv word_count.csv"] +``` + #### Input + Simple `*.txt` file containing text. #### Output + A single `*.csv` file containing data similar to: + ```csv "word","count" "a","3" @@ -86,9 +160,9 @@ A single `*.csv` file containing data similar to: ``` #### Run the job -Please make sure to package the code before submitting the spark job (`poetry build`) + ```bash -poetry run spark-submit \ +poetry build && poetry run spark-submit \ --master local \ --py-files dist/data_transformations-*.whl \ jobs/word_count.py \ @@ -104,7 +178,13 @@ For analytics purposes, the BI department of a hypothetical bike share company w distance each bike was driven. There is a `*.csv` file that contains historical data of previous bike rides. This input file needs to be processed in multiple steps. There is a pipeline running these jobs. -![citibike pipeline](docs/citibike.png) +```mermaid +--- +title: Citibike Pipeline +--- +flowchart TD + Raw["fa:fa-file-csv citibike.csv"] --> J1{{citibike_ingest.py}} --> Bronze["fa:fa-table-columns citibike.parquet"] --> J2{{citibike_distance_calculation.py}} --> Silver["fa:fa-table-columns citibike_distance.parquet"] +``` There is a dump of the datalake for this under `resources/citibike/citibike.csv` with historical data. @@ -133,9 +213,9 @@ Historical bike ride `*.csv` file: ``` ##### Run the job -Please make sure to package the code before submitting the spark job (`poetry build`) + ```bash -poetry run spark-submit \ +poetry build && poetry run spark-submit \ --master local \ --py-files dist/data_transformations-*.whl \ jobs/citibike_ingest.py \ @@ -145,7 +225,7 @@ poetry run spark-submit \ #### Distance calculation -This job takes bike trip information and calculates the "as the crow flies" distance traveled for each trip. +This job takes bike trip information and adds the "as the crow flies" distance traveled for each trip. It reads the previously ingested data parquet files. Hint: @@ -173,9 +253,9 @@ Historical bike ride `*.parquet` files ``` ##### Run the job -Please make sure to package the code before submitting the spark job (`poetry build`) + ```bash -poetry run spark-submit \ +poetry build && poetry run spark-submit \ --master local \ --py-files dist/data_transformations-*.whl \ jobs/citibike_distance_calculation.py \ @@ -183,28 +263,15 @@ poetry run spark-submit \ ``` -## Running the code inside container - -If you would like to run the code in Docker, please follow instructions [here](README-DOCKER.md). - -## Running the code on Gitpod - -Alternatively, you can setup the environment using - -[![Open in Gitpod](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/techops-recsys-lateral-hiring/dataengineer-transformations-python) - -It's recommend that you setup ssh to Gitpod so that you can use VS Code from local to remote to Gitpod. - -There's an initialize script setup that takes around 3 minutes to complete. Once you use paste this repository link in new Workspace, please wait until the packages are installed. After everything is setup, select Poetry's environment by clicking on thumbs up icon and navigate to Testing tab and hit refresh icon to discover tests. - -### Common issue with VS Code's Testing +--- -If Testing tab complains about Python Interpreter, run `poetry shell` in terminal to get the bin path, replace activate with python3 to resolve the issue. +> ⚠️ do not try to solve the exercises ahead of the interview -If poetry shell activate with this path +--- -`/workspace/.pyenv_mirror/poetry/virtualenvs/{project_name}-py{python_version}/bin/activate` +## Reading List -Paste this into Python Interpreter prompt +If you are unfamiliar with some of the tools used here, we recommend some resources to get started -`/workspace/.pyenv_mirror/poetry/virtualenvs/{project_name}-py{python_version}/bin/python3` +- **pytest**: [official](https://docs.pytest.org/en/8.2.x/getting-started.html#get-started) +- **pyspark**: [official](https://spark.apache.org/docs/latest/api/python/index.html) and especially the [DataFrame quickstart](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_df.html) diff --git a/batect b/batect deleted file mode 100755 index 3fc8809..0000000 --- a/batect +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env bash - -{ - set -euo pipefail - - # This file is part of Batect. - # Do not modify this file. It will be overwritten next time you upgrade Batect. - # You should commit this file to version control alongside the rest of your project. It should not be installed globally. - # For more information, visit https://github.com/batect/batect. - - VERSION="0.85.0" - CHECKSUM="${BATECT_DOWNLOAD_CHECKSUM:-901ed73295be75d295cec1d06315f7026b36ccb1666660b8af432cfbbc7feae8}" - DOWNLOAD_URL_ROOT=${BATECT_DOWNLOAD_URL_ROOT:-"https://updates.batect.dev/v1/files"} - DOWNLOAD_URL=${BATECT_DOWNLOAD_URL:-"$DOWNLOAD_URL_ROOT/$VERSION/batect-$VERSION.jar"} - QUIET_DOWNLOAD=${BATECT_QUIET_DOWNLOAD:-false} - - BATECT_WRAPPER_CACHE_DIR=${BATECT_CACHE_DIR:-"$HOME/.batect/cache"} - VERSION_CACHE_DIR="$BATECT_WRAPPER_CACHE_DIR/$VERSION" - JAR_PATH="$VERSION_CACHE_DIR/batect-$VERSION.jar" - BATECT_WRAPPER_DID_DOWNLOAD=false - - SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - - function main() { - if ! haveVersionCachedLocally; then - download - BATECT_WRAPPER_DID_DOWNLOAD=true - fi - - checkChecksum - runApplication "$@" - } - - function haveVersionCachedLocally() { - [ -f "$JAR_PATH" ] - } - - function download() { - checkForCurl - - mkdir -p "$VERSION_CACHE_DIR" - temp_file=$(mktemp) - - if [[ $QUIET_DOWNLOAD == 'true' ]]; then - curl --silent --fail --show-error --location --output "$temp_file" --retry 3 --retry-connrefused "$DOWNLOAD_URL" - else - echo "Downloading Batect version $VERSION from $DOWNLOAD_URL..." - curl -# --fail --show-error --location --output "$temp_file" --retry 3 --retry-connrefused "$DOWNLOAD_URL" - fi - - mv "$temp_file" "$JAR_PATH" - } - - function checkChecksum() { - local_checksum=$(getLocalChecksum) - - if [[ "$local_checksum" != "$CHECKSUM" ]]; then - echo "The downloaded version of Batect does not have the expected checksum. Delete '$JAR_PATH' and then re-run this script to download it again." - exit 1 - fi - } - - function getLocalChecksum() { - if [[ "$(uname)" == "Darwin" ]]; then - shasum -a 256 "$JAR_PATH" | cut -d' ' -f1 - else - sha256sum "$JAR_PATH" | cut -d' ' -f1 - fi - } - - function runApplication() { - java_path=$(getPathToJava) - checkForJava "$java_path" - - java_version_info=$(getJavaVersionInfo "$java_path") - checkJavaVersion "$java_version_info" - - java_version=$(extractJavaVersion "$java_version_info") - java_version_major=$(extractJavaMajorVersion "$java_version") - - if (( java_version_major >= 9 )); then - JAVA_OPTS=(--add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED) - else - JAVA_OPTS=() - fi - - if [[ "$(uname -o 2>&1)" == "Msys" ]] && hash winpty 2>/dev/null && [ -t /dev/stdin ]; then - GIT_BASH_PTY_WORKAROUND=(winpty) - else - GIT_BASH_PTY_WORKAROUND=() - fi - - BATECT_WRAPPER_SCRIPT_DIR="$SCRIPT_PATH" \ - BATECT_WRAPPER_CACHE_DIR="$BATECT_WRAPPER_CACHE_DIR" \ - BATECT_WRAPPER_DID_DOWNLOAD="$BATECT_WRAPPER_DID_DOWNLOAD" \ - HOSTNAME="$HOSTNAME" \ - exec \ - ${GIT_BASH_PTY_WORKAROUND[@]+"${GIT_BASH_PTY_WORKAROUND[@]}"} \ - "$java_path" \ - -Djava.net.useSystemProxies=true \ - ${JAVA_OPTS[@]+"${JAVA_OPTS[@]}"} \ - -jar "$JAR_PATH" \ - "$@" - } - - function checkForCurl() { - if ! hash curl 2>/dev/null; then - echo "curl is not installed or not on your PATH. Please install it and try again." >&2 - exit 1 - fi - } - - function getPathToJava() { - if useJavaHome; then - echo "$JAVA_HOME/bin/java" - else - echo "java" - fi - } - - function useJavaHome() { - test -n "${JAVA_HOME+x}" - } - - function checkForJava() { - local java_path="$1" - - if ! hash "$java_path" 2>/dev/null; then - showJavaNotInstalledError - fi - } - - function showJavaNotInstalledError() { - if useJavaHome; then - echo "JAVA_HOME is set to '$JAVA_HOME', but there is no Java executable at '$JAVA_HOME/bin/java'." >&2 - else - echo "Java is not installed or not on your PATH. Please install it and try again." >&2 - fi - - exit 1 - } - - function checkJavaVersion() { - java_version_info="$1" - java_version=$(extractJavaVersion "$java_version_info") - java_version_major=$(extractJavaMajorVersion "$java_version") - java_version_minor=$(extractJavaMinorVersion "$java_version") - - if (( java_version_major < 1 || ( java_version_major == 1 && java_version_minor <= 7 ) )); then - if useJavaHome; then - echo "The version of Java that is available in JAVA_HOME is version $java_version, but version 1.8 or greater is required." >&2 - echo "If you have a newer version of Java installed, please make sure JAVA_HOME is set correctly." >&2 - echo "JAVA_HOME takes precedence over any versions of Java available on your PATH." >&2 - else - echo "The version of Java that is available on your PATH is version $java_version, but version 1.8 or greater is required." >&2 - echo "If you have a newer version of Java installed, please make sure your PATH is set correctly." >&2 - fi - - exit 1 - fi - - if ! javaIs64Bit "$java_version_info"; then - if useJavaHome; then - echo "The version of Java that is available in JAVA_HOME is a 32-bit version, but Batect requires a 64-bit Java runtime." >&2 - echo "If you have a 64-bit version of Java installed, please make sure JAVA_HOME is set correctly." >&2 - echo "JAVA_HOME takes precedence over any versions of Java available on your PATH." >&2 - else - echo "The version of Java that is available on your PATH is a 32-bit version, but Batect requires a 64-bit Java runtime." >&2 - echo "If you have a 64-bit version of Java installed, please make sure your PATH is set correctly." >&2 - fi - - exit 1 - fi - } - - function getJavaVersionInfo() { - local java_path="$1" - - "$java_path" -version 2>&1 || showJavaNotInstalledError - } - - function extractJavaVersion() { - echo "$1" | grep version | sed -En ';s/.* version "([0-9]+)(\.([0-9]+))?.*".*/\1.\3/p;' - } - - function extractJavaMajorVersion() { - java_version=$1 - - echo "${java_version%.*}" - } - - function extractJavaMinorVersion() { - java_version=$1 - java_version_minor="${java_version#*.}" - - echo "${java_version_minor:-0}" - } - - function javaIs64Bit() { - echo "$1" | grep -q '64-[Bb]it' - } - - main "$@" - exit $? -} diff --git a/batect.cmd b/batect.cmd deleted file mode 100644 index aba86ae..0000000 --- a/batect.cmd +++ /dev/null @@ -1,473 +0,0 @@ -@echo off -rem This file is part of Batect. -rem Do not modify this file. It will be overwritten next time you upgrade Batect. -rem You should commit this file to version control alongside the rest of your project. It should not be installed globally. -rem For more information, visit https://github.com/batect/batect. - -setlocal EnableDelayedExpansion - -set "version=0.85.0" - -if "%BATECT_CACHE_DIR%" == "" ( - set "BATECT_CACHE_DIR=%USERPROFILE%\.batect\cache" -) - -set "rootCacheDir=!BATECT_CACHE_DIR!" -set "cacheDir=%rootCacheDir%\%version%" -set "ps1Path=%cacheDir%\batect-%version%.ps1" - -set script=Set-StrictMode -Version 2.0^ - -$ErrorActionPreference = 'Stop'^ - -^ - -$Version='0.85.0'^ - -^ - -function getValueOrDefault($value, $default) {^ - - if ($value -eq $null) {^ - - $default^ - - } else {^ - - $value^ - - }^ - -}^ - -^ - -$DownloadUrlRoot = getValueOrDefault $env:BATECT_DOWNLOAD_URL_ROOT "https://updates.batect.dev/v1/files"^ - -$UrlEncodedVersion = [Uri]::EscapeDataString($Version)^ - -$DownloadUrl = getValueOrDefault $env:BATECT_DOWNLOAD_URL "$DownloadUrlRoot/$UrlEncodedVersion/batect-$UrlEncodedVersion.jar"^ - -$ExpectedChecksum = getValueOrDefault $env:BATECT_DOWNLOAD_CHECKSUM '901ed73295be75d295cec1d06315f7026b36ccb1666660b8af432cfbbc7feae8'^ - -^ - -$RootCacheDir = getValueOrDefault $env:BATECT_CACHE_DIR "$env:USERPROFILE\.batect\cache"^ - -$VersionCacheDir = "$RootCacheDir\$Version"^ - -$JarPath = "$VersionCacheDir\batect-$Version.jar"^ - -$DidDownload = 'false'^ - -^ - -function main() {^ - - if (-not (haveVersionCachedLocally)) {^ - - download^ - - $DidDownload = 'true'^ - - }^ - -^ - - checkChecksum^ - - runApplication @args^ - -}^ - -^ - -function haveVersionCachedLocally() {^ - - Test-Path $JarPath^ - -}^ - -^ - -function download() {^ - - Write-Output "Downloading Batect version $Version from $DownloadUrl..."^ - -^ - - createCacheDir^ - -^ - - $oldProgressPreference = $ProgressPreference^ - -^ - - try {^ - - [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12^ - -^ - - # Turn off the progress bar to significantly reduce download times - see https://github.com/PowerShell/PowerShell/issues/2138#issuecomment-251165868^ - - $ProgressPreference = 'SilentlyContinue'^ - -^ - - Invoke-WebRequest -Uri $DownloadUrl -OutFile $JarPath ^| Out-Null^ - - } catch {^ - - $Message = $_.Exception.Message^ - -^ - - Write-Host -ForegroundColor Red "Downloading failed with error: $Message"^ - - exit 1^ - - } finally {^ - - $ProgressPreference = $oldProgressPreference^ - - }^ - -}^ - -^ - -function checkChecksum() {^ - - $localChecksum = (Get-FileHash -Algorithm 'SHA256' $JarPath).Hash.ToLower()^ - -^ - - if ($localChecksum -ne $expectedChecksum) {^ - - Write-Host -ForegroundColor Red "The downloaded version of Batect does not have the expected checksum. Delete '$JarPath' and then re-run this script to download it again."^ - - exit 1^ - - }^ - -}^ - -^ - -function createCacheDir() {^ - - if (-not (Test-Path $VersionCacheDir)) {^ - - New-Item -ItemType Directory -Path $VersionCacheDir ^| Out-Null^ - - }^ - -}^ - -^ - -function runApplication() {^ - - $java = findJava^ - - $javaVersion = checkJavaVersion $java^ - -^ - - if ($javaVersion.Major -ge 9) {^ - - $javaArgs = @("--add-opens", "java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens", "java.base/java.io=ALL-UNNAMED")^ - - } else {^ - - $javaArgs = @()^ - - }^ - -^ - - $combinedArgs = $javaArgs + @("-Djava.net.useSystemProxies=true", "-jar", $JarPath) + $args^ - - $env:HOSTNAME = $env:COMPUTERNAME^ - - $env:BATECT_WRAPPER_CACHE_DIR = $RootCacheDir^ - - $env:BATECT_WRAPPER_DID_DOWNLOAD = $DidDownload^ - -^ - - $info = New-Object System.Diagnostics.ProcessStartInfo^ - - $info.FileName = $java.Source^ - - $info.Arguments = combineArgumentsToString($combinedArgs)^ - - $info.RedirectStandardError = $false^ - - $info.RedirectStandardOutput = $false^ - - $info.UseShellExecute = $false^ - -^ - - $process = New-Object System.Diagnostics.Process^ - - $process.StartInfo = $info^ - - $process.Start() ^| Out-Null^ - - $process.WaitForExit()^ - -^ - - exit $process.ExitCode^ - -}^ - -^ - -function useJavaHome() {^ - - return ($env:JAVA_HOME -ne $null)^ - -}^ - -^ - -function findJava() {^ - - if (useJavaHome) {^ - - $java = Get-Command "$env:JAVA_HOME\bin\java" -ErrorAction SilentlyContinue^ - -^ - - if ($java -eq $null) {^ - - Write-Host -ForegroundColor Red "JAVA_HOME is set to '$env:JAVA_HOME', but there is no Java executable at '$env:JAVA_HOME\bin\java.exe'."^ - - exit 1^ - - }^ - -^ - - return $java^ - - }^ - -^ - - $java = Get-Command "java" -ErrorAction SilentlyContinue^ - -^ - - if ($java -eq $null) {^ - - Write-Host -ForegroundColor Red "Java is not installed or not on your PATH. Please install it and try again."^ - - exit 1^ - - }^ - -^ - - return $java^ - -}^ - -^ - -function checkJavaVersion([System.Management.Automation.CommandInfo]$java) {^ - - $versionInfo = getJavaVersionInfo $java^ - - $rawVersion = getJavaVersion $versionInfo^ - - $parsedVersion = New-Object Version -ArgumentList $rawVersion^ - - $minimumVersion = "1.8"^ - -^ - - if ($parsedVersion -lt (New-Object Version -ArgumentList $minimumVersion)) {^ - - if (useJavaHome) {^ - - Write-Host -ForegroundColor Red "The version of Java that is available in JAVA_HOME is version $rawVersion, but version $minimumVersion or greater is required."^ - - Write-Host -ForegroundColor Red "If you have a newer version of Java installed, please make sure JAVA_HOME is set correctly."^ - - Write-Host -ForegroundColor Red "JAVA_HOME takes precedence over any versions of Java available on your PATH."^ - - } else {^ - - Write-Host -ForegroundColor Red "The version of Java that is available on your PATH is version $rawVersion, but version $minimumVersion or greater is required."^ - - Write-Host -ForegroundColor Red "If you have a newer version of Java installed, please make sure your PATH is set correctly."^ - - }^ - -^ - - exit 1^ - - }^ - -^ - - if (-not ($versionInfo -match "64\-[bB]it")) {^ - - if (useJavaHome) {^ - - Write-Host -ForegroundColor Red "The version of Java that is available in JAVA_HOME is a 32-bit version, but Batect requires a 64-bit Java runtime."^ - - Write-Host -ForegroundColor Red "If you have a 64-bit version of Java installed, please make sure JAVA_HOME is set correctly."^ - - Write-Host -ForegroundColor Red "JAVA_HOME takes precedence over any versions of Java available on your PATH."^ - - } else {^ - - Write-Host -ForegroundColor Red "The version of Java that is available on your PATH is a 32-bit version, but Batect requires a 64-bit Java runtime."^ - - Write-Host -ForegroundColor Red "If you have a 64-bit version of Java installed, please make sure your PATH is set correctly."^ - - }^ - -^ - - exit 1^ - - }^ - -^ - - return $parsedVersion^ - -}^ - -^ - -function getJavaVersionInfo([System.Management.Automation.CommandInfo]$java) {^ - - $info = New-Object System.Diagnostics.ProcessStartInfo^ - - $info.FileName = $java.Source^ - - $info.Arguments = "-version"^ - - $info.RedirectStandardError = $true^ - - $info.RedirectStandardOutput = $true^ - - $info.UseShellExecute = $false^ - -^ - - $process = New-Object System.Diagnostics.Process^ - - $process.StartInfo = $info^ - - $process.Start() ^| Out-Null^ - - $process.WaitForExit()^ - -^ - - $stderr = $process.StandardError.ReadToEnd()^ - - return $stderr^ - -}^ - -^ - -function getJavaVersion([String]$versionInfo) {^ - - $versionLine = ($versionInfo -split [Environment]::NewLine)[0]^ - -^ - - if (-not ($versionLine -match "version `"([0-9]+)(\.([0-9]+))?.*`"")) {^ - - Write-Error "Java reported a version that does not match the expected format: $versionLine"^ - - }^ - -^ - - $major = $Matches.1^ - -^ - - if ($Matches.Count -ge 3) {^ - - $minor = $Matches.3^ - - } else {^ - - $minor = "0"^ - - }^ - -^ - - return "$major.$minor"^ - -}^ - -^ - -function combineArgumentsToString([Object[]]$arguments) {^ - - $combined = @()^ - -^ - - $arguments ^| %% { $combined += escapeArgument($_) }^ - -^ - - return $combined -join " "^ - -}^ - -^ - -function escapeArgument([String]$argument) {^ - - return '"' + $argument.Replace('"', '"""') + '"'^ - -}^ - -^ - -main @args^ - - - -if not exist "%cacheDir%" ( - mkdir "%cacheDir%" -) - -echo !script! > "%ps1Path%" - -set BATECT_WRAPPER_SCRIPT_DIR=%~dp0 - -rem Why do we explicitly exit? -rem cmd.exe appears to read this script one line at a time and then executes it. -rem If we modify the script while it is still running (eg. because we're updating it), then cmd.exe does all kinds of odd things -rem because it continues execution from the next byte (which was previously the end of the line). -rem By explicitly exiting on the same line as starting the application, we avoid these issues as cmd.exe has already read the entire -rem line before we start the application and therefore will always exit. - -rem Why do we set PSModulePath? -rem See issue #627 -set "PSModulePath=" -powershell.exe -ExecutionPolicy Bypass -NoLogo -NoProfile -File "%ps1Path%" %* && exit /b 0 || exit /b !ERRORLEVEL! - -rem What's this for? -rem This is so the tests for the wrapper has a way to ensure that the line above terminates the script correctly. -echo WARNING: you should never see this, and if you do, then Batect's wrapper script has a bug diff --git a/batect.yml b/batect.yml deleted file mode 100644 index f401b4f..0000000 --- a/batect.yml +++ /dev/null @@ -1,41 +0,0 @@ -containers: - pyspark: - build_directory: . - volumes: - - local: . - container: /app - options: cached - - type: cache - name: poetry-dependencies - container: /root/.cache/pypoetry/virtualenvs - -tasks: - unit-test: - description: Unit tests - group: Test - run: - container: pyspark - entrypoint: scripts/mac_or_linux/unit-test.sh - - integration-test: - description: Integration tests - group: Test - run: - container: pyspark - entrypoint: scripts/mac_or_linux/integration-test.sh - - style-checks: - description: Lint and type check - group: Test - run: - container: pyspark - entrypoint: scripts/mac_or_linux/linting.sh - - run-job: - description: Run spark job - group: Run - run: - container: pyspark - entrypoint: scripts/mac_or_linux/run-job.sh - environment: - JOB: $JOB diff --git a/docs/citibike.drawio b/docs/citibike.drawio deleted file mode 100644 index 55ba18c..0000000 --- a/docs/citibike.drawio +++ /dev/null @@ -1 +0,0 @@ -7VjdbpswGH2aXDbCGCi5bLNuq6ap0yqtl5VjDFhxMDUmIXv6GbBJ+ElJu6atql5hH9uf7XPOZzATOF8V3wRK4588IGxiW0ExgV8mtg2A76lHiWxrxPNBDUSCBrrTDrilf4kGLY3mNCBZq6PknEmatkHMk4Rg2cKQEHzT7hZy1p41RRHpAbcYsT56RwMZ16jvWjv8O6FRbGYGlm5ZIdNZA1mMAr7Zg+DVBM4F57IurYo5YSV5hpd63NcDrc3CBEnkMQNukt+X2z/+2V1x/fMaLufF3Y/4TKuzRizXG57YHlqlE3iZLLLy0a9bmEq6oEtSTk9TwmhC9Bbl1vAmeJ4EpJwaqEGbmEpymyJctm6UUxQWyxXTzSFPpJYeOKqOGI0SVWEkVFu7XBMhqZLkQsOS1wupVq7aSHGQEtAQrRxK+IpIsVVdtBkh1NpsdtLOjF7xvqyeBpG2U9TE2jGuCpr0JwgAegIYdu9pEpFMTtPtI+RaTyc3pIzNOeOiigXDMLQxVngmBV+SvZbAW3iu959MmwEmYQzzfp/5ps8+8ycj3h5yPpOasBbf3kPOTcNZVlF5oToAJy3qBNHtqhSVT6PgFGdrE1StsY5bd+lJqk6HtCwmXJJxTRcIL6PKBTe5rPKvxgMkljdqFJUlz9bUcscNECDih4MG8LBPFuHLGAB29Pf6+vtDiWedSn94UP+ArrtK3gc0kyjB5F4dQzhnSFKeVJnZqLs36gPkqzMbz1f7VfPVeZl8PZysipyHnMjPhNWKO92MdfsnNhhwwOxUDnCPydiXNUWT9wddMZz/J1rWqEePOo0+unPP/Y5zXXt6pHcBcE9k3vMB83Z0IUlwUd5YVA0zlGUUP/5iqAOQoHeB6RClJuG5wGT8Y0giERE59rXaJ36P1qF3gsEEKV+c6/Zyh5jWM/zitMqWon2wmJtk99Og3qUetH8R6sQBnZPNdTuBahp6gSrdm10/3wr+u7aCuWSPWcF5SyvY5x0vgOd6wWsHcuzX9cLsXXvBOdIL8C29AMGIhMd6Abpv6wWzj3dqBnikGdxPMzxqBlXd/fSru+9+ncKrfw== \ No newline at end of file diff --git a/docs/citibike.png b/docs/citibike.png deleted file mode 100644 index 807c2c6..0000000 Binary files a/docs/citibike.png and /dev/null differ diff --git a/go.ps1 b/go.ps1 deleted file mode 100644 index 013fdd3..0000000 --- a/go.ps1 +++ /dev/null @@ -1,71 +0,0 @@ -$ErrorActionPreference = "Stop" - -$action=$args[0] - -function Get-Usage { - Write-Host $MyInvocation.PSCommandPath " [--] [options ...]" - Write-Host "Commands:" - Write-Host " linting Static analysis, code style, etc. (please install poetry if you would like to use this command)" - Write-Host " precommit Run sensible checks before committing" - Write-Host " install-with-docker-desktop Install the application requirements along with docker desktop" - Write-Host " run-local-unit-test Run unit tests on local machine" - Write-Host " run-docker-desktop-unit-test Run unit tests on containers using Docker Desktop" - Write-Host " run-local-integration-test Run integration tests on local machine" - Write-Host " run-docker-desktop-integration-test Run integration tests on containers using Docker Desktop" - Write-Host " run-local-job Run job on local machine" - Write-Host " run-docker-desktop-job Run job on containers using Docker Desktop" -} - -switch ($action) -{ - linting { - scripts/win/linting.ps1 - Break - } - precommit { - Write-Host "Precommit Checks" - Break - } - install-with-docker-desktop { - scripts/win/install_with_docker_desktop.ps1 - Break - } - run-local-unit-test { - Write-Host "Running unit tests on local machine" - poetry run pytest tests/unit - Break - } - run-docker-desktop-unit-test { - Write-Host "Running unit tests on containers using Docker Desktop" - ./batect unit-test - Break - } - run-local-integration-test { - Write-Host "Running integration tests on local machine" - scripts/win/run-local-integration-test.ps1 - Break - } - run-docker-desktop-integration-test { - Write-Host "Running integration tests on containers using Docker Desktop" - scripts/win/run-docker-desktop-integration-test.ps1 - Break - } - run-local-job { - Write-Host "Running job on local machine" - ./scripts/win/run-job.ps1 - Break - } - run-docker-desktop-job { - "Running job on containers using Docker Desktop" - ./scripts/win/run-docker-desktop-job.ps1 - Break - } - usage { - Get-Usage - Break - } - default { - Get-Usage - Break - } -} \ No newline at end of file diff --git a/go.sh b/go.sh deleted file mode 100755 index c269556..0000000 --- a/go.sh +++ /dev/null @@ -1,268 +0,0 @@ -#!/bin/bash - -set -euo pipefail - - -function trace() { - { - local tracing - [[ "$-" = *"x"* ]] && tracing=true || tracing=false - set +x - } 2>/dev/null - if [ "$tracing" != true ]; then - # Bash's own trace mode is off, so explicitely write the message. - echo "$@" >&2 - else - # Restore trace - set -x - fi -} - - -function contains () { - local e match="$1" - shift - for e; do [[ "$e" == "$match" ]] && return 0; done - return 1 -} - - -# Parse arguments. -operations=() -subcommand_opts=() -while true; do - case "${1:-}" in - lint|linting) - operations+=( linting ) - shift - ;; - precommit) - operations+=( precommit ) - shift - ;; - install-with-docker-desktop) - operations+=( install-with-docker-desktop ) - shift - ;; - install-with-colima) - operations+=( install-with-colima ) - shift - ;; - start-colima) - operations+=( start-colima ) - shift - ;; - run-local-unit-test) - operations+=( run-local-unit-test ) - shift - ;; - run-docker-desktop-unit-test) - operations+=( run-docker-desktop-unit-test ) - shift - ;; - run-colima-unit-test) - operations+=( run-colima-unit-test ) - shift - ;; - run-local-integration-test) - operations+=( run-local-integration-test ) - shift - ;; - run-docker-desktop-integration-test) - operations+=( run-docker-desktop-integration-test ) - shift - ;; - run-colima-integration-test) - operations+=( run-colima-integration-test ) - shift - ;; - run-local-job) - operations+=( run-local-job ) - shift - ;; - run-docker-desktop-job) - operations+=( run-docker-desktop-job ) - shift - ;; - run-colima-job) - operations+=( run-colima-job ) - shift - ;; - --) - shift - break - ;; - -h|--help) - operations+=( usage ) - shift - ;; - *) - break - ;; - esac -done -if [ "${#operations[@]}" -eq 0 ]; then - operations=( usage ) -fi -if [ "$#" -gt 0 ]; then - subcommand_opts=( "$@" ) -fi - - -function usage() { - trace "$0 [--] [options ...]" - trace "Commands:" - trace " linting Static analysis, code style, etc.(please install poetry if you would like to use this command)" - trace " precommit Run sensible checks before committing" - trace " install-with-docker-desktop Install the application requirements along with docker desktop" - trace " install-with-colima Install the application requirements along with colima" - trace " start-colima Start Colima" - trace " run-local-unit-test Run unit tests on local machine" - trace " run-colima-unit-test Run unit tests on containers using Colima" - trace " run-docker-desktop-unit-test Run unit tests on containers using Docker Desktop" - trace " run-local-integration-test Run integration tests on local machine" - trace " run-colima-integration-test Run integration tests on containers using Colima" - trace " run-docker-desktop-integration-test Run integration tests on containers using Docker Desktop" - trace " run-local-job Run job on local machine" - trace " run-colima-job Run job on containers using Colima" - trace " run-docker-desktop-job Run job on containers using Docker Desktop" - trace "Options are passed through to the sub-command." -} - - -function linting() { - trace "Linting" - ./scripts/mac_or_linux/linting.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function precommit() { - trace "Precommit Checks" -} - - -function install-with-docker-desktop() { - trace "Install the application requirements along with docker desktop" - ./scripts/mac_or_linux/install-with-docker-desktop.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function install-with-colima() { - trace "Install the application requirements along with docker desktop" - ./scripts/mac_or_linux/install-with-colima.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function start-colima() { - trace "Starting Colima" - ./scripts/mac_or_linux/start-colima.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function run-local-unit-test() { - trace "Running unit tests on local machine" - ./scripts/mac_or_linux/unit-test.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function run-colima-unit-test() { - trace "Running unit tests on containers using Colima" - ./scripts/mac_or_linux/run-colima-unit-test.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function run-docker-desktop-unit-test() { - trace "Running unit tests on containers using Docker Desktop" - ./scripts/mac_or_linux/run-docker-desktop-unit-test.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function run-local-integration-test() { - trace "Running integration tests on local machine" - ./scripts/mac_or_linux/integration-test.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function run-colima-integration-test() { - trace "Running integration tests on containers using Colima" - ./scripts/mac_or_linux/run-colima-integration-test.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function run-docker-desktop-integration-test() { - trace "Running integration tests on containers using Docker Desktop" - ./scripts/mac_or_linux/run-docker-desktop-integration-test.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function run-local-job() { - trace "Running job on local machine" - ./scripts/mac_or_linux/run-job.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function run-colima-job() { - trace "Running job on containers using Colima" - ./scripts/mac_or_linux/run-colima-job.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -function run-docker-desktop-job() { - trace "Running job on containers using Docker Desktop" - ./scripts/mac_or_linux/run-docker-desktop-job.sh "${subcommand_opts[@]:+${subcommand_opts[@]}}" -} - - -script_directory="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" -cd "${script_directory}/" - - -if contains usage "${operations[@]}"; then - usage - exit 1 -fi -if contains linting "${operations[@]}"; then - linting -fi -if contains precommit "${operations[@]}"; then - precommit -fi -if contains install-with-docker-desktop "${operations[@]}"; then - install-with-docker-desktop -fi -if contains install-with-colima "${operations[@]}"; then - install-with-colima -fi -if contains start-colima "${operations[@]}"; then - start-colima -fi -if contains run-local-unit-test "${operations[@]}"; then - run-local-unit-test -fi -if contains run-colima-unit-test "${operations[@]}"; then - run-colima-unit-test -fi -if contains run-docker-desktop-unit-test "${operations[@]}"; then - run-docker-desktop-unit-test -fi -if contains run-local-integration-test "${operations[@]}"; then - run-local-integration-test -fi -if contains run-colima-integration-test "${operations[@]}"; then - run-colima-integration-test -fi -if contains run-docker-desktop-integration-test "${operations[@]}"; then - run-docker-desktop-integration-test -fi -if contains run-local-job "${operations[@]}"; then - run-local-job -fi -if contains run-colima-job "${operations[@]}"; then - run-colima-job -fi -if contains run-docker-desktop-job "${operations[@]}"; then - run-docker-desktop-job -fi - - -trace "Exited cleanly." \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d005ffa..e96c428 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ numpy = "^1.20.2" pandas = "^2.0.0" pyarrow = "^12.0.0" # This is pinned to stay in sync with the version -# That gets downloaded in `Dockerfile` +# That gets downloaded in `.gitpod.Dockerfile` pyspark = "3.5.1" python = "~3.11" diff --git a/scripts/install.bat b/scripts/install.bat deleted file mode 100644 index 57f4555..0000000 --- a/scripts/install.bat +++ /dev/null @@ -1,4 +0,0 @@ -where java -IF %ERRORLEVEL% EQU 0 (ECHO "JAVA IS INSTALLED") ELSE (choco install adoptopenjdk11) -where docker -IF %ERRORLEVEL% EQU 0 (ECHO "DOCKER IS INSTALLED") ELSE (choco install docker-desktop) diff --git a/scripts/install_choco.ps1 b/scripts/install_choco.ps1 deleted file mode 100644 index a42202c..0000000 --- a/scripts/install_choco.ps1 +++ /dev/null @@ -1,2 +0,0 @@ -Set-ExecutionPolicy AllSigned -Set-ExecutionPolicy Bypass -Scope Process -Force; [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072; iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1')) diff --git a/scripts/mac_or_linux/install-with-colima.sh b/scripts/mac_or_linux/install-with-colima.sh deleted file mode 100755 index d8c2b9a..0000000 --- a/scripts/mac_or_linux/install-with-colima.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh - -set -euo pipefail - -# batect dependencies -echo "Installing homebrew if it's not installed..." -which brew || /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" - -echo "Installing Docker if it's not installed..." -which docker || brew install docker - -echo "Installing Colima if it's not installed..." -which colima || brew install colima - -echo "Installing java if it's not installed..." -which java -if [ $? -ne 0 ]; then - brew tap adoptopenjdk/openjdk - brew cask install adoptopenjdk11 -fi diff --git a/scripts/mac_or_linux/install-with-docker-desktop.sh b/scripts/mac_or_linux/install-with-docker-desktop.sh deleted file mode 100755 index 23ec747..0000000 --- a/scripts/mac_or_linux/install-with-docker-desktop.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/bin/sh - -set -euo pipefail - -# batect dependencies -echo "Installing homebrew if it's not installed..." -which brew || /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install.sh)" - -echo "Installing docker desktop if it's not installed..." -which docker || brew install --cask docker - -echo "Installing java if it's not installed..." -which java -if [ $? -ne 0 ]; then - brew tap adoptopenjdk/openjdk - brew cask install adoptopenjdk11 -fi diff --git a/scripts/mac_or_linux/integration-test.sh b/scripts/mac_or_linux/integration-test.sh deleted file mode 100755 index b9e3184..0000000 --- a/scripts/mac_or_linux/integration-test.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -poetry run pytest tests/integration diff --git a/scripts/mac_or_linux/linting.sh b/scripts/mac_or_linux/linting.sh deleted file mode 100755 index b7b8de4..0000000 --- a/scripts/mac_or_linux/linting.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -echo "Running type checks" -poetry run mypy --ignore-missing-imports --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs \ - data_transformations tests - -echo "Running lint checks" -poetry run pylint data_transformations tests diff --git a/scripts/mac_or_linux/run-colima-integration-test.sh b/scripts/mac_or_linux/run-colima-integration-test.sh deleted file mode 100755 index 3c327ae..0000000 --- a/scripts/mac_or_linux/run-colima-integration-test.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -set -euo pipefail - -./batect --docker-host=unix://$HOME/.colima/docker.sock integration-test diff --git a/scripts/mac_or_linux/run-colima-job.sh b/scripts/mac_or_linux/run-colima-job.sh deleted file mode 100755 index 8907689..0000000 --- a/scripts/mac_or_linux/run-colima-job.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -set -euo pipefail - -./batect --docker-host=unix://$HOME/.colima/docker.sock run-job diff --git a/scripts/mac_or_linux/run-colima-unit-test.sh b/scripts/mac_or_linux/run-colima-unit-test.sh deleted file mode 100755 index 4095b9d..0000000 --- a/scripts/mac_or_linux/run-colima-unit-test.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -set -euo pipefail - -./batect --docker-host=unix://$HOME/.colima/docker.sock unit-test diff --git a/scripts/mac_or_linux/run-docker-desktop-integration-test.sh b/scripts/mac_or_linux/run-docker-desktop-integration-test.sh deleted file mode 100755 index 6261c19..0000000 --- a/scripts/mac_or_linux/run-docker-desktop-integration-test.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -set -euo pipefail - -./batect integration-test diff --git a/scripts/mac_or_linux/run-docker-desktop-job.sh b/scripts/mac_or_linux/run-docker-desktop-job.sh deleted file mode 100755 index 996082f..0000000 --- a/scripts/mac_or_linux/run-docker-desktop-job.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -set -euo pipefail - -./batect run-job diff --git a/scripts/mac_or_linux/run-docker-desktop-unit-test.sh b/scripts/mac_or_linux/run-docker-desktop-unit-test.sh deleted file mode 100755 index f8878c8..0000000 --- a/scripts/mac_or_linux/run-docker-desktop-unit-test.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh - -set -euo pipefail - -./batect unit-test diff --git a/scripts/mac_or_linux/run-job.sh b/scripts/mac_or_linux/run-job.sh deleted file mode 100755 index f61ea37..0000000 --- a/scripts/mac_or_linux/run-job.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -poetry build - -jobName=$(echo "${JOB}" | awk '{ print tolower($1) }') - -if [[ "${jobName}" == "citibike_ingest" ]]; then - INPUT_FILE_PATH="./resources/citibike/citibike.csv" - JOB_ENTRY_POINT="jobs/citibike_ingest.py" - OUTPUT_PATH="./output_int" -elif [[ "${jobName}" == "citibike_distance_calculation" ]]; then - INPUT_FILE_PATH="./output_int" - JOB_ENTRY_POINT="jobs/citibike_distance_calculation.py" - OUTPUT_PATH="./output" -elif [[ "${jobName}" == "wordcount" ]]; then - INPUT_FILE_PATH="./resources/word_count/words.txt" - JOB_ENTRY_POINT="jobs/word_count.py" - OUTPUT_PATH="./output" -else - echo "Job name provided was : ${JOB} : failed" - echo "Job name deduced was : ${jobName} : failed" - echo "Please enter a valid job name (citibike_ingest, citibike_distance_calculation or wordcount)" - exit 1 -fi - -rm -rf $OUTPUT_PATH - - - -poetry run spark-submit \ - --master local \ - --py-files dist/data_transformations-*.whl \ - $JOB_ENTRY_POINT \ - $INPUT_FILE_PATH \ - $OUTPUT_PATH - diff --git a/scripts/mac_or_linux/start-colima.sh b/scripts/mac_or_linux/start-colima.sh deleted file mode 100755 index bac1283..0000000 --- a/scripts/mac_or_linux/start-colima.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -set -euo pipefail - -echo "Starting Colima" -colima start diff --git a/scripts/mac_or_linux/unit-test.sh b/scripts/mac_or_linux/unit-test.sh deleted file mode 100755 index 2164b32..0000000 --- a/scripts/mac_or_linux/unit-test.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -poetry run pytest tests/unit diff --git a/scripts/win/install_with_docker_desktop.ps1 b/scripts/win/install_with_docker_desktop.ps1 deleted file mode 100644 index f88176a..0000000 --- a/scripts/win/install_with_docker_desktop.ps1 +++ /dev/null @@ -1,23 +0,0 @@ -# WIP -Write-host "Please install java and docker desktop manually or use scripts/install.bat as an example. " -# ECHO "Checking if java is installed" -# -# "where java" | cmd -# Write-Host $LASTEXITCODE -# Write-Host 1 -# try{ -# -# Write-Host "JAVA IS INSTALLED" -# } -# catch{ -# choco install adoptopenjdk11 -# } -# -# -# where docker -# IF ($LASTEXITCODE -eq 0) { -# ECHO "DOCKER IS INSTALLED" -# } -# ELSE { -# choco install docker-desktop -# } diff --git a/scripts/win/linting.ps1 b/scripts/win/linting.ps1 deleted file mode 100755 index 6bce2b0..0000000 --- a/scripts/win/linting.ps1 +++ /dev/null @@ -1,5 +0,0 @@ -Write-Host "Running type checks" -poetry run mypy --ignore-missing-imports --disallow-untyped-calls --disallow-untyped-defs --disallow-incomplete-defs data_transformations tests - -Write-Host "Running lint checks" -poetry run pylint data_transformations tests diff --git a/scripts/win/run-docker-desktop-integration-test.ps1 b/scripts/win/run-docker-desktop-integration-test.ps1 deleted file mode 100755 index 8f3c5a1..0000000 --- a/scripts/win/run-docker-desktop-integration-test.ps1 +++ /dev/null @@ -1 +0,0 @@ -./batect integration-test diff --git a/scripts/win/run-docker-desktop-job.ps1 b/scripts/win/run-docker-desktop-job.ps1 deleted file mode 100755 index 9249a25..0000000 --- a/scripts/win/run-docker-desktop-job.ps1 +++ /dev/null @@ -1 +0,0 @@ -./batect run-job diff --git a/scripts/win/run-job.ps1 b/scripts/win/run-job.ps1 deleted file mode 100755 index b15ee73..0000000 --- a/scripts/win/run-job.ps1 +++ /dev/null @@ -1,41 +0,0 @@ -poetry build - -$JOB=[System.Environment]::GetEnvironmentVariable('JOB') -$jobName=$JOB.ToLower() - -switch($jobName) -{ - - citibike_ingest { - $INPUT_FILE_PATH="resources/citibike/citibike.csv" - $JOB_ENTRY_POINT="jobs/citibike_ingest.py" - $OUTPUT_PATH="./output_int" - Break - } - citibike_distance_calculation { - $INPUT_FILE_PATH="./output_int" - $JOB_ENTRY_POINT="jobs/citibike_distance_calculation.py" - $OUTPUT_PATH="./output" - Break - } - wordcount { - $INPUT_FILE_PATH="resources/word_count/words.txt" - $JOB_ENTRY_POINT="jobs/word_count.py" - $OUTPUT_PATH="./output" - Break - } - default { - Write-Host "Job name provided was : ${JOB} : failed" - Write-Host "Job name deduced was : ${jobName} : failed" - Write-Host "Please enter a valid job name (citibike_ingest, citibike_distance_calculation or wordcount)" - exit 1 - Break - } - - -} - - -rm -rf $OUTPUT_PATH - -poetry run spark-submit --master local --py-files dist/data_transformations-*.whl $JOB_ENTRY_POINT $INPUT_FILE_PATH $OUTPUT_PATH diff --git a/scripts/win/run-local-integration-test.ps1 b/scripts/win/run-local-integration-test.ps1 deleted file mode 100755 index c910832..0000000 --- a/scripts/win/run-local-integration-test.ps1 +++ /dev/null @@ -1 +0,0 @@ -poetry run pytest tests/integration diff --git a/scripts/win/run-local-unit-test.ps1 b/scripts/win/run-local-unit-test.ps1 deleted file mode 100755 index 5e2c6ac..0000000 --- a/scripts/win/run-local-unit-test.ps1 +++ /dev/null @@ -1 +0,0 @@ -poetry run pytest tests/unit diff --git a/scripts/win/write-test.ps1 b/scripts/win/write-test.ps1 deleted file mode 100644 index d230747..0000000 --- a/scripts/win/write-test.ps1 +++ /dev/null @@ -1 +0,0 @@ -Write-host $args[0] \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/integration/test_distance_transformer.py b/tests/integration/test_distance_transformer.py index e812618..5de4572 100644 --- a/tests/integration/test_distance_transformer.py +++ b/tests/integration/test_distance_transformer.py @@ -98,7 +98,7 @@ def test_should_maintain_all_data_it_reads(SPARK) -> None: @pytest.mark.skip def test_should_add_distance_column_with_calculated_distance(SPARK) -> None: - given_ingest_folder, given_transform_folder = __create_ingest_and_transform_folders() + given_ingest_folder, given_transform_folder = __create_ingest_and_transform_folders(SPARK) distance_transformer.run(SPARK, given_ingest_folder, given_transform_folder) actual_dataframe = SPARK.read.parquet(given_transform_folder)