Skip to content

Commit

Permalink
Merge pull request #37 from lauris-tw/37_lauris_bump_python_version
Browse files Browse the repository at this point in the history
Bump Python version from 3.9 to 3.11
  • Loading branch information
lauris-tw authored Apr 30, 2024
2 parents 634688a + a0bb7e7 commit d20d831
Show file tree
Hide file tree
Showing 8 changed files with 533 additions and 367 deletions.
46 changes: 32 additions & 14 deletions .github/workflows/local-setup-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@ jobs:
os: [ubuntu-latest, macos-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- uses: actions/setup-java@v3
- uses: actions/checkout@v4
- uses: actions/setup-java@v4
with:
distribution: "zulu"
java-version: "11"
- name: Install poetry
run: pipx install poetry
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: "3.9"
python-version: "3.11"
cache: "poetry"
- name: Install Python Dependencies
run: |
Expand All @@ -31,27 +31,45 @@ jobs:
windows:
runs-on: windows-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-java@v3
- uses: actions/checkout@v4
- uses: actions/setup-java@v4
with:
distribution: "zulu"
java-version: "11"
- name: Install poetry
run: pipx install poetry
- uses: actions/setup-python@v4
- uses: actions/setup-python@v5
with:
python-version: "3.9"
python-version: "3.11"
cache: "poetry"
- name: Install Hadoop for Windows
# See https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
# that recommends https://github.com/steveloughran/winutils
# that recommends https://github.com/cdarlint/winutils
#
# Setting environement variables: e.g.
# $env:HADOOP_HOME = "$pwd\winutils\hadoop-3.3.5"
# $env:Path += ";$pwd\winutils\hadoop-3.3.5\bin"
# requires a special handling: https://stackoverflow.com/questions/61858388/how-do-i-set-an-enviroment-variable-in-github-action-on-a-windows-server
#
# Reading / Writing to parquet through winutils requires Microsoft Visual C++ 2010 Service Pack 1
# https://stackoverflow.com/questions/45947375/why-does-starting-a-streaming-query-lead-to-exitcodeexception-exitcode-1073741
run: |
choco install vcredist2010
git clone --depth 1 -b master https://github.com/cdarlint/winutils.git
echo "HADOOP_HOME=$pwd\winutils\hadoop-3.3.5" >> $env:GITHUB_ENV
echo ";$pwd\winutils\hadoop-3.3.5\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Test Hadoop Setup
run: |
winutils.exe chmod 777 D:\a\dataengineer-transformations-python\dataengineer-transformations-python
- name: Install Python Dependencies
run: |
scripts\install.bat
poetry install
- name: Run local unit tests
run: |
.\go.ps1 run-local-unit-test
# This doesn't work at the moment because running spark locally on Windows
# requires a special setup for Hadoop
# TODO: https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
# - name: Run local integration tests
# run: |
# .\go.ps1 run-local-integration-test
- name: Run local integration tests
run: |
.\go.ps1 run-local-integration-test
9 changes: 5 additions & 4 deletions .gitpod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
# and commit this file to your remote git repository to share the goodness with others.

image:
file: .gitpod.Dockerfile
file: .gitpod.Dockerfile

tasks:
- init: |
pyenv shell 3.9.10
poetry env use "${HOME}/.pyenv/versions/3.9.10/bin/python3"
pyenv install 3.11.4
pyenv shell 3.11.4
poetry env use "${HOME}/.pyenv/versions/3.11.4/bin/python3"
poetry install
poetry env info
make tests
make tests
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
ARG PYTHON_VERSION=3.9.10
ARG PYTHON_VERSION=3.11
FROM --platform=linux/amd64 python:$PYTHON_VERSION
USER root
WORKDIR /opt
RUN if [ "$(arch)" = "aarch64" ] ; then ARCHITECTURE="aarch64" ; else ARCHITECTURE="x64"; fi && \
wget -O OpenJDK.tar.gz https://github.com/AdoptOpenJDK/openjdk11-binaries/releases/download/jdk-11.0.11%2B9/OpenJDK11U-jdk_${ARCHITECTURE}_linux_hotspot_11.0.11_9.tar.gz && \
wget -O scala.tgz https://downloads.lightbend.com/scala/2.13.5/scala-2.13.5.tgz && \
wget -O spark-hadoop.tgz https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
wget -O spark-hadoop.tgz https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3-scala2.13.tgz
RUN tar xzf OpenJDK.tar.gz && \
tar xvf scala.tgz && \
tar xvf spark-hadoop.tgz
Expand Down
45 changes: 40 additions & 5 deletions README-LOCAL.md
Original file line number Diff line number Diff line change
@@ -1,45 +1,58 @@
# Data transformations with Python

This is a collection of _Python_ jobs that are supposed to transform data.
These jobs are using _PySpark_ to process larger volumes of data and are supposed to run on a _Spark_ cluster (via `spark-submit`).

## Pre-requisites

Please make sure you have the following installed and can run them
* Python (3.9 or later), you can use for example [pyenv](https://github.com/pyenv/pyenv#installation) to manage your python versions locally
* [Poetry](https://python-poetry.org/docs/#installation)
* Java (1.8)

- Python (3.11 or later), you can use for example [pyenv](https://github.com/pyenv/pyenv#installation) to manage your python versions locally
- [Poetry](https://python-poetry.org/docs/#installation)
- Java (11)

## Install all dependencies

```bash
poetry install
```

## Run tests

To run all tests:

```bash
make tests
```

### Run unit tests

```bash
make unit-test
```

### Run integration tests

```bash
make integration-test
```

## Create package

This will create a `tar.gz` and a `.wheel` in `dist/` folder:

```bash
poetry build
```

More: https://python-poetry.org/docs/cli/#build

## Run style checks

```bash
make style-checks
```

This is running the linter and a type checker.

## Jobs
Expand All @@ -50,17 +63,21 @@ Currently, these exist as skeletons, and have some initial test cases which are
For each application, please un-ignore the tests and implement the missing logic.

### Word Count

A NLP model is dependent on a specific input file. This job is supposed to preprocess a given text file to produce this
input file for the NLP model (feature engineering). This job will count the occurrences of a word within the given text
file (corpus).
file (corpus).

There is a dump of the datalake for this under `resources/word_count/words.txt` with a text file.

#### Input

Simple `*.txt` file containing text.

#### Output

A single `*.csv` file containing data similar to:

```csv
"word","count"
"a","3"
Expand All @@ -69,7 +86,9 @@ A single `*.csv` file containing data similar to:
```

#### Run the job

Please make sure to package the code before submitting the spark job (`poetry build`)

```bash
poetry run spark-submit \
--master local \
Expand All @@ -80,6 +99,7 @@ poetry run spark-submit \
```

### Citibike

For analytics purposes the BI department of a bike share company would like to present dashboards, displaying the
distance each bike was driven. There is a `*.csv` file that contains historical data of previous bike rides. This input
file needs to be processed in multiple steps. There is a pipeline running these jobs.
Expand All @@ -89,26 +109,33 @@ file needs to be processed in multiple steps. There is a pipeline running these
There is a dump of the datalake for this under `resources/citibike/citibike.csv` with historical data.

#### Ingest

Reads a `*.csv` file and transforms it to parquet format. The column names will be sanitized (whitespaces replaced).

##### Input

Historical bike ride `*.csv` file:

```csv
"tripduration","starttime","stoptime","start station id","start station name","start station latitude",...
364,"2017-07-01 00:00:00","2017-07-01 00:06:05",539,"Metropolitan Ave & Bedford Ave",40.71534825,...
...
```

##### Output

`*.parquet` files containing the same content

```csv
"tripduration","starttime","stoptime","start_station_id","start_station_name","start_station_latitude",...
364,"2017-07-01 00:00:00","2017-07-01 00:06:05",539,"Metropolitan Ave & Bedford Ave",40.71534825,...
...
```

##### Run the job

Please make sure to package the code before submitting the spark job (`poetry build`)

```bash
poetry run spark-submit \
--master local \
Expand All @@ -119,30 +146,38 @@ poetry run spark-submit \
```

#### Distance calculation

This job takes bike trip information and calculates the "as the crow flies" distance traveled for each trip.
It reads the previously ingested data parquet files.

Hint:
- For distance calculation, consider using [**Harvesine formula**](https://en.wikipedia.org/wiki/Haversine_formula) as an option.

- For distance calculation, consider using [**Harvesine formula**](https://en.wikipedia.org/wiki/Haversine_formula) as an option.

##### Input

Historical bike ride `*.parquet` files

```csv
"tripduration",...
364,...
...
```

##### Outputs

`*.parquet` files containing historical data with distance column containing the calculated distance.

```csv
"tripduration",...,"distance"
364,...,1.34
...
```

##### Run the job

Please make sure to package the code before submitting the spark job (`poetry build`)

```bash
poetry run spark-submit \
--master local \
Expand Down
4 changes: 2 additions & 2 deletions batect
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
# You should commit this file to version control alongside the rest of your project. It should not be installed globally.
# For more information, visit https://github.com/batect/batect.

VERSION="0.82.0"
CHECKSUM="${BATECT_DOWNLOAD_CHECKSUM:-6d3e7e26e718f705d8a344c85048c2821aedd8ae84fec1db2251fe6f3adec3ea}"
VERSION="0.84.0"
CHECKSUM="${BATECT_DOWNLOAD_CHECKSUM:-e39f3e73f0772b3716a4a01624e26009d9da5f5a274464a598d5b265c1e52964}"
DOWNLOAD_URL_ROOT=${BATECT_DOWNLOAD_URL_ROOT:-"https://updates.batect.dev/v1/files"}
DOWNLOAD_URL=${BATECT_DOWNLOAD_URL:-"$DOWNLOAD_URL_ROOT/$VERSION/batect-$VERSION.jar"}
QUIET_DOWNLOAD=${BATECT_QUIET_DOWNLOAD:-false}
Expand Down
6 changes: 3 additions & 3 deletions batect.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ rem For more information, visit https://github.com/batect/batect.

setlocal EnableDelayedExpansion

set "version=0.82.0"
set "version=0.84.0"

if "%BATECT_CACHE_DIR%" == "" (
set "BATECT_CACHE_DIR=%USERPROFILE%\.batect\cache"
Expand All @@ -22,7 +22,7 @@ $ErrorActionPreference = 'Stop'^

^

$Version='0.82.0'^
$Version='0.84.0'^

^

Expand All @@ -48,7 +48,7 @@ $UrlEncodedVersion = [Uri]::EscapeDataString($Version)^

$DownloadUrl = getValueOrDefault $env:BATECT_DOWNLOAD_URL "$DownloadUrlRoot/$UrlEncodedVersion/batect-$UrlEncodedVersion.jar"^

$ExpectedChecksum = getValueOrDefault $env:BATECT_DOWNLOAD_CHECKSUM '6d3e7e26e718f705d8a344c85048c2821aedd8ae84fec1db2251fe6f3adec3ea'^
$ExpectedChecksum = getValueOrDefault $env:BATECT_DOWNLOAD_CHECKSUM 'e39f3e73f0772b3716a4a01624e26009d9da5f5a274464a598d5b265c1e52964'^

^

Expand Down
Loading

0 comments on commit d20d831

Please sign in to comment.