Merge pull request #37 from lauris-tw/37_lauris_bump_python_version

Bump Python version from 3.9 to 3.11
techops-recsys-lateral-hiring · Apr 30, 2024 · d20d831 · d20d831
2 parents 634688a + a0bb7e7
commit d20d831
Show file tree

Hide file tree

Showing 8 changed files with 533 additions and 367 deletions.
diff --git a/.github/workflows/local-setup-test.yaml b/.github/workflows/local-setup-test.yaml
@@ -7,16 +7,16 @@ jobs:
         os: [ubuntu-latest, macos-latest]
     runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-java@v3
+      - uses: actions/checkout@v4
+      - uses: actions/setup-java@v4
         with:
           distribution: "zulu"
           java-version: "11"
       - name: Install poetry
         run: pipx install poetry
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
-          python-version: "3.9"
+          python-version: "3.11"
           cache: "poetry"
       - name: Install Python Dependencies
         run: |
@@ -31,27 +31,45 @@ jobs:
   windows:
     runs-on: windows-latest
     steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-java@v3
+      - uses: actions/checkout@v4
+      - uses: actions/setup-java@v4
         with:
           distribution: "zulu"
           java-version: "11"
       - name: Install poetry
         run: pipx install poetry
-      - uses: actions/setup-python@v4
+      - uses: actions/setup-python@v5
         with:
-          python-version: "3.9"
+          python-version: "3.11"
           cache: "poetry"
+      - name: Install Hadoop for Windows
+        # See https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
+        # that recommends https://github.com/steveloughran/winutils
+        # that recommends https://github.com/cdarlint/winutils
+        #
+        # Setting environement variables: e.g.
+        # $env:HADOOP_HOME = "$pwd\winutils\hadoop-3.3.5"
+        # $env:Path += ";$pwd\winutils\hadoop-3.3.5\bin"
+        # requires a special handling: https://stackoverflow.com/questions/61858388/how-do-i-set-an-enviroment-variable-in-github-action-on-a-windows-server
+        #
+        # Reading / Writing to parquet through winutils requires Microsoft Visual C++ 2010 Service Pack 1
+        # https://stackoverflow.com/questions/45947375/why-does-starting-a-streaming-query-lead-to-exitcodeexception-exitcode-1073741
+        run: |
+          choco install vcredist2010
+          git clone --depth 1 -b master https://github.com/cdarlint/winutils.git
+          echo "HADOOP_HOME=$pwd\winutils\hadoop-3.3.5" >> $env:GITHUB_ENV
+          echo ";$pwd\winutils\hadoop-3.3.5\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+
+      - name: Test Hadoop Setup
+        run: |
+          winutils.exe chmod 777 D:\a\dataengineer-transformations-python\dataengineer-transformations-python
       - name: Install Python Dependencies
         run: |
           scripts\install.bat
           poetry install
       - name: Run local unit tests
         run: |
           .\go.ps1 run-local-unit-test
-      # This doesn't work at the moment because running spark locally on Windows
-      # requires a special setup for Hadoop
-      # TODO: https://cwiki.apache.org/confluence/display/HADOOP2/WindowsProblems
-      # - name: Run local integration tests
-      #   run: |
-      #     .\go.ps1 run-local-integration-test
+      - name: Run local integration tests
+        run: |
+          .\go.ps1 run-local-integration-test
diff --git a/.gitpod.yml b/.gitpod.yml
@@ -3,12 +3,13 @@
 # and commit this file to your remote git repository to share the goodness with others.
 
 image:
-    file: .gitpod.Dockerfile
+  file: .gitpod.Dockerfile
 
 tasks:
   - init: |
-      pyenv shell 3.9.10
-      poetry env use "${HOME}/.pyenv/versions/3.9.10/bin/python3"
+      pyenv install 3.11.4
+      pyenv shell 3.11.4
+      poetry env use "${HOME}/.pyenv/versions/3.11.4/bin/python3"
       poetry install
       poetry env info
-      make tests
+      make tests
diff --git a/Dockerfile b/Dockerfile
@@ -1,11 +1,11 @@
-ARG PYTHON_VERSION=3.9.10
+ARG PYTHON_VERSION=3.11
 FROM --platform=linux/amd64 python:$PYTHON_VERSION
 USER root
 WORKDIR /opt
 RUN if [ "$(arch)" = "aarch64" ] ; then ARCHITECTURE="aarch64" ; else ARCHITECTURE="x64"; fi && \
     wget -O OpenJDK.tar.gz https://github.com/AdoptOpenJDK/openjdk11-binaries/releases/download/jdk-11.0.11%2B9/OpenJDK11U-jdk_${ARCHITECTURE}_linux_hotspot_11.0.11_9.tar.gz && \
     wget -O scala.tgz https://downloads.lightbend.com/scala/2.13.5/scala-2.13.5.tgz && \
-    wget -O spark-hadoop.tgz https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
+    wget -O spark-hadoop.tgz https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3-scala2.13.tgz
 RUN tar xzf OpenJDK.tar.gz && \
     tar xvf scala.tgz && \
     tar xvf spark-hadoop.tgz

diff --git a/README-LOCAL.md b/README-LOCAL.md
@@ -1,45 +1,58 @@
 # Data transformations with Python
+
 This is a collection of _Python_ jobs that are supposed to transform data.
 These jobs are using _PySpark_ to process larger volumes of data and are supposed to run on a _Spark_ cluster (via `spark-submit`).
 
 ## Pre-requisites
+
 Please make sure you have the following installed and can run them
-* Python (3.9 or later), you can use for example [pyenv](https://github.com/pyenv/pyenv#installation) to manage your python versions locally
-* [Poetry](https://python-poetry.org/docs/#installation)
-* Java (1.8)
+
+- Python (3.11 or later), you can use for example [pyenv](https://github.com/pyenv/pyenv#installation) to manage your python versions locally
+- [Poetry](https://python-poetry.org/docs/#installation)
+- Java (11)
 
 ## Install all dependencies
+
 ```bash
 poetry install
 ```
 
 ## Run tests
+
 To run all tests:
+
 ```bash
 make tests
 ```
 
 ### Run unit tests
+
 ```bash
 make unit-test
 ```
 
 ### Run integration tests
+
 ```bash
 make integration-test
 ```
 
 ## Create package
+
 This will create a `tar.gz` and a `.wheel` in `dist/` folder:
+
 ```bash
 poetry build
 ```
+
 More: https://python-poetry.org/docs/cli/#build
 
 ## Run style checks
+
 ```bash
 make style-checks
 ```
+
 This is running the linter and a type checker.
 
 ## Jobs
@@ -50,17 +63,21 @@ Currently, these exist as skeletons, and have some initial test cases which are
 For each application, please un-ignore the tests and implement the missing logic.
 
 ### Word Count
+
 A NLP model is dependent on a specific input file. This job is supposed to preprocess a given text file to produce this
 input file for the NLP model (feature engineering). This job will count the occurrences of a word within the given text
-file (corpus). 
+file (corpus).
 
 There is a dump of the datalake for this under `resources/word_count/words.txt` with a text file.
 
 #### Input
+
 Simple `*.txt` file containing text.
 
 #### Output
+
 A single `*.csv` file containing data similar to:
+
 ```csv
 "word","count"
 "a","3"
@@ -69,7 +86,9 @@ A single `*.csv` file containing data similar to:
 ```
 
 #### Run the job
+
 Please make sure to package the code before submitting the spark job (`poetry build`)
+
 ```bash
 poetry run spark-submit \
     --master local \
@@ -80,6 +99,7 @@ poetry run spark-submit \
 ```
 
 ### Citibike
+
 For analytics purposes the BI department of a bike share company would like to present dashboards, displaying the
 distance each bike was driven. There is a `*.csv` file that contains historical data of previous bike rides. This input
 file needs to be processed in multiple steps. There is a pipeline running these jobs.
@@ -89,26 +109,33 @@ file needs to be processed in multiple steps. There is a pipeline running these
 There is a dump of the datalake for this under `resources/citibike/citibike.csv` with historical data.
 
 #### Ingest
+
 Reads a `*.csv` file and transforms it to parquet format. The column names will be sanitized (whitespaces replaced).
 
 ##### Input
+
 Historical bike ride `*.csv` file:
+
 ```csv
 "tripduration","starttime","stoptime","start station id","start station name","start station latitude",...
 364,"2017-07-01 00:00:00","2017-07-01 00:06:05",539,"Metropolitan Ave & Bedford Ave",40.71534825,...
 ...
 ```
 
 ##### Output
+
 `*.parquet` files containing the same content
+
 ```csv
 "tripduration","starttime","stoptime","start_station_id","start_station_name","start_station_latitude",...
 364,"2017-07-01 00:00:00","2017-07-01 00:06:05",539,"Metropolitan Ave & Bedford Ave",40.71534825,...
 ...
 ```
 
 ##### Run the job
+
 Please make sure to package the code before submitting the spark job (`poetry build`)
+
 ```bash
 poetry run spark-submit \
     --master local \
@@ -119,30 +146,38 @@ poetry run spark-submit \
 ```
 
 #### Distance calculation
+
 This job takes bike trip information and calculates the "as the crow flies" distance traveled for each trip.
 It reads the previously ingested data parquet files.
 
 Hint:
- - For distance calculation, consider using [**Harvesine formula**](https://en.wikipedia.org/wiki/Haversine_formula) as an option.  
+
+- For distance calculation, consider using [**Harvesine formula**](https://en.wikipedia.org/wiki/Haversine_formula) as an option.
 
 ##### Input
+
 Historical bike ride `*.parquet` files
+
 ```csv
 "tripduration",...
 364,...
 ...
 ```
 
 ##### Outputs
+
 `*.parquet` files containing historical data with distance column containing the calculated distance.
+
 ```csv
 "tripduration",...,"distance"
 364,...,1.34
 ...
 ```
 
 ##### Run the job
+
 Please make sure to package the code before submitting the spark job (`poetry build`)
+
 ```bash
 poetry run spark-submit \
     --master local \

diff --git a/batect b/batect
@@ -8,8 +8,8 @@
     # You should commit this file to version control alongside the rest of your project. It should not be installed globally.
     # For more information, visit https://github.com/batect/batect.
 
-    VERSION="0.82.0"
-    CHECKSUM="${BATECT_DOWNLOAD_CHECKSUM:-6d3e7e26e718f705d8a344c85048c2821aedd8ae84fec1db2251fe6f3adec3ea}"
+    VERSION="0.84.0"
+    CHECKSUM="${BATECT_DOWNLOAD_CHECKSUM:-e39f3e73f0772b3716a4a01624e26009d9da5f5a274464a598d5b265c1e52964}"
     DOWNLOAD_URL_ROOT=${BATECT_DOWNLOAD_URL_ROOT:-"https://updates.batect.dev/v1/files"}
     DOWNLOAD_URL=${BATECT_DOWNLOAD_URL:-"$DOWNLOAD_URL_ROOT/$VERSION/batect-$VERSION.jar"}
     QUIET_DOWNLOAD=${BATECT_QUIET_DOWNLOAD:-false}

diff --git a/batect.cmd b/batect.cmd
@@ -6,7 +6,7 @@ rem For more information, visit https://github.com/batect/batect.
 
 setlocal EnableDelayedExpansion
 
-set "version=0.82.0"
+set "version=0.84.0"
 
 if "%BATECT_CACHE_DIR%" == "" (
     set "BATECT_CACHE_DIR=%USERPROFILE%\.batect\cache"
@@ -22,7 +22,7 @@ $ErrorActionPreference = 'Stop'^
 
 ^
 
-$Version='0.82.0'^
+$Version='0.84.0'^
 
 ^
 
@@ -48,7 +48,7 @@ $UrlEncodedVersion = [Uri]::EscapeDataString($Version)^
 
 $DownloadUrl = getValueOrDefault $env:BATECT_DOWNLOAD_URL "$DownloadUrlRoot/$UrlEncodedVersion/batect-$UrlEncodedVersion.jar"^
 
-$ExpectedChecksum = getValueOrDefault $env:BATECT_DOWNLOAD_CHECKSUM '6d3e7e26e718f705d8a344c85048c2821aedd8ae84fec1db2251fe6f3adec3ea'^
+$ExpectedChecksum = getValueOrDefault $env:BATECT_DOWNLOAD_CHECKSUM 'e39f3e73f0772b3716a4a01624e26009d9da5f5a274464a598d5b265c1e52964'^
 
 ^