Skip to content

Commit

Permalink
add: using spark submit
Browse files Browse the repository at this point in the history
  • Loading branch information
Haeun-Oh committed Oct 29, 2023
1 parent 1298bd4 commit 84c8525
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 0 deletions.
10 changes: 10 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ services:
volumes:
- ./data:/data

data-uploader:
build: ./docker/data-uploader
container_name: data-uploader
depends_on:
minio:
condition: service_healthy
command: spark-submit data_uploader.py
volumes:
- ./docker/spark/spark-defaults.conf:/usr/local/spark/conf/spark-defaults.conf

networks:
default:
name: simple-data-flow
46 changes: 46 additions & 0 deletions docker/data-uploader/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
FROM python:3.10.11-slim

USER root
ARG openjdk_version=17
RUN apt-get update --yes && \
apt-get install --yes --no-install-recommends \
"openjdk-${openjdk_version}-jre-headless" \
ca-certificates-java && \
apt-get clean && rm -rf /var/lib/apt/lists/*

RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& apt-get autoremove -yqq --purge \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

ENV SPARK_VERSION=3.3.3
ENV SPARK_HOME=/usr/local/spark
ENV SPARK_OPTS="--driver-java-options=-Xms1024M --driver-java-options=-Xmx4096M --driver-java-options=-Dlog4j.logLevel=info" \
PATH="${PATH}:${SPARK_HOME}/bin"

RUN curl -O https://dlcdn.apache.org/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz \
&& tar -xvzf spark-$SPARK_VERSION-bin-hadoop3.tgz \
&& rm -rf spark-$SPARK_VERSION-bin-hadoop3.tgz \
&& mv spark-$SPARK_VERSION-bin-hadoop3/ /usr/local/ \
&& rm -rf /usr/local/spark \
&& rm -rf /usr/local/spark-3.3.0-bin-hadoop3 \
&& ln -s /usr/local/spark-$SPARK_VERSION-bin-hadoop3 /usr/local/spark

RUN curl -O https://repo1.maven.org/maven2/software/amazon/awssdk/s3/2.18.41/s3-2.18.41.jar \
&& curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk/1.12.367/aws-java-sdk-1.12.367.jar \
&& curl -O https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.1026/aws-java-sdk-bundle-1.11.1026.jar \
&& curl -O https://repo1.maven.org/maven2/io/delta/delta-core_2.12/2.2.0/delta-core_2.12-2.2.0.jar \
&& curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar \
&& curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.2/hadoop-aws-3.3.2.jar \
&& mv s3-2.18.41.jar /usr/local/spark/jars \
&& mv aws-java-sdk-1.12.367.jar /usr/local/spark/jars \
&& mv aws-java-sdk-bundle-1.11.1026.jar /usr/local/spark/jars \
&& mv delta-core_2.12-2.2.0.jar /usr/local/spark/jars \
&& mv delta-storage-2.2.0.jar /usr/local/spark/jars \
&& mv hadoop-aws-3.3.2.jar /usr/local/spark/jars

RUN pip install --no-cache-dir -U pip
RUN pip install --no-cache-dir delta-spark==2.2.0

COPY data_uploader.py data_uploader.py
7 changes: 7 additions & 0 deletions docker/data-uploader/data_uploader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

if __name__ == '__main__':
spark = configure_spark_with_delta_pip(SparkSession.builder).getOrCreate()
df = spark.read.format("csv").load("s3a://data/raw/", header=True)
df.write.mode("overwrite").option("compression", "snappy").parquet("s3a://data/lake", compression="snappy")

0 comments on commit 84c8525

Please sign in to comment.