From dda47595f90cae6338cbd057c6aa38908313a81c Mon Sep 17 00:00:00 2001
From: steelbear <choisteelbear@gmail.com>
Date: Sun, 11 Aug 2024 23:15:22 +0900
Subject: [PATCH] W5M1 - Data Analysis using RDD

---
 missions/W5/NYC-TLC-with-RDD.ipynb            | 473 ++++++++++++++++++
 missions/W5/spark-softeer/Dockerfile          |  39 ++
 missions/W5/spark-softeer/README.md           |  18 +
 .../config/fairscheduler.xml.template         |  31 ++
 .../config/log4j2.properties.template         |  69 +++
 .../config/metrics.properties.template        | 210 ++++++++
 .../spark-softeer/config/spark-defaults.conf  |  28 ++
 .../config/spark-defaults.conf.template       |  27 +
 missions/W5/spark-softeer/config/spark-env.sh |  88 ++++
 .../config/spark-env.sh.template              |  84 ++++
 missions/W5/spark-softeer/config/workers      |  20 +
 .../W5/spark-softeer/config/workers.template  |  19 +
 missions/W5/spark-softeer/docker-compose.yml  |  48 ++
 missions/W5/spark-softeer/start.sh            |  15 +
 14 files changed, 1169 insertions(+)
 create mode 100644 missions/W5/NYC-TLC-with-RDD.ipynb
 create mode 100644 missions/W5/spark-softeer/Dockerfile
 create mode 100644 missions/W5/spark-softeer/README.md
 create mode 100644 missions/W5/spark-softeer/config/fairscheduler.xml.template
 create mode 100644 missions/W5/spark-softeer/config/log4j2.properties.template
 create mode 100644 missions/W5/spark-softeer/config/metrics.properties.template
 create mode 100644 missions/W5/spark-softeer/config/spark-defaults.conf
 create mode 100644 missions/W5/spark-softeer/config/spark-defaults.conf.template
 create mode 100755 missions/W5/spark-softeer/config/spark-env.sh
 create mode 100755 missions/W5/spark-softeer/config/spark-env.sh.template
 create mode 100644 missions/W5/spark-softeer/config/workers
 create mode 100644 missions/W5/spark-softeer/config/workers.template
 create mode 100644 missions/W5/spark-softeer/docker-compose.yml
 create mode 100644 missions/W5/spark-softeer/start.sh

diff --git a/missions/W5/NYC-TLC-with-RDD.ipynb b/missions/W5/NYC-TLC-with-RDD.ipynb
new file mode 100644
index 00000000..f54b5ad0
--- /dev/null
+++ b/missions/W5/NYC-TLC-with-RDD.ipynb
@@ -0,0 +1,473 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql.functions import col, min, max, sum, to_date"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Spark Session 시작하기"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark = SparkSession.builder.remote(\"sc://localhost\").appName(\"W5M1\").getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# NYC 택시 데이터 가져오기"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n",
+      "|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|\n",
+      "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n",
+      "|       1| 2024-05-01 00:59:15|  2024-05-01 01:23:50|              1|          6.1|         1|                 N|         138|         145|           1|       28.2| 7.75|    0.5|       5.0|         0.0|                  1.0|       42.45|                 0.0|       1.75|\n",
+      "|       2| 2024-04-30 23:58:26|  2024-05-01 00:29:42|              1|        11.23|         1|                 N|         138|         249|           1|       46.4|  6.0|    0.5|      8.72|         0.0|                  1.0|       66.87|                 2.5|       1.75|\n",
+      "|       2| 2024-05-01 00:57:17|  2024-05-01 01:14:15|              1|         9.02|         1|                 N|         138|         170|           1|       35.9|  6.0|    0.5|     10.57|        6.94|                  1.0|       65.16|                 2.5|       1.75|\n",
+      "|       2| 2024-05-01 00:24:47|  2024-05-01 00:48:51|              1|         6.53|         1|                 N|          87|         133|           1|       30.3|  1.0|    0.5|      7.06|         0.0|                  1.0|       42.36|                 2.5|        0.0|\n",
+      "|       2| 2024-05-01 00:11:20|  2024-05-01 00:52:10|              1|        14.38|         1|                 N|         161|         165|           1|       61.8|  1.0|    0.5|       0.0|         0.0|                  1.0|        66.8|                 2.5|        0.0|\n",
+      "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "taxi_trip_df = spark.read.parquet('/Users/admin/Projects/HMG_Softeer_DE/missions/W5/spark-softeer/userdata/NYC-TLC/yellow_tripdata_2024-05.parquet')\n",
+    "taxi_trip_df.show(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- VendorID: integer (nullable = true)\n",
+      " |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)\n",
+      " |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)\n",
+      " |-- passenger_count: long (nullable = true)\n",
+      " |-- trip_distance: double (nullable = true)\n",
+      " |-- RatecodeID: long (nullable = true)\n",
+      " |-- store_and_fwd_flag: string (nullable = true)\n",
+      " |-- PULocationID: integer (nullable = true)\n",
+      " |-- DOLocationID: integer (nullable = true)\n",
+      " |-- payment_type: long (nullable = true)\n",
+      " |-- fare_amount: double (nullable = true)\n",
+      " |-- extra: double (nullable = true)\n",
+      " |-- mta_tax: double (nullable = true)\n",
+      " |-- tip_amount: double (nullable = true)\n",
+      " |-- tolls_amount: double (nullable = true)\n",
+      " |-- improvement_surcharge: double (nullable = true)\n",
+      " |-- total_amount: double (nullable = true)\n",
+      " |-- congestion_surcharge: double (nullable = true)\n",
+      " |-- Airport_fee: double (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "taxi_trip_df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 데이터 전처리\n",
+    "## 결측치 제거"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "numerical_columns = ['passenger_count', 'trip_distance', 'fare_amount', \n",
+    "                     'extra', 'mta_tax', 'tip_amount', 'tolls_amount', \n",
+    "                     'improvement_surcharge', 'total_amount', 'congestion_surcharge', \n",
+    "                     'Airport_fee']\n",
+    "minmax_columns = []\n",
+    "\n",
+    "for column_name in numerical_columns:\n",
+    "    minmax_columns.append(min(column_name))\n",
+    "    minmax_columns.append(max(column_name))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n",
+      "|min(passenger_count)|max(passenger_count)|min(trip_distance)|max(trip_distance)|min(fare_amount)|max(fare_amount)|min(extra)|max(extra)|min(mta_tax)|max(mta_tax)|min(tip_amount)|max(tip_amount)|min(tolls_amount)|max(tolls_amount)|min(improvement_surcharge)|max(improvement_surcharge)|min(total_amount)|max(total_amount)|min(congestion_surcharge)|max(congestion_surcharge)|min(Airport_fee)|max(Airport_fee)|\n",
+      "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n",
+      "|                   0|                   9|               0.0|         249151.44|         -866.69|       334076.32|      -7.5|     65.99|        -0.5|         4.0|         -92.35|         437.84|           -46.14|           528.56|                      -1.0|                       1.0|          -867.69|         334145.3|                     -2.5|                      2.5|           -1.75|            1.75|\n",
+      "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "taxi_trip_df.select(*minmax_columns).show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "taxi_trip_df = (taxi_trip_df.filter(col('fare_amount') > 0)\n",
+    "                            .filter(col('extra') > 0)\n",
+    "                            .filter(col('mta_tax') > 0)\n",
+    "                            .filter(col('tip_amount') > 0)\n",
+    "                            .filter(col('tolls_amount') > 0)\n",
+    "                            .filter(col('improvement_surcharge') > 0)\n",
+    "                            .filter(col('total_amount') > 0)\n",
+    "                            .filter(col('congestion_surcharge') > 0)\n",
+    "                            .filter(col('Airport_fee') > 0)\n",
+    "                            .filter(col('trip_distance') > 0)\n",
+    "                )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n",
+      "|min(passenger_count)|max(passenger_count)|min(trip_distance)|max(trip_distance)|min(fare_amount)|max(fare_amount)|min(extra)|max(extra)|min(mta_tax)|max(mta_tax)|min(tip_amount)|max(tip_amount)|min(tolls_amount)|max(tolls_amount)|min(improvement_surcharge)|max(improvement_surcharge)|min(total_amount)|max(total_amount)|min(congestion_surcharge)|max(congestion_surcharge)|min(Airport_fee)|max(Airport_fee)|\n",
+      "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n",
+      "|                   0|                   6|              0.02|             39.66|            17.0|           244.5|      0.02|     11.75|         0.5|         0.5|           0.01|          110.0|              1.0|            63.94|                       1.0|                       1.0|            43.36|           314.28|                      2.5|                      2.5|            1.75|            1.75|\n",
+      "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "taxi_trip_df.select(*minmax_columns).show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 필요한 columns만 고르기"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+-------------+------------+\n",
+      "|tpep_pickup_datetime|trip_distance|total_amount|\n",
+      "+--------------------+-------------+------------+\n",
+      "| 2024-05-01 00:57:17|         9.02|       65.16|\n",
+      "| 2024-05-01 00:14:05|         8.53|       59.31|\n",
+      "| 2024-05-01 00:03:08|         27.5|      107.49|\n",
+      "| 2024-05-01 00:44:01|         11.0|       74.74|\n",
+      "| 2024-05-01 00:14:19|         8.32|       59.79|\n",
+      "+--------------------+-------------+------------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "taxi_trip_df = taxi_trip_df.select('tpep_pickup_datetime', 'trip_distance', 'total_amount')\n",
+    "taxi_trip_df.show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 날짜 기준으로 그룹화"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "taxi_trip_group_by_date_df = taxi_trip_df.withColumn('tpep_pickup_date', to_date(col('tpep_pickup_datetime'))).groupBy('tpep_pickup_date')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Aggregation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "84378"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "taxi_trip_df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----------------+\n",
+      "|sum(total_amount)|\n",
+      "+-----------------+\n",
+      "| 6862441.78999949|\n",
+      "+-----------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "taxi_trip_df.select(sum(col('total_amount'))).show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "average of trip distance:  11.696007727132685 miles\n"
+     ]
+    }
+   ],
+   "source": [
+    "num_of_trips = taxi_trip_df.count()\n",
+    "total_trip_distance = taxi_trip_df.select(sum(col('trip_distance'))).collect()[0]['sum(trip_distance)']\n",
+    "\n",
+    "print('average of trip distance: ', total_trip_distance / num_of_trips, 'miles')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+----------------+-----+\n",
+      "|tpep_pickup_date|count|\n",
+      "+----------------+-----+\n",
+      "|      2024-05-01| 2874|\n",
+      "|      2024-05-02| 3058|\n",
+      "|      2024-05-03| 3015|\n",
+      "|      2024-05-04| 1431|\n",
+      "|      2024-05-05| 2632|\n",
+      "|      2024-05-06| 3575|\n",
+      "|      2024-05-07| 3043|\n",
+      "|      2024-05-08| 3203|\n",
+      "|      2024-05-09| 3242|\n",
+      "|      2024-05-10| 2898|\n",
+      "|      2024-05-11| 1297|\n",
+      "|      2024-05-12| 2587|\n",
+      "|      2024-05-13| 3963|\n",
+      "|      2024-05-14| 3465|\n",
+      "|      2024-05-15| 3303|\n",
+      "|      2024-05-16| 3496|\n",
+      "|      2024-05-17| 2844|\n",
+      "|      2024-05-18| 1368|\n",
+      "|      2024-05-19| 2760|\n",
+      "|      2024-05-20| 3423|\n",
+      "|      2024-05-21| 2947|\n",
+      "|      2024-05-22| 2845|\n",
+      "|      2024-05-23| 2615|\n",
+      "|      2024-05-24| 2576|\n",
+      "|      2024-05-25| 1060|\n",
+      "|      2024-05-26| 1232|\n",
+      "|      2024-05-27| 1873|\n",
+      "|      2024-05-28| 3492|\n",
+      "|      2024-05-29| 2801|\n",
+      "|      2024-05-30| 2947|\n",
+      "|      2024-05-31| 2513|\n",
+      "+----------------+-----+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "count_by_date_df = taxi_trip_group_by_date_df.count().orderBy('tpep_pickup_date')\n",
+    "count_by_date_df.show(31)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+----------------+--------------------+\n",
+      "|tpep_pickup_date|total_amount_by_date|\n",
+      "+----------------+--------------------+\n",
+      "|      2024-05-01|   234102.3800000013|\n",
+      "|      2024-05-02|  253174.17000000217|\n",
+      "|      2024-05-03|  245122.76000000146|\n",
+      "|      2024-05-04|  108908.40000000017|\n",
+      "|      2024-05-05|  201508.48000000062|\n",
+      "|      2024-05-06|  292839.31000000227|\n",
+      "|      2024-05-07|  251233.74000000185|\n",
+      "|      2024-05-08|   265208.1300000018|\n",
+      "|      2024-05-09|  266810.56000000174|\n",
+      "|      2024-05-10|  242295.62000000122|\n",
+      "|      2024-05-11|   100074.6800000003|\n",
+      "|      2024-05-12|   191414.2600000006|\n",
+      "|      2024-05-13|  321041.43000000203|\n",
+      "|      2024-05-14|  290568.03000000113|\n",
+      "|      2024-05-15|   276198.3200000021|\n",
+      "|      2024-05-16|   293009.5300000017|\n",
+      "|      2024-05-17|  239929.89000000103|\n",
+      "|      2024-05-18|  108224.19000000009|\n",
+      "|      2024-05-19|  208423.63999999972|\n",
+      "|      2024-05-20|    281833.060000002|\n",
+      "|      2024-05-21|  246523.13000000146|\n",
+      "|      2024-05-22|  233623.27000000144|\n",
+      "|      2024-05-23|   218459.9300000015|\n",
+      "|      2024-05-24|   212158.6700000009|\n",
+      "|      2024-05-25|   80535.12000000005|\n",
+      "|      2024-05-26|   90027.91000000008|\n",
+      "|      2024-05-27|   138572.9199999999|\n",
+      "|      2024-05-28|   281179.2100000025|\n",
+      "|      2024-05-29|  231462.55000000205|\n",
+      "|      2024-05-30|  248300.18000000023|\n",
+      "|      2024-05-31|   209678.3200000011|\n",
+      "+----------------+--------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "total_amount_by_date_df = (taxi_trip_group_by_date_df.sum()\n",
+    "                                                     .select('tpep_pickup_date', 'sum(total_amount)')\n",
+    "                                                     .withColumnRenamed('sum(total_amount)', 'total_amount_by_date')\n",
+    "                                                     .orderBy('tpep_pickup_date'))\n",
+    "total_amount_by_date_df.show(31)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 결과 저장하기"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "count_by_date_df.write.csv('/Users/admin/Projects/HMG_Softeer_DE/missions/W5/NYC-taxi-count_by_date', header=True)\n",
+    "total_amount_by_date_df.write.csv('/Users/admin/Projects/HMG_Softeer_DE/missions/W5/NYC-taxi-total_amount_by_date', header=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spark.stop()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv-310",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/missions/W5/spark-softeer/Dockerfile b/missions/W5/spark-softeer/Dockerfile
new file mode 100644
index 00000000..998f39a3
--- /dev/null
+++ b/missions/W5/spark-softeer/Dockerfile
@@ -0,0 +1,39 @@
+FROM ubuntu:jammy
+
+EXPOSE 8080
+
+RUN apt update -y && apt upgrade -y
+RUN DEBIAN_FRONTEND=noninteractive apt install wget sudo openjdk-11-jdk -y
+
+RUN useradd sparkuser -s /bin/bash -d /home/sparkuser -p spark
+RUN echo "sparkuser ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
+
+RUN mkdir /home/sparkuser
+RUN chown sparkuser:sparkuser /home/sparkuser
+
+USER sparkuser
+WORKDIR /home/sparkuser
+
+RUN wget https://bootstrap.pypa.io/get-pip.py
+RUN sudo chown sparkuser:sparkuser get-pip.py
+RUN python3 get-pip.py
+RUN rm get-pip.py
+RUN python3 -m pip install pandas matplotlib pyspark pyarrow grpcio protobuf grpcio-status
+
+RUN wget https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
+RUN tar -zxvf spark-3.5.1-bin-hadoop3.tgz
+RUN rm spark-3.5.1-bin-hadoop3.tgz
+RUN mv spark-3.5.1-bin-hadoop3 spark
+RUN sudo mv spark /
+
+WORKDIR /spark
+ADD ./start.sh . 
+ADD config/fairscheduler.xml* /spark/conf
+ADD config/log4j2.properties* /spark/conf
+ADD config/metrics.properties* /spark/conf
+ADD config/spark-defaults.conf* /spark/conf
+ADD config/spark-env.sh* /spark/conf
+ADD config/workers* /spark/conf
+RUN sudo chown sparkuser:sparkuser ./start.sh
+RUN sudo chmod +rwx ./start.sh
+RUN echo "export PATH=/spark/bin:$PATH" >> ~/.bashrc
\ No newline at end of file
diff --git a/missions/W5/spark-softeer/README.md b/missions/W5/spark-softeer/README.md
new file mode 100644
index 00000000..db2ae0e4
--- /dev/null
+++ b/missions/W5/spark-softeer/README.md
@@ -0,0 +1,18 @@
+# HMG Softeer Spark Container
+
+## Run Spark Cluster
+```bash
+# pwd => missions/W4/spark-softeer
+docker compose up
+```
+
+## Submit a job to Spark
+| Docker Container 외부에서 submit 한다면, driver 환경에 Python 버전이 3.10이여야 합니다.
+```bash
+docker exec spark-softeer-master-1 bin/spark-submit --files file --master spark://localhost:7077 client-application [args]
+
+# example
+docker exec spark-softeer-master-1 bin/spark-submit --files userdata/pg74102.txt --master spark://localhost:7077 userdata/wordcount.py
+
+# output is stored in userdata/output_wordcount directory.
+```
\ No newline at end of file
diff --git a/missions/W5/spark-softeer/config/fairscheduler.xml.template b/missions/W5/spark-softeer/config/fairscheduler.xml.template
new file mode 100644
index 00000000..385b2e77
--- /dev/null
+++ b/missions/W5/spark-softeer/config/fairscheduler.xml.template
@@ -0,0 +1,31 @@
+<?xml version="1.0"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<allocations>
+  <pool name="production">
+    <schedulingMode>FAIR</schedulingMode>
+    <weight>1</weight>
+    <minShare>2</minShare>
+  </pool>
+  <pool name="test">
+    <schedulingMode>FIFO</schedulingMode>
+    <weight>2</weight>
+    <minShare>3</minShare>
+  </pool>
+</allocations>
diff --git a/missions/W5/spark-softeer/config/log4j2.properties.template b/missions/W5/spark-softeer/config/log4j2.properties.template
new file mode 100644
index 00000000..ab96e03b
--- /dev/null
+++ b/missions/W5/spark-softeer/config/log4j2.properties.template
@@ -0,0 +1,69 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the console
+rootLogger.level = info
+rootLogger.appenderRef.stdout.ref = console
+
+# In the pattern layout configuration below, we specify an explicit `%ex` conversion
+# pattern for logging Throwables. If this was omitted, then (by default) Log4J would
+# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional
+# class packaging information. That extra information can sometimes add a substantial
+# performance overhead, so we disable it in our default logging config.
+# For more information, see SPARK-39361.
+appender.console.type = Console
+appender.console.name = console
+appender.console.target = SYSTEM_ERR
+appender.console.layout.type = PatternLayout
+appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex
+
+# Set the default spark-shell/spark-sql log level to WARN. When running the
+# spark-shell/spark-sql, the log level for these classes is used to overwrite
+# the root logger's log level, so that the user can have different defaults
+# for the shell and regular Spark apps.
+logger.repl.name = org.apache.spark.repl.Main
+logger.repl.level = warn
+
+logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
+logger.thriftserver.level = warn
+
+# Settings to quiet third party logs that are too verbose
+logger.jetty1.name = org.sparkproject.jetty
+logger.jetty1.level = warn
+logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle
+logger.jetty2.level = error
+logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper
+logger.replexprTyper.level = info
+logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter
+logger.replSparkILoopInterpreter.level = info
+logger.parquet1.name = org.apache.parquet
+logger.parquet1.level = error
+logger.parquet2.name = parquet
+logger.parquet2.level = error
+
+# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
+logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
+logger.RetryingHMSHandler.level = fatal
+logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry
+logger.FunctionRegistry.level = error
+
+# For deploying Spark ThriftServer
+# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805
+appender.console.filter.1.type = RegexFilter
+appender.console.filter.1.regex = .*Thrift error occurred during processing of message.*
+appender.console.filter.1.onMatch = deny
+appender.console.filter.1.onMismatch = neutral
diff --git a/missions/W5/spark-softeer/config/metrics.properties.template b/missions/W5/spark-softeer/config/metrics.properties.template
new file mode 100644
index 00000000..f52d33fd
--- /dev/null
+++ b/missions/W5/spark-softeer/config/metrics.properties.template
@@ -0,0 +1,210 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#  syntax: [instance].sink|source.[name].[options]=[value]
+
+#  This file configures Spark's internal metrics system. The metrics system is
+#  divided into instances which correspond to internal components.
+#  Each instance can be configured to report its metrics to one or more sinks.
+#  Accepted values for [instance] are "master", "worker", "executor", "driver",
+#  and "applications". A wildcard "*" can be used as an instance name, in
+#  which case all instances will inherit the supplied property.
+#
+#  Within an instance, a "source" specifies a particular set of grouped metrics.
+#  there are two kinds of sources:
+#    1. Spark internal sources, like MasterSource, WorkerSource, etc, which will
+#    collect a Spark component's internal state. Each instance is paired with a
+#    Spark source that is added automatically.
+#    2. Common sources, like JvmSource, which will collect low level state.
+#    These can be added through configuration options and are then loaded
+#    using reflection.
+#
+#  A "sink" specifies where metrics are delivered to. Each instance can be
+#  assigned one or more sinks.
+#
+#  The sink|source field specifies whether the property relates to a sink or
+#  source.
+#
+#  The [name] field specifies the name of source or sink.
+#
+#  The [options] field is the specific property of this source or sink. The
+#  source or sink is responsible for parsing this property.
+#
+#  Notes:
+#    1. To add a new sink, set the "class" option to a fully qualified class
+#    name (see examples below).
+#    2. Some sinks involve a polling period. The minimum allowed polling period
+#    is 1 second.
+#    3. Wildcard properties can be overridden by more specific properties.
+#    For example, master.sink.console.period takes precedence over
+#    *.sink.console.period.
+#    4. A metrics specific configuration
+#    "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be
+#    added to Java properties using -Dspark.metrics.conf=xxx if you want to
+#    customize metrics system. You can also put the file in ${SPARK_HOME}/conf
+#    and it will be loaded automatically.
+#    5. The MetricsServlet sink is added by default as a sink in the master,
+#    worker and driver, and you can send HTTP requests to the "/metrics/json"
+#    endpoint to get a snapshot of all the registered metrics in JSON format.
+#    For master, requests to the "/metrics/master/json" and
+#    "/metrics/applications/json" endpoints can be sent separately to get
+#    metrics snapshots of the master instance and applications. This
+#    MetricsServlet does not have to be configured.
+#    6. The metrics system can also be configured using Spark configuration
+#    parameters. The relevant parameter names are formed by adding the
+#    prefix "spark.metrics.conf." to the configuration entries detailed in
+#    this file (see examples below).
+
+## List of available common sources and their properties.
+
+# org.apache.spark.metrics.source.JvmSource
+#   Note: Currently, JvmSource is the only available common source.
+#         It can be added to an instance by setting the "class" option to its
+#         fully qualified class name (see examples below).
+
+## List of available sinks and their properties.
+
+# org.apache.spark.metrics.sink.ConsoleSink
+#   Name:   Default:   Description:
+#   period  10         Poll period
+#   unit    seconds    Unit of the poll period
+
+# org.apache.spark.metrics.sink.CSVSink
+#   Name:     Default:   Description:
+#   period    10         Poll period
+#   unit      seconds    Unit of the poll period
+#   directory /tmp       Where to store CSV files
+
+# org.apache.spark.metrics.sink.GangliaSink
+#   Name:     Default:   Description:
+#   host      NONE       Hostname or multicast group of the Ganglia server,
+#                        must be set
+#   port      NONE       Port of the Ganglia server(s), must be set
+#   period    10         Poll period
+#   unit      seconds    Unit of the poll period
+#   ttl       1          TTL of messages sent by Ganglia
+#   dmax      0          Lifetime in seconds of metrics (0 never expired)
+#   mode      multicast  Ganglia network mode ('unicast' or 'multicast')
+
+# org.apache.spark.metrics.sink.JmxSink
+
+# org.apache.spark.metrics.sink.MetricsServlet
+#   Name:     Default:   Description:
+#   path      VARIES*    Path prefix from the web server root
+#   sample    false      Whether to show entire set of samples for histograms
+#                        ('false' or 'true')
+#
+# * Default path is /metrics/json for all instances except the master. The
+#   master has two paths:
+#     /metrics/applications/json # App information
+#     /metrics/master/json       # Master information
+
+# org.apache.spark.metrics.sink.PrometheusServlet
+#   Name:     Default:   Description:
+#   path      VARIES*    Path prefix from the web server root
+#
+# * Default path is /metrics/prometheus for all instances except the master. The
+#   master has two paths:
+#     /metrics/applications/prometheus # App information
+#     /metrics/master/prometheus       # Master information
+
+# org.apache.spark.metrics.sink.GraphiteSink
+#   Name:     Default:      Description:
+#   host      NONE          Hostname of the Graphite server, must be set
+#   port      NONE          Port of the Graphite server, must be set
+#   period    10            Poll period
+#   unit      seconds       Unit of the poll period
+#   prefix    EMPTY STRING  Prefix to prepend to every metric's name
+#   protocol  tcp           Protocol ("tcp" or "udp") to use
+#   regex     NONE          Optional filter to send only metrics matching this regex string
+
+# org.apache.spark.metrics.sink.StatsdSink
+#   Name:     Default:      Description:
+#   host      127.0.0.1     Hostname or IP of StatsD server
+#   port      8125          Port of StatsD server
+#   period    10            Poll period
+#   unit      seconds       Units of poll period
+#   prefix    EMPTY STRING  Prefix to prepend to metric name
+
+## Examples
+# Enable JmxSink for all instances by class name
+#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink
+
+# Enable ConsoleSink for all instances by class name
+#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink
+
+# Enable StatsdSink for all instances by class name
+#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink
+#*.sink.statsd.prefix=spark
+
+# Polling period for the ConsoleSink
+#*.sink.console.period=10
+# Unit of the polling period for the ConsoleSink
+#*.sink.console.unit=seconds
+
+# Polling period for the ConsoleSink specific for the master instance
+#master.sink.console.period=15
+# Unit of the polling period for the ConsoleSink specific for the master
+# instance
+#master.sink.console.unit=seconds
+
+# Enable CsvSink for all instances by class name
+#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink
+
+# Polling period for the CsvSink
+#*.sink.csv.period=1
+# Unit of the polling period for the CsvSink
+#*.sink.csv.unit=minutes
+
+# Polling directory for CsvSink
+#*.sink.csv.directory=/tmp/
+
+# Polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.period=10
+# Unit of the polling period for the CsvSink specific for the worker instance
+#worker.sink.csv.unit=minutes
+
+# Enable Slf4jSink for all instances by class name
+#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink
+
+# Polling period for the Slf4JSink
+#*.sink.slf4j.period=1
+# Unit of the polling period for the Slf4jSink
+#*.sink.slf4j.unit=minutes
+
+# Example configuration for Graphite sink
+#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink
+#*.sink.graphite.host=<graphiteEndPoint_hostName>
+#*.sink.graphite.port=<listening_port>
+#*.sink.graphite.period=10
+#*.sink.graphite.unit=seconds
+#*.sink.graphite.prefix=<optional_value>
+
+# Enable JvmSource for instance master, worker, driver and executor
+#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource
+
+# Example configuration for PrometheusServlet
+#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
+#*.sink.prometheusServlet.path=/metrics/prometheus
+#master.sink.prometheusServlet.path=/metrics/master/prometheus
+#applications.sink.prometheusServlet.path=/metrics/applications/prometheus
diff --git a/missions/W5/spark-softeer/config/spark-defaults.conf b/missions/W5/spark-softeer/config/spark-defaults.conf
new file mode 100644
index 00000000..a444df45
--- /dev/null
+++ b/missions/W5/spark-softeer/config/spark-defaults.conf
@@ -0,0 +1,28 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
+spark.master                     spark://master:7077
\ No newline at end of file
diff --git a/missions/W5/spark-softeer/config/spark-defaults.conf.template b/missions/W5/spark-softeer/config/spark-defaults.conf.template
new file mode 100644
index 00000000..19cba6e7
--- /dev/null
+++ b/missions/W5/spark-softeer/config/spark-defaults.conf.template
@@ -0,0 +1,27 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Default system properties included when running spark-submit.
+# This is useful for setting default environmental settings.
+
+# Example:
+# spark.master                     spark://master:7077
+# spark.eventLog.enabled           true
+# spark.eventLog.dir               hdfs://namenode:8021/directory
+# spark.serializer                 org.apache.spark.serializer.KryoSerializer
+# spark.driver.memory              5g
+# spark.executor.extraJavaOptions  -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
diff --git a/missions/W5/spark-softeer/config/spark-env.sh b/missions/W5/spark-softeer/config/spark-env.sh
new file mode 100755
index 00000000..198636df
--- /dev/null
+++ b/missions/W5/spark-softeer/config/spark-env.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in any mode
+# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
+
+# Options read in any cluster manager using HDFS
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+
+# Options read in YARN client/cluster mode
+# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN
+
+# Options for the daemons used in the standalone deploy mode
+# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+# Options for launcher
+# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y")
+
+# Generic options for the daemons used in the standalone deploy mode
+# - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
+# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5.
+# - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
+# - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
+# - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
+# - SPARK_NO_DAEMONIZE  Run the proposed command in the foreground. It will not output a PID file.
+# Options for native BLAS, like Intel MKL, OpenBLAS, and so on.
+# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
+# - MKL_NUM_THREADS=1        Disable multi-threading of Intel MKL
+# - OPENBLAS_NUM_THREADS=1   Disable multi-threading of OpenBLAS
+
+# Options for beeline
+# - SPARK_BEELINE_OPTS, to set config properties only for the beeline cli (e.g. "-Dx=y")
+# - SPARK_BEELINE_MEMORY, Memory for beeline (e.g. 1000M, 2G) (Default: 1G)
+
+export SPARK_HOME=/spark
+export SPARK_CONF_DIR=$SPARK_HOME/conf
+
+export SPARK_LOCAL_DIRS=/tmp
+export SPARK_WORKER_DIR=/tmp
+export SPARK_VERSION=3.5.1
\ No newline at end of file
diff --git a/missions/W5/spark-softeer/config/spark-env.sh.template b/missions/W5/spark-softeer/config/spark-env.sh.template
new file mode 100755
index 00000000..d65cd8d1
--- /dev/null
+++ b/missions/W5/spark-softeer/config/spark-env.sh.template
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This file is sourced when running various Spark programs.
+# Copy it as spark-env.sh and edit that to configure Spark for your site.
+
+# Options read when launching programs locally with
+# ./bin/run-example or ./bin/spark-submit
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program
+
+# Options read by executors and drivers running inside the cluster
+# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node
+# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program
+# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data
+# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos
+
+# Options read in any mode
+# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1).
+# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G)
+# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G)
+
+# Options read in any cluster manager using HDFS
+# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files
+
+# Options read in YARN client/cluster mode
+# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN
+
+# Options for the daemons used in the standalone deploy mode
+# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname
+# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master
+# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y")
+# - SPARK_WORKER_CORES, to set the number of cores to use on this machine
+# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g)
+# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker
+# - SPARK_WORKER_DIR, to set the working directory of worker processes
+# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y")
+# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g).
+# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y")
+# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y")
+# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y")
+# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons
+# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers
+
+# Options for launcher
+# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y")
+
+# Generic options for the daemons used in the standalone deploy mode
+# - SPARK_CONF_DIR      Alternate conf dir. (Default: ${SPARK_HOME}/conf)
+# - SPARK_LOG_DIR       Where log files are stored.  (Default: ${SPARK_HOME}/logs)
+# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5.
+# - SPARK_PID_DIR       Where the pid file is stored. (Default: /tmp)
+# - SPARK_IDENT_STRING  A string representing this instance of spark. (Default: $USER)
+# - SPARK_NICENESS      The scheduling priority for daemons. (Default: 0)
+# - SPARK_NO_DAEMONIZE  Run the proposed command in the foreground. It will not output a PID file.
+# Options for native BLAS, like Intel MKL, OpenBLAS, and so on.
+# You might get better performance to enable these options if using native BLAS (see SPARK-21305).
+# - MKL_NUM_THREADS=1        Disable multi-threading of Intel MKL
+# - OPENBLAS_NUM_THREADS=1   Disable multi-threading of OpenBLAS
+
+# Options for beeline
+# - SPARK_BEELINE_OPTS, to set config properties only for the beeline cli (e.g. "-Dx=y")
+# - SPARK_BEELINE_MEMORY, Memory for beeline (e.g. 1000M, 2G) (Default: 1G)
+
+export SPARK_LOCAL_DIRS=/tmp
+export SPARK_WORKER_DIR=/tmp
\ No newline at end of file
diff --git a/missions/W5/spark-softeer/config/workers b/missions/W5/spark-softeer/config/workers
new file mode 100644
index 00000000..11aa72ed
--- /dev/null
+++ b/missions/W5/spark-softeer/config/workers
@@ -0,0 +1,20 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# A Spark Worker will be started on each of the machines listed below.
+worker1
+worker2
\ No newline at end of file
diff --git a/missions/W5/spark-softeer/config/workers.template b/missions/W5/spark-softeer/config/workers.template
new file mode 100644
index 00000000..be42a638
--- /dev/null
+++ b/missions/W5/spark-softeer/config/workers.template
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# A Spark Worker will be started on each of the machines listed below.
+localhost
\ No newline at end of file
diff --git a/missions/W5/spark-softeer/docker-compose.yml b/missions/W5/spark-softeer/docker-compose.yml
new file mode 100644
index 00000000..2b4cd683
--- /dev/null
+++ b/missions/W5/spark-softeer/docker-compose.yml
@@ -0,0 +1,48 @@
+version: "2"
+services:
+  master:
+    build: .
+    hostname: master
+    ports:
+      - 4040:4040
+      - 7077:7077
+      - 8080:8080
+    volumes:
+      - type: bind
+        source: userdata/
+        target: /spark/userdata/
+    command: ./start.sh master
+  connect:
+    depends_on:
+      - master
+    build: .
+    hostname: connect
+    ports:
+      - 15002:15002
+    volumes:
+      - type: bind
+        source: userdata/
+        target: /spark/userdata/
+    command: ./start.sh connect
+  worker1:
+    depends_on:
+      - master
+    build: .
+    hostname: worker1
+    ports:
+      - 8081:8081
+    volumes:
+      - type: bind
+        source: userdata/
+        target: /spark/userdata/
+    command: ./start.sh worker
+  worker2:
+    depends_on:
+      - master
+    build: .
+    hostname: worker2
+    volumes:
+      - type: bind
+        source: userdata/
+        target: /spark/userdata/
+    command: ./start.sh worker
\ No newline at end of file
diff --git a/missions/W5/spark-softeer/start.sh b/missions/W5/spark-softeer/start.sh
new file mode 100644
index 00000000..d73f73b4
--- /dev/null
+++ b/missions/W5/spark-softeer/start.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+
+if [ $1 = "master" ]; then
+    LOG_FILE=$(./sbin/start-master.sh | sed "s/.\+logging to //")
+elif [ $1 = "connect" ]; then
+    LOG_FILE=$(./sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:3.5.1 --master spark://master:7077 | sed "s/.\+logging to //")
+elif [ $1 = "worker" ]; then
+    LOG_FILE=$(./sbin/start-worker.sh spark://master:7077 | sed "s/.\+logging to //")
+else
+    echo "Usage: $0 [master|connect|worker]"
+    exit 1
+fi 
+
+tail -f $LOG_FILE
\ No newline at end of file