From dda47595f90cae6338cbd057c6aa38908313a81c Mon Sep 17 00:00:00 2001 From: steelbear Date: Sun, 11 Aug 2024 23:15:22 +0900 Subject: [PATCH] W5M1 - Data Analysis using RDD --- missions/W5/NYC-TLC-with-RDD.ipynb | 473 ++++++++++++++++++ missions/W5/spark-softeer/Dockerfile | 39 ++ missions/W5/spark-softeer/README.md | 18 + .../config/fairscheduler.xml.template | 31 ++ .../config/log4j2.properties.template | 69 +++ .../config/metrics.properties.template | 210 ++++++++ .../spark-softeer/config/spark-defaults.conf | 28 ++ .../config/spark-defaults.conf.template | 27 + missions/W5/spark-softeer/config/spark-env.sh | 88 ++++ .../config/spark-env.sh.template | 84 ++++ missions/W5/spark-softeer/config/workers | 20 + .../W5/spark-softeer/config/workers.template | 19 + missions/W5/spark-softeer/docker-compose.yml | 48 ++ missions/W5/spark-softeer/start.sh | 15 + 14 files changed, 1169 insertions(+) create mode 100644 missions/W5/NYC-TLC-with-RDD.ipynb create mode 100644 missions/W5/spark-softeer/Dockerfile create mode 100644 missions/W5/spark-softeer/README.md create mode 100644 missions/W5/spark-softeer/config/fairscheduler.xml.template create mode 100644 missions/W5/spark-softeer/config/log4j2.properties.template create mode 100644 missions/W5/spark-softeer/config/metrics.properties.template create mode 100644 missions/W5/spark-softeer/config/spark-defaults.conf create mode 100644 missions/W5/spark-softeer/config/spark-defaults.conf.template create mode 100755 missions/W5/spark-softeer/config/spark-env.sh create mode 100755 missions/W5/spark-softeer/config/spark-env.sh.template create mode 100644 missions/W5/spark-softeer/config/workers create mode 100644 missions/W5/spark-softeer/config/workers.template create mode 100644 missions/W5/spark-softeer/docker-compose.yml create mode 100644 missions/W5/spark-softeer/start.sh diff --git a/missions/W5/NYC-TLC-with-RDD.ipynb b/missions/W5/NYC-TLC-with-RDD.ipynb new file mode 100644 index 00000000..f54b5ad0 --- /dev/null +++ b/missions/W5/NYC-TLC-with-RDD.ipynb @@ -0,0 +1,473 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import SparkSession\n", + "from pyspark.sql.functions import col, min, max, sum, to_date" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Spark Session 시작하기" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "spark = SparkSession.builder.remote(\"sc://localhost\").appName(\"W5M1\").getOrCreate()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NYC 택시 데이터 가져오기" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n", + "|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|\n", + "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n", + "| 1| 2024-05-01 00:59:15| 2024-05-01 01:23:50| 1| 6.1| 1| N| 138| 145| 1| 28.2| 7.75| 0.5| 5.0| 0.0| 1.0| 42.45| 0.0| 1.75|\n", + "| 2| 2024-04-30 23:58:26| 2024-05-01 00:29:42| 1| 11.23| 1| N| 138| 249| 1| 46.4| 6.0| 0.5| 8.72| 0.0| 1.0| 66.87| 2.5| 1.75|\n", + "| 2| 2024-05-01 00:57:17| 2024-05-01 01:14:15| 1| 9.02| 1| N| 138| 170| 1| 35.9| 6.0| 0.5| 10.57| 6.94| 1.0| 65.16| 2.5| 1.75|\n", + "| 2| 2024-05-01 00:24:47| 2024-05-01 00:48:51| 1| 6.53| 1| N| 87| 133| 1| 30.3| 1.0| 0.5| 7.06| 0.0| 1.0| 42.36| 2.5| 0.0|\n", + "| 2| 2024-05-01 00:11:20| 2024-05-01 00:52:10| 1| 14.38| 1| N| 161| 165| 1| 61.8| 1.0| 0.5| 0.0| 0.0| 1.0| 66.8| 2.5| 0.0|\n", + "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "taxi_trip_df = spark.read.parquet('/Users/admin/Projects/HMG_Softeer_DE/missions/W5/spark-softeer/userdata/NYC-TLC/yellow_tripdata_2024-05.parquet')\n", + "taxi_trip_df.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- VendorID: integer (nullable = true)\n", + " |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)\n", + " |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)\n", + " |-- passenger_count: long (nullable = true)\n", + " |-- trip_distance: double (nullable = true)\n", + " |-- RatecodeID: long (nullable = true)\n", + " |-- store_and_fwd_flag: string (nullable = true)\n", + " |-- PULocationID: integer (nullable = true)\n", + " |-- DOLocationID: integer (nullable = true)\n", + " |-- payment_type: long (nullable = true)\n", + " |-- fare_amount: double (nullable = true)\n", + " |-- extra: double (nullable = true)\n", + " |-- mta_tax: double (nullable = true)\n", + " |-- tip_amount: double (nullable = true)\n", + " |-- tolls_amount: double (nullable = true)\n", + " |-- improvement_surcharge: double (nullable = true)\n", + " |-- total_amount: double (nullable = true)\n", + " |-- congestion_surcharge: double (nullable = true)\n", + " |-- Airport_fee: double (nullable = true)\n", + "\n" + ] + } + ], + "source": [ + "taxi_trip_df.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 데이터 전처리\n", + "## 결측치 제거" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "numerical_columns = ['passenger_count', 'trip_distance', 'fare_amount', \n", + " 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', \n", + " 'improvement_surcharge', 'total_amount', 'congestion_surcharge', \n", + " 'Airport_fee']\n", + "minmax_columns = []\n", + "\n", + "for column_name in numerical_columns:\n", + " minmax_columns.append(min(column_name))\n", + " minmax_columns.append(max(column_name))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n", + "|min(passenger_count)|max(passenger_count)|min(trip_distance)|max(trip_distance)|min(fare_amount)|max(fare_amount)|min(extra)|max(extra)|min(mta_tax)|max(mta_tax)|min(tip_amount)|max(tip_amount)|min(tolls_amount)|max(tolls_amount)|min(improvement_surcharge)|max(improvement_surcharge)|min(total_amount)|max(total_amount)|min(congestion_surcharge)|max(congestion_surcharge)|min(Airport_fee)|max(Airport_fee)|\n", + "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n", + "| 0| 9| 0.0| 249151.44| -866.69| 334076.32| -7.5| 65.99| -0.5| 4.0| -92.35| 437.84| -46.14| 528.56| -1.0| 1.0| -867.69| 334145.3| -2.5| 2.5| -1.75| 1.75|\n", + "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n", + "\n" + ] + } + ], + "source": [ + "taxi_trip_df.select(*minmax_columns).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "taxi_trip_df = (taxi_trip_df.filter(col('fare_amount') > 0)\n", + " .filter(col('extra') > 0)\n", + " .filter(col('mta_tax') > 0)\n", + " .filter(col('tip_amount') > 0)\n", + " .filter(col('tolls_amount') > 0)\n", + " .filter(col('improvement_surcharge') > 0)\n", + " .filter(col('total_amount') > 0)\n", + " .filter(col('congestion_surcharge') > 0)\n", + " .filter(col('Airport_fee') > 0)\n", + " .filter(col('trip_distance') > 0)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n", + "|min(passenger_count)|max(passenger_count)|min(trip_distance)|max(trip_distance)|min(fare_amount)|max(fare_amount)|min(extra)|max(extra)|min(mta_tax)|max(mta_tax)|min(tip_amount)|max(tip_amount)|min(tolls_amount)|max(tolls_amount)|min(improvement_surcharge)|max(improvement_surcharge)|min(total_amount)|max(total_amount)|min(congestion_surcharge)|max(congestion_surcharge)|min(Airport_fee)|max(Airport_fee)|\n", + "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n", + "| 0| 6| 0.02| 39.66| 17.0| 244.5| 0.02| 11.75| 0.5| 0.5| 0.01| 110.0| 1.0| 63.94| 1.0| 1.0| 43.36| 314.28| 2.5| 2.5| 1.75| 1.75|\n", + "+--------------------+--------------------+------------------+------------------+----------------+----------------+----------+----------+------------+------------+---------------+---------------+-----------------+-----------------+--------------------------+--------------------------+-----------------+-----------------+-------------------------+-------------------------+----------------+----------------+\n", + "\n" + ] + } + ], + "source": [ + "taxi_trip_df.select(*minmax_columns).show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 필요한 columns만 고르기" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-------------+------------+\n", + "|tpep_pickup_datetime|trip_distance|total_amount|\n", + "+--------------------+-------------+------------+\n", + "| 2024-05-01 00:57:17| 9.02| 65.16|\n", + "| 2024-05-01 00:14:05| 8.53| 59.31|\n", + "| 2024-05-01 00:03:08| 27.5| 107.49|\n", + "| 2024-05-01 00:44:01| 11.0| 74.74|\n", + "| 2024-05-01 00:14:19| 8.32| 59.79|\n", + "+--------------------+-------------+------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "taxi_trip_df = taxi_trip_df.select('tpep_pickup_datetime', 'trip_distance', 'total_amount')\n", + "taxi_trip_df.show(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 날짜 기준으로 그룹화" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "taxi_trip_group_by_date_df = taxi_trip_df.withColumn('tpep_pickup_date', to_date(col('tpep_pickup_datetime'))).groupBy('tpep_pickup_date')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Aggregation" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "84378" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "taxi_trip_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------+\n", + "|sum(total_amount)|\n", + "+-----------------+\n", + "| 6862441.78999949|\n", + "+-----------------+\n", + "\n" + ] + } + ], + "source": [ + "taxi_trip_df.select(sum(col('total_amount'))).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "average of trip distance: 11.696007727132685 miles\n" + ] + } + ], + "source": [ + "num_of_trips = taxi_trip_df.count()\n", + "total_trip_distance = taxi_trip_df.select(sum(col('trip_distance'))).collect()[0]['sum(trip_distance)']\n", + "\n", + "print('average of trip distance: ', total_trip_distance / num_of_trips, 'miles')" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------+-----+\n", + "|tpep_pickup_date|count|\n", + "+----------------+-----+\n", + "| 2024-05-01| 2874|\n", + "| 2024-05-02| 3058|\n", + "| 2024-05-03| 3015|\n", + "| 2024-05-04| 1431|\n", + "| 2024-05-05| 2632|\n", + "| 2024-05-06| 3575|\n", + "| 2024-05-07| 3043|\n", + "| 2024-05-08| 3203|\n", + "| 2024-05-09| 3242|\n", + "| 2024-05-10| 2898|\n", + "| 2024-05-11| 1297|\n", + "| 2024-05-12| 2587|\n", + "| 2024-05-13| 3963|\n", + "| 2024-05-14| 3465|\n", + "| 2024-05-15| 3303|\n", + "| 2024-05-16| 3496|\n", + "| 2024-05-17| 2844|\n", + "| 2024-05-18| 1368|\n", + "| 2024-05-19| 2760|\n", + "| 2024-05-20| 3423|\n", + "| 2024-05-21| 2947|\n", + "| 2024-05-22| 2845|\n", + "| 2024-05-23| 2615|\n", + "| 2024-05-24| 2576|\n", + "| 2024-05-25| 1060|\n", + "| 2024-05-26| 1232|\n", + "| 2024-05-27| 1873|\n", + "| 2024-05-28| 3492|\n", + "| 2024-05-29| 2801|\n", + "| 2024-05-30| 2947|\n", + "| 2024-05-31| 2513|\n", + "+----------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "count_by_date_df = taxi_trip_group_by_date_df.count().orderBy('tpep_pickup_date')\n", + "count_by_date_df.show(31)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------+--------------------+\n", + "|tpep_pickup_date|total_amount_by_date|\n", + "+----------------+--------------------+\n", + "| 2024-05-01| 234102.3800000013|\n", + "| 2024-05-02| 253174.17000000217|\n", + "| 2024-05-03| 245122.76000000146|\n", + "| 2024-05-04| 108908.40000000017|\n", + "| 2024-05-05| 201508.48000000062|\n", + "| 2024-05-06| 292839.31000000227|\n", + "| 2024-05-07| 251233.74000000185|\n", + "| 2024-05-08| 265208.1300000018|\n", + "| 2024-05-09| 266810.56000000174|\n", + "| 2024-05-10| 242295.62000000122|\n", + "| 2024-05-11| 100074.6800000003|\n", + "| 2024-05-12| 191414.2600000006|\n", + "| 2024-05-13| 321041.43000000203|\n", + "| 2024-05-14| 290568.03000000113|\n", + "| 2024-05-15| 276198.3200000021|\n", + "| 2024-05-16| 293009.5300000017|\n", + "| 2024-05-17| 239929.89000000103|\n", + "| 2024-05-18| 108224.19000000009|\n", + "| 2024-05-19| 208423.63999999972|\n", + "| 2024-05-20| 281833.060000002|\n", + "| 2024-05-21| 246523.13000000146|\n", + "| 2024-05-22| 233623.27000000144|\n", + "| 2024-05-23| 218459.9300000015|\n", + "| 2024-05-24| 212158.6700000009|\n", + "| 2024-05-25| 80535.12000000005|\n", + "| 2024-05-26| 90027.91000000008|\n", + "| 2024-05-27| 138572.9199999999|\n", + "| 2024-05-28| 281179.2100000025|\n", + "| 2024-05-29| 231462.55000000205|\n", + "| 2024-05-30| 248300.18000000023|\n", + "| 2024-05-31| 209678.3200000011|\n", + "+----------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "total_amount_by_date_df = (taxi_trip_group_by_date_df.sum()\n", + " .select('tpep_pickup_date', 'sum(total_amount)')\n", + " .withColumnRenamed('sum(total_amount)', 'total_amount_by_date')\n", + " .orderBy('tpep_pickup_date'))\n", + "total_amount_by_date_df.show(31)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 결과 저장하기" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "count_by_date_df.write.csv('/Users/admin/Projects/HMG_Softeer_DE/missions/W5/NYC-taxi-count_by_date', header=True)\n", + "total_amount_by_date_df.write.csv('/Users/admin/Projects/HMG_Softeer_DE/missions/W5/NYC-taxi-total_amount_by_date', header=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "spark.stop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv-310", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/missions/W5/spark-softeer/Dockerfile b/missions/W5/spark-softeer/Dockerfile new file mode 100644 index 00000000..998f39a3 --- /dev/null +++ b/missions/W5/spark-softeer/Dockerfile @@ -0,0 +1,39 @@ +FROM ubuntu:jammy + +EXPOSE 8080 + +RUN apt update -y && apt upgrade -y +RUN DEBIAN_FRONTEND=noninteractive apt install wget sudo openjdk-11-jdk -y + +RUN useradd sparkuser -s /bin/bash -d /home/sparkuser -p spark +RUN echo "sparkuser ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +RUN mkdir /home/sparkuser +RUN chown sparkuser:sparkuser /home/sparkuser + +USER sparkuser +WORKDIR /home/sparkuser + +RUN wget https://bootstrap.pypa.io/get-pip.py +RUN sudo chown sparkuser:sparkuser get-pip.py +RUN python3 get-pip.py +RUN rm get-pip.py +RUN python3 -m pip install pandas matplotlib pyspark pyarrow grpcio protobuf grpcio-status + +RUN wget https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz +RUN tar -zxvf spark-3.5.1-bin-hadoop3.tgz +RUN rm spark-3.5.1-bin-hadoop3.tgz +RUN mv spark-3.5.1-bin-hadoop3 spark +RUN sudo mv spark / + +WORKDIR /spark +ADD ./start.sh . +ADD config/fairscheduler.xml* /spark/conf +ADD config/log4j2.properties* /spark/conf +ADD config/metrics.properties* /spark/conf +ADD config/spark-defaults.conf* /spark/conf +ADD config/spark-env.sh* /spark/conf +ADD config/workers* /spark/conf +RUN sudo chown sparkuser:sparkuser ./start.sh +RUN sudo chmod +rwx ./start.sh +RUN echo "export PATH=/spark/bin:$PATH" >> ~/.bashrc \ No newline at end of file diff --git a/missions/W5/spark-softeer/README.md b/missions/W5/spark-softeer/README.md new file mode 100644 index 00000000..db2ae0e4 --- /dev/null +++ b/missions/W5/spark-softeer/README.md @@ -0,0 +1,18 @@ +# HMG Softeer Spark Container + +## Run Spark Cluster +```bash +# pwd => missions/W4/spark-softeer +docker compose up +``` + +## Submit a job to Spark +| Docker Container 외부에서 submit 한다면, driver 환경에 Python 버전이 3.10이여야 합니다. +```bash +docker exec spark-softeer-master-1 bin/spark-submit --files file --master spark://localhost:7077 client-application [args] + +# example +docker exec spark-softeer-master-1 bin/spark-submit --files userdata/pg74102.txt --master spark://localhost:7077 userdata/wordcount.py + +# output is stored in userdata/output_wordcount directory. +``` \ No newline at end of file diff --git a/missions/W5/spark-softeer/config/fairscheduler.xml.template b/missions/W5/spark-softeer/config/fairscheduler.xml.template new file mode 100644 index 00000000..385b2e77 --- /dev/null +++ b/missions/W5/spark-softeer/config/fairscheduler.xml.template @@ -0,0 +1,31 @@ + + + + + + + FAIR + 1 + 2 + + + FIFO + 2 + 3 + + diff --git a/missions/W5/spark-softeer/config/log4j2.properties.template b/missions/W5/spark-softeer/config/log4j2.properties.template new file mode 100644 index 00000000..ab96e03b --- /dev/null +++ b/missions/W5/spark-softeer/config/log4j2.properties.template @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +rootLogger.level = info +rootLogger.appenderRef.stdout.ref = console + +# In the pattern layout configuration below, we specify an explicit `%ex` conversion +# pattern for logging Throwables. If this was omitted, then (by default) Log4J would +# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional +# class packaging information. That extra information can sometimes add a substantial +# performance overhead, so we disable it in our default logging config. +# For more information, see SPARK-39361. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex + +# Set the default spark-shell/spark-sql log level to WARN. When running the +# spark-shell/spark-sql, the log level for these classes is used to overwrite +# the root logger's log level, so that the user can have different defaults +# for the shell and regular Spark apps. +logger.repl.name = org.apache.spark.repl.Main +logger.repl.level = warn + +logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver +logger.thriftserver.level = warn + +# Settings to quiet third party logs that are too verbose +logger.jetty1.name = org.sparkproject.jetty +logger.jetty1.level = warn +logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle +logger.jetty2.level = error +logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper +logger.replexprTyper.level = info +logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter +logger.replSparkILoopInterpreter.level = info +logger.parquet1.name = org.apache.parquet +logger.parquet1.level = error +logger.parquet2.name = parquet +logger.parquet2.level = error + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler +logger.RetryingHMSHandler.level = fatal +logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry +logger.FunctionRegistry.level = error + +# For deploying Spark ThriftServer +# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 +appender.console.filter.1.type = RegexFilter +appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* +appender.console.filter.1.onMatch = deny +appender.console.filter.1.onMismatch = neutral diff --git a/missions/W5/spark-softeer/config/metrics.properties.template b/missions/W5/spark-softeer/config/metrics.properties.template new file mode 100644 index 00000000..f52d33fd --- /dev/null +++ b/missions/W5/spark-softeer/config/metrics.properties.template @@ -0,0 +1,210 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# syntax: [instance].sink|source.[name].[options]=[value] + +# This file configures Spark's internal metrics system. The metrics system is +# divided into instances which correspond to internal components. +# Each instance can be configured to report its metrics to one or more sinks. +# Accepted values for [instance] are "master", "worker", "executor", "driver", +# and "applications". A wildcard "*" can be used as an instance name, in +# which case all instances will inherit the supplied property. +# +# Within an instance, a "source" specifies a particular set of grouped metrics. +# there are two kinds of sources: +# 1. Spark internal sources, like MasterSource, WorkerSource, etc, which will +# collect a Spark component's internal state. Each instance is paired with a +# Spark source that is added automatically. +# 2. Common sources, like JvmSource, which will collect low level state. +# These can be added through configuration options and are then loaded +# using reflection. +# +# A "sink" specifies where metrics are delivered to. Each instance can be +# assigned one or more sinks. +# +# The sink|source field specifies whether the property relates to a sink or +# source. +# +# The [name] field specifies the name of source or sink. +# +# The [options] field is the specific property of this source or sink. The +# source or sink is responsible for parsing this property. +# +# Notes: +# 1. To add a new sink, set the "class" option to a fully qualified class +# name (see examples below). +# 2. Some sinks involve a polling period. The minimum allowed polling period +# is 1 second. +# 3. Wildcard properties can be overridden by more specific properties. +# For example, master.sink.console.period takes precedence over +# *.sink.console.period. +# 4. A metrics specific configuration +# "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be +# added to Java properties using -Dspark.metrics.conf=xxx if you want to +# customize metrics system. You can also put the file in ${SPARK_HOME}/conf +# and it will be loaded automatically. +# 5. The MetricsServlet sink is added by default as a sink in the master, +# worker and driver, and you can send HTTP requests to the "/metrics/json" +# endpoint to get a snapshot of all the registered metrics in JSON format. +# For master, requests to the "/metrics/master/json" and +# "/metrics/applications/json" endpoints can be sent separately to get +# metrics snapshots of the master instance and applications. This +# MetricsServlet does not have to be configured. +# 6. The metrics system can also be configured using Spark configuration +# parameters. The relevant parameter names are formed by adding the +# prefix "spark.metrics.conf." to the configuration entries detailed in +# this file (see examples below). + +## List of available common sources and their properties. + +# org.apache.spark.metrics.source.JvmSource +# Note: Currently, JvmSource is the only available common source. +# It can be added to an instance by setting the "class" option to its +# fully qualified class name (see examples below). + +## List of available sinks and their properties. + +# org.apache.spark.metrics.sink.ConsoleSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period + +# org.apache.spark.metrics.sink.CSVSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period +# directory /tmp Where to store CSV files + +# org.apache.spark.metrics.sink.GangliaSink +# Name: Default: Description: +# host NONE Hostname or multicast group of the Ganglia server, +# must be set +# port NONE Port of the Ganglia server(s), must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# ttl 1 TTL of messages sent by Ganglia +# dmax 0 Lifetime in seconds of metrics (0 never expired) +# mode multicast Ganglia network mode ('unicast' or 'multicast') + +# org.apache.spark.metrics.sink.JmxSink + +# org.apache.spark.metrics.sink.MetricsServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# sample false Whether to show entire set of samples for histograms +# ('false' or 'true') +# +# * Default path is /metrics/json for all instances except the master. The +# master has two paths: +# /metrics/applications/json # App information +# /metrics/master/json # Master information + +# org.apache.spark.metrics.sink.PrometheusServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# +# * Default path is /metrics/prometheus for all instances except the master. The +# master has two paths: +# /metrics/applications/prometheus # App information +# /metrics/master/prometheus # Master information + +# org.apache.spark.metrics.sink.GraphiteSink +# Name: Default: Description: +# host NONE Hostname of the Graphite server, must be set +# port NONE Port of the Graphite server, must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# prefix EMPTY STRING Prefix to prepend to every metric's name +# protocol tcp Protocol ("tcp" or "udp") to use +# regex NONE Optional filter to send only metrics matching this regex string + +# org.apache.spark.metrics.sink.StatsdSink +# Name: Default: Description: +# host 127.0.0.1 Hostname or IP of StatsD server +# port 8125 Port of StatsD server +# period 10 Poll period +# unit seconds Units of poll period +# prefix EMPTY STRING Prefix to prepend to metric name + +## Examples +# Enable JmxSink for all instances by class name +#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink + +# Enable ConsoleSink for all instances by class name +#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink + +# Enable StatsdSink for all instances by class name +#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink +#*.sink.statsd.prefix=spark + +# Polling period for the ConsoleSink +#*.sink.console.period=10 +# Unit of the polling period for the ConsoleSink +#*.sink.console.unit=seconds + +# Polling period for the ConsoleSink specific for the master instance +#master.sink.console.period=15 +# Unit of the polling period for the ConsoleSink specific for the master +# instance +#master.sink.console.unit=seconds + +# Enable CsvSink for all instances by class name +#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink + +# Polling period for the CsvSink +#*.sink.csv.period=1 +# Unit of the polling period for the CsvSink +#*.sink.csv.unit=minutes + +# Polling directory for CsvSink +#*.sink.csv.directory=/tmp/ + +# Polling period for the CsvSink specific for the worker instance +#worker.sink.csv.period=10 +# Unit of the polling period for the CsvSink specific for the worker instance +#worker.sink.csv.unit=minutes + +# Enable Slf4jSink for all instances by class name +#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink + +# Polling period for the Slf4JSink +#*.sink.slf4j.period=1 +# Unit of the polling period for the Slf4jSink +#*.sink.slf4j.unit=minutes + +# Example configuration for Graphite sink +#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink +#*.sink.graphite.host= +#*.sink.graphite.port= +#*.sink.graphite.period=10 +#*.sink.graphite.unit=seconds +#*.sink.graphite.prefix= + +# Enable JvmSource for instance master, worker, driver and executor +#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +# Example configuration for PrometheusServlet +#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet +#*.sink.prometheusServlet.path=/metrics/prometheus +#master.sink.prometheusServlet.path=/metrics/master/prometheus +#applications.sink.prometheusServlet.path=/metrics/applications/prometheus diff --git a/missions/W5/spark-softeer/config/spark-defaults.conf b/missions/W5/spark-softeer/config/spark-defaults.conf new file mode 100644 index 00000000..a444df45 --- /dev/null +++ b/missions/W5/spark-softeer/config/spark-defaults.conf @@ -0,0 +1,28 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" +spark.master spark://master:7077 \ No newline at end of file diff --git a/missions/W5/spark-softeer/config/spark-defaults.conf.template b/missions/W5/spark-softeer/config/spark-defaults.conf.template new file mode 100644 index 00000000..19cba6e7 --- /dev/null +++ b/missions/W5/spark-softeer/config/spark-defaults.conf.template @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" diff --git a/missions/W5/spark-softeer/config/spark-env.sh b/missions/W5/spark-softeer/config/spark-env.sh new file mode 100755 index 00000000..198636df --- /dev/null +++ b/missions/W5/spark-softeer/config/spark-env.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is sourced when running various Spark programs. +# Copy it as spark-env.sh and edit that to configure Spark for your site. + +# Options read when launching programs locally with +# ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program + +# Options read by executors and drivers running inside the cluster +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program +# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data +# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos + +# Options read in any mode +# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) + +# Options read in any cluster manager using HDFS +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files + +# Options read in YARN client/cluster mode +# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN + +# Options for the daemons used in the standalone deploy mode +# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname +# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master +# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") +# - SPARK_WORKER_CORES, to set the number of cores to use on this machine +# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) +# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker +# - SPARK_WORKER_DIR, to set the working directory of worker processes +# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). +# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") +# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") +# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") +# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons +# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers + +# Options for launcher +# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y") + +# Generic options for the daemons used in the standalone deploy mode +# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) +# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5. +# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) +# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) +# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) +# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. +# Options for native BLAS, like Intel MKL, OpenBLAS, and so on. +# You might get better performance to enable these options if using native BLAS (see SPARK-21305). +# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL +# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS + +# Options for beeline +# - SPARK_BEELINE_OPTS, to set config properties only for the beeline cli (e.g. "-Dx=y") +# - SPARK_BEELINE_MEMORY, Memory for beeline (e.g. 1000M, 2G) (Default: 1G) + +export SPARK_HOME=/spark +export SPARK_CONF_DIR=$SPARK_HOME/conf + +export SPARK_LOCAL_DIRS=/tmp +export SPARK_WORKER_DIR=/tmp +export SPARK_VERSION=3.5.1 \ No newline at end of file diff --git a/missions/W5/spark-softeer/config/spark-env.sh.template b/missions/W5/spark-softeer/config/spark-env.sh.template new file mode 100755 index 00000000..d65cd8d1 --- /dev/null +++ b/missions/W5/spark-softeer/config/spark-env.sh.template @@ -0,0 +1,84 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is sourced when running various Spark programs. +# Copy it as spark-env.sh and edit that to configure Spark for your site. + +# Options read when launching programs locally with +# ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program + +# Options read by executors and drivers running inside the cluster +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program +# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data +# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos + +# Options read in any mode +# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) + +# Options read in any cluster manager using HDFS +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files + +# Options read in YARN client/cluster mode +# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN + +# Options for the daemons used in the standalone deploy mode +# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname +# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master +# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") +# - SPARK_WORKER_CORES, to set the number of cores to use on this machine +# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) +# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker +# - SPARK_WORKER_DIR, to set the working directory of worker processes +# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). +# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") +# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") +# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") +# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons +# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers + +# Options for launcher +# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y") + +# Generic options for the daemons used in the standalone deploy mode +# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) +# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5. +# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) +# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) +# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) +# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. +# Options for native BLAS, like Intel MKL, OpenBLAS, and so on. +# You might get better performance to enable these options if using native BLAS (see SPARK-21305). +# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL +# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS + +# Options for beeline +# - SPARK_BEELINE_OPTS, to set config properties only for the beeline cli (e.g. "-Dx=y") +# - SPARK_BEELINE_MEMORY, Memory for beeline (e.g. 1000M, 2G) (Default: 1G) + +export SPARK_LOCAL_DIRS=/tmp +export SPARK_WORKER_DIR=/tmp \ No newline at end of file diff --git a/missions/W5/spark-softeer/config/workers b/missions/W5/spark-softeer/config/workers new file mode 100644 index 00000000..11aa72ed --- /dev/null +++ b/missions/W5/spark-softeer/config/workers @@ -0,0 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# A Spark Worker will be started on each of the machines listed below. +worker1 +worker2 \ No newline at end of file diff --git a/missions/W5/spark-softeer/config/workers.template b/missions/W5/spark-softeer/config/workers.template new file mode 100644 index 00000000..be42a638 --- /dev/null +++ b/missions/W5/spark-softeer/config/workers.template @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# A Spark Worker will be started on each of the machines listed below. +localhost \ No newline at end of file diff --git a/missions/W5/spark-softeer/docker-compose.yml b/missions/W5/spark-softeer/docker-compose.yml new file mode 100644 index 00000000..2b4cd683 --- /dev/null +++ b/missions/W5/spark-softeer/docker-compose.yml @@ -0,0 +1,48 @@ +version: "2" +services: + master: + build: . + hostname: master + ports: + - 4040:4040 + - 7077:7077 + - 8080:8080 + volumes: + - type: bind + source: userdata/ + target: /spark/userdata/ + command: ./start.sh master + connect: + depends_on: + - master + build: . + hostname: connect + ports: + - 15002:15002 + volumes: + - type: bind + source: userdata/ + target: /spark/userdata/ + command: ./start.sh connect + worker1: + depends_on: + - master + build: . + hostname: worker1 + ports: + - 8081:8081 + volumes: + - type: bind + source: userdata/ + target: /spark/userdata/ + command: ./start.sh worker + worker2: + depends_on: + - master + build: . + hostname: worker2 + volumes: + - type: bind + source: userdata/ + target: /spark/userdata/ + command: ./start.sh worker \ No newline at end of file diff --git a/missions/W5/spark-softeer/start.sh b/missions/W5/spark-softeer/start.sh new file mode 100644 index 00000000..d73f73b4 --- /dev/null +++ b/missions/W5/spark-softeer/start.sh @@ -0,0 +1,15 @@ +#!/bin/bash + + +if [ $1 = "master" ]; then + LOG_FILE=$(./sbin/start-master.sh | sed "s/.\+logging to //") +elif [ $1 = "connect" ]; then + LOG_FILE=$(./sbin/start-connect-server.sh --packages org.apache.spark:spark-connect_2.12:3.5.1 --master spark://master:7077 | sed "s/.\+logging to //") +elif [ $1 = "worker" ]; then + LOG_FILE=$(./sbin/start-worker.sh spark://master:7077 | sed "s/.\+logging to //") +else + echo "Usage: $0 [master|connect|worker]" + exit 1 +fi + +tail -f $LOG_FILE \ No newline at end of file