cd Dep
sbt clean publishLocal
sbt assembly
Use any Spark version compatible with Scala version used in the project.
# Create a dir for logs, default file:/tmp/spark-events
mkdir /tmp/spark-events
# start Spark History Server
bash $SPARK_HOME/sbin/
bash scripts/generic/ \
target/scala-2.12/AlgoSEDD-assembly-0.1-SNAPSHOT.jar \
local[2] \
2010-01-01 \
2022-01-01 \
'8 weeks' \
/mnt/datastore/data/StackExchangeDataDump/2021-06-07/ \
gcloud dataproc jobs submit spark \
--cluster=cluster-5cff \
--class=pl.epsilondeltalimit.analyzer.StackExchangeDataDumpAnalyzerSingle \
--jars=gs://stack-exchange-data-dump-analyzer-single/StackExchangeDataDumpAnalyzerSingle-0.1-SNAPSHOT-jar-with-dependencies.jar \
--region=europe-west3 \
--driver-log-levels root=DEBUG \
-- 2010-01-01 2021-01-01 '13 weeks' gs://stack-exchange-data-dump/ gs://stack-exchange-data-dump/ gs://stack-exchange-data-dump/ gs://stack-exchange-data-dump/ gs://stack-exchange-data-dump/ gs://stack-exchange-data-dump/ gs://stack-exchange-data-dump/ gs://stack-exchange-data-dump/ gs://stack-exchange-data-dump-analyzer-single/output/
# relative popularity
#bash scripts/plot/generic/ \
# <csv result file> \
# <tag name> \
# <aggregation interval> \
# <y axis max> \
# <optional >
#bash scripts/ \
# output/tag\=print-quality/part-00000-9b6e8399-3e48-4a97-a355-4b239b975515.c000.csv \
# print-quality \
# 8weeks \
# 1.0 \
# 0.1
# entries count
bash scripts/plot/generic/ \
target/output/\=star-wars/part-00000-28150a61-065b-4e24-8435-3b71b4911bcf.c000.csv \
star-wars \
8weeks \