From d12ddc905fd20950f5261c4d2cb29beec920a253 Mon Sep 17 00:00:00 2001 From: gilh Date: Fri, 10 Nov 2023 17:25:41 +0000 Subject: [PATCH] Working upto npld and bypm crawlers --- ingest/fc/fc-crawl/docker-compose.yml | 18 ++++---- ingest/fc/fc-kafka-ui/deploy.sh | 3 -- ingest/fc/fc-kafka-ui/docker-compose.yml | 6 +++ ingest/fc/fc-kafka/docker-compose.yml | 4 ++ ingest/fc/prod/deploy-fc-crawl.sh | 55 +++++++++++++++--------- ingest/fc/prod/deploy-fc-kafka-ui.sh | 3 ++ ingest/fc/prod/deploy-fc-kafka.sh | 9 ++-- ingest/fc/prod/env-aws-fc2023-prod.sh | 19 +++++++- ingest/fc/prod/kafka-create-topics.sh | 10 ++--- 9 files changed, 82 insertions(+), 45 deletions(-) delete mode 100644 ingest/fc/fc-kafka-ui/deploy.sh create mode 100755 ingest/fc/prod/deploy-fc-kafka-ui.sh diff --git a/ingest/fc/fc-crawl/docker-compose.yml b/ingest/fc/fc-crawl/docker-compose.yml index f5d58b4..577359c 100644 --- a/ingest/fc/fc-crawl/docker-compose.yml +++ b/ingest/fc/fc-crawl/docker-compose.yml @@ -19,9 +19,9 @@ services: - "WEBRENDER_WARC_PREFIX=BL-NPLD-WEBRENDER" - "CDXSERVER_ENDPOINT=${CDXSERVER_ENDPOINT}" volumes: - - "${STORAGE_PATH}/heritrix/output:/heritrix/output" - - "${TMP_STORAGE_PATH}/heritrix/npld/state:/heritrix/state" - - "${STORAGE_PATH}/surts/npld:/shared" # Shared configuration - surts file held here. + - "${HERITRIX_OUTPUT_PATH}:/heritrix/output" + - "${NPLD_STATE_PATH}:/heritrix/state" + - "${SURTS_NPLD_PATH}:/shared" # Shared configuration - surts file held here. deploy: replicas: 1 stop_grace_period: 5m # Give the H3 instances some time to shut down neatly following SIGTERM @@ -33,7 +33,7 @@ services: - default - kafka - # UKWA Heritrix for NPLD + # UKWA Heritrix for BYPM bypm-heritrix-worker: user: ${H3_UID} image: ukwa/heritrix:${HERITRIX_VERSION} @@ -50,9 +50,9 @@ services: - "WEBRENDER_WARC_PREFIX=BL-BYPM-WEBRENDER" - "CDXSERVER_ENDPOINT=${CDXSERVER_ENDPOINT}" volumes: - - "${STORAGE_PATH}/heritrix/output:/heritrix/output" - - "${TMP_STORAGE_PATH}/heritrix/bypm/state:/heritrix/state" - - "${STORAGE_PATH}/surts/bypm:/shared" # Shared configuration - surts file held here. + - "${HERITRIX_OUTPUT_PATH}:/heritrix/output" + - "${BYPM_STATE_PATH}:/heritrix/state" + - "${SURTS_BYPM_PATH}:/shared" # Shared configuration - surts file held here. deploy: replicas: 1 stop_grace_period: 5m # Give the H3 instances some time to shut down neatly following SIGTERM @@ -100,8 +100,8 @@ services: - "KAFKA_ACKS=1" - "CDXSERVER_ENDPOINT=${CDXSERVER_ENDPOINT}" volumes: - - "${STORAGE_PATH}/heritrix/wren:/output/warcs" - - "/mnt/localraid6/fc/warcprox:/host" # FAST disk for dedup db + - "${HERITRIX_WREN_PATH}:/output/warcs" + - "${WARCPROX_PATH}:/host" # FAST disk for dedup db ulimits: nproc: 2000 # See https://github.com/internetarchive/warcprox/blob/2.x/warcprox/warcproxy.py#L413 deploy: diff --git a/ingest/fc/fc-kafka-ui/deploy.sh b/ingest/fc/fc-kafka-ui/deploy.sh deleted file mode 100644 index 9275d24..0000000 --- a/ingest/fc/fc-kafka-ui/deploy.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/sh -docker stack deploy -c docker-compose.yml fc_ui_kafka - diff --git a/ingest/fc/fc-kafka-ui/docker-compose.yml b/ingest/fc/fc-kafka-ui/docker-compose.yml index fd1da55..9ac2cdf 100644 --- a/ingest/fc/fc-kafka-ui/docker-compose.yml +++ b/ingest/fc/fc-kafka-ui/docker-compose.yml @@ -4,11 +4,17 @@ services: kafka-ui: image: provectuslabs/kafka-ui:latest + ulimits: + nofile: + soft: 65536 + hard: 65536 ports: - 9000:8080 environment: + - "KAFKA_CLUSTERS_0_NAME=fc-kafka" - "KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS=kafka:9092" - "DYNAMIC_CONFIG_ENABLED=true" + - "LOGGING_LEVEL_ROOT=info" networks: - kafka diff --git a/ingest/fc/fc-kafka/docker-compose.yml b/ingest/fc/fc-kafka/docker-compose.yml index ed2a5d1..633a484 100644 --- a/ingest/fc/fc-kafka/docker-compose.yml +++ b/ingest/fc/fc-kafka/docker-compose.yml @@ -15,6 +15,10 @@ services: kafka: image: wurstmeister/kafka:2.12-2.1.0 + ulimits: + nofile: + soft: 65536 + hard: 65536 ports: - target: 9094 published: 9094 diff --git a/ingest/fc/prod/deploy-fc-crawl.sh b/ingest/fc/prod/deploy-fc-crawl.sh index d3d46ba..445932a 100755 --- a/ingest/fc/prod/deploy-fc-crawl.sh +++ b/ingest/fc/prod/deploy-fc-crawl.sh @@ -1,28 +1,41 @@ -#!/bin/bash - -# Fail and halt execution on errors +#!/bin/sh set -e - -if [[ "$1" != "" ]]; then - ENV_TAG="$1" -else - echo "You must give an argument that specifies the deployment, e.g. crawler06 uses prod-env-crawler06.sh." - exit 1 -fi +ENVFILE=$1 +DEBUG=1 -source ./env-${ENV_TAG}.sh +# read environment file +if [[ "${ENVFILE}" == "" ]]; then + echo "ERROR: You must give an argument that specifies the deployment, e.g. crawler06 uses prod-env-crawler06.sh." + exit 1 +fi +if ! [[ -f ${ENVFILE} ]]; then + echo "ERROR: argument [${ENVFILE}] environment file missing" + exit 1 +fi +source ./${ENVFILE} -echo Using UID $H3_UID for Heritrix -mkdir -p ${STORAGE_PATH}/heritrix/output -mkdir -p ${STORAGE_PATH}/heritrix/wren -mkdir -p ${STORAGE_PATH}/surts/npld -mkdir -p ${STORAGE_PATH}/surts/bypm -mkdir -p ${TMP_STORAGE_PATH}/heritrix/npld/state -mkdir -p ${TMP_STORAGE_PATH}/heritrix/bypm/state -mkdir -p ${CDX_STORAGE_PATH} -mkdir -p /tmp/webrender -mkdir -p ${STORAGE_PATH}/prometheus-data +# check STORAGE_PATH exists, create any missing sub-directories +if ! [[ -d ${STORAGE_PATH} ]]; then + echo "ERROR: STORAGE_PATH [${STORAGE_PATH}] defined in [${ENVFILE}] missing" + exit 1 +fi +for _d in ${HERITRIX_OUTPUT_PATH} ${HERITRIX_WREN_PATH} ${SURTS_NPLD_PATH} ${SURTS_BYPM_PATH} ${NPLD_STATE_PATH} ${BYPM_STATE_PATH} ${CDX_STORAGE_PATH} ${TMP_WEBRENDER_PATH} ${PROMETHEUS_DATA_PATH} ${WARCPROX_PATH}; do + [[ ${DEBUG} ]] && echo -e "DEBUG]\t _d:\t [${_d}]" + if [[ "${_d}" == "" ]]; then + echo "ERROR: No directory defined" + exit 1 + fi + if ! [[ -d ${_d} ]]; then + [[ ${DEBUG} ]] && echo -e "DEBUG]\t making dir [${_d}]" + mkdir -p ${_d} || { + echo "ERROR: failed to make directory [${_d}]" + exit 1 + } + fi +done +exit +# start FC crawler stack docker stack deploy -c ../fc-crawl/docker-compose.yml fc_crawl diff --git a/ingest/fc/prod/deploy-fc-kafka-ui.sh b/ingest/fc/prod/deploy-fc-kafka-ui.sh new file mode 100755 index 0000000..a5f1fe2 --- /dev/null +++ b/ingest/fc/prod/deploy-fc-kafka-ui.sh @@ -0,0 +1,3 @@ +#!/bin/sh +docker stack deploy -c ../fc-kafka-ui/docker-compose.yml fc_ui_kafka + diff --git a/ingest/fc/prod/deploy-fc-kafka.sh b/ingest/fc/prod/deploy-fc-kafka.sh index 3037f4c..b97903f 100755 --- a/ingest/fc/prod/deploy-fc-kafka.sh +++ b/ingest/fc/prod/deploy-fc-kafka.sh @@ -1,13 +1,12 @@ #!/bin/sh set -e +ENVFILE=$1 DEBUG=1 # read environment file -if [[ "$1" != "" ]]; then - ENVFILE="$1" -else - echo "ERROR You must give an argument that specifies the deployment, e.g. crawler06 uses prod-env-crawler06.sh." +if [[ "${ENVFILE}" == "" ]]; then + echo "ERROR: You must give an argument that specifies the deployment, e.g. crawler06 uses prod-env-crawler06.sh." exit 1 fi if ! [[ -f ${ENVFILE} ]]; then @@ -22,7 +21,7 @@ if ! [[ -d ${STORAGE_PATH} ]]; then echo "ERROR: STORAGE_PATH [${STORAGE_PATH}] defined in [${ENVFILE}] missing" exit 1 fi -for _d in ${TMP_STORAGE_PATH} ${CDX_STORAGE_PATH} ${ZK_DATA_PATH} ${ZK_DATALOG_PATH} ${KAFKA_PATH}; do +for _d in ${TMP_STORAGE_PATH} ${ZK_DATA_PATH} ${ZK_DATALOG_PATH} ${KAFKA_PATH}; do if [[ "${_d}" == "" ]]; then echo "ERROR: No directory defined" exit 1 diff --git a/ingest/fc/prod/env-aws-fc2023-prod.sh b/ingest/fc/prod/env-aws-fc2023-prod.sh index 35722de..fbf017e 100644 --- a/ingest/fc/prod/env-aws-fc2023-prod.sh +++ b/ingest/fc/prod/env-aws-fc2023-prod.sh @@ -1,11 +1,26 @@ -# Common directories +#### Common directories +# kafka export STORAGE_PATH=/mnt/data/fc export TMP_STORAGE_PATH=${STORAGE_PATH}/tmp -export CDX_STORAGE_PATH=${STORAGE_PATH}/cdx export ZK_DATA_PATH=${STORAGE_PATH}/zookeeper/data export ZK_DATALOG_PATH=${STORAGE_PATH}/zookeeper/datalog export KAFKA_PATH=${STORAGE_PATH}/kafka +# + crawler +export HERITRIX_OUTPUT_PATH=${STORAGE_PATH}/heritrix/output +export HERITRIX_WREN_PATH=${STORAGE_PATH}/heritrix/wren +export SURTS_NPLD_PATH=${STORAGE_PATH}/surts/npld +export SURTS_BYPM_PATH=${STORAGE_PATH}/surts/bypm +export NPLD_STATE_PATH=${TMP_STORAGE_PATH}/heritrix/npld/state +export BYPM_STATE_PATH=${TMP_STORAGE_PATH}/heritrix/bypm/state +export CDX_STORAGE_PATH=${STORAGE_PATH}/cdx +export TMP_WEBRENDER_PATH=/tmp/webrender +export PROMETHEUS_DATA_PATH=${STORAGE_PATH}/prometheus-data +export WARCPROX_PATH=${STORAGE_PATH}/warcprox + # crawler details export CRAWL_HOST_LAN_IP=172.31.43.254 export CRAWL_HOST_WAN_IP=18.130.205.6 +export H3_UID=$(id -u) +export HERITRIX_VERSION=2.9.0 +export CDXSERVER_ENDPOINT=http://${CRAWL_HOST_LAN_IP}:8081/fc diff --git a/ingest/fc/prod/kafka-create-topics.sh b/ingest/fc/prod/kafka-create-topics.sh index 9919ae5..5c920a6 100755 --- a/ingest/fc/prod/kafka-create-topics.sh +++ b/ingest/fc/prod/kafka-create-topics.sh @@ -1,13 +1,13 @@ export KAFKA_IMAGE=wurstmeister/kafka:2.12-2.1.0 -docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.crawled --replication-factor 1 --partitions 16 --config compression.type=snappy +docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.crawled --replication-factor 1 --partitions 16 --config compression.type=snappy -docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.tocrawl.bypm --replication-factor 1 --partitions 16 --config compression.type=snappy +docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.tocrawl.bypm --replication-factor 1 --partitions 16 --config compression.type=snappy -docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.inscope.bypm --replication-factor 1 --partitions 16 --config compression.type=snappy +docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.inscope.bypm --replication-factor 1 --partitions 16 --config compression.type=snappy -docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.tocrawl.npld --replication-factor 1 --partitions 16 --config compression.type=snappy +docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.tocrawl.npld --replication-factor 1 --partitions 16 --config compression.type=snappy -docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.inscope.npld --replication-factor 1 --partitions 16 --config compression.type=snappy +docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.inscope.npld --replication-factor 1 --partitions 16 --config compression.type=snappy