Skip to content

Commit

Permalink
Working upto npld and bypm crawlers
Browse files Browse the repository at this point in the history
  • Loading branch information
GilHoggarth committed Nov 10, 2023
1 parent 8a66f2b commit d12ddc9
Show file tree
Hide file tree
Showing 9 changed files with 82 additions and 45 deletions.
18 changes: 9 additions & 9 deletions ingest/fc/fc-crawl/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ services:
- "WEBRENDER_WARC_PREFIX=BL-NPLD-WEBRENDER"
- "CDXSERVER_ENDPOINT=${CDXSERVER_ENDPOINT}"
volumes:
- "${STORAGE_PATH}/heritrix/output:/heritrix/output"
- "${TMP_STORAGE_PATH}/heritrix/npld/state:/heritrix/state"
- "${STORAGE_PATH}/surts/npld:/shared" # Shared configuration - surts file held here.
- "${HERITRIX_OUTPUT_PATH}:/heritrix/output"
- "${NPLD_STATE_PATH}:/heritrix/state"
- "${SURTS_NPLD_PATH}:/shared" # Shared configuration - surts file held here.
deploy:
replicas: 1
stop_grace_period: 5m # Give the H3 instances some time to shut down neatly following SIGTERM
Expand All @@ -33,7 +33,7 @@ services:
- default
- kafka

# UKWA Heritrix for NPLD
# UKWA Heritrix for BYPM
bypm-heritrix-worker:
user: ${H3_UID}
image: ukwa/heritrix:${HERITRIX_VERSION}
Expand All @@ -50,9 +50,9 @@ services:
- "WEBRENDER_WARC_PREFIX=BL-BYPM-WEBRENDER"
- "CDXSERVER_ENDPOINT=${CDXSERVER_ENDPOINT}"
volumes:
- "${STORAGE_PATH}/heritrix/output:/heritrix/output"
- "${TMP_STORAGE_PATH}/heritrix/bypm/state:/heritrix/state"
- "${STORAGE_PATH}/surts/bypm:/shared" # Shared configuration - surts file held here.
- "${HERITRIX_OUTPUT_PATH}:/heritrix/output"
- "${BYPM_STATE_PATH}:/heritrix/state"
- "${SURTS_BYPM_PATH}:/shared" # Shared configuration - surts file held here.
deploy:
replicas: 1
stop_grace_period: 5m # Give the H3 instances some time to shut down neatly following SIGTERM
Expand Down Expand Up @@ -100,8 +100,8 @@ services:
- "KAFKA_ACKS=1"
- "CDXSERVER_ENDPOINT=${CDXSERVER_ENDPOINT}"
volumes:
- "${STORAGE_PATH}/heritrix/wren:/output/warcs"
- "/mnt/localraid6/fc/warcprox:/host" # FAST disk for dedup db
- "${HERITRIX_WREN_PATH}:/output/warcs"
- "${WARCPROX_PATH}:/host" # FAST disk for dedup db
ulimits:
nproc: 2000 # See https://github.com/internetarchive/warcprox/blob/2.x/warcprox/warcproxy.py#L413
deploy:
Expand Down
3 changes: 0 additions & 3 deletions ingest/fc/fc-kafka-ui/deploy.sh

This file was deleted.

6 changes: 6 additions & 0 deletions ingest/fc/fc-kafka-ui/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@ services:

kafka-ui:
image: provectuslabs/kafka-ui:latest
ulimits:
nofile:
soft: 65536
hard: 65536
ports:
- 9000:8080
environment:
- "KAFKA_CLUSTERS_0_NAME=fc-kafka"
- "KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS=kafka:9092"
- "DYNAMIC_CONFIG_ENABLED=true"
- "LOGGING_LEVEL_ROOT=info"
networks:
- kafka

Expand Down
4 changes: 4 additions & 0 deletions ingest/fc/fc-kafka/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ services:

kafka:
image: wurstmeister/kafka:2.12-2.1.0
ulimits:
nofile:
soft: 65536
hard: 65536
ports:
- target: 9094
published: 9094
Expand Down
55 changes: 34 additions & 21 deletions ingest/fc/prod/deploy-fc-crawl.sh
Original file line number Diff line number Diff line change
@@ -1,28 +1,41 @@
#!/bin/bash

# Fail and halt execution on errors
#!/bin/sh
set -e

if [[ "$1" != "" ]]; then
ENV_TAG="$1"
else
echo "You must give an argument that specifies the deployment, e.g. crawler06 uses prod-env-crawler06.sh."
exit 1
fi
ENVFILE=$1
DEBUG=1


source ./env-${ENV_TAG}.sh
# read environment file
if [[ "${ENVFILE}" == "" ]]; then
echo "ERROR: You must give an argument that specifies the deployment, e.g. crawler06 uses prod-env-crawler06.sh."
exit 1
fi
if ! [[ -f ${ENVFILE} ]]; then
echo "ERROR: argument [${ENVFILE}] environment file missing"
exit 1
fi
source ./${ENVFILE}

echo Using UID $H3_UID for Heritrix

mkdir -p ${STORAGE_PATH}/heritrix/output
mkdir -p ${STORAGE_PATH}/heritrix/wren
mkdir -p ${STORAGE_PATH}/surts/npld
mkdir -p ${STORAGE_PATH}/surts/bypm
mkdir -p ${TMP_STORAGE_PATH}/heritrix/npld/state
mkdir -p ${TMP_STORAGE_PATH}/heritrix/bypm/state
mkdir -p ${CDX_STORAGE_PATH}
mkdir -p /tmp/webrender
mkdir -p ${STORAGE_PATH}/prometheus-data
# check STORAGE_PATH exists, create any missing sub-directories
if ! [[ -d ${STORAGE_PATH} ]]; then
echo "ERROR: STORAGE_PATH [${STORAGE_PATH}] defined in [${ENVFILE}] missing"
exit 1
fi
for _d in ${HERITRIX_OUTPUT_PATH} ${HERITRIX_WREN_PATH} ${SURTS_NPLD_PATH} ${SURTS_BYPM_PATH} ${NPLD_STATE_PATH} ${BYPM_STATE_PATH} ${CDX_STORAGE_PATH} ${TMP_WEBRENDER_PATH} ${PROMETHEUS_DATA_PATH} ${WARCPROX_PATH}; do
[[ ${DEBUG} ]] && echo -e "DEBUG]\t _d:\t [${_d}]"
if [[ "${_d}" == "" ]]; then
echo "ERROR: No directory defined"
exit 1
fi
if ! [[ -d ${_d} ]]; then
[[ ${DEBUG} ]] && echo -e "DEBUG]\t making dir [${_d}]"
mkdir -p ${_d} || {
echo "ERROR: failed to make directory [${_d}]"
exit 1
}
fi
done
exit

# start FC crawler stack
docker stack deploy -c ../fc-crawl/docker-compose.yml fc_crawl
3 changes: 3 additions & 0 deletions ingest/fc/prod/deploy-fc-kafka-ui.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh
docker stack deploy -c ../fc-kafka-ui/docker-compose.yml fc_ui_kafka

9 changes: 4 additions & 5 deletions ingest/fc/prod/deploy-fc-kafka.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
#!/bin/sh
set -e
ENVFILE=$1
DEBUG=1


# read environment file
if [[ "$1" != "" ]]; then
ENVFILE="$1"
else
echo "ERROR You must give an argument that specifies the deployment, e.g. crawler06 uses prod-env-crawler06.sh."
if [[ "${ENVFILE}" == "" ]]; then
echo "ERROR: You must give an argument that specifies the deployment, e.g. crawler06 uses prod-env-crawler06.sh."
exit 1
fi
if ! [[ -f ${ENVFILE} ]]; then
Expand All @@ -22,7 +21,7 @@ if ! [[ -d ${STORAGE_PATH} ]]; then
echo "ERROR: STORAGE_PATH [${STORAGE_PATH}] defined in [${ENVFILE}] missing"
exit 1
fi
for _d in ${TMP_STORAGE_PATH} ${CDX_STORAGE_PATH} ${ZK_DATA_PATH} ${ZK_DATALOG_PATH} ${KAFKA_PATH}; do
for _d in ${TMP_STORAGE_PATH} ${ZK_DATA_PATH} ${ZK_DATALOG_PATH} ${KAFKA_PATH}; do
if [[ "${_d}" == "" ]]; then
echo "ERROR: No directory defined"
exit 1
Expand Down
19 changes: 17 additions & 2 deletions ingest/fc/prod/env-aws-fc2023-prod.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,26 @@
# Common directories
#### Common directories
# kafka
export STORAGE_PATH=/mnt/data/fc
export TMP_STORAGE_PATH=${STORAGE_PATH}/tmp
export CDX_STORAGE_PATH=${STORAGE_PATH}/cdx
export ZK_DATA_PATH=${STORAGE_PATH}/zookeeper/data
export ZK_DATALOG_PATH=${STORAGE_PATH}/zookeeper/datalog
export KAFKA_PATH=${STORAGE_PATH}/kafka

# + crawler
export HERITRIX_OUTPUT_PATH=${STORAGE_PATH}/heritrix/output
export HERITRIX_WREN_PATH=${STORAGE_PATH}/heritrix/wren
export SURTS_NPLD_PATH=${STORAGE_PATH}/surts/npld
export SURTS_BYPM_PATH=${STORAGE_PATH}/surts/bypm
export NPLD_STATE_PATH=${TMP_STORAGE_PATH}/heritrix/npld/state
export BYPM_STATE_PATH=${TMP_STORAGE_PATH}/heritrix/bypm/state
export CDX_STORAGE_PATH=${STORAGE_PATH}/cdx
export TMP_WEBRENDER_PATH=/tmp/webrender
export PROMETHEUS_DATA_PATH=${STORAGE_PATH}/prometheus-data
export WARCPROX_PATH=${STORAGE_PATH}/warcprox

# crawler details
export CRAWL_HOST_LAN_IP=172.31.43.254
export CRAWL_HOST_WAN_IP=18.130.205.6
export H3_UID=$(id -u)
export HERITRIX_VERSION=2.9.0
export CDXSERVER_ENDPOINT=http://${CRAWL_HOST_LAN_IP}:8081/fc
10 changes: 5 additions & 5 deletions ingest/fc/prod/kafka-create-topics.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
export KAFKA_IMAGE=wurstmeister/kafka:2.12-2.1.0

docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.crawled --replication-factor 1 --partitions 16 --config compression.type=snappy
docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.crawled --replication-factor 1 --partitions 16 --config compression.type=snappy

docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.tocrawl.bypm --replication-factor 1 --partitions 16 --config compression.type=snappy
docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.tocrawl.bypm --replication-factor 1 --partitions 16 --config compression.type=snappy

docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.inscope.bypm --replication-factor 1 --partitions 16 --config compression.type=snappy
docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.inscope.bypm --replication-factor 1 --partitions 16 --config compression.type=snappy

docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.tocrawl.npld --replication-factor 1 --partitions 16 --config compression.type=snappy
docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.tocrawl.npld --replication-factor 1 --partitions 16 --config compression.type=snappy

docker run --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.inscope.npld --replication-factor 1 --partitions 16 --config compression.type=snappy
docker run --ulimit nofile=1024:1024 --net=fc_kafka_default ${KAFKA_IMAGE} kafka-topics.sh --zookeeper zookeeper:2181 --create --topic fc.inscope.npld --replication-factor 1 --partitions 16 --config compression.type=snappy


0 comments on commit d12ddc9

Please sign in to comment.