Skip to content

Commit

Permalink
Fall 2022 changes (#67)
Browse files Browse the repository at this point in the history
Co-authored-by: David Roher <[email protected]>
  • Loading branch information
droher and droher authored Dec 30, 2022
1 parent ad54ecd commit f5a0bd4
Show file tree
Hide file tree
Showing 16 changed files with 116 additions and 84 deletions.
8 changes: 4 additions & 4 deletions .env
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
CHADWICK_VERSION=v0.9.3
BASEBALLDATABANK_VERSION=dd1a4503b9d6ec2bdda5e345ba06c867e368dd13
RETROSHEET_VERSION=e540755f22b65d2f85f4da9180d1a31754c331f9
CHADWICK_VERSION=v0.9.5
BASEBALLDATABANK_VERSION=ccb3cef05e68f0085db4ada6d4a9ebab9435b452
RETROSHEET_VERSION=48334a58f7446d59746d81aa73c3e9fa9b2676e9

EXTRACT_DIR=extract
REPO=doublewick/boxball
VERSION=2022.0.0
VERSION=2023.0.0
24 changes: 24 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ x-clickhouse:
&clickhouse
build:
context: load/clickhouse
dockerfile: ../Dockerfile
target: clickhouse
platforms:
- "linux/amd64"
args:
- VERSION
image: ${REPO}:clickhouse-${VERSION}
Expand All @@ -55,6 +59,10 @@ x-drill:
&drill
build:
context: load/drill
dockerfile: ../Dockerfile
target: drill
platforms:
- "linux/amd64"
args:
- VERSION
image: ${REPO}:drill-${VERSION}
Expand All @@ -68,6 +76,10 @@ x-postgres:
&postgres
build:
context: load/postgres
dockerfile: ../Dockerfile
target: postgres
platforms:
- "linux/amd64"
args:
- VERSION
image: ${REPO}:postgres-${VERSION}
Expand All @@ -81,6 +93,10 @@ x-postgres-cstore-fdw:
&postgres-cstore-fdw
build:
context: load/postgres_cstore_fdw
dockerfile: ../Dockerfile
target: postgres-cstore-fdw
platforms:
- "linux/amd64"
args:
- VERSION
image: ${REPO}:postgres-cstore-fdw-${VERSION}
Expand All @@ -94,6 +110,10 @@ x-mysql:
&mysql
build:
context: load/mysql
dockerfile: ../Dockerfile
target: mysql
platforms:
- "linux/amd64"
args:
- VERSION
image: ${REPO}:mysql-${VERSION}
Expand All @@ -107,6 +127,10 @@ x-sqlite:
&sqlite
build:
context: load/sqlite
dockerfile: ../Dockerfile
target: sqlite
platforms:
- "linux/amd64"
args:
- VERSION
image: ${REPO}:sqlite-${VERSION}
Expand Down
2 changes: 1 addition & 1 deletion extract/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ ARG BUILD_ENV
ARG RETROSHEET_IMAGE=get-retrosheet-${BUILD_ENV}
ARG BASEBALLDATABANK_IMAGE=get-baseballdatabank-${BUILD_ENV}

FROM python:3.10.4-alpine3.15 AS build-common
FROM python:3.11-alpine3.17 AS build-common
RUN apk add --no-cache \
parallel \
libtool \
Expand Down
69 changes: 69 additions & 0 deletions load/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
ARG VERSION
FROM doublewick/boxball:ddl-${VERSION} as ddl

FROM yandex/clickhouse-server:22.9.7.34 as clickhouse
COPY z_load.sh /docker-entrypoint-initdb.d/
COPY --chown=clickhouse:clickhouse --from=ddl /ddl/clickhouse.sql /docker-entrypoint-initdb.d/
COPY --chown=clickhouse:clickhouse --from=parquet /transform/parquet /data

FROM drill/apache-drill:1.17.0 as drill
COPY --from=parquet /transform/parquet /data

FROM mysql:8.0.31-debian as mysql
ENV MYSQL_ALLOW_EMPTY_PASSWORD=yes
COPY my.cnf /etc/mysql/conf.d/
COPY A_unzip_csvs.sh z_remove_csvs.sh /docker-entrypoint-initdb.d/
RUN apt-get update && apt-get install -y --no-install-recommends zstd zip && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY --chown=mysql:mysql --from=ddl /ddl/mysql.sql /docker-entrypoint-initdb.d/
COPY --chown=mysql:mysql --from=csv /transform/csv /data

FROM postgres:15.1 as postgres
RUN apt-get update && apt-get install -y --no-install-recommends zstd zip && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
COPY A_build_conf.sql z_run_conf.sql /docker-entrypoint-initdb.d/
COPY --chown=postgres:postgres --from=ddl /ddl/postgres.sql /docker-entrypoint-initdb.d/
COPY --chown=postgres:postgres --from=csv /transform/csv /data

FROM postgres:13.2 as postgres-cstore-fdw-build
RUN apt-get update && apt-get install -y --no-install-recommends postgresql-server-dev-13 build-essential zstd libprotobuf-c-dev protobuf-c-compiler wget ca-certificates unzip make gcc libpq-dev && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN wget https://github.com/citusdata/cstore_fdw/archive/master.zip -O cstore_fdw.zip && \
unzip cstore_fdw.zip && \
mv cstore_fdw-master cstore_fdw
WORKDIR /cstore_fdw
RUN make && \
make install
WORKDIR /
RUN echo "shared_preload_libraries = 'cstore_fdw'" >> "${PGDATA}/postgresql.conf"
COPY --chown=postgres:postgres --from=ddl /ddl/postgres_cstore_fdw.sql /docker-entrypoint-initdb.d/
COPY --chown=postgres:postgres --from=csv /transform/csv /data
RUN cat /docker-entrypoint-initdb.d/postgres_cstore_fdw.sql

FROM postgres-cstore-fdw-build as postgres-cstore-fdw

FROM alpine:3.17 as sqlite-build
RUN apk add --no-cache \
zstd \
sqlite
RUN sqlite3 boxball.db ".databases"
COPY --from=ddl /ddl/sqlite.sql .
COPY --from=csv /transform/csv /data
RUN echo "Decompressing fies..." && \
for f in /data/**/*.csv.zst; do zstd --rm -d ${f}; done && \
echo "Building db..." && \
< sqlite.sql sqlite3 -bail -echo boxball.db && \
rm -rf /data && \
zstd --rm boxball.db


FROM python:3.11-alpine3.17 AS sqlite
RUN apk add --no-cache \
zstd \
sqlite
RUN pip install sqlite-web==0.4.1
COPY --from=build boxball.db.zst /tmp/
ENTRYPOINT zstd --rm -d /tmp/boxball.db.zst -fo /db/boxball.db && sqlite_web -H 0.0.0.0 -x /db/boxball.db
8 changes: 0 additions & 8 deletions load/clickhouse/Dockerfile

This file was deleted.

5 changes: 0 additions & 5 deletions load/drill/Dockerfile

This file was deleted.

13 changes: 0 additions & 13 deletions load/mysql/Dockerfile

This file was deleted.

11 changes: 0 additions & 11 deletions load/postgres/Dockerfile

This file was deleted.

26 changes: 0 additions & 26 deletions load/sqlite/Dockerfile

This file was deleted.

14 changes: 7 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
pyhumps==1.6.1
zstandard==0.15.2
SQLAlchemy==1.3.23
pyhumps==3.8.0
zstandard==0.19.0
SQLAlchemy==1.4.45
sqlalchemy-fdw==0.3.0
clickhouse-sqlalchemy==0.1.5
pyarrow==3.0.0
pytest==6.2.2
pytest-cov==2.11.1
clickhouse-sqlalchemy==0.2.3
pyarrow==10.0.1
pytest==7.2.0
pytest-cov==4.0.0
codacy-coverage==1.3.11
2 changes: 1 addition & 1 deletion transform/ddl.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.10-slim-bullseye AS build-common
FROM python:3.11-slim-bullseye AS build-common
COPY requirements.txt .
RUN pip install -r requirements.txt
ENV PYTHONPATH="/"
Expand Down
2 changes: 1 addition & 1 deletion transform/parquet.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ARG VERSION
FROM doublewick/boxball:extract-${VERSION} as extract

FROM python:3.10-slim-bullseye AS build-common
FROM python:3.11-slim-bullseye AS build-common
COPY requirements.txt .
RUN pip install -r requirements.txt
ENV PYTHONPATH="/"
Expand Down
2 changes: 1 addition & 1 deletion transform/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
SQLAlchemy==1.3.23
sqlalchemy-fdw==0.3.0
clickhouse-sqlalchemy==0.1.5
pyarrow==7.0.0
pyarrow==10.0.1
zstandard==0.17.0
4 changes: 2 additions & 2 deletions transform/src/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# How many bytes in each CSV chunk to bring into memory.
# Larger sizes result in better compression and slightly faster time,
# but don't want to risk OOM issues on small build boxes.
BUFFER_SIZE_BYTES = 1000000000
BUFFER_SIZE_BYTES = 500000000

sql_type_lookup: Dict[Type[TypeEngine], str] = {
Integer: 'int32',
Expand Down Expand Up @@ -57,7 +57,7 @@ def get_path(prefix: Path, suffix: str):
arrow_schema = pa.schema(get_fields(table))
column_names = [name for name, dtype in get_fields(table)]

read_options = pcsv.ReadOptions(column_names=column_names, block_size=1000000000)
read_options = pcsv.ReadOptions(column_names=column_names, block_size=BUFFER_SIZE_BYTES)
parse_options = pcsv.ParseOptions(newlines_in_values=True)
convert_options = pcsv.ConvertOptions(column_types=arrow_schema, timestamp_parsers=["%Y%m%d", "%Y-%m-%d"],
true_values=["1", "T"], false_values=["0", "F"], strings_can_be_null=True)
Expand Down
7 changes: 4 additions & 3 deletions transform/src/schemas/baseballdatabank.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,9 +246,10 @@ class HallOfFame(Base):
player_id = Column(String(10), primary_key=True, nullable=False)
year_id = Column(SmallInteger, primary_key=True, nullable=False)
voted_by = Column(String(64), primary_key=True, nullable=False)
ballots = Column(SmallInteger)
needed = Column(SmallInteger)
votes = Column(SmallInteger)
# The 3 below are actually ints but there are some irregular nulls
ballots = Column(String(64))
needed = Column(String(64))
votes = Column(String(64))
inducted = Column(String(1))
category = Column(String(20))
needed_note = Column(String(25))
Expand Down
3 changes: 2 additions & 1 deletion transform/src/schemas/retrosheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ class Roster(Base):
bats = Column(CHAR(1), doc="Bat handedness")
throws = Column(CHAR(1), doc="Throw handedness")
team_id = Column(CHAR(3), primary_key=True, doc="Team ID")
position = Column(String(2), doc="Primary fielding position")
# TODO: Remove duplicate roster entry(s)
position = Column(String(2), primary_key=True, doc="Primary fielding position")


class Schedule(Base):
Expand Down

0 comments on commit f5a0bd4

Please sign in to comment.