diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..8fe8573 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +*.csv +.dolt/ +.sqlhistory +*.log +*.out diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..86ad510 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,13 @@ +FROM continuumio/anaconda3 + +RUN wget https://github.com/dolthub/dolt/releases/download/v0.40.19/dolt-linux-amd64.tar.gz -O /tmp/dolt-linux-amd64.tar.gz && cd /tmp && tar -zxvf /tmp/dolt-linux-amd64.tar.gz && cp /tmp/dolt-linux-amd64/bin/dolt /usr/bin/ && rm -rf /tmp/* +RUN apt update && apt install -y git psmisc zip gcc g++ +RUN cd / && dolt clone chenditc/investment_data +RUN cd /investment_data && git init && git pull https://github.com/chenditc/investment_data.git +RUN pip install numpy && pip install --upgrade cython \ + && cd / && git clone https://github.com/microsoft/qlib.git \ + && cd /qlib/ && pip install . && pip install -r scripts/data_collector/yahoo/requirements.txt +COPY ./requirements.txt /tmp/requirements.txt +RUN pip install -r /tmp/requirements.txt +COPY . /app +WORKDIR /investment_data/ \ No newline at end of file diff --git a/README.md b/README.md index 630924d..52011e9 100644 --- a/README.md +++ b/README.md @@ -7,16 +7,20 @@ Follow https://github.com/dolthub/dolt ## Export to qlib format ``` -dolt sql-server -H 0.0.0.0 +docker run -v /:/output --it --rm chenditc/investment_data bash dump_qlib_bin.sh && cp ./qlib_bin.tar.gz /output/ +``` -# Run in this repo's root directory -mkdir ./qlib/qlib_source -python ./qlib/dump_all_to_qlib_source.py +## Daily Update +``` +export TUSHARE= +bash daily_update.sh +``` -# Run qlib's yahoo converter: https://github.com/microsoft/qlib/tree/main/scripts/data_collector/yahoo -python3 ~/qlib/scripts/data_collector/yahoo/collector.py normalize_data --source_dir /mnt/investment_data/qlib/qlib_source/ --normalize_dir ./qlib_normalize --max_workers=16 --date_field_name="tradedate" -python3 ~/qlib/scripts/dump_bin.py dump_all --csv_path ./qlib_normalize/ --qlib_dir ./qlib_bin --date_field_name=tradedate --exclude_fields=tradedate,symbol +## Daily update and output ``` +docker run -v /:/output --it --rm chenditc/investment_data bash daily_update.sh && bash dump_qlib_bin.sh && cp ./qlib_bin.tar.gz /output/ +``` + # Initiative 1. Try to fill in missing data by combining data from multiple data source. For example, delist company's data. diff --git a/daily_update.sh b/daily_update.sh index ed51ac9..fcc6142 100644 --- a/daily_update.sh +++ b/daily_update.sh @@ -1,3 +1,5 @@ +dolt pull chenditc/investment_data + echo "Updating index weight" startdate=$(dolt sql -q "select DATE_FORMAT(DATE_ADD(max(trade_date), INTERVAL 1 DAY), '%Y%m%d') from ts_index_weight" -r csv | tail -1) python3 tushare/dump_index_weight.py --start_date=$startdate diff --git a/dump_qlib_bin.sh b/dump_qlib_bin.sh new file mode 100644 index 0000000..b05cfbd --- /dev/null +++ b/dump_qlib_bin.sh @@ -0,0 +1,19 @@ +cd /investment_data/ +dolt pull chenditc/investment_data + +dolt sql-server & +mkdir ./qlib/qlib_source +python3 ./qlib/dump_all_to_qlib_source.py +killall dolt + +python3 /qlib/scripts/data_collector/yahoo/collector.py normalize_data --source_dir ./qlib/qlib_source/ --normalize_dir ./qlib_normalize --max_workers=16 --date_field_name="tradedate" +python3 /qlib/scripts/dump_bin.py dump_all --csv_path ./qlib_normalize/ --qlib_dir ./qlib_bin --date_field_name=tradedate --exclude_fields=tradedate,symbol + +dolt sql-server & +mkdir ./qlib/qlib_index/ +python3 ./qlib/dump_index_weight.py +killall dolt + +cp qlib/qlib_index/csi* ./qlib_bin/instruments/ + +tar -czvf ./qlib_bin.tar.gz ./qlib_bin/ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bdde8d8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +tushare +sqlalchemy +pymysql +fire \ No newline at end of file