diff --git a/.gitignore b/.gitignore index 99ed3ad..c86904e 100644 --- a/.gitignore +++ b/.gitignore @@ -37,4 +37,7 @@ Network Trash Folder Temporary Items .apdisk +#Vscode +.vscode + # End of https://www.gitignore.io/api/macos,jupyternotebook \ No newline at end of file diff --git a/README.md b/README.md index 007fb74..4bbac68 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,25 @@ # wsae-lstm -Repository that aims to implement the WSAE-LSTM model and replicate the results of said model as defined in "A deep learning framework for financial time series using stacked autoencoders and long-short term memory" by Wei Bao, Jun Yue , Yulei Rao (2017). +Repository that aims to implement the WSAE-LSTM model and replicate the results of said model as defined in *"A deep learning framework for financial time series using stacked autoencoders and long-short term memory"* by Wei Bao, Jun Yue , Yulei Rao (2017). -## Source journal article https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0180944 -### Source article +## Source journal (APA format) Bao W, Yue J, Rao Y (2017). "A deep learning framework for financial time series using stacked autoencoders and long-short term memory". PLOS ONE 12(7): e0180944. https://doi.org/10.1371/journal.pone.0180944 -### Source article data: +### Source journal data (saved into `data/raw` folder): DOI:10.6084/m9.figshare.5028110 https://figshare.com/articles/Raw_Data/5028110 ### `mlpanda/DeepLearning_Financial` -Repository of an existing attempt to replicate above paper in PyTorch, checked out as git submodule for reference: +Repository of an existing attempt to replicate above paper in PyTorch, checked out as git submodule for reference in `submodules` folder: `mlpanda/DeepLearning_Financial: https://github.com/mlpanda/DeepLearning_Financial` -## Misc +## Repository structure -`/documentation/citation`: +This repository uses a directory structured based upon [Cookiecutter Datascience]( http://drivendata.github.io/cookiecutter-data-science/#directory-structure). -.ris (EndNote, Reference Manager, ProCite, RefWork) compatible - -.bib (BibDesk, LaTeX) compatible \ No newline at end of file +Repository package requirements/dependencies are defined in `requirements.txt` for pip and/or `environment.yml` for Anaconda/conda. \ No newline at end of file diff --git a/data/interim/clean_data.xlsx b/data/interim/clean_data.xlsx index 67d9899..ab3629f 100644 Binary files a/data/interim/clean_data.xlsx and b/data/interim/clean_data.xlsx differ diff --git a/data/interim/clean_data_futures.xlsx b/data/interim/clean_data_futures.xlsx index d297073..2cb96c1 100644 Binary files a/data/interim/clean_data_futures.xlsx and b/data/interim/clean_data_futures.xlsx differ diff --git a/data/interim/clean_data_index.xlsx b/data/interim/clean_data_index.xlsx index 8b2723c..e87b8c4 100644 Binary files a/data/interim/clean_data_index.xlsx and b/data/interim/clean_data_index.xlsx differ diff --git a/docs/changelog.md b/docs/changelog.md new file mode 100644 index 0000000..4b3422a --- /dev/null +++ b/docs/changelog.md @@ -0,0 +1,7 @@ +# Changelog + +2019-02-06 + +- clean_dataset.py in `wsae_lstm` folder to clean raw dataset, output stored in `data/interim` folder (refactored from `notebooks\0_data_clean_load.ipynb`) +- `readme.md` update with repository structure section & other minor clarification changes + diff --git a/notebooks/0_data_clean_load.ipynb b/notebooks/0_data_clean_load.ipynb index 4ffc022..2732951 100644 --- a/notebooks/0_data_clean_load.ipynb +++ b/notebooks/0_data_clean_load.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 125, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,7 @@ "import datetime as dt\n", "import xlrd\n", "import xlsxwriter\n", - "#from collections import OrderedDict\n", + "from collections import OrderedDict\n", "\n", "import sys\n", "sys.path.append('../') " @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 30, "metadata": { "scrolled": false }, @@ -46,12 +46,12 @@ "source": [ "# Load in excel file and map each excel sheet to an ordered dict\n", "raw_xlsx_file = pd.ExcelFile(\"../data/raw/raw_data.xlsx\")\n", - "dict_dataframes =pd.read_excel(raw_xlsx_file,sheet_name = None)" + "dict_dataframes = pd.read_excel(raw_xlsx_file,sheet_name = None)" ] }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -71,7 +71,7 @@ "collections.OrderedDict" ] }, - "execution_count": 130, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -91,7 +91,7 @@ "dict" ] }, - "execution_count": 131, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 34, "metadata": { "scrolled": false }, @@ -135,7 +135,7 @@ " # Source: https://stackoverflow.com/a/38572808\n", "dict_dataframes = {k.lower(): v for k, v in dict_dataframes.items()}\n", "\n", - "# Print number of sheets in raw_data\n", + "# Print name + number of sheets in dict of dataframes:\n", "print(\"Number of sheets: \",len(dict_dataframes),\"\\n\")\n", "print(\"\\n\".join(list(dict_dataframes.keys())))\n", "#print(raw_xlsx_file.sheet_names)" @@ -160,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -174,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -188,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 37, "metadata": { "scrolled": true }, @@ -204,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -215,7 +215,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -241,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 40, "metadata": { "scrolled": false }, @@ -374,7 +374,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 41, "metadata": { "scrolled": true }, @@ -387,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -399,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -595,7 +595,7 @@ "4 -3.531151e+07 " ] }, - "execution_count": 141, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -606,7 +606,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 44, "metadata": { "scrolled": false }, @@ -674,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -683,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -716,7 +716,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 47, "metadata": { "scrolled": false }, @@ -851,7 +851,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -870,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -890,7 +890,7 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -909,7 +909,7 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -929,7 +929,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ diff --git a/wsae_lstm/clean_dataset.py b/wsae_lstm/clean_dataset.py new file mode 100644 index 0000000..f12f5a0 --- /dev/null +++ b/wsae_lstm/clean_dataset.py @@ -0,0 +1,176 @@ +# Load and clean raw dataset from 'data/raw' folder + # Cleaned data stored in 'data/interim' folder + +# Imports (External) +import numpy as np +import pandas as pd +import datetime as dt +import xlrd +import xlsxwriter +from collections import OrderedDict + +import sys +sys.path.append('../') + +# Load in excel file and map each excel sheet to an ordered dict +raw_xlsx_file = pd.ExcelFile("../data/raw/raw_data.xlsx") +dict_dataframes = pd.read_excel(raw_xlsx_file,sheet_name = None) +#print(type(dict_dataframes)) + +# Convert ordered of dataframes to regular dict +dict_dataframes = dict(dict_dataframes) +#print(type(dict_dataframes)() + +# Convert all sheet names/dict keys to lowercase using list comprehension + # Source: https://stackoverflow.com/a/38572808 +dict_dataframes = {k.lower(): v for k, v in dict_dataframes.items()} + +# Print name + number of sheets in dict of dataframes: +#print("Number of sheets: ",len(dict_dataframes),"\n") +#print("\n".join(list(dict_dataframes.keys()))) + +# Panel A, Developing Market + # 'csi300 index data', + # 'csi300 index future data' + # 'nifty 50 index data' + # 'nifty 50 index future data' +# Panel B, Relatively Developed Market + # 'hangseng index data' + # 'hangseng index future data' + # 'nikkei 225 index data' + # 'nikkei 225 index future data' +# Panel C, Developed Market + # 's&p500 index data' + # 's&p500 index future data' + # 'djia index data' + # 'djia index future data' + +# Rename all dataframe column headers in each dataframe in dict_dataframes to lowercase +for item in dict_dataframes: + dict_dataframes[item].columns = map(str.lower, dict_dataframes[item].columns) + +# Convert dict back to orderdict after reorder to match Panel A/B/C format + # Source: https://stackoverflow.com/a/46447976 +key_order = ['csi300 index data', +'csi300 index future data', +'nifty 50 index data', +'nifty 50 index future data', +'hangseng index data', +'hangseng index future data', +'nikkei 225 index data', +'nikkei 225 index future data', +'s&p500 index data', +'s&p500 index future data', +'djia index data', +'djia index future data', +] +list_of_tuples = [(key, dict_dataframes[key]) for key in key_order] +dict_dataframes = OrderedDict(list_of_tuples) + +# Obtain information on each sheet (row and column info) +# for item in dict_dataframes: +# # Obtain number of rows in dataframe +# #rc=dict_dataframes[item].shape[0] +# # Obtain number of columns in dataframe +# #cc = len(dict_dataframes[item].columns) +# print ("=======================================") +# print (item,"\n") +# print (dict_dataframes[item].info(verbose=False)) + +# Drop column 'matlab_time' from all dataframes in OrderedDict + rename OHLC columns for consistency +for item in dict_dataframes: + for subitem in dict_dataframes[item]: + if 'matlab_time' in subitem: + print(subitem,"Dropped from ", item) + dict_dataframes[item].drop(subitem,axis=1, inplace=True) + # Rename OHLC columns for consistency + if 'open price' in subitem: + print(subitem,"Renamed from ", item) + dict_dataframes[item].rename(columns={'open price':'open'},inplace=True) + if 'high price' in subitem: + print(subitem,"Renamed from ", item) + dict_dataframes[item].rename(columns={'high price':'high'},inplace=True) + if 'low price' in subitem: + print(subitem,"Renamed from ", item) + dict_dataframes[item].rename(columns={'low price':'low'},inplace=True) + if 'closing price' in subitem: + print(subitem,"Renamed from ", item) + dict_dataframes[item].rename(columns={'closing price':'close'},inplace=True) + if 'close price' in subitem: + print(subitem,"Renamed from ", item) + dict_dataframes[item].rename(columns={'close price':'close'},inplace=True) + +# Rename date/ntime columns to date + drop mislabeled matlab_time columns +dict_dataframes['csi300 index data'].rename(columns={'time':'date'},inplace=True) +dict_dataframes['csi300 index future data'].rename(columns={'num_time':'date'},inplace=True) + +dict_dataframes['nifty 50 index data'].drop(columns=['ntime'],axis=1, inplace=True) +dict_dataframes['nifty 50 index future data'].drop(columns=['ntime'],axis=1, inplace=True) + +dict_dataframes['hangseng index data'].drop(columns=['time'],axis=1, inplace=True) +dict_dataframes['hangseng index data'].rename(columns={'ntime':'date'},inplace=True) + +dict_dataframes['hangseng index future data'].rename(columns={'ntime':'date'},inplace=True) + +dict_dataframes['nikkei 225 index data'].rename(columns={'ntime':'date'},inplace=True) +dict_dataframes['nikkei 225 index data'].drop(columns=['time'],axis=1, inplace=True) + +dict_dataframes['nikkei 225 index future data'].drop(columns=['time'],axis=1, inplace=True) +dict_dataframes['nikkei 225 index future data'].rename(columns={'ntime':'date'},inplace=True) + +dict_dataframes['s&p500 index data'].drop(columns=['time'],axis=1, inplace=True) +dict_dataframes['s&p500 index data'].rename(columns={'ntime':'date'},inplace=True) + +dict_dataframes['djia index data'].drop(columns=['time'],axis=1, inplace=True) +dict_dataframes['djia index data'].rename(columns={'ntime':'date'},inplace=True) + +dict_dataframes['djia index future data'].drop(columns=['time'],axis=1, inplace=True) + +# # Verify date rename + column drop/rename +# for item in dict_dataframes: +# # Obtain number of rows in dataframe +# rc=dict_dataframes[item].shape[0] +# # Obtain number of columns in dataframe +# cc = len(dict_dataframes[item].columns) +# print ("=======================================") +# print (item,"\n") +# print (dict_dataframes[item].info(verbose=False)) + +# Save cleaned data to disk (both index data and futures in one xlsx sheet) +def frames_to_excel(df_dict, path): + # frames_to_excel() source: https://stackoverflow.com/q/51696940 + """Write dictionary of dataframes to separate sheets, within + 1 file.""" + writer = pd.ExcelWriter(path, engine='xlsxwriter') + for tab_name, dframe in df_dict.items(): + dframe.to_excel(writer, sheet_name=tab_name) + writer.save() + +frames_to_excel(dict_dataframes,"../data/interim/clean_data.xlsx") + +# Save clean data to disk - index data only +key_order = ['csi300 index data', +'nifty 50 index data', +'hangseng index data', +'nikkei 225 index data', +'s&p500 index data', +'djia index data', +] +list_of_tuples = [(key, dict_dataframes[key]) for key in key_order] +dict_dataframes_index = OrderedDict(list_of_tuples) + +frames_to_excel(dict_dataframes_index,"../data/interim/clean_data_index.xlsx") + +# Save clean data to disk - future data only +key_order = [ +'csi300 index future data', +'nifty 50 index future data', +'hangseng index future data', +'nikkei 225 index future data', +'s&p500 index future data', +'djia index future data', +] +list_of_tuples = [(key, dict_dataframes[key]) for key in key_order] +dict_dataframes_futures = OrderedDict(list_of_tuples) + +frames_to_excel(dict_dataframes_futures,"../data/interim/clean_data_futures.xlsx") \ No newline at end of file