clean_dataset.py from 0_data_clean_load.ipynb + readme update

timothyyu · Feb 7, 2019 · 0b199b4 · 0b199b4
1 parent 1aa1bf9
commit 0b199b4
Show file tree

Hide file tree

Showing 8 changed files with 225 additions and 42 deletions.
diff --git a/.gitignore b/.gitignore
@@ -37,4 +37,7 @@ Network Trash Folder
 Temporary Items
 .apdisk
 
+#Vscode
+.vscode
+
 # End of https://www.gitignore.io/api/macos,jupyternotebook
diff --git a/README.md b/README.md
@@ -1,28 +1,25 @@
 # wsae-lstm
 
-Repository that aims to implement the WSAE-LSTM model and replicate the results of said model as defined in "A deep learning framework for financial time series using stacked autoencoders and long-short term memory" by Wei Bao, Jun Yue , Yulei Rao (2017).
+Repository that aims to implement the WSAE-LSTM model and replicate the results of said model as defined in *"A deep learning framework for financial time series using stacked autoencoders and long-short term memory"* by Wei Bao, Jun Yue , Yulei Rao (2017).
 
-## Source journal article
 https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0180944
 
-### Source article
+## Source journal (APA format)
 
 Bao W, Yue J, Rao Y (2017). "A deep learning framework for financial time series using stacked autoencoders and long-short term memory". PLOS ONE 12(7): e0180944. https://doi.org/10.1371/journal.pone.0180944
 
-### Source article data:
+### Source journal data (saved into `data/raw` folder):
 DOI:10.6084/m9.figshare.5028110
 https://figshare.com/articles/Raw_Data/5028110
 
 ### `mlpanda/DeepLearning_Financial`
 
-Repository of an existing attempt to replicate above paper in PyTorch, checked out as git submodule for reference:
+Repository of an existing attempt to replicate above paper in PyTorch, checked out as git submodule for reference in `submodules` folder:
 
 `mlpanda/DeepLearning_Financial: https://github.com/mlpanda/DeepLearning_Financial`
 
-## Misc
+## Repository structure
 
-`/documentation/citation`:
+This repository uses a directory structured based upon [Cookiecutter Datascience]( http://drivendata.github.io/cookiecutter-data-science/#directory-structure).
 
-.ris (EndNote, Reference Manager, ProCite, RefWork) compatible 
-
-.bib (BibDesk, LaTeX) compatible
+Repository package requirements/dependencies are defined in `requirements.txt` for pip and/or `environment.yml` for Anaconda/conda. 
diff --git a/data/interim/clean_data.xlsx b/data/interim/clean_data.xlsx
diff --git a/data/interim/clean_data_futures.xlsx b/data/interim/clean_data_futures.xlsx
diff --git a/data/interim/clean_data_index.xlsx b/data/interim/clean_data_index.xlsx
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -0,0 +1,7 @@
+# Changelog
+
+2019-02-06
+
+- clean_dataset.py in `wsae_lstm` folder to clean raw dataset, output stored in `data/interim` folder (refactored from `notebooks\0_data_clean_load.ipynb`)
+- `readme.md` update with repository structure section & other minor clarification changes 
+
diff --git a/notebooks/0_data_clean_load.ipynb b/notebooks/0_data_clean_load.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,15 +21,15 @@
     "import datetime as dt\n",
     "import xlrd\n",
     "import xlsxwriter\n",
-    "#from collections import OrderedDict\n",
+    "from collections import OrderedDict\n",
     "\n",
     "import sys\n",
     "sys.path.append('../')  "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,20 +38,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 30,
    "metadata": {
     "scrolled": false
    },
    "outputs": [],
    "source": [
     "# Load in excel file and map each excel sheet to an ordered dict\n",
     "raw_xlsx_file = pd.ExcelFile(\"../data/raw/raw_data.xlsx\")\n",
-    "dict_dataframes =pd.read_excel(raw_xlsx_file,sheet_name = None)"
+    "dict_dataframes = pd.read_excel(raw_xlsx_file,sheet_name = None)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -62,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -71,7 +71,7 @@
        "collections.OrderedDict"
       ]
      },
-     "execution_count": 130,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -82,7 +82,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -91,7 +91,7 @@
        "dict"
       ]
      },
-     "execution_count": 131,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -104,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 34,
    "metadata": {
     "scrolled": false
    },
@@ -135,7 +135,7 @@
     "    # Source: https://stackoverflow.com/a/38572808\n",
     "dict_dataframes = {k.lower(): v for k, v in dict_dataframes.items()}\n",
     "\n",
-    "# Print number of sheets in raw_data\n",
+    "# Print name + number of sheets in dict of dataframes:\n",
     "print(\"Number of sheets: \",len(dict_dataframes),\"\\n\")\n",
     "print(\"\\n\".join(list(dict_dataframes.keys())))\n",
     "#print(raw_xlsx_file.sheet_names)"
@@ -160,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 133,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -174,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -188,7 +188,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 37,
    "metadata": {
     "scrolled": true
    },
@@ -204,7 +204,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -215,7 +215,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 137,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -241,7 +241,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 138,
+   "execution_count": 40,
    "metadata": {
     "scrolled": false
    },
@@ -374,7 +374,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 41,
    "metadata": {
     "scrolled": true
    },
@@ -387,7 +387,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -399,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 141,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
@@ -595,7 +595,7 @@
        "4 -3.531151e+07  "
       ]
      },
-     "execution_count": 141,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -606,7 +606,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
+   "execution_count": 44,
    "metadata": {
     "scrolled": false
    },
@@ -674,7 +674,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 143,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -683,7 +683,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 144,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -716,7 +716,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 47,
    "metadata": {
     "scrolled": false
    },
@@ -851,7 +851,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": 48,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -870,7 +870,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 150,
+   "execution_count": 49,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -890,7 +890,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
+   "execution_count": 50,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -909,7 +909,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 152,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -929,7 +929,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
+   "execution_count": 52,
    "metadata": {},
    "outputs": [],
    "source": [
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,4 +37,7 @@ Network Trash Folder @@
     Temporary Items
     .apdisk
+    #Vscode
+    .vscode
     # End of https://www.gitignore.io/api/macos,jupyternotebook