diff --git a/.gitignore b/.gitignore
index 99ed3ad..c86904e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,4 +37,7 @@ Network Trash Folder
 Temporary Items
 .apdisk
 
+#Vscode
+.vscode
+
 # End of https://www.gitignore.io/api/macos,jupyternotebook
\ No newline at end of file
diff --git a/README.md b/README.md
index 007fb74..4bbac68 100644
--- a/README.md
+++ b/README.md
@@ -1,28 +1,25 @@
 # wsae-lstm
 
-Repository that aims to implement the WSAE-LSTM model and replicate the results of said model as defined in "A deep learning framework for financial time series using stacked autoencoders and long-short term memory" by Wei Bao, Jun Yue , Yulei Rao (2017).
+Repository that aims to implement the WSAE-LSTM model and replicate the results of said model as defined in *"A deep learning framework for financial time series using stacked autoencoders and long-short term memory"* by Wei Bao, Jun Yue , Yulei Rao (2017).
 
-## Source journal article
 https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0180944
 
-### Source article
+## Source journal (APA format)
 
 Bao W, Yue J, Rao Y (2017). "A deep learning framework for financial time series using stacked autoencoders and long-short term memory". PLOS ONE 12(7): e0180944. https://doi.org/10.1371/journal.pone.0180944
 
-### Source article data:
+### Source journal data (saved into `data/raw` folder):
 DOI:10.6084/m9.figshare.5028110
 https://figshare.com/articles/Raw_Data/5028110
 
 ### `mlpanda/DeepLearning_Financial`
 
-Repository of an existing attempt to replicate above paper in PyTorch, checked out as git submodule for reference:
+Repository of an existing attempt to replicate above paper in PyTorch, checked out as git submodule for reference in `submodules` folder:
 
 `mlpanda/DeepLearning_Financial: https://github.com/mlpanda/DeepLearning_Financial`
 
-## Misc
+## Repository structure
 
-`/documentation/citation`:
+This repository uses a directory structured based upon [Cookiecutter Datascience]( http://drivendata.github.io/cookiecutter-data-science/#directory-structure).
 
-.ris (EndNote, Reference Manager, ProCite, RefWork) compatible 
-
-.bib (BibDesk, LaTeX) compatible
\ No newline at end of file
+Repository package requirements/dependencies are defined in `requirements.txt` for pip and/or `environment.yml` for Anaconda/conda. 
\ No newline at end of file
diff --git a/data/interim/clean_data.xlsx b/data/interim/clean_data.xlsx
index 67d9899..ab3629f 100644
Binary files a/data/interim/clean_data.xlsx and b/data/interim/clean_data.xlsx differ
diff --git a/data/interim/clean_data_futures.xlsx b/data/interim/clean_data_futures.xlsx
index d297073..2cb96c1 100644
Binary files a/data/interim/clean_data_futures.xlsx and b/data/interim/clean_data_futures.xlsx differ
diff --git a/data/interim/clean_data_index.xlsx b/data/interim/clean_data_index.xlsx
index 8b2723c..e87b8c4 100644
Binary files a/data/interim/clean_data_index.xlsx and b/data/interim/clean_data_index.xlsx differ
diff --git a/docs/changelog.md b/docs/changelog.md
new file mode 100644
index 0000000..4b3422a
--- /dev/null
+++ b/docs/changelog.md
@@ -0,0 +1,7 @@
+# Changelog
+
+2019-02-06
+
+- clean_dataset.py in `wsae_lstm` folder to clean raw dataset, output stored in `data/interim` folder (refactored from `notebooks\0_data_clean_load.ipynb`)
+- `readme.md` update with repository structure section & other minor clarification changes 
+
diff --git a/notebooks/0_data_clean_load.ipynb b/notebooks/0_data_clean_load.ipynb
index 4ffc022..2732951 100644
--- a/notebooks/0_data_clean_load.ipynb
+++ b/notebooks/0_data_clean_load.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,7 +21,7 @@
     "import datetime as dt\n",
     "import xlrd\n",
     "import xlsxwriter\n",
-    "#from collections import OrderedDict\n",
+    "from collections import OrderedDict\n",
     "\n",
     "import sys\n",
     "sys.path.append('../')  "
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -38,7 +38,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 30,
    "metadata": {
     "scrolled": false
    },
@@ -46,12 +46,12 @@
    "source": [
     "# Load in excel file and map each excel sheet to an ordered dict\n",
     "raw_xlsx_file = pd.ExcelFile(\"../data/raw/raw_data.xlsx\")\n",
-    "dict_dataframes =pd.read_excel(raw_xlsx_file,sheet_name = None)"
+    "dict_dataframes = pd.read_excel(raw_xlsx_file,sheet_name = None)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -62,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -71,7 +71,7 @@
        "collections.OrderedDict"
       ]
      },
-     "execution_count": 130,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -82,7 +82,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -91,7 +91,7 @@
        "dict"
       ]
      },
-     "execution_count": 131,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -104,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 34,
    "metadata": {
     "scrolled": false
    },
@@ -135,7 +135,7 @@
     "    # Source: https://stackoverflow.com/a/38572808\n",
     "dict_dataframes = {k.lower(): v for k, v in dict_dataframes.items()}\n",
     "\n",
-    "# Print number of sheets in raw_data\n",
+    "# Print name + number of sheets in dict of dataframes:\n",
     "print(\"Number of sheets: \",len(dict_dataframes),\"\\n\")\n",
     "print(\"\\n\".join(list(dict_dataframes.keys())))\n",
     "#print(raw_xlsx_file.sheet_names)"
@@ -160,7 +160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 133,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -174,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -188,7 +188,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 37,
    "metadata": {
     "scrolled": true
    },
@@ -204,7 +204,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -215,7 +215,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 137,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -241,7 +241,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 138,
+   "execution_count": 40,
    "metadata": {
     "scrolled": false
    },
@@ -374,7 +374,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 41,
    "metadata": {
     "scrolled": true
    },
@@ -387,7 +387,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -399,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 141,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
@@ -595,7 +595,7 @@
        "4 -3.531151e+07  "
       ]
      },
-     "execution_count": 141,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -606,7 +606,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
+   "execution_count": 44,
    "metadata": {
     "scrolled": false
    },
@@ -674,7 +674,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 143,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -683,7 +683,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 144,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -716,7 +716,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 47,
    "metadata": {
     "scrolled": false
    },
@@ -851,7 +851,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": 48,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -870,7 +870,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 150,
+   "execution_count": 49,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -890,7 +890,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
+   "execution_count": 50,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -909,7 +909,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 152,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -929,7 +929,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
+   "execution_count": 52,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/wsae_lstm/clean_dataset.py b/wsae_lstm/clean_dataset.py
new file mode 100644
index 0000000..f12f5a0
--- /dev/null
+++ b/wsae_lstm/clean_dataset.py
@@ -0,0 +1,176 @@
+# Load and clean raw dataset from 'data/raw' folder 
+    # Cleaned data stored in 'data/interim' folder 
+
+# Imports (External)
+import numpy as np
+import pandas as pd
+import datetime as dt
+import xlrd
+import xlsxwriter
+from collections import OrderedDict
+
+import sys
+sys.path.append('../')  
+
+# Load in excel file and map each excel sheet to an ordered dict
+raw_xlsx_file = pd.ExcelFile("../data/raw/raw_data.xlsx")
+dict_dataframes = pd.read_excel(raw_xlsx_file,sheet_name = None)
+#print(type(dict_dataframes))
+
+# Convert ordered of dataframes to regular dict
+dict_dataframes = dict(dict_dataframes)
+#print(type(dict_dataframes)()
+
+# Convert all sheet names/dict keys to lowercase using list comprehension 
+    # Source: https://stackoverflow.com/a/38572808
+dict_dataframes = {k.lower(): v for k, v in dict_dataframes.items()}
+
+# Print name + number of sheets in dict of dataframes:
+#print("Number of sheets: ",len(dict_dataframes),"\n")
+#print("\n".join(list(dict_dataframes.keys())))
+
+# Panel A, Developing Market
+    # 'csi300 index data',
+    # 'csi300 index future data'
+    # 'nifty 50 index data'
+    # 'nifty 50 index future data'
+# Panel B, Relatively Developed Market
+    # 'hangseng index data'
+    # 'hangseng index future data'
+    # 'nikkei 225 index data'
+    # 'nikkei 225 index future data'
+# Panel C, Developed Market
+    # 's&p500 index data'
+    # 's&p500 index future data'
+    # 'djia index data'
+    # 'djia index future data'
+
+# Rename all dataframe column headers in each dataframe in dict_dataframes to lowercase
+for item in dict_dataframes:
+    dict_dataframes[item].columns = map(str.lower, dict_dataframes[item].columns)
+
+# Convert dict back to orderdict after reorder to match Panel A/B/C format
+    # Source: https://stackoverflow.com/a/46447976
+key_order = ['csi300 index data',
+'csi300 index future data',
+'nifty 50 index data',
+'nifty 50 index future data',
+'hangseng index data',
+'hangseng index future data',
+'nikkei 225 index data',
+'nikkei 225 index future data',
+'s&p500 index data',
+'s&p500 index future data',
+'djia index data',
+'djia index future data',
+]
+list_of_tuples = [(key, dict_dataframes[key]) for key in key_order]
+dict_dataframes = OrderedDict(list_of_tuples)
+
+# Obtain information on each sheet (row and column info)
+# for item in dict_dataframes:
+#     # Obtain number of rows in dataframe
+#     #rc=dict_dataframes[item].shape[0]
+#     # Obtain number of columns in dataframe
+#     #cc =  len(dict_dataframes[item].columns)
+#     print ("=======================================")
+#     print (item,"\n")
+#     print (dict_dataframes[item].info(verbose=False))
+
+# Drop column 'matlab_time' from all dataframes in OrderedDict + rename OHLC columns for consistency
+for item in dict_dataframes:
+    for subitem in dict_dataframes[item]:
+        if 'matlab_time' in subitem:
+            print(subitem,"Dropped from ", item)
+            dict_dataframes[item].drop(subitem,axis=1, inplace=True) 
+        # Rename OHLC columns for consistency
+        if 'open price' in subitem:
+            print(subitem,"Renamed from ", item)
+            dict_dataframes[item].rename(columns={'open price':'open'},inplace=True)
+        if 'high price' in subitem:
+            print(subitem,"Renamed from ", item)
+            dict_dataframes[item].rename(columns={'high price':'high'},inplace=True)
+        if 'low price' in subitem:
+            print(subitem,"Renamed from ", item)
+            dict_dataframes[item].rename(columns={'low price':'low'},inplace=True)
+        if 'closing price' in subitem:
+            print(subitem,"Renamed from ", item)
+            dict_dataframes[item].rename(columns={'closing price':'close'},inplace=True)
+        if 'close price' in subitem:
+            print(subitem,"Renamed from ", item)
+            dict_dataframes[item].rename(columns={'close price':'close'},inplace=True)     
+
+# Rename date/ntime columns to date + drop mislabeled matlab_time columns
+dict_dataframes['csi300 index data'].rename(columns={'time':'date'},inplace=True)
+dict_dataframes['csi300 index future data'].rename(columns={'num_time':'date'},inplace=True)
+
+dict_dataframes['nifty 50 index data'].drop(columns=['ntime'],axis=1, inplace=True)
+dict_dataframes['nifty 50 index future data'].drop(columns=['ntime'],axis=1, inplace=True)
+
+dict_dataframes['hangseng index data'].drop(columns=['time'],axis=1, inplace=True)
+dict_dataframes['hangseng index data'].rename(columns={'ntime':'date'},inplace=True)
+
+dict_dataframes['hangseng index future data'].rename(columns={'ntime':'date'},inplace=True)
+
+dict_dataframes['nikkei 225 index data'].rename(columns={'ntime':'date'},inplace=True)
+dict_dataframes['nikkei 225 index data'].drop(columns=['time'],axis=1, inplace=True)
+
+dict_dataframes['nikkei 225 index future data'].drop(columns=['time'],axis=1, inplace=True)
+dict_dataframes['nikkei 225 index future data'].rename(columns={'ntime':'date'},inplace=True)
+
+dict_dataframes['s&p500 index data'].drop(columns=['time'],axis=1, inplace=True)
+dict_dataframes['s&p500 index data'].rename(columns={'ntime':'date'},inplace=True)
+
+dict_dataframes['djia index data'].drop(columns=['time'],axis=1, inplace=True)
+dict_dataframes['djia index data'].rename(columns={'ntime':'date'},inplace=True)
+
+dict_dataframes['djia index future data'].drop(columns=['time'],axis=1, inplace=True)
+
+# # Verify date rename + column drop/rename
+# for item in dict_dataframes:
+#     # Obtain number of rows in dataframe
+#     rc=dict_dataframes[item].shape[0]
+#     # Obtain number of columns in dataframe
+#     cc =  len(dict_dataframes[item].columns)
+#     print ("=======================================")
+#     print (item,"\n")
+#     print (dict_dataframes[item].info(verbose=False))
+
+# Save cleaned data to disk (both index data and futures in one xlsx sheet)
+def frames_to_excel(df_dict, path):
+    # frames_to_excel() source: https://stackoverflow.com/q/51696940
+    """Write dictionary of dataframes to separate sheets, within 
+        1 file."""
+    writer = pd.ExcelWriter(path, engine='xlsxwriter')
+    for tab_name, dframe in df_dict.items():
+        dframe.to_excel(writer, sheet_name=tab_name)
+    writer.save() 
+    
+frames_to_excel(dict_dataframes,"../data/interim/clean_data.xlsx")
+
+# Save clean data to disk - index data only
+key_order = ['csi300 index data',
+'nifty 50 index data',
+'hangseng index data',
+'nikkei 225 index data',
+'s&p500 index data',
+'djia index data',
+]
+list_of_tuples = [(key, dict_dataframes[key]) for key in key_order]
+dict_dataframes_index = OrderedDict(list_of_tuples)
+
+frames_to_excel(dict_dataframes_index,"../data/interim/clean_data_index.xlsx")
+
+# Save clean data to disk - future data only
+key_order = [
+'csi300 index future data',
+'nifty 50 index future data',
+'hangseng index future data',
+'nikkei 225 index future data',
+'s&p500 index future data',
+'djia index future data',
+]
+list_of_tuples = [(key, dict_dataframes[key]) for key in key_order]
+dict_dataframes_futures = OrderedDict(list_of_tuples)
+
+frames_to_excel(dict_dataframes_futures,"../data/interim/clean_data_futures.xlsx")
\ No newline at end of file