diff --git a/VERSION b/VERSION index b82608c..8308b63 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -v0.1.0 +v0.1.1 diff --git a/notebooks/BOILERPLATE.ipynb b/notebooks/BOILERPLATE.ipynb index a1ab830..2632d4a 100644 --- a/notebooks/BOILERPLATE.ipynb +++ b/notebooks/BOILERPLATE.ipynb @@ -8,17 +8,16 @@ "\n", "---\n", "\n", - "\n", "**Objective:** The file provides a simple *boilerplate* to concentrate on what is necessary, and stop doing same tasks! The boilerplate is also configured with certain [**nbextensions**](https://gitlab.com/ZenithClown/computer-configurations-and-setups) that I personally use. Install them, if required, else ignore them as they do not participate in any type of code-optimizations. For any new project *edit* this file or `File > Make a Copy` to get started with the project. Some settings and configurations are already provided, as mentioned below." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2023-01-11T15:50:13.490916Z", - "start_time": "2023-01-11T15:50:13.479904Z" + "end_time": "2023-04-12T08:16:36.922254Z", + "start_time": "2023-04-12T08:16:36.904106Z" } }, "outputs": [ @@ -26,7 +25,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Current Code Version: v0.1.0-beta\n" + "Current Code Version: v0.1.1\n", + "\n" ] } ], @@ -59,11 +59,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2023-01-11T15:50:34.230463Z", - "start_time": "2023-01-11T15:50:34.217051Z" + "end_time": "2023-04-12T08:16:55.883978Z", + "start_time": "2023-04-12T08:16:55.867980Z" } }, "outputs": [], @@ -75,11 +75,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2023-01-11T15:51:07.931774Z", - "start_time": "2023-01-11T15:51:07.926776Z" + "end_time": "2023-04-12T08:28:01.539456Z", + "start_time": "2023-04-12T08:28:01.534825Z" } }, "outputs": [], @@ -100,7 +100,7 @@ }, "outputs": [], "source": [ - "from copy import deepcopy # dataframe is mutable\n", + "# from copy import deepcopy # dataframe is mutable\n", "# from tqdm import tqdm as TQ # progress bar for loops\n", "# from uuid import uuid4 as UUID # unique identifier for objs" ] @@ -133,7 +133,7 @@ }, "outputs": [], "source": [ - "import logging # configure logging on `global arguments` section, as file path is required" + "# import logging # configure logging on `global arguments` section, as file path is required" ] }, { @@ -167,9 +167,47 @@ "plt.style.use('default-style');\n", "\n", "pd.set_option('display.max_rows', 50) # max. rows to show\n", - "pd.set_option('display.max_columns', 15) # max. cols to show\n", + "pd.set_option('display.max_columns', 17) # max. cols to show\n", "np.set_printoptions(precision = 3, threshold = 15) # set np options\n", - "pd.options.display.float_format = '{:,.2f}'.format # float precisions" + "pd.options.display.float_format = '{:,.3f}'.format # float precisions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# sklearn metrices for analysis can be imported as below\n", + "# considering `regression` problem, rmse is imported metrics\n", + "# for rmse, use `squared = False` : https://stackoverflow.com/a/18623635/\n", + "# from sklearn.metrics import mean_squared_error as MSE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "print(f\"Tensorflow Version: {tf.__version__}\", end = \"\\n\") # required >= 2.8\n", + "\n", + "# check physical devices, and gpu compute capability (if available)\n", + "if len(tf.config.list_physical_devices(device_type = \"GPU\")):\n", + " # https://stackoverflow.com/q/38009682/6623589\n", + " # https://stackoverflow.com/a/59179238/6623589\n", + " print(\"GPU Computing Available.\", end = \" \")\n", + " \n", + " # experimentally, get the gpu details and computation power\n", + " # https://www.tensorflow.org/api_docs/python/tf/config/experimental/get_device_details\n", + " devices = tf.config.list_physical_devices(device_type = \"GPU\")[0] # first\n", + " details = tf.config.experimental.get_device_details(devices) # only first\n", + " details.get('device_name', 'compute_capability')\n", + " print(f\"EXPERIMENTAL : {details}\")\n", + "else:\n", + " print(\"GPU Computing Not Available. If `GPU` is present, check configuration. Detected Devices:\")\n", + " print(\" > \", tf.config.list_physical_devices())" ] }, { @@ -192,7 +230,11 @@ "echo %VARNAME%\n", "```\n", "\n", - "Once you've setup your system with [`PYTHONPATH`](https://bic-berkeley.github.io/psych-214-fall-2016/using_pythonpath.html) as per [*python documentation*](https://docs.python.org/3/using/cmdline.html#envvar-PYTHONPATH) is an important directory where any `import` statements looks for based on their order of importance. If a source code/module is not available check necessary environment variables and/or ask the administrator for the source files. For testing purpose, the module boasts the use of `src`, `utils` and `config` directories. However, these directories are available at `ROOT` level, and thus using `sys.path.append()` to add directories while importing." + "Once you've setup your system with [`PYTHONPATH`](https://bic-berkeley.github.io/psych-214-fall-2016/using_pythonpath.html) as per [*python documentation*](https://docs.python.org/3/using/cmdline.html#envvar-PYTHONPATH) is an important directory where any `import` statements looks for based on their order of importance. If a source code/module is not available check necessary environment variables and/or ask the administrator for the source files. For testing purpose, the module boasts the use of `src`, `utils` and `config` directories. However, these directories are available at `ROOT` level, and thus using `sys.path.append()` to add directories while importing.\n", + "\n", + "**Getting Started** with **`submodules`**\n", + "\n", + "A [`submodule`](https://git-scm.com/book/en/v2/Git-Tools-Submodules) provides functionality to integrate a seperate project in the current repository - this is typically useful to remove code-duplicacy and central repository to control dependent modules. More information on initializing and using submodule is available [here](https://www.youtube.com/watch?v=gSlXo2iLBro). Check [Github-GISTS/ZenithClown](https://gist.github.com/ZenithClown) for more information." ] }, { @@ -203,12 +245,21 @@ "source": [ "# append `src` and sub-modules to call additional files these directory are\n", "# project specific and not to be added under environment or $PATH variable\n", - "sys.path.append(os.path.join(\"..\", \"src\")) # parent/source files directory\n", "sys.path.append(os.path.join(\"..\", \"src\", \"agents\")) # agents for reinforcement modelling\n", "sys.path.append(os.path.join(\"..\", \"src\", \"engine\")) # derivative engines for model control\n", "sys.path.append(os.path.join(\"..\", \"src\", \"models\")) # actual models for decision making tools" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# also append the `utilities` directory for additional helpful codes\n", + "sys.path.append(os.path.join(\"..\", \"utilities\"))" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -230,29 +281,37 @@ "outputs": [], "source": [ "ROOT = \"..\" # the document root is one level up, that contains all code structure\n", - "DATA = join(ROOT, \"data\") # the directory contains all data files, subdirectory (if any) can also be used/defined\n", + "DATA = os.path.join(ROOT, \"data\") # the directory contains all data files, subdirectory (if any) can also be used/defined\n", "\n", "# processed data directory can be used, such that preprocessing steps is not\n", "# required to run again-and-again each time on kernel restart\n", - "PROCESSED_DATA = join(DATA, \"processed\")" + "PROCESSED_DATA = os.path.join(DATA, \"processed\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2022-05-07T12:02:38.898998Z", - "start_time": "2022-05-07T12:02:38.888970Z" + "end_time": "2023-04-12T08:28:13.816861Z", + "start_time": "2023-04-12T08:28:13.803865Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Code Execution Started on: Wed, Apr 12 2023\n" + ] + } + ], "source": [ "# long projects can be overwhelming, and keeping track of files, outputs and\n", "# saved models can be intriguing! to help this out, `today` can be used. for\n", "# instance output can be stored at `output//` etc.\n", "# `today` is so configured that it permits windows/*.nix file/directory names\n", - "today = dt.strftime(dt.strptime(ctime(), \"%a %b %d %H:%M:%S %Y\"), \"%a, %b %d %Y\")\n", + "today = dt.datetime.strftime(dt.datetime.strptime(time.ctime(), \"%a %b %d %H:%M:%S %Y\"), \"%a, %b %d %Y\")\n", "print(f\"Code Execution Started on: {today}\") # only date, name of the sub-directory" ] }, @@ -267,12 +326,12 @@ }, "outputs": [], "source": [ - "OUTPUT_DIR = join(ROOT, \"output\", today)\n", - "makedirs(OUTPUT_DIR, exist_ok = True) # create dir if not exist\n", + "OUTPUT_DIR = os.path.join(ROOT, \"output\", today)\n", + "os.makedirs(OUTPUT_DIR, exist_ok = True) # create dir if not exist\n", "\n", "# also create directory for `logs`\n", - "LOGS_DIR = join(ROOT, \"logs\", open(\"../VERSION\", 'rt').read())\n", - "makedirs(LOGS_DIR, exist_ok = True)" + "# LOGS_DIR = os.path.join(ROOT, \"logs\", open(\"../VERSION\", 'rt').read())\n", + "# os.makedirs(LOGS_DIR, exist_ok = True)" ] }, { @@ -286,26 +345,21 @@ }, "outputs": [], "source": [ - "logging.basicConfig(\n", - " filename = join(LOGS_DIR, f\"{today}.log\"), # change `reports` file name\n", - " filemode = \"a\", # append logs to existing file, if file exists\n", - " format = \"%(asctime)s - %(name)s - CLASS:%(levelname)s:%(levelno)s:L#%(lineno)d - %(message)s\",\n", - " level = logging.DEBUG\n", - ")" + "# logging.basicConfig(\n", + "# filename = os.path.join(LOGS_DIR, f\"{today}.log\"), # change `reports` file name\n", + "# filemode = \"a\", # append logs to existing file, if file exists\n", + "# format = \"%(asctime)s - %(name)s - CLASS:%(levelname)s:%(levelno)s:L#%(lineno)d - %(message)s\",\n", + "# level = logging.DEBUG\n", + "# )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Read Input File(s)\n", - "\n", - "A typical machine learning project revolves around six important stages (as available in [Amazon ML Life Cycle Documentation](https://docs.aws.amazon.com/wellarchitected/latest/machine-learning-lens/well-architected-machine-learning-lifecycle.html)). The notebook boilerplate is provided to address two pillars:\n", - "\n", - " 1. **Data Processing:** An integral part of any machine learning project, which is the most time consuming step! A brief introduction and best practices is available [here](https://towardsdatascience.com/introduction-to-data-preprocessing-in-machine-learning-a9fa83a5dc9d).\n", - " 2. **Model Development:** From understanding to deployment, this section address development (training, validating and testing) of an machine learning model.\n", + "## Model Development & PoC Section\n", "\n", - "![ML Life Cycle](https://docs.aws.amazon.com/wellarchitected/latest/machine-learning-lens/images/ml-lifecycle.png)" + "A typical machine learning project revolves around six important stages (as available in [Amazon ML Life Cycle Documentation](https://docs.aws.amazon.com/wellarchitected/latest/machine-learning-lens/well-architected-machine-learning-lifecycle.html)). This notebook boilerplate can be used to understand the data file, perform statitical tests and other EDA as required for any AI/ML application. Later, using the below study a *full-fledged* application can be generated using other sections of the boilerplate." ] } ],