Add initial files

Snowflake-Labs · Jan 19, 2021 · f51b2da · f51b2da
commit f51b2da
Show file tree

Hide file tree

Showing 8 changed files with 1,391 additions and 0 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,50 @@
+#    - Please check the following URLs for the driver versions to pick up:
+#         All drivers: https://docs.snowflake.net/manuals/release-notes/client-change-log.html#client-changes-by-version
+#         ODBC:  https://sfc-repo.snowflakecomputing.com/odbc/linux/index.html
+#         JDBC:  https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/
+#         Spark: https://repo1.maven.org/maven2/net/snowflake/spark-snowflake_2.11
+#         Note: For Spark, the docker currently uses Spark 2.4 with Scala 2.11
+#    - Update lines 17 to 22 (beginning with ARG) with the correct levels to be deployed which executes deploy_snowflake.sh Script
+#    - For the almond & scala kernel, please check the following link:
+#         https://almond.sh/docs/quick-start-install
+#    - Note: For the jupyter scala kernel, the version can be set with the variable scala_kernel_version
+# Questions: Zohar Nissare-Houssen - [email protected]
+#
+
+#Start from the following core stack & driver levels versions
+FROM jupyter/all-spark-notebook:1c8073a927aa
+USER root
+ARG almond_version=0.10.9
+ARG scala_kernel_version=2.12.11
+ARG odbc_version=2.22.3
+ARG jdbc_version=3.12.16
+ARG spark_version=2.8.3
+ARG snowsql_version=1.2.10
+RUN apt-get update && \
+    apt-get install -y apt-utils && \
+    apt-get install -y libssl-dev libffi-dev && \
+    apt-get install -y vim
+RUN sudo -u jovyan /opt/conda/bin/curl -Lo coursier https://git.io/coursier-cli
+RUN chown -R jovyan:users /home/jovyan/coursier && chmod +x /home/jovyan/coursier
+RUN sudo -u jovyan /home/jovyan/coursier launch --fork almond:$almond_version --scala $scala_kernel_version -- --install
+RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade pip
+RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade pyarrow
+RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade snowflake-connector-python[pandas]
+RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade snowflake-sqlalchemy
+RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade plotly
+RUN conda install pyodbc
+RUN conda install -c conda-forge jupyterlab-plotly-extension --yes
+RUN apt-get install -y iodbc libiodbc2-dev libssl-dev
+COPY ./deploy_snowflake.sh /
+RUN chmod +x /deploy_snowflake.sh
+RUN /deploy_snowflake.sh
+RUN mkdir /home/jovyan/samples
+COPY ./pyodbc.ipynb /home/jovyan/samples
+COPY ./Python.ipynb /home/jovyan/samples
+COPY ./spark.ipynb /home/jovyan/samples
+COPY ./SQLAlchemy.ipynb /home/jovyan/samples
+RUN chown -R jovyan:users /home/jovyan/samples
+RUN sudo -u jovyan /opt/conda/bin/jupyter trust /home/jovyan/samples/pyodbc.ipynb
+RUN sudo -u jovyan /opt/conda/bin/jupyter trust /home/jovyan/samples/Python.ipynb
+RUN sudo -u jovyan /opt/conda/bin/jupyter trust /home/jovyan/samples/spark.ipynb
+RUN sudo -u jovyan /opt/conda/bin/jupyter trust /home/jovyan/samples/SQLAlchemy.ipynb
diff --git a/Python.ipynb b/Python.ipynb
@@ -0,0 +1,338 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Import the various modules required to make a simple Snowflake connection from Python\n",
+    "import snowflake.connector\n",
+    "from snowflake.connector.converter_null import SnowflakeNoConverterToPython\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Modify this cell to include information about your demo account\n",
+    "ACCOUNT = 'xxxx'\n",
+    "USER = 'xxxx'\n",
+    "PASSWORD = 'xxxx'\n",
+    "\n",
+    "con = snowflake.connector.connect(\n",
+    "  user=USER,\n",
+    "  password=PASSWORD,\n",
+    "  account=ACCOUNT\n",
+    "    ,converter_class=SnowflakeNoConverterToPython\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a variable called sql and specify a query that it will store\n",
+    "sql = \"select * from sales.public.customer limit 10000\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<snowflake.connector.cursor.SnowflakeCursor at 0x7f689ce3e5f8>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Specify the virtual warehouse and role we want to use\n",
+    "con.cursor().execute(\"USE WAREHOUSE xxxx\")\n",
+    "con.cursor().execute(\"USE role xxxx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Execute the query using the Python connector\n",
+    "#%%time\n",
+    "res = con.cursor().execute(sql).fetchall()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 10000 entries, 0 to 9999\n",
+      "Data columns (total 8 columns):\n",
+      "C_CUSTKEY       10000 non-null object\n",
+      "C_NAME          10000 non-null object\n",
+      "C_ADDRESS       10000 non-null object\n",
+      "C_NATIONKEY     10000 non-null object\n",
+      "C_PHONE         10000 non-null object\n",
+      "C_ACCTBAL       10000 non-null object\n",
+      "C_MKTSEGMENT    10000 non-null object\n",
+      "C_COMMENT       10000 non-null object\n",
+      "dtypes: object(8)\n",
+      "memory usage: 625.1+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Run that same query, but this time use the read_sql method\n",
+    "# in the Pandas data frame object\n",
+    "#%%time\n",
+    "df = pd.read_sql(sql, con)\n",
+    "df.info()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>C_CUSTKEY</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>C_MKTSEGMENT</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>AUTOMOBILE</td>\n",
+       "      <td>2043</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>BUILDING</td>\n",
+       "      <td>1938</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>FURNITURE</td>\n",
+       "      <td>2060</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>HOUSEHOLD</td>\n",
+       "      <td>1989</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>MACHINERY</td>\n",
+       "      <td>1970</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              C_CUSTKEY\n",
+       "C_MKTSEGMENT           \n",
+       "AUTOMOBILE         2043\n",
+       "BUILDING           1938\n",
+       "FURNITURE          2060\n",
+       "HOUSEHOLD          1989\n",
+       "MACHINERY          1970"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Get a count of distinct customers by market segment\n",
+    "df.groupby('C_MKTSEGMENT')[['C_CUSTKEY']].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "C_CUSTKEY       False\n",
+       "C_NAME          False\n",
+       "C_ADDRESS       False\n",
+       "C_NATIONKEY     False\n",
+       "C_PHONE         False\n",
+       "C_ACCTBAL       False\n",
+       "C_MKTSEGMENT    False\n",
+       "C_COMMENT       False\n",
+       "dtype: bool"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Check to see if any of the columns have null values\n",
+    "pd.isnull(df).any()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "list"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(res)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "('5050001', 'Customer#005050001', 'h2Q2lfB QpSuOt32ZDV7S8RsTKgedv4w9s9wa', '18', '28-680-716-8960', '4571.61', 'AUTOMOBILE', 'e thinly bold ideas. carefully final pinto beans cajole across')\n"
+     ]
+    }
+   ],
+   "source": [
+    "print (res[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AUTOMOBILE has occured 1974 times\n",
+      "BUILDING has occured 1964 times\n",
+      "MACHINERY has occured 1989 times\n",
+      "HOUSEHOLD has occured 2025 times\n",
+      "FURNITURE has occured 2048 times\n"
+     ]
+    }
+   ],
+   "source": [
+    "unique_cust_key = []\n",
+    "z = []\n",
+    "for x in res:\n",
+    "    z.append((x[0],x[6]))\n",
+    "\n",
+    "for x in z:\n",
+    "    if x not in unique_cust_key:\n",
+    "        unique_cust_key.append(x)\n",
+    "    \n",
+    "# initailize a null list \n",
+    "unique_list = []\n",
+    "\n",
+    "# traverse for all elements \n",
+    "for x in unique_cust_key:\n",
+    "    # check if exists in unique_list or not \n",
+    "    if x[1] not in unique_list:\n",
+    "        unique_list.append(x[1])\n",
+    "       \n",
+    "def countX(lst, x):\n",
+    "    count = 0\n",
+    "    for y in lst:\n",
+    "        if (y[1] == x):\n",
+    "            count = count + 1\n",
+    "    return count\n",
+    "\n",
+    "for a in unique_list:\n",
+    "    print('{} has occured {} times'.format(a, countX(unique_cust_key, a))) \n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}