-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit f51b2da
Showing
8 changed files
with
1,391 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# - Please check the following URLs for the driver versions to pick up: | ||
# All drivers: https://docs.snowflake.net/manuals/release-notes/client-change-log.html#client-changes-by-version | ||
# ODBC: https://sfc-repo.snowflakecomputing.com/odbc/linux/index.html | ||
# JDBC: https://repo1.maven.org/maven2/net/snowflake/snowflake-jdbc/ | ||
# Spark: https://repo1.maven.org/maven2/net/snowflake/spark-snowflake_2.11 | ||
# Note: For Spark, the docker currently uses Spark 2.4 with Scala 2.11 | ||
# - Update lines 17 to 22 (beginning with ARG) with the correct levels to be deployed which executes deploy_snowflake.sh Script | ||
# - For the almond & scala kernel, please check the following link: | ||
# https://almond.sh/docs/quick-start-install | ||
# - Note: For the jupyter scala kernel, the version can be set with the variable scala_kernel_version | ||
# Questions: Zohar Nissare-Houssen - [email protected] | ||
# | ||
|
||
#Start from the following core stack & driver levels versions | ||
FROM jupyter/all-spark-notebook:1c8073a927aa | ||
USER root | ||
ARG almond_version=0.10.9 | ||
ARG scala_kernel_version=2.12.11 | ||
ARG odbc_version=2.22.3 | ||
ARG jdbc_version=3.12.16 | ||
ARG spark_version=2.8.3 | ||
ARG snowsql_version=1.2.10 | ||
RUN apt-get update && \ | ||
apt-get install -y apt-utils && \ | ||
apt-get install -y libssl-dev libffi-dev && \ | ||
apt-get install -y vim | ||
RUN sudo -u jovyan /opt/conda/bin/curl -Lo coursier https://git.io/coursier-cli | ||
RUN chown -R jovyan:users /home/jovyan/coursier && chmod +x /home/jovyan/coursier | ||
RUN sudo -u jovyan /home/jovyan/coursier launch --fork almond:$almond_version --scala $scala_kernel_version -- --install | ||
RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade pip | ||
RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade pyarrow | ||
RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade snowflake-connector-python[pandas] | ||
RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade snowflake-sqlalchemy | ||
RUN sudo -u jovyan /opt/conda/bin/python -m pip install --upgrade plotly | ||
RUN conda install pyodbc | ||
RUN conda install -c conda-forge jupyterlab-plotly-extension --yes | ||
RUN apt-get install -y iodbc libiodbc2-dev libssl-dev | ||
COPY ./deploy_snowflake.sh / | ||
RUN chmod +x /deploy_snowflake.sh | ||
RUN /deploy_snowflake.sh | ||
RUN mkdir /home/jovyan/samples | ||
COPY ./pyodbc.ipynb /home/jovyan/samples | ||
COPY ./Python.ipynb /home/jovyan/samples | ||
COPY ./spark.ipynb /home/jovyan/samples | ||
COPY ./SQLAlchemy.ipynb /home/jovyan/samples | ||
RUN chown -R jovyan:users /home/jovyan/samples | ||
RUN sudo -u jovyan /opt/conda/bin/jupyter trust /home/jovyan/samples/pyodbc.ipynb | ||
RUN sudo -u jovyan /opt/conda/bin/jupyter trust /home/jovyan/samples/Python.ipynb | ||
RUN sudo -u jovyan /opt/conda/bin/jupyter trust /home/jovyan/samples/spark.ipynb | ||
RUN sudo -u jovyan /opt/conda/bin/jupyter trust /home/jovyan/samples/SQLAlchemy.ipynb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,338 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Import the various modules required to make a simple Snowflake connection from Python\n", | ||
"import snowflake.connector\n", | ||
"from snowflake.connector.converter_null import SnowflakeNoConverterToPython\n", | ||
"import pandas as pd" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Modify this cell to include information about your demo account\n", | ||
"ACCOUNT = 'xxxx'\n", | ||
"USER = 'xxxx'\n", | ||
"PASSWORD = 'xxxx'\n", | ||
"\n", | ||
"con = snowflake.connector.connect(\n", | ||
" user=USER,\n", | ||
" password=PASSWORD,\n", | ||
" account=ACCOUNT\n", | ||
" ,converter_class=SnowflakeNoConverterToPython\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create a variable called sql and specify a query that it will store\n", | ||
"sql = \"select * from sales.public.customer limit 10000\"" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"<snowflake.connector.cursor.SnowflakeCursor at 0x7f689ce3e5f8>" | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Specify the virtual warehouse and role we want to use\n", | ||
"con.cursor().execute(\"USE WAREHOUSE xxxx\")\n", | ||
"con.cursor().execute(\"USE role xxxx\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Execute the query using the Python connector\n", | ||
"#%%time\n", | ||
"res = con.cursor().execute(sql).fetchall()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"<class 'pandas.core.frame.DataFrame'>\n", | ||
"RangeIndex: 10000 entries, 0 to 9999\n", | ||
"Data columns (total 8 columns):\n", | ||
"C_CUSTKEY 10000 non-null object\n", | ||
"C_NAME 10000 non-null object\n", | ||
"C_ADDRESS 10000 non-null object\n", | ||
"C_NATIONKEY 10000 non-null object\n", | ||
"C_PHONE 10000 non-null object\n", | ||
"C_ACCTBAL 10000 non-null object\n", | ||
"C_MKTSEGMENT 10000 non-null object\n", | ||
"C_COMMENT 10000 non-null object\n", | ||
"dtypes: object(8)\n", | ||
"memory usage: 625.1+ KB\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"# Run that same query, but this time use the read_sql method\n", | ||
"# in the Pandas data frame object\n", | ||
"#%%time\n", | ||
"df = pd.read_sql(sql, con)\n", | ||
"df.info()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>C_CUSTKEY</th>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>C_MKTSEGMENT</th>\n", | ||
" <th></th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <td>AUTOMOBILE</td>\n", | ||
" <td>2043</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>BUILDING</td>\n", | ||
" <td>1938</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>FURNITURE</td>\n", | ||
" <td>2060</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>HOUSEHOLD</td>\n", | ||
" <td>1989</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>MACHINERY</td>\n", | ||
" <td>1970</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" C_CUSTKEY\n", | ||
"C_MKTSEGMENT \n", | ||
"AUTOMOBILE 2043\n", | ||
"BUILDING 1938\n", | ||
"FURNITURE 2060\n", | ||
"HOUSEHOLD 1989\n", | ||
"MACHINERY 1970" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Get a count of distinct customers by market segment\n", | ||
"df.groupby('C_MKTSEGMENT')[['C_CUSTKEY']].count()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"C_CUSTKEY False\n", | ||
"C_NAME False\n", | ||
"C_ADDRESS False\n", | ||
"C_NATIONKEY False\n", | ||
"C_PHONE False\n", | ||
"C_ACCTBAL False\n", | ||
"C_MKTSEGMENT False\n", | ||
"C_COMMENT False\n", | ||
"dtype: bool" | ||
] | ||
}, | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"# Check to see if any of the columns have null values\n", | ||
"pd.isnull(df).any()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"list" | ||
] | ||
}, | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"type(res)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"('5050001', 'Customer#005050001', 'h2Q2lfB QpSuOt32ZDV7S8RsTKgedv4w9s9wa', '18', '28-680-716-8960', '4571.61', 'AUTOMOBILE', 'e thinly bold ideas. carefully final pinto beans cajole across')\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print (res[0])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"AUTOMOBILE has occured 1974 times\n", | ||
"BUILDING has occured 1964 times\n", | ||
"MACHINERY has occured 1989 times\n", | ||
"HOUSEHOLD has occured 2025 times\n", | ||
"FURNITURE has occured 2048 times\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"unique_cust_key = []\n", | ||
"z = []\n", | ||
"for x in res:\n", | ||
" z.append((x[0],x[6]))\n", | ||
"\n", | ||
"for x in z:\n", | ||
" if x not in unique_cust_key:\n", | ||
" unique_cust_key.append(x)\n", | ||
" \n", | ||
"# initailize a null list \n", | ||
"unique_list = []\n", | ||
"\n", | ||
"# traverse for all elements \n", | ||
"for x in unique_cust_key:\n", | ||
" # check if exists in unique_list or not \n", | ||
" if x[1] not in unique_list:\n", | ||
" unique_list.append(x[1])\n", | ||
" \n", | ||
"def countX(lst, x):\n", | ||
" count = 0\n", | ||
" for y in lst:\n", | ||
" if (y[1] == x):\n", | ||
" count = count + 1\n", | ||
" return count\n", | ||
"\n", | ||
"for a in unique_list:\n", | ||
" print('{} has occured {} times'.format(a, countX(unique_cust_key, a))) \n", | ||
" \n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.7.3" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.