diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt index f7f6666..6f127ea 100644 --- a/CHANGE_LOG.txt +++ b/CHANGE_LOG.txt @@ -103,4 +103,8 @@ Fixing issues when getting displayHTML on some environments Version 0.0.19 -------------- Adding functions.daydiff as a direct replacement for datediff -Fixing some test_cases \ No newline at end of file +Fixing some test_cases + +Version 0.0.20 +-------------- +Changing the implementation for notebook integration for dataframe to a more standard approach diff --git a/README.md b/README.md index 4b5660e..37434f8 100644 --- a/README.md +++ b/README.md @@ -405,11 +405,17 @@ if tables.count() > 5: ``` If you dont specify a name you can still access the last result using `__df`. -The standard IPython display does not render snowpark `dataframe`. You can use `df.show()` but by default something like: `display(df)` will just print: `` +> NOTE: By default only 50 rows are displays. You can customize this limit for example to 100 rows with: +``` +import snowpark_extensions +snowpark_extensions.rows_limit = 100 +``` + +You can configure Jupyter to run some imports and initialization code at the start of a notebook by creating a file called `startup.ipy` in the `~/.ipython/profile_default/startup` directory. -You can configure Jupyter to run some imports and initialization code at the start of a notebook by creating a file called `startup.ipy` in the `~/.ipython/profile_default/startup` directory. Any code written in this file will be executed when you start a new Jupyter notebook. +Any code written in this file will be executed when you start a new Jupyter notebook. -An [example startup.ipy](./startup.ipy) is provided +An [example startup.ipy](https://github.com/MobilizeNet/snowpark-extensions-py/blob/main/startup.ipy) is provided ## Running notebooks in Snowpark diff --git a/example_notebook_with_snowpark.ipynb b/example_notebook_with_snowpark.ipynb index 28e3723..87281cf 100644 --- a/example_notebook_with_snowpark.ipynb +++ b/example_notebook_with_snowpark.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -40,23 +40,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "Failed to execute query [queryID: 01a9e504-0405-9751-0000-1c55029dae1a] select asdfad\n", - "000904 (42000): SQL compilation error: error line 1 at position 7\n", + "Failed to execute query [queryID: 01a9f02a-0405-90fd-0000-1c5502a00ac6] SELECT count(1) AS \"COUNT(LITERAL())\" FROM (select asdfad) LIMIT 1\n", + "000904 (42000): SQL compilation error: error line 1 at position 51\n", "invalid identifier 'ASDFAD'\n" ] }, { "data": { "text/html": [ - "
Error: 000904 (42000): SQL compilation error: error line 1 at position 7\n", + "
Error: 000904 (42000): SQL compilation error: error line 1 at position 51\n", "invalid identifier 'ASDFAD'
" ], "text/plain": [ - "" + "" ] }, + "execution_count": 4, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ @@ -66,7 +67,28 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql tables\n", + "select 'a1' A, 'b1' B union\n", + "select 'a2' , 'b2'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "if tables.count() > 100:\n", + " print(\"There are many tables here\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -95,7 +117,7 @@ "" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -103,33 +125,14 @@ } ], "source": [ - "%%sql tables\n", - "select 'a1' A, 'b1' B union\n", - "select 'a2' , 'b2'" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "if tables.count() > 100:\n", - " print(\"There are many tables here\")" + "display(tables)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Output restricted to 10 rows\n" - ] - }, { "data": { "text/html": [ @@ -137,286 +140,26 @@ " \n", " \n", " \n", - " TABLE_CATALOG\n", - " TABLE_SCHEMA\n", - " TABLE_NAME\n", - " TABLE_OWNER\n", - " TABLE_TYPE\n", - " IS_TRANSIENT\n", - " CLUSTERING_KEY\n", - " ROW_COUNT\n", - " BYTES\n", - " RETENTION_TIME\n", - " SELF_REFERENCING_COLUMN_NAME\n", - " REFERENCE_GENERATION\n", - " USER_DEFINED_TYPE_CATALOG\n", - " USER_DEFINED_TYPE_SCHEMA\n", - " USER_DEFINED_TYPE_NAME\n", - " IS_INSERTABLE_INTO\n", - " IS_TYPED\n", - " COMMIT_ACTION\n", - " CREATED\n", - " LAST_ALTERED\n", - " AUTO_CLUSTERING_ON\n", - " COMMENT\n", + " A\n", + " B\n", " \n", " \n", " \n", " \n", " 0\n", - " DEMODB\n", - " AUDIT_STAGE\n", - " T_AUDIT_ACCESS_DATASET_FACT\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:09:39.907000-07:00\n", - " 2022-09-29 07:09:40.017000-07:00\n", - " NO\n", - " None\n", + " a1\n", + " b1\n", " \n", " \n", " 1\n", - " DEMODB\n", - " AUDIT_STAGE\n", - " T_AUDIT_ACCESS_QUERY_FACT\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:09:40.420000-07:00\n", - " 2022-09-29 07:09:40.542000-07:00\n", - " NO\n", - " None\n", - " \n", - " \n", - " 2\n", - " DEMODB\n", - " AUDIT_STAGE\n", - " T_AUDIT_QUERY_DATASET_GROUP\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:09:40.469000-07:00\n", - " 2022-09-29 07:09:40.587000-07:00\n", - " NO\n", - " None\n", - " \n", - " \n", - " 3\n", - " DEMODB\n", - " CJC\n", - " V_WORKFORCE_INIT_ALL_JOB_RUNS\n", - " ACCOUNTADMIN\n", - " VIEW\n", - " None\n", - " None\n", - " NaN\n", - " NaN\n", - " NaN\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:10:54.934000-07:00\n", - " 2022-09-29 07:10:55.665000-07:00\n", - " NO\n", - " None\n", - " \n", - " \n", - " 4\n", - " DEMODB\n", - " DBMS_SQL\n", - " MYEMPLOYEES\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2021-11-24 21:02:07.868000-08:00\n", - " 2022-10-05 10:10:26.911000-07:00\n", - " NO\n", - " None\n", - " \n", - " \n", - " 5\n", - " DEMODB\n", - " BIAUDIT_STAGE\n", - " T_BI4_ADS_SERVER_TYPE_STR\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:09:32.081000-07:00\n", - " 2022-09-29 07:09:32.441000-07:00\n", - " NO\n", - " Source server type str data\n", - " \n", - " \n", - " 6\n", - " DEMODB\n", - " DIM\n", - " L_BLOOD_URGENCY\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:10:32.102000-07:00\n", - " 2022-09-29 07:10:33.220000-07:00\n", - " NO\n", - " Signifies the urgency associated with the blood test order\n", - " \n", - " \n", - " 7\n", - " DEMODB\n", - " DIM\n", - " L_DNURSE_EPISODE_OUTCOME\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:10:19.989000-07:00\n", - " 2022-09-29 07:10:21.669000-07:00\n", - " NO\n", - " Dimension table for holding reference information for the reasons for which the patient was discharged from the episode of care.\n", - " \n", - " \n", - " 8\n", - " DEMODB\n", - " DIM\n", - " L_BLOOD_RETURN_REASON\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:10:40.229000-07:00\n", - " 2022-09-29 07:10:41.441000-07:00\n", - " NO\n", - " Reason for returning a previously issued blood donation product eg. Recalled by QA\n", - " \n", - " \n", - " 9\n", - " DEMODB\n", - " DIM\n", - " L_MCN_RESULT_ACTION\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:10:29.768000-07:00\n", - " 2022-09-29 07:10:30.883000-07:00\n", - " NO\n", - " Any actions detailed based on the test results. Developed originally for MCN datamart.\n", + " a2\n", + " b2\n", " \n", " \n", "" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -429,24 +172,43 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "50" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import snowpark_extensions\n", + "display(snowpark_extensions.rows_limit)" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# an 'extended' display\n", - "from snowpark_extensions import display" + "snowpark_extensions.rows_limit = 5" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Output restricted to 10 rows\n" + "There are 1361 rows. Showing only 5 \n" ] }, { @@ -509,56 +271,6 @@ " \n", " 1\n", " DEMODB\n", - " AUDIT_STAGE\n", - " T_AUDIT_ACCESS_QUERY_FACT\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:09:40.420000-07:00\n", - " 2022-09-29 07:09:40.542000-07:00\n", - " NO\n", - " None\n", - " \n", - " \n", - " 2\n", - " DEMODB\n", - " AUDIT_STAGE\n", - " T_AUDIT_QUERY_DATASET_GROUP\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:09:40.469000-07:00\n", - " 2022-09-29 07:09:40.587000-07:00\n", - " NO\n", - " None\n", - " \n", - " \n", - " 3\n", - " DEMODB\n", " CJC\n", " V_WORKFORCE_INIT_ALL_JOB_RUNS\n", " ACCOUNTADMIN\n", @@ -582,32 +294,7 @@ " None\n", " \n", " \n", - " 4\n", - " DEMODB\n", - " DBMS_SQL\n", - " MYEMPLOYEES\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2021-11-24 21:02:07.868000-08:00\n", - " 2022-10-05 10:10:26.911000-07:00\n", - " NO\n", - " None\n", - " \n", - " \n", - " 5\n", + " 2\n", " DEMODB\n", " BIAUDIT_STAGE\n", " T_BI4_ADS_SERVER_TYPE_STR\n", @@ -632,35 +319,10 @@ " Source server type str data\n", " \n", " \n", - " 6\n", - " DEMODB\n", - " DIM\n", - " L_BLOOD_URGENCY\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:10:32.102000-07:00\n", - " 2022-09-29 07:10:33.220000-07:00\n", - " NO\n", - " Signifies the urgency associated with the blood test order\n", - " \n", - " \n", - " 7\n", + " 3\n", " DEMODB\n", " DIM\n", - " L_DNURSE_EPISODE_OUTCOME\n", + " L_MCN_SCORE\n", " ACCOUNTADMIN\n", " BASE TABLE\n", " NO\n", @@ -676,13 +338,13 @@ " YES\n", " YES\n", " None\n", - " 2022-09-29 07:10:19.989000-07:00\n", - " 2022-09-29 07:10:21.669000-07:00\n", + " 2022-09-29 07:09:56.537000-07:00\n", + " 2022-09-29 07:09:57.953000-07:00\n", " NO\n", - " Dimension table for holding reference information for the reasons for which the patient was discharged from the episode of care.\n", + " Holds MCN score information sourced from NMCN CLEFT system\n", " \n", " \n", - " 8\n", + " 4\n", " DEMODB\n", " DIM\n", " L_BLOOD_RETURN_REASON\n", @@ -706,44 +368,21 @@ " NO\n", " Reason for returning a previously issued blood donation product eg. Recalled by QA\n", " \n", - " \n", - " 9\n", - " DEMODB\n", - " DIM\n", - " L_MCN_RESULT_ACTION\n", - " ACCOUNTADMIN\n", - " BASE TABLE\n", - " NO\n", - " None\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " YES\n", - " YES\n", - " None\n", - " 2022-09-29 07:10:29.768000-07:00\n", - " 2022-09-29 07:10:30.883000-07:00\n", - " NO\n", - " Any actions detailed based on the test results. Developed originally for MCN datamart.\n", - " \n", " \n", "" ], "text/plain": [ - "" + "" ] }, + "execution_count": 13, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "display(tables)" + "%%sql\n", + "select * from information_schema.tables" ] } ], diff --git a/runner b/runner index 6b39793..cb76244 100755 --- a/runner +++ b/runner @@ -26,7 +26,7 @@ session.sql(f"CREATE STAGE IF NOT EXISTS {args.stage}").show() session.file.put(args.notebook,f'@{args.stage}',auto_compress=False,overwrite=True) -packages=["snowflake-snowpark-python","nbconvert","nbformat","ipython","jinja2==3.0.3","plotly"] #"stack_data","pexpect","pickleshare","backcall","decorator","prompt_toolkit","jedi" +packages=["snowflake-snowpark-python","nbconvert","nbformat","ipython","jinja2==3.0.3","plotly"] packages.extend(set(filter(None, args.packages.split(',')))) print(f"Using packages [magenta]{packages}") diff --git a/setup.py b/setup.py index 2fbd263..2c4c178 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text() -VERSION = '0.0.19' +VERSION = '0.0.20' setup(name='snowpark_extensions', version=VERSION, diff --git a/snowpark_extensions/__init__.py b/snowpark_extensions/__init__.py index 2c6cee3..cd216b0 100644 --- a/snowpark_extensions/__init__.py +++ b/snowpark_extensions/__init__.py @@ -6,9 +6,10 @@ from .session_builder_extensions import * from .types_extensions import * from .column_extensions import * -from .utils import display +rows_limit = 50 + def get_display_html() -> None: import inspect for frame in inspect.getouterframes(inspect.currentframe()): @@ -36,6 +37,29 @@ def instructions(): except: from IPython.display import display, HTML output_cell_output = lambda x: display(HTML(x)) + + def snowflake_dataframe_formatter(df): + # Format the dataframe as a table using pandas + from snowflake.snowpark.exceptions import SnowparkSQLException + try: + count = df.count() + if count > rows_limit: + print(f"There are {count} rows. Showing only {rows_limit} ") + return df.limit(rows_limit).to_pandas().to_html() + except SnowparkSQLException as sce: + error_msg = sce.message + formatted = error_message_template.replace("@error", error_msg) + return formatted + except Exception as ex: + error_message = str(ex) + return f"
{error_message}
" + + + + # Register the display hook + from snowflake.snowpark import DataFrame + get_ipython().display_formatter.formatters['text/html'].for_type(DataFrame, snowflake_dataframe_formatter) + from IPython.core.magic import (Magics, magics_class, cell_magic) @magics_class class SnowparkMagics(Magics): @@ -51,22 +75,12 @@ def sql(self, line, cell): name = None if line and line.strip(): name = line.strip().split(" ")[0] - from snowflake.snowpark.exceptions import SnowparkSQLException - try: - df = session.sql(res) - html = df.to_pandas().to_html() - output_cell_output(html) - if name: - self.shell.user_ns[name] = df - else: - self.shell.user_ns["__df"] = df - except SnowparkSQLException as sce: - error_msg = sce.message - formatted = error_message_template.replace("@error", error_msg) - output_cell_output(formatted) - except Exception as ex: - error_message = str(ex) - output_cell_output(f"
{error_message}
") + df = session.sql(res) + if name: + self.shell.user_ns[name] = df + else: + self.shell.user_ns["__df"] = df + return df else: return "No session was found. You can setup one by running: session = Session.builder.from_env().getOrCreate()" magics = SnowparkMagics(ipython) diff --git a/snowpark_extensions/utils.py b/snowpark_extensions/utils.py index eafaa96..6a1a218 100644 --- a/snowpark_extensions/utils.py +++ b/snowpark_extensions/utils.py @@ -149,20 +149,3 @@ def schema_str_to_schema(schema_as_str): datatype = map_string_type_to_datatype(type) schema_fields.append(StructField(name,datatype)) return StructType(schema_fields) - -from snowflake.snowpark import DataFrame -def display(data:DataFrame, limit:int=10): - from IPython.display import display as _display, HTML - if isinstance(data, DataFrame): - try: - output_cell_output = displayHTML - except: - output_cell_output = lambda x: _display(HTML(x)) - try: - print(f"Output restricted to {limit} rows") - html = data.limit(limit).to_pandas().to_html() - output_cell_output(html) - except Exception as e: - print(f"Error displaying dataframe {e}") - else: - _display(data) \ No newline at end of file