0.0.26 (#29)

* progress on table function changes * reorganize functions * save progress * upgrade version and notes in change log
Snowflake-Labs · Feb 9, 2023 · 3d45121 · 3d45121
1 parent 84cb043
commit 3d45121
Show file tree

Hide file tree

Showing 19 changed files with 618 additions and 411 deletions.
diff --git a/CHANGE_LOG.txt b/CHANGE_LOG.txt
@@ -129,3 +129,11 @@ Fixing an issue with the current implementation of applyInPandas
 Version 0.0.25
 --------------
 Change in implementation of regexp_split to support different regular expression cases
+
+Version 0.0.26
+--------------
+- Changes in the implementation for explode / explode_outer / array_zip / flatten
+to take advantege of changes in snowpark lib.
+- adding a stack method similar in functionality to unpivot
+- removing dependency to shortuuid
+- adding extensions for DataFrameReader
diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ order by start_time desc;
 | DataFrame.groupby.applyInPandas| Maps each group of the current DataFrame using a pandas udf and returns the result as a DataFrame. |
 | DataFrame.replace        | extends replace to allow using a regex
 | DataFrame.groupBy.pivot        | extends the snowpark groupby to add a pivot operator
-
+| DataFrame.stack  | This is an operator similar to the unpivot operator
 
 ### Examples
 
@@ -193,6 +193,58 @@ df.group_by("ID").applyInPandas(
 ------------------------------
 ```
 
+### stack
+
+Assuming you have a DataTable like:
+
+#  +-------+---------+-----+---------+----+
+#  |   Name|Analytics|   BI|Ingestion|  ML|
+#  +-------+---------+-----+---------+----+
+#  | Mickey|     null|12000|     null|8000|
+#  | Martin|     null| 5000|     null|null|
+#  |  Jerry|     null| null|     1000|null|
+#  |  Riley|     null| null|     null|9000|
+#  | Donald|     1000| null|     null|null|
+#  |   John|     null| null|     1000|null|
+#  |Patrick|     null| null|     null|1000|
+#  |  Emily|     8000| null|     3000|null|
+#  |   Arya|    10000| null|     2000|null|
+#  +-------+---------+-----+---------+----+    
+
+```python
+df.select("NAME",df.stack(4,lit('Analytics'), "ANALYTICS", lit('BI'), "BI", lit('Ingestion'), "INGESTION", lit('ML'), "ML").alias("Project", "Cost_To_Project")).filter(col("Cost_To_Project").is_not_null()).orderBy("NAME","Project")
+```
+
+That will return:
+```
+'-------------------------------------------
+|"NAME"   |"PROJECT"  |"COST_TO_PROJECT"  |
+-------------------------------------------
+|Arya     |Analytics  |10000              |
+|Arya     |Ingestion  |2000               |
+|Donald   |Analytics  |1000               |
+|Emily    |Analytics  |8000               |
+|Emily    |Ingestion  |3000               |
+|Jerry    |Ingestion  |1000               |
+|John     |Ingestion  |1000               |
+|Martin   |BI         |5000               |
+|Mickey   |BI         |12000              |
+|Mickey   |ML         |8000               |
+|Patrick  |ML         |1000               |
+|Riley    |ML         |9000               |
+-------------------------------------------
+```
+
+## DataFrameReader Extensions
+
+| Name                           | Description                                                                         |
+|--------------------------------|-------------------------------------------------------------------------------------|
+| DataFrameReader.format         | Specified the format of the file to load
+| DataFrameReader.load           | Loads a dataframe from a file. It will upload the files to an stage if needed 
+
+### Example
+
+
 ## Functions Extensions
 
 | Name                         | Description                                                                         |

diff --git a/extras/README.md b/extras/README.md
@@ -0,0 +1,6 @@
+# Snowpark Extensions Extras
+
+These "extras" are experimental extensions. These extensions are meant to test some snowpark capabilities.
+We put them as experimental, as they might require some additional testing or apply only in some scenarios.
+
+
diff --git a/extras/notebooks/runner/README.md b/extras/notebooks/runner/README.md
@@ -0,0 +1,15 @@
+# Notebook Runner
+
+The notebook runner is a small example that allows you to run a notebook from within snowflake.
+
+The runner script will:
+1. connect to snowflake 
+2. upload the notebook, 
+3. publish a storeproc, 
+4. run the store procedure, 
+5. save the results of the notebook and 
+6. then download the results as an html
+
+This script call also be used to publish a permanent stored proc that can then be used to run any notebook that is already on an stage,
+or to schedule a task to run a notebook.
+
diff --git a/extras/notebooks/runner/example1.ipynb b/extras/notebooks/runner/example1.ipynb
@@ -0,0 +1,54 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from snowflake.snowpark import Session\n",
+    "import snowpark_extensions\n",
+    "# will try to setup credential from the snowsql CLI if present of from SNOW_xxx or SNOWSQL_xxx variables\n",
+    "# if not configuration can be retrieve you will receive an error\n",
+    "session = Session.builder.from_snowsql().from_env().getOrCreate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = session.createDataFrame([('oneAtwoBthreeC',)], ['s',])\n",
+    "df.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16 (default, Jan 10 2023, 15:23:34) \n[GCC 9.4.0]"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "9ac03a0a6051494cc606d484d27d20fce22fb7b4d169f583271e11d5ba46a56e"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/runner → extras/notebooks/runner/runner.py b/runner → extras/notebooks/runner/runner.py
@@ -4,33 +4,61 @@
 
 import argparse
 from rich import print
-import shortuuid
 import os
 
 from snowflake.snowpark import Session
 from snowflake.snowpark.functions import sproc
 import snowpark_extensions
 
-print("[cyan]Snowpark Extensions Utilities")
+print("[cyan]Snowpark Extensions Extras")
+print("[cyan]Notebook Runner")
 print("[cyan]=============================")
-print("This tool will connect using snowconfig file")
 arguments = argparse.ArgumentParser()
-arguments.add_argument("--notebook",help="Jupyter Notebook to run",required=True)
+arguments.add_argument("--notebook",help="Jupyter Notebook to run")
+arguments.add_argument("--registerproc",default="",help="Register an stored proc that can then be used to run notebooks")
 arguments.add_argument("--stage",help="stage",default="NOTEBOOK_RUN")
 arguments.add_argument("--packages",help="packages",default="")
+arguments.add_argument("--imports" ,help="imports" ,default="")
+arguments.add_argument("--connection",dest="connection_args",nargs="*",required=True,help="Connect options, for example snowsql, snowsql connection,env")
 
 
 args = arguments.parse_args()
-session = Session.builder.from_snowsql().getOrCreate()
+print(args)
+session = None
+try:
+    if len(args.connection_args) >= 1:
+        first_arg = args.connection_args[0]
+        rest_args = args.connection_args[1:]
+        if first_arg == "snowsql":
+            session = Session.builder.from_snowsql(*rest_args).create()
+        elif first_arg == "env":
+            session = Session.builder.from_env().create()
+        else:
+            connection_args={}
+            for arg in args.connection_args:
+                key, value = arg.split("=")
+                connection_args[key] = value
+            session = Session.builder.configs(connection_args).create()
+except Exception as e:
+    print(e)
+    print("[red] An error happened while trying to connect")
+    exit(1)
+if not session:
+    print("[red] Not connected. Aborting")
+    exit(2)
 session.sql(f"CREATE STAGE IF NOT EXISTS {args.stage}").show()
-session.file.put(args.notebook,f'@{args.stage}',auto_compress=False,overwrite=True)
-
+print(f"Uploading notebook to stage {args.stage}")
+session.file.put(f"file://{args.notebook}",f'@{args.stage}',auto_compress=False,overwrite=True)
+print(f"Notebook uploaded")
 
 packages=["snowflake-snowpark-python","nbconvert","nbformat","ipython","jinja2==3.0.3","plotly"]
 packages.extend(set(filter(None, args.packages.split(','))))
 print(f"Using packages [magenta]{packages}")
-
-@sproc(replace=True,is_permanent=False,packages=packages,imports=["@test/snowpark_extensions.zip","@test/shortuuid.zip"]) #,"@test/IPython.zip"
+imports=[]
+if args.imports:
+    imports.extend(args.imports.split(','))
+is_permanent=False
+@sproc(name=args.registerproc,replace=True,is_permanent=is_permanent,packages=packages,imports=[])
 def run_notebook(session:Session,stage:str,notebook_filename:str) -> str:
         # (c) Matthew Wardrop 2019; Licensed under the MIT license
         #

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,5 @@
 snowflake-snowpark-python[pandas]
 pandas
-shortuuid
 rich
 nest_asyncio
 jinja2

diff --git a/runner.bat b/runner.bat
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 this_directory = Path(__file__).parent
 long_description = (this_directory / "README.md").read_text()
 
-VERSION = '0.0.25'
+VERSION = '0.0.26'
 
 setup(name='snowpark_extensions',
       version=VERSION,
@@ -14,12 +14,8 @@
       long_description_content_type='text/markdown',
       url='http://github.com/MobilizeNet/snowpark-extensions-py',
       author='mauricio.rojas',
-      install_requires=['snowflake-snowpark-python[pandas]>=1.1.0', 
-                        'shortuuid', 'nest_asyncio', 'jinja2', 'rich'],
+      install_requires=['snowflake-snowpark-python[pandas]==1.1.0', 
+                        'nest_asyncio', 'jinja2', 'rich'],
       author_email='[email protected]',
       packages=['snowpark_extensions'],
-       scripts=[
-        'runner',
-        'runner.bat'
-      ],
       zip_safe=False)
diff --git a/snowpark_extensions/__init__.py b/snowpark_extensions/__init__.py
@@ -2,6 +2,7 @@
 
 
 from .dataframe_extensions import *
+from .dataframe_reader_extensions import *
 from .functions_extensions import *
 from .session_builder_extensions import *
 from .types_extensions import *