Snowflake-Labs · sfc-gh-jvenegasvega-1 · Apr 24, 2023 · Apr 25, 2023 · sfc-gh-kjimenezmorales · Apr 25, 2023
diff --git a/snowpark_extensions/dataframe_reader_extensions.py b/snowpark_extensions/dataframe_reader_extensions.py
@@ -9,6 +9,8 @@
     import logging    
     DataFrameReader.___extended = True
     DataFrameReader.__option = DataFrameReader.option
+    DataFrameReader.__csv = DataFrameReader.csv
+
     def _option(self, key: str, value: Any) -> "DataFrameReader":
         key = key.upper()
         if key == "SEP" or key == "DELIMITER":
@@ -42,28 +44,17 @@ def _load(self,path: Union[str, List[str], None] = None, format: Optional[str] =
         self.format(format)
         if schema:
             self.schema(schema)
-        files = []
-        if isinstance(path,list):
-            files.extend(path)
-        else:
-            files.append(path)
+        files = get_file_paths(path)
         session = context.get_active_session()
         if stage is None:
             stage = f'{session.get_fully_qualified_current_schema()}.{_generate_prefix("TEMP_STAGE")}'
             session.sql(f'create TEMPORARY stage if not exists {stage}').show()
         stage_files = [x for x in path if x.startswith("@")]
         if len(stage_files) > 1:
             raise Exception("Currently only one staged file can be specified. You can use a pattern if you want to specify several files")
-        print(f"Uploading files using stage {stage}")
-        for file in files:
-            if file.startswith("file://"): # upload local file
-                session.file.put(file,stage)
-            elif file.startswith("@"): #ignore it is on an stage
-                return self._read_semi_structured_file(file,format)
-            else: #assume it is file too
-                session.file.put(f"file://{file}",f"@{stage}")
+        stage = get_stage(self, session, files, stage)
         if self._file_type == "csv":
-            return self.csv(f"@{stage}")
+            return self.__csv(f"@{stage}")
         return self._read_semi_structured_file(f"@{stage}",format)
 
     def _format(self, file_type: str) -> "DataFrameReader":
@@ -72,7 +63,50 @@ def _format(self, file_type: str) -> "DataFrameReader":
             self._file_type = file_type
         else:
             raise Exception(f"Unsupported file format {file_type}")
+
+    def _csv(self,path: Union[str, List[str]],schema: Optional[Union[StructType, str]] = None,sep: Optional[str] = None,encoding: Optional[str] = None,quote: Optional[str] = None,
+             escape: Optional[str] = None,comment: Optional[str] = None,header: Optional[Union[bool, str]] = None,inferSchema: Optional[Union[bool, str]] = None,
+             ignoreLeadingWhiteSpace: Optional[Union[bool, str]] = None,ignoreTrailingWhiteSpace: Optional[Union[bool, str]] = None,nullValue: Optional[str] = None,
+             nanValue: Optional[str] = None,positiveInf: Optional[str] = None,negativeInf: Optional[str] = None,dateFormat: Optional[str] = None,timestampFormat: Optional[str] = None,
+             maxColumns: Optional[Union[int, str]] = None,maxCharsPerColumn: Optional[Union[int, str]] = None,maxMalformedLogPerPartition: Optional[Union[int, str]] = None,
+             mode: Optional[str] = None,columnNameOfCorruptRecord: Optional[str] = None,multiLine: Optional[Union[bool, str]] = None,charToEscapeQuoteEscaping: Optional[str] = None,
+             samplingRatio: Optional[Union[float, str]] = None,enforceSchema: Optional[Union[bool, str]] = None,emptyValue: Optional[str] = None,locale: Optional[str] = None,
+             lineSep: Optional[str] = None,pathGlobFilter: Optional[Union[bool, str]] = None,recursiveFileLookup: Optional[Union[bool, str]] = None,modifiedBefore: Optional[Union[bool, str]] = None,
+             modifiedAfter: Optional[Union[bool, str]] = None,unescapedQuoteHandling: Optional[str] = None) -> "DataFrame":
+        params = {k: v for k, v in locals().items() if v is not None}
+        params.pop("self", None)
+        params.pop("path", None)
+        params.pop("schema", None)
+        if schema:
+            self.schema(schema)
+        files = get_file_paths(path)
+        session = context.get_active_session()  
+        stage = f'{session.get_fully_qualified_current_schema()}.{_generate_prefix("TEMP_STAGE")}'
+        session.sql(f'create TEMPORARY stage if not exists {stage}').show()
+        stage = get_stage(self, session, files, stage)
+        for key, value in params.items():
+          self = self.option(key, value)
+        return self.__csv(f"@{stage}")
 
+    def get_file_paths(path: Union[str, List[str]]):
+        if isinstance(path,list):
+            return path
+        else:
+            return [path]
+
+    def get_stage(self, session, files: List[str], stage: str):
+        print(f"Uploading files using stage {stage}")
+        for file in files:
+            if file.startswith("file://"): # upload local file
+                session.file.put(file,stage)
+            elif file.startswith("@"): #ignore it is on an stage
+                return self._read_semi_structured_file(file,format)
+            else: #assume it is file too
+                session.file.put(f"file://{file}",f"@{stage}")
+        return stage
+
     DataFrameReader.format = _format
     DataFrameReader.load   = _load
-    DataFrameReader.option = _option
+    DataFrameReader.option = _option
+    DataFrameReader.csv = _csv
+
diff --git a/tests/test_dataframe_reader_extensions.py b/tests/test_dataframe_reader_extensions.py
@@ -1,10 +1,26 @@
 import pytest
-from snowflake.snowpark import Session, Row
+from snowflake.snowpark import Session, Row, DataFrameReader
 from snowflake.snowpark.types import *
 import snowpark_extensions
 
 def test_load():    
     session = Session.builder.from_snowsql().getOrCreate()
+    cases = session.read.load(["./tests/data/test1_0.csv","./tests/data/test1_1.csv"],
+                        schema=get_schema(),
+                        format="csv", 
+                        sep=",",
+                        header="true")
+    assert 10 == len(cases.collect())
+
+def test_csv():
+    session = Session.builder.from_snowsql().getOrCreate()
+    csvInfo = session.read.csv(["./tests/data/test1_0.csv","./tests/data/test1_1.csv"],
+                               schema=get_schema(),
+                               sep=",",
+                               header="true")
+    assert 10 == len(csvInfo.collect())
+
+def get_schema():
     schema = StructType([ \
     StructField("case_id",       StringType()), \
     StructField("province",      StringType()), \
@@ -13,11 +29,6 @@ def test_load():
     StructField("infection_case",StringType()), \
     StructField("confirmed",     IntegerType()), \
     StructField("latitude",      FloatType()), \
-    StructField("cilongitudety", FloatType()) \
+    StructField("longitude", FloatType()) \
     ])
-    cases = session.read.load(["./tests/data/test1_0.csv","./tests/data/test1_1.csv"],
-                        schema=schema,
-                        format="csv", 
-                        sep=",",
-                        header="true")
-    assert 10 == len(cases.collect())
+    return schema