Spark 3.4 (#113)

SneaksAndData · May 30, 2023 · 63e52f9 · 63e52f9
1 parent d22e286
commit 63e52f9
Show file tree

Hide file tree

Showing 8 changed files with 395 additions and 339 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -4,4 +4,4 @@
 # These owners will be the default owners for everything in
 # the repo. Unless a later match takes precedence, these people will be requests a review
 # review when someone opens a pull request.
-*       @jrbentzon @george-zubrienko
+*       @SneaksAndData/platform-engineering
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -13,10 +13,10 @@ jobs:
     if: ${{ github.ref != 'refs/heads/main' }}
 
     container:
-      image: esdcrproduction.azurecr.io/spark:v3.1.1-bitnami-a0a12a1d-python-3.9.15
+      image: esdcrdevelopment.azurecr.io/spark:v3.1.1-5-g28008d0-bitnami-c31d3b46-python-3.9.16
       credentials:
-        username: ${{ secrets.AZCR_PROD_USER }}
-        password: ${{ secrets.AZCR_PROD_TOKEN }}
+        username: ${{ secrets.AZCR_DEV_USER }}
+        password: ${{ secrets.AZCR_DEV_TOKEN }}
       options: -u root -w /opt/bitnami/spark --mount type=tmpfs,destination=/home/spark
 
     steps:

diff --git a/.github/workflows/prepare_release.yaml b/.github/workflows/prepare_release.yaml
@@ -16,4 +16,4 @@ jobs:
         uses: SneaksAndData/github-actions/[email protected]
         with:
           major_v: 1
-          minor_v: 0
+          minor_v: 1
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,10 +9,10 @@ readme = "README.md"
 repository = 'https://github.com/SneaksAndData/spark-utils'
 
 [tool.poetry.dependencies]
-python = "^3.9"
-hadoop-fs-wrapper = "~0.5.2"
+python = ">=3.9, <3.12"
+hadoop-fs-wrapper = "~0.6.0"
 cryptography = "~36.0"
-delta-spark = "~2.1.1"
+delta-spark = "~2.4.0"
 
 kubernetes = { version = "24.2.0", optional = true }
 

diff --git a/spark_utils/common/spark_session_provider.py b/spark_utils/common/spark_session_provider.py
@@ -37,7 +37,6 @@
         V1Container,
         V1ContainerPort,
         V1EnvVar,
-        V1ResourceRequirements,
         V1PodSecurityContext,
         V1NodeAffinity,
         V1NodeSelector,
@@ -62,7 +61,7 @@ class SparkSessionProvider:
     def __init__(
         self,
         *,
-        delta_lake_version="2.12:2.1.0",
+        delta_lake_version="2.12:2.4.0",
         hive_metastore_config: Optional[HiveMetastoreConfig] = None,
         additional_packages: Optional[List[str]] = None,
         additional_configs: Optional[Dict[str, str]] = None,
@@ -86,8 +85,8 @@ def __init__(
             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
             .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
             .config("spark.jars.ivy", os.path.join(tempfile.gettempdir(), ".ivy2"))
-            .config("spark.sql.legacy.parquet.datetimeRebaseModeInWrite", "CORRECTED")
-            .config("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
+            .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
+            .config("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
         )
 
         if hive_metastore_config:
@@ -152,7 +151,6 @@ def configure_for_k8s(
             .config("spark.kubernetes.driver.pod.name", spark_config.driver_name or os.getenv("SPARK_DRIVER_NAME"))
             .config("spark.app.name", spark_config.application_name)
             .config("spark.kubernetes.executor.podNamePrefix", executor_name)
-            .config("spark.kubernetes.executor.limit.cores", 1)
             .config("spark.driver.host", spark_config.driver_ip or os.getenv("SPARK_DRIVER_IP"))
             .config("spark.kubernetes.namespace", spark_config.k8s_namespace)
             .config("spark.kubernetes.container.image", spark_config.spark_image)
@@ -171,10 +169,6 @@ def configure_for_k8s(
                         image=spark_config.spark_image,
                         ports=[V1ContainerPort(name="http", container_port=8080, protocol="TCP")],
                         env=[V1EnvVar(name="SPARK_WORKER_WEBUI_PORT", value="8080")],
-                        resources=V1ResourceRequirements(
-                            limits={"cpu": 1, "memory": f"{spark_config.default_executor_memory}Mi"},
-                            requests={"cpu": 1, "memory": f"{spark_config.default_executor_memory}Mi"},
-                        ),
                     )
                 ],
                 restart_policy="Never",

diff --git a/spark_utils/common/spark_udf.py b/spark_utils/common/spark_udf.py
@@ -33,5 +33,5 @@ def getfromstr_uuid(input_str):
     :param input_str: Any string value
     :return: UUID that only matches provided input
     """
-    null_namespace = type("", (), dict(bytes=b""))()
+    null_namespace = type("", (), {"bytes": b""})()
     return str(uuid.uuid3(null_namespace, input_str))
diff --git a/test/test_delta_log.py b/test/test_delta_log.py
@@ -27,4 +27,4 @@ def test_cache_clear(spark_session: SparkSession, test_base_path: str):
     _ = DeltaLog.for_table(spark_session, f"file:///{test_base_path}/delta_log/table2")
     cleared_cache_read = time.monotonic_ns() - start
 
-    assert non_cached_read / cached_read > 100 and non_cached_read / cleared_cache_read < 2
+    assert non_cached_read / cached_read > 10 and non_cached_read / cleared_cache_read < 2