Skip to content

Commit

Permalink
Spark 3.4 (#113)
Browse files Browse the repository at this point in the history
  • Loading branch information
george-zubrienko authored May 30, 2023
1 parent d22e286 commit 63e52f9
Show file tree
Hide file tree
Showing 8 changed files with 395 additions and 339 deletions.
2 changes: 1 addition & 1 deletion .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
# These owners will be the default owners for everything in
# the repo. Unless a later match takes precedence, these people will be requests a review
# review when someone opens a pull request.
* @jrbentzon @george-zubrienko
* @SneaksAndData/platform-engineering
6 changes: 3 additions & 3 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ jobs:
if: ${{ github.ref != 'refs/heads/main' }}

container:
image: esdcrproduction.azurecr.io/spark:v3.1.1-bitnami-a0a12a1d-python-3.9.15
image: esdcrdevelopment.azurecr.io/spark:v3.1.1-5-g28008d0-bitnami-c31d3b46-python-3.9.16
credentials:
username: ${{ secrets.AZCR_PROD_USER }}
password: ${{ secrets.AZCR_PROD_TOKEN }}
username: ${{ secrets.AZCR_DEV_USER }}
password: ${{ secrets.AZCR_DEV_TOKEN }}
options: -u root -w /opt/bitnami/spark --mount type=tmpfs,destination=/home/spark

steps:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/prepare_release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ jobs:
uses: SneaksAndData/github-actions/[email protected]
with:
major_v: 1
minor_v: 0
minor_v: 1
702 changes: 382 additions & 320 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ readme = "README.md"
repository = 'https://github.com/SneaksAndData/spark-utils'

[tool.poetry.dependencies]
python = "^3.9"
hadoop-fs-wrapper = "~0.5.2"
python = ">=3.9, <3.12"
hadoop-fs-wrapper = "~0.6.0"
cryptography = "~36.0"
delta-spark = "~2.1.1"
delta-spark = "~2.4.0"

kubernetes = { version = "24.2.0", optional = true }

Expand Down
12 changes: 3 additions & 9 deletions spark_utils/common/spark_session_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
V1Container,
V1ContainerPort,
V1EnvVar,
V1ResourceRequirements,
V1PodSecurityContext,
V1NodeAffinity,
V1NodeSelector,
Expand All @@ -62,7 +61,7 @@ class SparkSessionProvider:
def __init__(
self,
*,
delta_lake_version="2.12:2.1.0",
delta_lake_version="2.12:2.4.0",
hive_metastore_config: Optional[HiveMetastoreConfig] = None,
additional_packages: Optional[List[str]] = None,
additional_configs: Optional[Dict[str, str]] = None,
Expand All @@ -86,8 +85,8 @@ def __init__(
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
.config("spark.jars.ivy", os.path.join(tempfile.gettempdir(), ".ivy2"))
.config("spark.sql.legacy.parquet.datetimeRebaseModeInWrite", "CORRECTED")
.config("spark.sql.legacy.parquet.int96RebaseModeInWrite", "CORRECTED")
.config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
.config("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
)

if hive_metastore_config:
Expand Down Expand Up @@ -152,7 +151,6 @@ def configure_for_k8s(
.config("spark.kubernetes.driver.pod.name", spark_config.driver_name or os.getenv("SPARK_DRIVER_NAME"))
.config("spark.app.name", spark_config.application_name)
.config("spark.kubernetes.executor.podNamePrefix", executor_name)
.config("spark.kubernetes.executor.limit.cores", 1)
.config("spark.driver.host", spark_config.driver_ip or os.getenv("SPARK_DRIVER_IP"))
.config("spark.kubernetes.namespace", spark_config.k8s_namespace)
.config("spark.kubernetes.container.image", spark_config.spark_image)
Expand All @@ -171,10 +169,6 @@ def configure_for_k8s(
image=spark_config.spark_image,
ports=[V1ContainerPort(name="http", container_port=8080, protocol="TCP")],
env=[V1EnvVar(name="SPARK_WORKER_WEBUI_PORT", value="8080")],
resources=V1ResourceRequirements(
limits={"cpu": 1, "memory": f"{spark_config.default_executor_memory}Mi"},
requests={"cpu": 1, "memory": f"{spark_config.default_executor_memory}Mi"},
),
)
],
restart_policy="Never",
Expand Down
2 changes: 1 addition & 1 deletion spark_utils/common/spark_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ def getfromstr_uuid(input_str):
:param input_str: Any string value
:return: UUID that only matches provided input
"""
null_namespace = type("", (), dict(bytes=b""))()
null_namespace = type("", (), {"bytes": b""})()
return str(uuid.uuid3(null_namespace, input_str))
2 changes: 1 addition & 1 deletion test/test_delta_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ def test_cache_clear(spark_session: SparkSession, test_base_path: str):
_ = DeltaLog.for_table(spark_session, f"file:///{test_base_path}/delta_log/table2")
cleared_cache_read = time.monotonic_ns() - start

assert non_cached_read / cached_read > 100 and non_cached_read / cleared_cache_read < 2
assert non_cached_read / cached_read > 10 and non_cached_read / cleared_cache_read < 2

0 comments on commit 63e52f9

Please sign in to comment.