From 658e22031e044efff297833dac45e6181b0bcebf Mon Sep 17 00:00:00 2001 From: Sebastian Espinosa <40347293+sebastian2296@users.noreply.github.com> Date: Fri, 5 Jan 2024 07:21:50 -0500 Subject: [PATCH] Feat/add humanizeBytes and showDeltaFileSizes functions (#82) * feat: add humanize_bytes function (MrPowers#42) * feat: add showDeltaFileSizes function (MrPowers#43) * fix: reuse deltaFileSizes logic and use loweCamelCase convention --- README.md | 26 +++++++++++++++ .../scala/mrpowers/jodie/DeltaHelpers.scala | 21 +++++++++++- .../mrpowers/jodie/DeltaHelperSpec.scala | 32 +++++++++++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index afe3935..0893b44 100644 --- a/README.md +++ b/README.md @@ -483,6 +483,32 @@ Map("size_in_bytes" -> 1320, "number_of_files" -> 2, "average_file_size_in_bytes" -> 660) ``` +## Show Delta File Sizes + +The function `showDeltaFileSizes` displays the size, average size and amount of files of a Delta table in a human readable fashion. + +Suppose you have the following table, partitioned by `col1`: + +``` ++----+----+----+ +|col1|col2|col3| ++----+----+----+ +| 1| A| A| +| 2| A| B| ++----+----+----+ +``` + +Running `DeltaHelpers.showDeltaFileSizes` will display the following into the console: + +`"The delta table contains 2 files with a size of 1.32 kB.The average file size is 660 B"` + +## Humanize Bytes + +The function `humanizeBytes` formats a `integer` represeting a number of bytes into a human readable format. +``` +DeltaHelpers.humanize_bytes(1234567890) # "1.23 GB" +DeltaHelpers.humanize_bytes(1234567890000) # "1.23 TB" +``` ## Delta Table File Size Distribution The function `deltaFileSizeDistributionInMB` returns a `DataFrame` that contains the following stats in megabytes about file sizes in a Delta Table: diff --git a/src/main/scala/mrpowers/jodie/DeltaHelpers.scala b/src/main/scala/mrpowers/jodie/DeltaHelpers.scala index 6c4e06d..faed7e3 100644 --- a/src/main/scala/mrpowers/jodie/DeltaHelpers.scala +++ b/src/main/scala/mrpowers/jodie/DeltaHelpers.scala @@ -218,7 +218,7 @@ object DeltaHelpers { } } - def deltaFileSizes(deltaTable: DeltaTable) = { + def deltaFileSizes(deltaTable: DeltaTable): Map[String, Double] = { val details: Row = deltaTable.detail().select("numFiles", "sizeInBytes").collect()(0) val (sizeInBytes, numberOfFiles) = (details.getAs[Long]("sizeInBytes"), details.getAs[Long]("numFiles")) @@ -527,4 +527,23 @@ object DeltaHelpers { duplicateRecords.isEmpty } + def humanizeBytes(n: Double): String = { + val tuples: Seq[(String, Double)] = Seq(("PB", 1e15), ("TB", 1e12), ("GB", 1e9), ("MB", 1e6), ("kB", 1e3)) + val resultOption: Option[String] = tuples.collectFirst({ + case (prefix, k) if n >= (k * 0.9) => f"${(n /k)}%.2f" + " " + prefix + }) + val result: String = resultOption.getOrElse(f"$n%.0f" + " B") + result + } + + def showDeltaFileSizes(deltaTable: DeltaTable) = { + val rawFileSizes = deltaFileSizes(deltaTable) + + val humanizedNumberOfFiles = rawFileSizes("number_of_files").toString + val humanizedSizeInBytes = humanizeBytes(rawFileSizes("size_in_bytes")) + val humanizedAverageFileSize = humanizeBytes(rawFileSizes("average_file_size_in_bytes")) + + println( s"The delta table contains ${humanizedNumberOfFiles} files with a size of ${humanizedSizeInBytes}." + + s" The average file size is ${humanizedAverageFileSize}") + } } diff --git a/src/test/scala/mrpowers/jodie/DeltaHelperSpec.scala b/src/test/scala/mrpowers/jodie/DeltaHelperSpec.scala index 8c2d69b..aff4ef2 100644 --- a/src/test/scala/mrpowers/jodie/DeltaHelperSpec.scala +++ b/src/test/scala/mrpowers/jodie/DeltaHelperSpec.scala @@ -52,6 +52,38 @@ class DeltaHelperSpec actual("number_of_files") should equal(0) actual("average_file_size_in_bytes") should equal(0) } + + it("should match delta file sizes in a human readable fashion") { + val path = (os.pwd / "tmp" / "delta-table").toString() + createBaseDeltaTable(path, rows) + val deltaTable = DeltaTable.forPath(path) + val fileSizes = DeltaHelpers.deltaFileSizes(deltaTable) + val humanize: ((String, AnyVal)) => (String, String) = { + case (key: String, value: Double) if key contains "size_in_bytes" => + val humanizedVal = DeltaHelpers.humanizeBytes(value) + val humanizedKey = "humanized_" + key + humanizedKey -> humanizedVal + case (key: String, value: Double) if key contains "size_in_bytes" => + val humanizedVal = DeltaHelpers.humanizeBytes(value) + val humanizedKey = "humanized_" + key + humanizedKey -> humanizedVal + case (key: String, value: Double) => + val humanizedKey = "humanized_" + key + humanizedKey -> value.toString() + } + val actual = fileSizes.map(humanize) + + actual("humanized_size_in_bytes") == "1.09 kB" + actual("humanized_number_of_files") == "1" + actual("humanized_average_file_size_in_bytes") == "1.09 kB" + } + + it("should display delta file sizes in a human readable fashion") { + val path = (os.pwd / "tmp" / "delta-table").toString() + createBaseDeltaTable(path, rows) + val deltaTable = DeltaTable.forPath(path) + DeltaHelpers.showDeltaFileSizes(deltaTable) + } } describe("remove duplicate records from delta table") { it("should remove duplicates successful") {