Skip to content

Commit

Permalink
Feat/add humanizeBytes and showDeltaFileSizes functions (#82)
Browse files Browse the repository at this point in the history
* feat: add humanize_bytes function (MrPowers#42)

* feat: add showDeltaFileSizes function (MrPowers#43)

* fix: reuse deltaFileSizes logic and use loweCamelCase convention
  • Loading branch information
sebastian2296 authored Jan 5, 2024
1 parent d265948 commit 658e220
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 1 deletion.
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,32 @@ Map("size_in_bytes" -> 1320,
"number_of_files" -> 2,
"average_file_size_in_bytes" -> 660)
```
## Show Delta File Sizes

The function `showDeltaFileSizes` displays the size, average size and amount of files of a Delta table in a human readable fashion.

Suppose you have the following table, partitioned by `col1`:

```
+----+----+----+
|col1|col2|col3|
+----+----+----+
| 1| A| A|
| 2| A| B|
+----+----+----+
```

Running `DeltaHelpers.showDeltaFileSizes` will display the following into the console:

`"The delta table contains 2 files with a size of 1.32 kB.The average file size is 660 B"`

## Humanize Bytes

The function `humanizeBytes` formats a `integer` represeting a number of bytes into a human readable format.
```
DeltaHelpers.humanize_bytes(1234567890) # "1.23 GB"
DeltaHelpers.humanize_bytes(1234567890000) # "1.23 TB"
```

## Delta Table File Size Distribution
The function `deltaFileSizeDistributionInMB` returns a `DataFrame` that contains the following stats in megabytes about file sizes in a Delta Table:
Expand Down
21 changes: 20 additions & 1 deletion src/main/scala/mrpowers/jodie/DeltaHelpers.scala
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ object DeltaHelpers {
}
}

def deltaFileSizes(deltaTable: DeltaTable) = {
def deltaFileSizes(deltaTable: DeltaTable): Map[String, Double] = {
val details: Row = deltaTable.detail().select("numFiles", "sizeInBytes").collect()(0)
val (sizeInBytes, numberOfFiles) =
(details.getAs[Long]("sizeInBytes"), details.getAs[Long]("numFiles"))
Expand Down Expand Up @@ -527,4 +527,23 @@ object DeltaHelpers {
duplicateRecords.isEmpty
}

def humanizeBytes(n: Double): String = {
val tuples: Seq[(String, Double)] = Seq(("PB", 1e15), ("TB", 1e12), ("GB", 1e9), ("MB", 1e6), ("kB", 1e3))
val resultOption: Option[String] = tuples.collectFirst({
case (prefix, k) if n >= (k * 0.9) => f"${(n /k)}%.2f" + " " + prefix
})
val result: String = resultOption.getOrElse(f"$n%.0f" + " B")
result
}

def showDeltaFileSizes(deltaTable: DeltaTable) = {
val rawFileSizes = deltaFileSizes(deltaTable)

val humanizedNumberOfFiles = rawFileSizes("number_of_files").toString
val humanizedSizeInBytes = humanizeBytes(rawFileSizes("size_in_bytes"))
val humanizedAverageFileSize = humanizeBytes(rawFileSizes("average_file_size_in_bytes"))

println( s"The delta table contains ${humanizedNumberOfFiles} files with a size of ${humanizedSizeInBytes}."
+ s" The average file size is ${humanizedAverageFileSize}")
}
}
32 changes: 32 additions & 0 deletions src/test/scala/mrpowers/jodie/DeltaHelperSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,38 @@ class DeltaHelperSpec
actual("number_of_files") should equal(0)
actual("average_file_size_in_bytes") should equal(0)
}

it("should match delta file sizes in a human readable fashion") {
val path = (os.pwd / "tmp" / "delta-table").toString()
createBaseDeltaTable(path, rows)
val deltaTable = DeltaTable.forPath(path)
val fileSizes = DeltaHelpers.deltaFileSizes(deltaTable)
val humanize: ((String, AnyVal)) => (String, String) = {
case (key: String, value: Double) if key contains "size_in_bytes" =>
val humanizedVal = DeltaHelpers.humanizeBytes(value)
val humanizedKey = "humanized_" + key
humanizedKey -> humanizedVal
case (key: String, value: Double) if key contains "size_in_bytes" =>
val humanizedVal = DeltaHelpers.humanizeBytes(value)
val humanizedKey = "humanized_" + key
humanizedKey -> humanizedVal
case (key: String, value: Double) =>
val humanizedKey = "humanized_" + key
humanizedKey -> value.toString()
}
val actual = fileSizes.map(humanize)

actual("humanized_size_in_bytes") == "1.09 kB"
actual("humanized_number_of_files") == "1"
actual("humanized_average_file_size_in_bytes") == "1.09 kB"
}

it("should display delta file sizes in a human readable fashion") {
val path = (os.pwd / "tmp" / "delta-table").toString()
createBaseDeltaTable(path, rows)
val deltaTable = DeltaTable.forPath(path)
DeltaHelpers.showDeltaFileSizes(deltaTable)
}
}
describe("remove duplicate records from delta table") {
it("should remove duplicates successful") {
Expand Down

0 comments on commit 658e220

Please sign in to comment.