Problem
You want to get the full size of a Delta table or partition, rather than the current snapshot.
Note
For instructions on getting the size of a table’s current snapshot, refer to the KB article Find the size of a table snapshot.
Cause
You want to understand the total size of storage data to better evaluate cost management, data lifecycle management, and table optimization choices.
Solution
Use the following Scala command. To get the size of the table, pass the root path. To get the size of a specific partition, specify the path to the partition.
%scala
def findSizes(pathToTable: String): Double = {
// Function to recursively get the size of all files in a directory
def getAllFiles(path: String): Seq[com.databricks.backend.daemon.dbutils.FileInfo] = {
val filesAndDirs = dbutils.fs.ls(path)
filesAndDirs.flatMap { fileInfo =>
if (fileInfo.isDir) {
getAllFiles(fileInfo.path) // Recurse into subdirectories
} else {
Seq(fileInfo) // Collect the file
}
}
}
// Recursively collect all files from the given directory
val allFiles = getAllFiles(pathToTable)
// Sum the sizes of all the files and convert to MB
val totalSize = allFiles.map(_.size).sum
val sizeInMB = totalSize / (1024.0 * 1024.0) // Convert bytes to MB
println(f"Size of the table is $sizeInMB%.2f MB") //The size in MB
sizeInMB // Return the size in MB
}
print(findSizes("dbfs:/<path-to-delta-table>"))// Pass the root path to get size of entire table
print(findSizes("dbfs:/<path-to-delta-table/partition>"))// Path the path to the partition to get the size of specific partition.