Built site for gh-pages

nsidc · Sep 5, 2024 · cdbcec9 · cdbcec9
1 parent 28d8c89
commit cdbcec9
Show file tree

Hide file tree

Showing 26 changed files with 3,947 additions and 65 deletions.
diff --git a/.nojekyll b/.nojekyll
@@ -1 +1 @@
-b0d584ea
+ab06fa03
diff --git a/_tex/bibliography.bib b/_tex/bibliography.bib
@@ -61,3 +61,64 @@ @misc{LopezPangeoShowcase2024
     note = {Accessed 2024-08-08}
     }
 
+@misc{Mozilla-latency-2024,
+	month = {5},
+	title = {Understanding latency},
+  author = {Mozilla MDN},
+	year = {2024},
+	url = {https://developer.mozilla.org/en-US/docs/Web/Performance/Understanding_latency},
+}
+
+@misc{h5cloud2023,
+  author       = {ICESAT-2 HackWeek, H5Cloud Contributors},
+  title        = {h5cloud: Tools for cloud-based analysis of HDF5 data},
+  year         = {2023},
+  url          = {https://github.com/ICESAT-2HackWeek/h5cloud},
+  note         = {GitHub repository},
+  version      = {v1.0.0},
+}
+
+@misc{scott-2020,
+	author = {Scott, Colin},
+	title = {Numbers every programmer should know},
+	year = {2020},
+	url = {https://colin-scott.github.io/personal_website/research/interactive_latency.html},
+}
+
+@software{The_HDF_Group_Hierarchical_Data_Format,
+author = {{The HDF Group}},
+title = {{Hierarchical Data Format, version 5}},
+url = {https://github.com/HDFGroup/hdf5}
+}
+
+@ARTICLE{Hoyer2017-su,
+  title     = "xarray: {N-D} labeled Arrays and Datasets in Python",
+  author    = "Hoyer, Stephan and Hamman, Joe",
+  abstract  = "xarray is an open source project and Python package that
+               provides a toolkit and data structures for N-dimensional labeled
+               arrays. Our approach combines an application programing
+               interface (API) inspired by pandas with the Common Data Model
+               for self-described scientific data. Key features of the xarray
+               package include label-based indexing and arithmetic,
+               interoperability with the core scientific Python packages (e.g.,
+               pandas, NumPy, Matplotlib), out-of-core computation on datasets
+               that don't fit into memory, a wide range of serialization and
+               input/output (I/O) options, and advanced multi-dimensional data
+               manipulation tools such as group-by and resampling. xarray, as a
+               data model and analytics toolkit, has been widely adopted in the
+               geoscience community but is also used more broadly for
+               multi-dimensional data analysis in physics, machine learning and
+               finance.",
+  journal   = "J. Open Res. Softw.",
+  publisher = "Ubiquity Press, Ltd.",
+  volume    =  5,
+  number    =  1,
+  pages     = "10",
+  month     =  apr,
+  year      =  2017,
+  copyright = "http://creativecommons.org/licenses/by/4.0",
+  language  = "en"
+}
+
+
+
diff --git a/_tex/figures/figure-2.png b/_tex/figures/figure-2.png
diff --git a/_tex/figures/figure-3.png b/_tex/figures/figure-3.png
diff --git a/_tex/figures/figure-5.png b/_tex/figures/figure-5.png
diff --git a/_tex/paper.tex b/_tex/paper.tex
@@ -299,21 +299,36 @@ \section{Problem}\label{problem}
 \subsubsection{\texorpdfstring{\textbf{Metadata
 fragmentation}}{Metadata fragmentation}}\label{metadata-fragmentation}
 
-By default, file-level metadata associated with a dataset is stored in
-chunks of 4kb. This produces a lot of fragmentation across the file
-especially for data with many variables and nested groups.
+When working with large datasets, especially those that include numerous
+variables and nested groups, the storage of file-level metadata can
+become a challenge. By default, metadata associated with each dataset is
+stored in chunks of 4 kilobytes (KB). This chunking mechanism was
+originally intended to optimize storage efficiency and access speed on
+disks with hardware resources available more than 20 years ago. In
+datasets with many variables and/or complex hierarchical structures,
+these 4KB chunks can lead to significant fragmentation.
+
+Fragmentation occurs when this metadata is spread out across multiple
+non-contiguous chunks within the file. This results in inefficiencies
+when accessing or modifying data because compatible libraries need to
+read from multiple, scattered locations in the file. Over time, as the
+dataset grows and evolves, this fragmentation can compound, leading to
+degraded performance and increased storage overhead. In particular,
+operations that involve reading or writing metadata, such as opening a
+file, checking attributes, or modifying variables, can become slower and
+more resource-intensive.
 
 \subsubsection{\texorpdfstring{\textbf{Global API
 Lock}}{Global API Lock}}\label{global-api-lock}
 
-Because of the historical complexity of operations with the HDF5 format,
-there has been a necessity to make the library thread-safe and similarly
-to what happens in the Python language, the simplest mechanism to
-implement this is to have a global API lock. This global lock is not as
-big of an issue when we read data from local disk but it becomes a major
-bottleneck when we read data over the network because each read is
-sequential and latency in the cloud is exponentially bigger than local
-access.
+Because of the historical complexity of operations with the HDF5
+format(The HDF Group, n.d.), there has been a necessity to make the
+library thread-safe and similarly to what happens in the Python
+language, the simplest mechanism to implement this is to have a global
+API lock. This global lock is not as big of an issue when we read data
+from local disk but it becomes a major bottleneck when we read data over
+the network because each read is sequential and latency in the cloud is
+exponentially bigger than local access (MDN, 2024) (Scott, 2020).
 
 \begin{center}\rule{0.5\linewidth}{0.5pt}\end{center}
 
@@ -332,43 +347,44 @@ \subsubsection{\texorpdfstring{\textbf{Global API
 
 \end{figure*}%
 
-\subsubsection{Background and data
-selection}\label{background-and-data-selection}
+\subsubsection{\texorpdfstring{\textbf{Background and data
+selection}}{Background and data selection}}\label{background-and-data-selection}
 
 As a result of community feedback and ``hack weeks'' organized by NSIDC
-and UW eScience Institute in 2023, NSIDC started the Cloud Optimized
-Format Investigation (COFI) project to improve access to HDF5 from the
-ICESat-2 mission. A spaceborne lidar that retrieves surface topography
-of the Earth's ice sheets, land and (oceans Neumann et al., 2019).
-Because of its complexity, large size and importance for cryospheric
-studies we targeted the ATL03 dataset. ATL03 core data are geolocated
-photon heights from the ICESat-2 ATLAS instrument. Each file contains
-1003 geophysical variables in 6 data groups. Although our research was
-focused on this dataset, most of our findings are applicable to any
-dataset stored in HDF5 and NetCDF4.
+and UW eScience Institute in 2023(ICESAT-2 HackWeek, 2023), NSIDC
+started the Cloud Optimized Format Investigation (COFI) project to
+improve access to HDF5 from the ICESat-2 mission. A spaceborne lidar
+that retrieves surface topography of the Earth's ice sheets, land and
+oceans (Neumann et al., 2019). Because of its complexity, large size and
+importance for cryospheric studies we targeted the ATL03 dataset. ATL03
+core data are geolocated photon heights from the ICESat-2 ATLAS
+instrument. Each file contains 1003 geophysical variables in 6 data
+groups. Although our research was focused on this dataset, most of our
+findings are applicable to any dataset stored in HDF5 and NetCDF4.
 
 \section{Methodology}\label{methodology}
 
-We tested access times to original and cloud-optimized small (1 GB),
-medium (2 GB) and large (7 GB) HDF5 ATL03 files {[}list files tested{]}
-stored in AWS S3 buckets in region us-west-2, the region hosting NASA's
-Earthdata Cloud archives. Files were accessed using Python tools
-commonly used by Earth scientists: h5py and Xarray. h5py is a Python
-wrapper around the HDF5 C API. xarray\footnote{\texttt{h5py} is a
+We tested access times to original and different configurations of
+cloud-optimized HDF5 ATL03 files {[}list files tested{]} stored in AWS
+S3 buckets in region us-west-2, the region hosting NASA's Earthdata
+Cloud archives. Files were accessed using Python tools commonly used by
+Earth scientists: h5py and Xarray(Hoyer \& Hamman, 2017). h5py is a
+Python wrapper around the HDF5 C API. xarray\footnote{\texttt{h5py} is a
   dependency of Xarray} is a widely used Python package for working with
 n-dimensional data. We also tested access times using h5coro, a python
 package optimized for reading HDF5 files from S3 buckets and kerchunk, a
 tool that creates an efficient lookup table for file chunks to allow
 performant partial reads of files.
 
-HDF5 ATL03 files were originally cloud optimized by ``repacking'' them,
+The test files were originally cloud optimized by ``repacking'' them,
 using a relatively new feature in the HDF5 C API called ``paged
 aggregation''. Page aggregation does 2 things: it collects file-level
 metadata from datasets and stores it on dedicated metadata blocks in the
-file; and it forces the library to write data using fixed-size blocks.
-Aggregation allows client libraries to read file metadata with only a
-few requests and uses the page size used in the aggregation as the
-minimal request size, overriding the 1 request per chunk behavior.
+file; and it forces the library to write both data and metadata using
+these fixed-size pages. Aggregation allows client libraries to read file
+metadata with only a few requests and uses the page size used in the
+aggregation as the minimal request size, overriding the 1 request per
+chunk behavior.
 
 \begin{figure*}
 
@@ -383,7 +399,19 @@ \section{Methodology}\label{methodology}
 
 \end{figure*}%
 
-\section{Results}\label{results}
+As we can see in Figure~\ref{fig-2}, when we cloud optimize a file using
+paged-aggregation there are some considerations and behavior that we had
+to take into account. The first thing to observe is that page
+aggregation will --as we mentioned-- consolidate the file-evel metadata
+at the front of the file and will add information in the so-called
+superblock\footnote{The HDF5 superblock is a crucial component of the
+  HDF5 file format, acting as the starting point for accessing all data
+  within the file. It stores important metadata such as the version of
+  the file format, pointers to the root group, and addresses for
+  locating different file components} The next thing to notice is that
+page size us uses across the board for metadata and data as of October
+2024 and version 1.14 of the HDF5 library, page size cannot dynamically
+adjust to the total metadata size.
 
 \begin{figure*}
 
@@ -393,7 +421,89 @@ \section{Results}\label{results}
 
 }
 
-\caption{\label{fig-3}Benchmarks show that cloud optimizing ATL03 files
+\caption{\label{fig-3}shows how file-level metadata and data packing
+inside aggregated pages leave unused space that can potentially increase
+the file size in a considerable way.}
+
+\end{figure*}%
+
+This one page size for all approach simplifies how the HDF5 API reads
+the file (if instructucted) but it also brings unused page space and
+chunk over-reads. In the case of the ICESat-2 dataset (ATL03) the data
+itself has been partitioned and each granule represents a segment in the
+satellite orbit and within the file the most relevant dataset is chunked
+using 10,000 items per chunk, with data being float-32 and using a fast
+compression value, the resulting chunk size is on average under 40KB,
+which is really small for an HTTP request, especially when we have to
+read them sequentially. Because of these considerations, we opted for
+testing different page sizes, and increase chunk size. The following
+table describes the different configurations used in our tests.
+
+\begin{longtable}[]{@{}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 12\tabcolsep) * \real{0.1589}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 12\tabcolsep) * \real{0.3444}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 12\tabcolsep) * \real{0.1457}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 12\tabcolsep) * \real{0.0993}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 12\tabcolsep) * \real{0.0728}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 12\tabcolsep) * \real{0.0728}}
+  >{\raggedright\arraybackslash}p{(\columnwidth - 12\tabcolsep) * \real{0.1060}}@{}}
+\toprule\noalign{}
+\begin{minipage}[b]{\linewidth}\raggedright
+prefix
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+description
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+\% file size increase
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+\textasciitilde km per chunk
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+shape
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+page size
+\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
+avg\_chunk\_size
+\end{minipage} \\
+\midrule\noalign{}
+\endhead
+\bottomrule\noalign{}
+\endlastfoot
+original & original file from ATL03 v6 (1gb and 7gb) & 0 & 1.5km &
+(10000,) & N/A & 35kb \\
+original-kerchunk & kerchunk sidecar of the original file & N/A & 1.5km
+& (10000,) & N/A & 35kb \\
+page-only-4mb & paged-aggregated file with 4mb per page &
+\textasciitilde1\% & 1.5km & (10000,) & 4MB & 35kb \\
+page-only-8mb & paged-aggregated file with 4mb per pag8 &
+\textasciitilde1\% & 1.5km & (10000,) & 8MB & 35kb \\
+rechunked-4mb & page-aggregated and bigger chunk sizes &
+\textasciitilde1\% & 10km & (100000,) & 4MB & 400kb \\
+rechunked-8mb & page-aggregated and bigger chunk sizes &
+\textasciitilde1\% & 10km & (100000,) & 8MB & 400kb \\
+rechunked-8mb-kerchunk & kerchunk sidecar of the last paged-aggregated
+file & N/A & 10km & (10000,) & 8MB & 400kb \\
+\end{longtable}
+
+This table represents the different configurations we used for our tests
+in 2 file sizes. It's worth noticing that we encountered a few outlier
+cases where compression and chunk sizes led page aggregation to an
+increase in file size of approximately 10\% which was above the desired
+value for NSIDC (5\% max) We tested these files using the most common
+libraries to handle HDF5 and 2 different I/O drivers that support remote
+access to AWS S3, fsspec and the native S3. The results of our testing
+is explained in the next section and the code to reproduce the results
+is in the attached notebooks.
+
+\section{Results}\label{results}
+
+\begin{figure*}
+
+\centering{
+
+\includegraphics{figures/figure-5.png}
+
+}
+
+\caption{\label{fig-5}Benchmarks show that cloud optimizing ATL03 files
 improved access times at least an order of magnitude when used with
 aligned I/O patterns, this is telling the library about the cloud
 optimization and page size.}
@@ -511,13 +621,25 @@ \section{Discussion}\label{discussion}
   Side effects on different access patterns, e.g.~Kerchunk
 \end{enumerate}
 
-\section*{References}\label{references}
-\addcontentsline{toc}{section}{References}
+\section{References}\label{references}
 
 \phantomsection\label{refs}
 \begin{CSLReferences}{1}{0}
 \vspace{1em}
 
+\bibitem[\citeproctext]{ref-Hoyer2017-su}
+Hoyer, S., \& Hamman, J. (2017). Xarray: {N-D} labeled arrays and
+datasets in python. \emph{J. Open Res. Softw.}, \emph{5}(1), 10.
+
+\bibitem[\citeproctext]{ref-h5cloud2023}
+ICESAT-2 HackWeek, H. C. (2023). h5cloud: Tools for cloud-based analysis
+of HDF5 data (Version v1.0.0). Retrieved from
+\url{https://github.com/ICESAT-2HackWeek/h5cloud}
+
+\bibitem[\citeproctext]{ref-Mozilla-latency-2024}
+MDN, M. (2024, May). Understanding latency. Retrieved from
+\url{https://developer.mozilla.org/en-US/docs/Web/Performance/Understanding_latency}
+
 \bibitem[\citeproctext]{ref-NEUMANN2019111325}
 Neumann, T. A., Martino, A. J., Markus, T., Bae, S., Bock, M. R.,
 Brenner, A. C., et al. (2019). The ice, cloud, and land elevation
@@ -526,6 +648,14 @@ \section*{References}\label{references}
 Environment}, \emph{233}, 111325.
 https://doi.org/\url{https://doi.org/10.1016/j.rse.2019.111325}
 
+\bibitem[\citeproctext]{ref-scott-2020}
+Scott, C. (2020). Numbers every programmer should know. Retrieved from
+\url{https://colin-scott.github.io/personal_website/research/interactive_latency.html}
+
+\bibitem[\citeproctext]{ref-The_HDF_Group_Hierarchical_Data_Format}
+The HDF Group. (n.d.). {Hierarchical Data Format, version 5}. Retrieved
+from \url{https://github.com/HDFGroup/hdf5}
+
 \end{CSLReferences}
 
 

diff --git a/figures/figure-2.png b/figures/figure-2.png
diff --git a/figures/figure-3.png b/figures/figure-3.png
diff --git a/figures/figure-5.png b/figures/figure-5.png