report.tex

\documentclass{article}
\usepackage{graphicx} % Required for inserting images
\usepackage{booktabs}
\usepackage[a4paper, margin=1in]{geometry} % Adjust margins here

\title{AE Report: Parallel Cluster-BFS and Applications to Shortest Paths}
\author{Letong Wang}
\date{September 2024}

\begin{document}

\maketitle

\section{Experiment 1}
\begin{table}[htbp]
  \centering
  \footnotesize
  \input{figs_and_tables/exp1_table.tex}
  \caption{\small\textbf{
    Tested graphs and microbenchmarks on different BFS algorithms from a cluster of vertices with size 64. 
  }
  The numbers endup with `$\times$' are speedups, higher is better. Others are running time, lower is better. The columns ``AIY'', ``Ligra'' in related work and ``Final'' show the speedup over the ``Seq-BFS''. ``AIY'' is referred to a sequential cluster BFS baseline, ``Ligra'' is referred to a parallel single BFS baseline, and ``Final'' is referred to our parallel C-BFS.  The ``self-speedup'' is the speedup running the algorithm in parallel over running it in sequential. 
  \label{table:microbenchmark}
  }
\end{table}
Table~\ref{table:microbenchmark} is referred to the `Table 1' in the original paper.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=\columnwidth]{figs_and_tables/exp1_figure.pdf}
  \caption{\small\textbf{Speedup of parallel Ligra BFSs and parallel C-BFS over the standard sequential BFS on cluster with size 64.} $y$-axis is the speedup over sequential regular BFS in log-scale, higher is better. Each group of bars represents a graph, except the last group, which represents the average across all graphs. The numbers on the bar are the speedup of corresponding algorithms over the standard sequential algorithm.  
  \label{fig:par_compare}
  }
\end{figure} 
Figure~\ref{fig:par_compare} is referred to the `Figure 3' in the original paper. Essentially, the bars ``AIY'' mean the speedup that can be achieved by applying bit-level par- allelism on a cluster-BFS in the sequential setting. Similarly, the bars ``Ligra'' provide the speedup that can be achieved by applying thread-level parallelism for running $k = 64$ regular (non-cluster) BFS. 
Our algorithm combines the strengths of both bit-level and thread-level parallelism. Our solution is always better than any of the baselines (``AIY'' or  ``Ligra'').

\section{Experiment 2}
\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.4\columnwidth]{figs_and_tables/exp2_figure.pdf}
  \caption{\small \textbf{The running time of C-BFS on various cluster diameter $d$.}
  The $y$-axis shows the relative running time over $d=2$. The $x$-axis shows the cluster diameter $d$. 
  }
  \label{fig:ccbfs_d}
\end{figure} 
Figure~\ref{fig:ccbfs_d} if referred to the `Figure 4' in the original paper. It tested how $d$ affects the running time of C-BFS.  It shows the running time increases as $d$ grows, especially when $d$ is small. 

\begin{table}[htbp]
  \centering
  \footnotesize
  \input{figs_and_tables/exp2_table.tex}
  \caption{
    \small\textbf{The parallel C-BFS time (seconds) for one cluster with size 64 on different cluster diameter $d$.} 
    \label{table:BFS_d}
  }
\end{table}
Table~\ref{table:BFS_d} is referred to the `Table 3' in the Appendix. It shows full experiment results of Figure~\ref{fig:ccbfs_d} for all the graphs.

\section{Experiment 3}
\begin{figure}
  \centering
  \includegraphics[width=\columnwidth]{figs_and_tables/exp3_figure.pdf}    \caption{\small \textbf{Tradeoffs between index size and distortion/construction time}
  The $x$-axis is the memory limits per vertex in bytes, and is in log-scale. The $y$-axis on the left shows the ($1+\epsilon$) distortion.  The $y$-axis on the right shows the preprocessing time. For both preprocessing time and
  distortion, lower is better. For the algorithms compared here, `plain' is the regular LL, others are the C-BFS-based LL that choose clusters with size $w$ as landmarks.
  \label{fig:approx}
  }
\end{figure} 

Figure~\ref{fig:approx} is referred to the `Figure 5' in the original paper. It studied the trade-off between proprocessing time and distortion on three representative graphs. For both preprocessing time and distortion, lower is better.  In general, larger $w$ results in faster preprocessing but larger distortion. 

\section{Experiment 4}

\begin{table}[htbp]
  \centering
  \footnotesize
  \input{figs_and_tables/exp4_small_table.tex}
  \caption{
    \small\textbf{The index construction time, (1$+\epsilon$) distortion, and query time for ADO
    based on landmark labeling.} The ``Plain'' is the plain LL algorithm that each landmark is a single vertex. The ``$w=64$'' and ``$w=8$'' C-BFS-based LL that landmarks are in clusters with size $w$.
    The memory budget is 1024 bytes per vertex. For both index time and $\epsilon$, lower is better. 
    \label{table:ApproxLL}
  }
\end{table}

\begin{table}[htbp]
  \centering
  \small
  \input{figs_and_tables/exp4_full_table.tex}
  \caption{
    \small\textbf{The index construction time, (1$+\epsilon$) distortion, and query time for ADO
    based on landmark labeling.} The ``Plain'' is the normal LL algorithm that each landmark is a single vertex. Others are C-BFS-based LL that landmarks are in clusters with size $w$.
    The memory budget is 1024 bytes per vertex. For both index time and $\epsilon$, lower is better. 
    \label{table:ApproxLL_full}
  }
\end{table}

Table~\ref{table:ApproxLL_full} if a full version of Table~\ref{table:ApproxLL}.  Table~\ref{table:ApproxLL} is referred to the  `Table 2' in the original paper. Table~\ref{table:ApproxLL_full} is referred to `Table 5' in the Appendix.  

On all 18 tested graphs, C-BFS with $w=64$ always gives a lower running time. When $w=8$, the running time is higher than $w=64$, but still mostly faster than the plain version. 

Regarding distortion, $w=8$ generally gives better accuracy than $w=64$, but it can also be worse in several instances. Both $w=8$ and $w = 64$ are more accurate than the plain version on most of the 18 graphs. 

The Index Time may be different from the paper, but the distortion $\epsilon$ should be the same.  Note that the distortion for graphs FT and SD are a little bit different from the original paper, that's because we use different ground truth (previously choosing 10000 different pairs, now choosing 100000 different pairs.)
\section{Experiment 5}

\begin{table}[htbp]
  \centering
  \footnotesize
  \input{figs_and_tables/exp5_table.tex}
  \caption{
    \small\textbf{The Approximate Landmark Labeling Time and Distortion for cluster with size 64 and memory limits 1024 bytes per vertex on different cluster diameter $d$.} 
    \label{table:ApproxLL_d}
  }
\end{table}

Table~\ref{table:ApproxLL_d} is referred to the `Table 4' in the original paper. It shows with the same memory limits, the construction time is reversely proportional to $d$, since larger $d$ leads to fewer sources and clusters. However, for distortion, other than the graph OK, larger $d$ always leads to lower accuracy.  

\end{document}