diff --git a/full_report/Latex_report/Capstone5_report_v1.tex b/full_report/Latex_report/Capstone5_report_v1.tex deleted file mode 100644 index 05383d1..0000000 --- a/full_report/Latex_report/Capstone5_report_v1.tex +++ /dev/null @@ -1,179 +0,0 @@ -% THIS TEMPLATE IS A WORK IN PROGRESS -% Adapted from an original template by faculty at Reykjavik University, Iceland - -\documentclass{scrartcl} -\input{File_Setup.tex} -\usepackage{graphicx,epsfig} -\hypersetup{ - colorlinks = true, %Colours links instead of ugly boxes - urlcolor = blue, %Colour for external hyper links - linkcolor = blue, %Colour of internal links - citecolor = red, %Colour of citations - setpagesize = false, - linktocpage = true, -} -\graphicspath{ {fig/} } - - - -\renewenvironment{abstract}{ - \centering - \textbf{Abstract} - \vspace{0.5cm} - \par\itshape - \begin{minipage}{0.7\linewidth}}{\end{minipage} - \noindent\ignorespaces -} -% ------------------------------------------------------------------------------------------------------------------------ - -\begin{document} -%Title of the report, name of coworkers and dates (of experiment and of report). -\begin{titlepage} - \centering - \includegraphics[width=0.6\textwidth]{GW_logo.eps}\par - \vspace{2cm} - %%%% COMMENT OUT irrelevant lines below: Data Science OR Computer Science OR none - {\scshape\LARGE Data Science Program \par} - \vspace{1cm} - {\scshape\Large Capstone Report - Spring 2024\par} - %{\large \today\par} - \vspace{1.5cm} - %%%% PROJECT TITLE - {\huge\bfseries Vector vs. Graph Database for Retrieval-Augmented Generation\par} - \vspace{2cm} - %%%% AUTHOR(S) - {\Large\itshape Arjun Bingly,\\ Sanchit Vijay,\\ Erika Pham,\\Kunal Inglunkar}\par - \vspace{1.5cm} - supervised by\par - %%%% SUPERVISOR(S) - Amir Jafari - - \vfill - \begin{abstract} - We introduce an implementation of Retrieval-Augmented Generation (RAG) as part of an end-to-end, self-hostable, semantic-based search engine for internal documents. RAG’s ability to understand context and producing relevant quality responses to prompts is crucial to producing a semantic-based search engine. - - Traditional RAG implementation uses a vector database; but we see the potential for graph databases, owing to its relational capabilities (revise this). We also present a performance comparison between vector and graph databases for a RAG pipeline. - \end{abstract} - \vfill -% Bottom of the page -\end{titlepage} -\tableofcontents -\newpage -% ------------------------------------------------------------------------------------------------------------------------ -\section{Introduction} -- Idea: build a locally-hostable semantic based search engine using RAG. -- What is RAG? Insert the diagram here. -- Vector DB vs Graph DB: pros and cons -- Goal: compare performance of this pipeline on vector db vs graph db -% ------------------------------------------------------------------------------------------------------------------------ -\section{Problem Statement} -Our current main challenges include: -1. Parsing tables in PDF documents accurately. -2. Traditional performance evaluation metrics (list them) for RAG are not informative on our process (add why?). -3. Implementation of graph database for RAG is difficult, existing literature and experiments employ non-open sourced products such as OpenAI (cite) which we lack resources for. - -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Related Work} - - -It's an edited and updated version of your literature review. Here are a few examples of how to insert citations like~\cite{byzantine-pki}, \cite{atomic-mcast-tcs01}, and also~\cite{sybilattack}, or even~\cite{psn-fail, verisign-fail}. - -Talk about the graph LLM papers here and how that inspires implementation. Also cite git repos. -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Solution and Methodology} -\subsection{Overview: insert diagram here} - -\begin{figure}[H] - \begin{center} - \includegraphics[scale=0.7]{basic_RAG_pipeline.drawio.svg} - \end{center} - \caption{Basic Retrieval-Augmented Generation (RAG) Pipeline (better illustration coming)} - \label{fig:ascent} -\end{figure} - -\subsection {PDF Parser:} -Parsing PDF documents presents a significant challenge due to their complex structure. PDFs often contain unstructured data, which lacks a predefined organization, making accurate recognition and processing arduous. A notable difficulty arises when handling tables, as PDFs do not inherently understand table columns, complicating the task of recognizing table layouts. This complexity is particularly evident in documents like tax forms, which feature intricate nested table structures. Additionally, scanned PDFs require Optical Character Recognition (OCR) tools to convert images back into text, introducing another layer of complexity. - -Our approach involved experimenting with various packages and strategies to develop a program capable of parsing and processing PDF documents. Despite our efforts, we encountered limitations in parsing tables, where the results were inconsistent. -\subsubsection{Unstructured IO:} -This open-source library facilitates the processing of diverse document types. Utilizing its partition_pdf() function, we were able to segment a PDF document into distinct elements, enhancing the parsing process. Unstructured IO also supports "FigureCaptions" identification, potentially improving the contextual understanding of the model. We adopted their "hi-res" strategy, which converts PDF pages into images and then applying the OCR tool PyTesseract to extract text. -While the output for plain text was satisfactory, the library struggled with more complex documents, such as tax forms and bank statements, yielding inadequate results. - -\subsubsection{PDFPlumber, Unstructured IO, and PyTesseract: (add more details on method here)} -To address these challenges, we integrated PDFPlumber for parsing table elements, PyTesseract for image-based text extraction, and Unstructured IO for processing other text content. PDFPlumber demonstrated superior layout detection capabilities, offering higher accuracy in parsing tables from non-scanned documents compared to our previous method. However, it underperformed with scanned documents and exhibited inconsistent results across various PDF files. - -3. LLM implementation: -4. Vector DB implementation + evaluation: insert table on evaluation metrics in the results section -5. Graph DB implementation - - -% ------------------------------------------------------------------------------------------------------------------------ -\section{Results and Discussion} - -The results section details your metrics and experiments for the assessment of your solution. It then provides experimental validation for your approach with visual aids such as data tables and graphs. In particular, it allows you to compare your idea with other approaches you've tested, for example solutions you've mentioned in your related work section. - -\subsection{Experimentation protocol} - -It is of the utmost importance to describe how you came up with the measurements and results that support your evaluation. - -\subsection{Data tables} - -Every data table should be numbered, have a brief description as its title, and specify the units used. - -As an example, Table~\ref{tab:my_label} compares the average latencies of native application calls to networked services. The experiments were conducted on an Apple MacBook Air 2010 with a CPU speed of 1.4GHz and a bus speed of 800MHz. Each data point is a mean over 20 instances of each call, after discarding both the lowest and the highest measurement. - -\begin{table}[ht] - \centering - \begin{tabular}{llr} -\hline -\multicolumn{2}{c}{Network Applications} \\ -\cline{1-2} -Service & Protocol & Latency (\si{\ms}) \\ -\hline -DNS & UDP & \SI{13.65}{\ms} \\ - & TCP & \SI{0.01}{\ms} \\ -NTP & UDP & \SI{92.50}{\ms} \\ -SMTP & TCP & \SI{33.33}{\ms} \\ -HTTP & TCP & \SI{8.99}{\ms} \\ -\hline -\end{tabular} - \caption{Comparison of latencies between services running on \texttt{localhost}.} - \label{tab:my_label} -\end{table} - -\subsection{Graphs} - -Graphs are often the most important information in your report; you should design and plot them with great care. A graph contains a lot of information in a short space. Graphs should be numbered and have a title. Their axes should be labelled, with the quantities and units specified. Make sure that individual data points (your measurements) stand out clearly. And of course, always associate your graph with text that explains your results, and outlines the conclusions you draw from these results. - -\begin{figure} - \begin{center} - \includegraphics[scale=0.9]{perf-plot-1.pdf} - \end{center} - \caption{Probability of including [k] faulty/malicious nodes in the service} - \label{graph:faulty-proportion-plot} -\end{figure} - -For example, Figure~\ref{graph:faulty-proportion-plot} compares the efficiency of three different service architectures in eliminating adversarial behaviors. Every data point gives the probability that $k$ faulty/malicious nodes managed to participate in a computation that involves 32 nodes. In the absence of at least one reliable node ($k = 32$), the failure will go undetected ; but the results show that this case is extremely unlikely, regardless of the architecture. The most significant result pertains to $k = 16$: the reliable nodes detect the failure, but cannot reach a majority to recover. The graph shows that the \texttt{CORPS 5\%} architecture is much more resilient than the \texttt{DHT 30\%} architecture, by a magnitude of $10^{11}$. - -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Discussion} -1. -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Conclusion} - - -\bibliographystyle{IEEEtran} -\bibliography{references} - - -%------ To create Appendix with additional stuff -------% -%\newpage -%\appendix -%\section{Appendix} -%Put data files, CAD drawings, additional sketches, etc. - -\end{document} \ No newline at end of file diff --git a/full_report/Latex_report/Capstone5_report_v2.synctex(busy) b/full_report/Latex_report/Capstone5_report_v2.synctex(busy) deleted file mode 100644 index e69de29..0000000 diff --git a/full_report/Latex_report/Capstone5_report_v3.tex b/full_report/Latex_report/Capstone5_report_v3.tex deleted file mode 100644 index 1d03488..0000000 --- a/full_report/Latex_report/Capstone5_report_v3.tex +++ /dev/null @@ -1,211 +0,0 @@ -% THIS TEMPLATE IS A WORK IN PROGRESS -% Adapted from an original template by faculty at Reykjavik University, Iceland - -\documentclass{scrartcl} -\input{File_Setup.tex} -\usepackage{graphicx,epsfig} -\hypersetup{ - colorlinks = true, %Colours links instead of ugly boxes - urlcolor = blue, %Colour for external hyper links - linkcolor = blue, %Colour of internal links - citecolor = red, %Colour of citations - setpagesize = false, - linktocpage = true, -} -\graphicspath{ {fig/} } - - - -\renewenvironment{abstract}{ - \centering - \textbf{Abstract} - \vspace{0.5cm} - \par\itshape - \begin{minipage}{0.7\linewidth}}{\end{minipage} - \noindent\ignorespaces -} -% ------------------------------------------------------------------------------------------------------------------------ - -\begin{document} -%Title of the report, name of coworkers and dates (of experiment and of report). -\begin{titlepage} - \centering - \includegraphics[width=0.6\textwidth]{GW_logo.eps}\par - \vspace{2cm} - %%%% COMMENT OUT irrelevant lines below: Data Science OR Computer Science OR none - {\scshape\LARGE Data Science Program \par} - \vspace{1cm} - {\scshape\Large Capstone Report - Spring 2024\par} - %{\large \today\par} - \vspace{1.5cm} - %%%% PROJECT TITLE - {\huge\bfseries Vector vs. Graph Database for Retrieval-Augmented Generation\par} - \vspace{2cm} - %%%% AUTHOR(S) - {\Large\itshape Arjun Bingly,\\ Sanchit Vijay,\\ Erika Pham,\\Kunal Inglunkar}\par - \vspace{1.5cm} - supervised by\par - %%%% SUPERVISOR(S) - Amir Jafari - - \vfill - \begin{abstract} - We introduce an implementation of Retrieval-Augmented Generation (RAG) that retrieves from graph databases as part of an end-to-end, self-hostable, semantic-based search engine for internal documents. RAG’s ability to understand context and producing relevant quality responses to prompts is crucial to producing a semantic-based search engine. Traditional RAG implementation uses a vector database; but we see the potential for graph databases, owing to its complex relational capabilities (revise this). We also present a performance comparison between vector and graph databases for a RAG pipeline. - \end{abstract} - \vfill -% Bottom of the page -\end{titlepage} -\tableofcontents -\newpage -% ------------------------------------------------------------------------------------------------------------------------ -\section{Introduction} -Retriever-Augmented Generation (RAG) is a method natural language processing (NLP) that combines the retrieval of informational documents from a large database (the "retriever" part) and the generation of coherent, contextual text based on the information retrieved (the "generation" part). -It was introduced as an enhancement to Large Language Models (LLMs), as RAG provides the LLM with real-time data access, preserves data privacy, and mitigate "hallucination" (cite paper). RAG is therefore ideal for semantic-based search engines; it improves their ability to understand and respond to queries with high relevance, accuracy, and personalization. -Traditionally, RAG implementations uses vector databases for its retrieval process. As RAG uses vector embeddings for its processes, vector database is the optimal choice for ease of retrieval and efficiency in similarity search. -However, since RAG simply outputs the closest vector in relation to the query vector, it leaves room for error if the database does not contain relevant information to the input prompt. This makes RAG overly reliant on the quality of the data and the embedding process. Additionally, while vector databases are scalable, the computational resources required for its maintenance can be expensive. -Graph database presents a very promising possibility due to its complex relational network - in theory, this could solve vector DB's limitation. -There is limited existing literature comparing the performance of vector versus graph database in a RAG implementation. This paper aims to experiment with graph database for RAG, and compare its performance to traditional implementation which uses vector databases. - -% ------------------------------------------------------------------------------------------------------------------------ -\section{Problem Statement} -Our current main challenges include: -1. Parsing tables in PDF documents accurately. -2. Traditional performance evaluation metrics (list them) for RAG are not informative on our process (add why?). -3. Implementation of graph database for RAG is difficult, existing literature and experiments employ non-open sourced products such as OpenAI (cite) which we lack resources for. - -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Related Work} -The original inspiration was to create an end-to-end, open-sourced, self-hostable search engine. Companies that need an internal search engine would benefit from this implementation, as it requires no access to online resources. -https://github.com/michaelthwan/searchGPT and https://github.com/GerevAI/gerev#integrations are open-sourced packages for LLM-powered semantic search engines. However, they leverage APIs, which makes the process reliant on online resources. We aim to localize all processes to ensure self-hostability. -% ------------------------------------------------------------------------------------------------------------------------ -\section{Solution and Methodology} -\subsection{RAG Pipeline} -\subsubsection{Overview} -Figure 1 shows a traditional RAG pipeline. As the name implies, the process is two-part: retrieval and generation. -The input query and documents are first preprocessed into vectors through the embedding process. -The pipeline then retrieves data relevant to the query, performing a similarity search in the vector database. Once the retrieval process is complete, RAG utilizes an LLM to understand and preserve context. Then, RAG system integrates the retrieved information with the original query to provide a richer context for the generation phase. -In the generation step, the augmented query is processed by the LLM, which synthesizes the information into a coherent and contextually appropriate response. The final output is then post-processed, if necessary, to ensure it meets the required specifications, such as correctness, coherence, and relevance. -\begin{figure}[H] - \begin{center} - \includegraphics[scale=0.7]{basic_RAG_pipeline.drawio.svg} - \end{center} - \caption{Figure 1: Basic Retrieval-Augmented Generation (RAG) Pipeline (better illustration coming)} - \label{fig:ascent} -\end{figure} - -RAG provides several advantages and solutions to LLMs caveats: -\begin{itemize} - \item 1. Empowering LLM solutions with real-time data access - LLMs are typically trained on vast datasets that may quickly become outdated as new information emerges. RAG technology addresses this limitation by allowing LLMs to access and incorporate real-time data into their responses. Through the retrieval component, RAG systems can query up-to-date databases or the internet to find the most current information, ensuring that the generated output reflects the latest developments. - \item 2. Preserving data privacy - RAG can retrieve information from a controlled, secure dataset or environment rather than relying on direct access to private data. By designing the retrieval component to operate within privacy-preserving parameters, RAG can ensure that the LLM will not directly access or expose sensitive data. - \item 3. Mitigating LLM hallucinations - "Hallucination" in the context of LLMs refers to the generation of plausible but inaccurate or entirely fabricated information. This is a known challenge with LLMs, where the model might confidently produce incorrect data or statements.(cite) RAG helps mitigate this issue by grounding the LLM's responses in retrieved documents that are verified or deemed reliable. By leveraging external sources of information, RAG reduces the model's reliance on potentially flawed internal representations and biases, leading to more accurate outputs. -\end{itemize} - -\subsubsection{RAG Document Chains} - -Document chains are used in Retrieval-Augmented Generation (RAG) to effectively utilize retrieved documents. These chains serve various purposes, including efficient document processing, task decomposition, and improved accuracy. - -\textbf{Stuff Chain} - -This is the simplest form of document chain. It involves putting all relevant data into the prompt. Given \(n\) documents, it concatenates the documents with a separator, usually \verb|\n\n|. -The advantage of this method is \textit{it only requires one call to the LLM}, and the model has access to all the information at once. -However, one downside is \textit{most LLMs can only handle a certain amount of context}. For large or multiple documents, stuffing may result in a prompt that exceeds the context limit. -Additionally, this method is \textit{only suitable for smaller amounts of data}. When working with larger data, alternative approaches should be used. - -\begin{figure}[H] - \centering - \includegraphics[width=0.8\textwidth]{path/to/stuff_chain_image.jpg} - \caption{Illustration of Stuff Chain} -\end{figure} -\href{https://readmedium.com/en/https:/ogre51.medium.com/types-of-chains-in-langchain-823c8878c2e9}{Source} - -\textbf{Refine Chain} - -The Refine Documents Chain uses an iterative process to generate a response by analyzing each input document and updating its answer accordingly. -It passes all non-document inputs, the current document, and the latest intermediate answer to an LLM chain to obtain a new answer for each document. -This chain is ideal for tasks that involve analyzing more documents than can fit in the model’s context, as it \textit{only passes a single document to the LLM at a time}. -However, this also means it makes significantly more LLM calls than other chains, such as the Stuff Documents Chain. It may \textit{perform poorly for tasks that require cross-referencing between documents} or detailed information from multiple documents. -Pros of this method include \textit{incorporating more relevant context and potentially less data loss} than the MapReduce Documents Chain. However, \textit{it requires many more LLM calls and the calls are not independent}, meaning they cannot be paralleled like the MapReduce Documents Chain. -There may also be dependencies on the order in which the documents are analyzed, thus it might be ideal to provide documents in order of similarity. - -\begin{figure}[H] - \centering - \includegraphics[width=0.8\textwidth]{path/to/refine_chain_image.jpg} - \caption{Illustration of the Refine Chain method.} -\end{figure} -\href{https://readmedium.com/en/https:/ogre51.medium.com/types-of-chains-in-langchain-823c8878c2e9}{Source} - -\textbf{Map Reduce Chain} - -To process \textit{large amounts of data efficiently}, the MapReduceDocumentsChain method is used. -This involves applying an LLM chain to each document individually (in the Map step), producing a new document. Then, all the new documents are passed to a separate combine documents chain to get a single output (in the Reduce step). If necessary, the mapped documents can be compressed before passing them to the combine documents chain. -This compression step is performed recursively. -This method requires an initial prompt on each chunk of data. -For summarization tasks, this could be a summary of that chunk, while for question-answering tasks, it could be an answer based solely on that chunk. Then, a different prompt is run to combine all the initial outputs. -The pros of this method are that \textit{it can scale to larger documents and handle more documents} than the StuffDocumentsChain. Additionally, \textit{the calls to the LLM on individual documents are independent and can be parallelized}. -The cons are that it \textit{requires many more calls to the LLM} than the StuffDocumentsChain and \textit{loses some information during the final combining call}. - -\begin{figure}[H] - \centering - \includegraphics[width=0.8\textwidth]{path/to/map_reduce_chain_image.jpg} - \caption{Illustration of the Map Reduce Chain method.} -\end{figure} -\href{https://readmedium.com/en/https:/ogre51.medium.com/types-of-chains-in-langchain-823c8878c2e9}{Source} - -\subsubsection{Propmting} -Prompting strategies differ from model to model.For example, the Llama model takes system prompts. -(add example here) -\subsubsection{Other Hyperparameters} -\begin{itemize} - \item \textbf{Chunk Sizes} --- generally, the smallest chunk size you can get away with. - \item \textbf{Similarity Score} --- e.g., cosine similarity, a measure used to determine how similar two documents or vectors are. - \item \textbf{Embedding} --- a representation of text in a high-dimensional vector space, which allows for capturing the semantic meaning of words or phrases. -\end{itemize} - -\subsection {PDF Parser} -Parsing PDF documents presents a significant challenge due to their complex structure. PDFs often contain unstructured data, which lacks a predefined organization, making accurate recognition and processing arduous. A notable difficulty arises when handling tables, as PDFs do not inherently understand table columns, complicating the task of recognizing table layouts. This complexity is particularly evident in documents like tax forms, which feature intricate nested table structures. Additionally, scanned PDFs require Optical Character Recognition (OCR) tools to convert images back into text, introducing another layer of complexity. -Our approach involved experimenting with various packages and strategies to develop a program capable of parsing and processing PDF documents. Despite our efforts, we encountered limitations in parsing tables, where the results were inconsistent. - -\subsubsection{Unstructured IO} -This open-source library facilitates the processing of diverse document types. Utilizing its partition_pdf() function, we were able to segment a PDF document into distinct elements, enhancing the parsing process. Unstructured IO also supports "FigureCaptions" identification, potentially improving the contextual understanding of the model. We adopted their "hi-res" strategy, which converts PDF pages into images and then applying the OCR tool PyTesseract to extract text. -While the output for plain text was satisfactory, the library struggled with more complex documents, such as tax forms and bank statements, yielding inadequate results. -(add example of output-original text versus output text) -\subsubsection{PDFPlumber, Unstructured IO, and PyTesseract: (add more details on method here)} -To address these challenges, we integrated PDFPlumber for parsing table elements, PyTesseract for image-based text extraction, and Unstructured IO for processing other text content. PDFPlumber demonstrated superior layout detection capabilities, offering higher accuracy in parsing tables from non-scanned documents compared to our previous method. However, it underperformed with scanned documents and exhibited inconsistent results across various PDF files. -(add example of output - original text vs output text) - -\subesection{LLM implementation} -(outline only, needs revision) Quantize model first; currently cannot run without quantization. -Use llama.cpp which provides quantization -Have text user interface (TUI) for users to easily download the model from HuggingFace and quantize -Tested our implementation with Llama2 7b & 13b, Mixtral 8x7b, Gemma 13b -Could use any other model - -\subsection{Graph DB implementation} -(to be added) -% ------------------------------------------------------------------------------------------------------------------------ -\section{Results and Discussion} - -\subsection{Experimentation protocol} - -\subsection{Data tables} -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Discussion} -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Conclusion} - -\bibliographystyle{IEEEtran} -\bibliography{references} -(add graph papers and Lewis et al. here) -%------ To create Appendix with additional stuff -------% -%\newpage -%\appendix -%\section{Appendix} -%Put data files, CAD drawings, additional sketches, etc. - -\end{document} \ No newline at end of file diff --git a/full_report/Latex_report/Capstone5_report_v3.txt b/full_report/Latex_report/Capstone5_report_v3.txt deleted file mode 100644 index 810bfe9..0000000 --- a/full_report/Latex_report/Capstone5_report_v3.txt +++ /dev/null @@ -1,211 +0,0 @@ -% THIS TEMPLATE IS A WORK IN PROGRESS -% Adapted from an original template by faculty at Reykjavik University, Iceland - -\documentclass{scrartcl} -\input{File_Setup.tex} -\usepackage{graphicx,epsfig} -\hypersetup{ - colorlinks = true, %Colours links instead of ugly boxes - urlcolor = blue, %Colour for external hyper links - linkcolor = blue, %Colour of internal links - citecolor = red, %Colour of citations - setpagesize = false, - linktocpage = true, -} -\graphicspath{ {fig/} } - - - -\renewenvironment{abstract}{ - \centering - \textbf{Abstract} - \vspace{0.5cm} - \par\itshape - \begin{minipage}{0.7\linewidth}}{\end{minipage} - \noindent\ignorespaces -} -% ------------------------------------------------------------------------------------------------------------------------ - -\begin{document} -%Title of the report, name of coworkers and dates (of experiment and of report). -\begin{titlepage} - \centering - \includegraphics[width=0.6\textwidth]{GW_logo.eps}\par - \vspace{2cm} - %%%% COMMENT OUT irrelevant lines below: Data Science OR Computer Science OR none - {\scshape\LARGE Data Science Program \par} - \vspace{1cm} - {\scshape\Large Capstone Report - Spring 2024\par} - %{\large \today\par} - \vspace{1.5cm} - %%%% PROJECT TITLE - {\huge\bfseries Vector vs. Graph Database for Retrieval-Augmented Generation\par} - \vspace{2cm} - %%%% AUTHOR(S) - {\Large\itshape Arjun Bingly,\\ Sanchit Vijay,\\ Erika Pham,\\Kunal Inglunkar}\par - \vspace{1.5cm} - supervised by\par - %%%% SUPERVISOR(S) - Amir Jafari - - \vfill - \begin{abstract} - We introduce an implementation of Retrieval-Augmented Generation (RAG) that retrieves from graph databases as part of an end-to-end, self-hostable, semantic-based search engine for internal documents. RAG’s ability to understand context and producing relevant quality responses to prompts is crucial to producing a semantic-based search engine. Traditional RAG implementation uses a vector database; but we see the potential for graph databases, owing to its complex relational capabilities (revise this). We also present a performance comparison between vector and graph databases for a RAG pipeline. - \end{abstract} - \vfill -% Bottom of the page -\end{titlepage} -\tableofcontents -\newpage -% ------------------------------------------------------------------------------------------------------------------------ -\section{Introduction} -Retriever-Augmented Generation (RAG) is a method natural language processing (NLP) that combines the retrieval of informational documents from a large database (the "retriever" part) and the generation of coherent, contextual text based on the information retrieved (the "generation" part). -It was introduced as an enhancement to Large Language Models (LLMs), as RAG provides the LLM with real-time data access, preserves data privacy, and mitigate "hallucination" (cite paper). RAG is therefore ideal for semantic-based search engines; it improves their ability to understand and respond to queries with high relevance, accuracy, and personalization. -Traditionally, RAG implementations uses vector databases for its retrieval process. As RAG uses vector embeddings for its processes, vector database is the optimal choice for ease of retrieval and efficiency in similarity search. -However, since RAG simply outputs the closest vector in relation to the query vector, it leaves room for error if the database does not contain relevant information to the input prompt. This makes RAG overly reliant on the quality of the data and the embedding process. Additionally, while vector databases are scalable, the computational resources required for its maintenance can be expensive. -Graph database presents a very promising possibility due to its complex relational network - in theory, this could solve vector DB's limitation. -There is limited existing literature comparing the performance of vector versus graph database in a RAG implementation. This paper aims to experiment with graph database for RAG, and compare its performance to traditional implementation which uses vector databases. - -% ------------------------------------------------------------------------------------------------------------------------ -\section{Problem Statement} -Our current main challenges include: -1. Parsing tables in PDF documents accurately. -2. Traditional performance evaluation metrics (list them) for RAG are not informative on our process (add why?). -3. Implementation of graph database for RAG is difficult, existing literature and experiments employ non-open sourced products such as OpenAI (cite) which we lack resources for. - -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Related Work} -The original inspiration was to create an end-to-end, open-sourced, self-hostable search engine. Companies that need an internal search engine would benefit from this implementation, as it requires no access to online resources. -https://github.com/michaelthwan/searchGPT and https://github.com/GerevAI/gerev#integrations are open-sourced packages for LLM-powered semantic search engines. However, they leverage APIs, which makes the process reliant on online resources. We aim to localize all processes to ensure self-hostability. -% ------------------------------------------------------------------------------------------------------------------------ -\section{Solution and Methodology} -\subsection{RAG Pipeline} -\subsubsection{Overview} -Figure 1 shows a traditional RAG pipeline. As the name implies, the process is two-part: retrieval and generation. -The input query and documents are first preprocessed into vectors through the embedding process. -The pipeline then retrieves data relevant to the query, performing a similarity search in the vector database. Once the retrieval process is complete, RAG utilizes an LLM to understand and preserve context. Then, RAG system integrates the retrieved information with the original query to provide a richer context for the generation phase. -In the generation step, the augmented query is processed by the LLM, which synthesizes the information into a coherent and contextually appropriate response. The final output is then post-processed, if necessary, to ensure it meets the required specifications, such as correctness, coherence, and relevance. -\begin{figure}[H] - \begin{center} - \includegraphics[scale=0.7]{basic_RAG_pipeline.drawio.svg} - \end{center} - \caption{Figure 1: Basic Retrieval-Augmented Generation (RAG) Pipeline (better illustration coming)} - \label{fig:ascent} -\end{figure} - -RAG provides several advantages and solutions to LLMs caveats: -\begin{itemize} - \item 1. Empowering LLM solutions with real-time data access - LLMs are typically trained on vast datasets that may quickly become outdated as new information emerges. RAG technology addresses this limitation by allowing LLMs to access and incorporate real-time data into their responses. Through the retrieval component, RAG systems can query up-to-date databases or the internet to find the most current information, ensuring that the generated output reflects the latest developments. - \item 2. Preserving data privacy - RAG can retrieve information from a controlled, secure dataset or environment rather than relying on direct access to private data. By designing the retrieval component to operate within privacy-preserving parameters, RAG can ensure that the LLM will not directly access or expose sensitive data. - \item 3. Mitigating LLM hallucinations - "Hallucination" in the context of LLMs refers to the generation of plausible but inaccurate or entirely fabricated information. This is a known challenge with LLMs, where the model might confidently produce incorrect data or statements.(cite) RAG helps mitigate this issue by grounding the LLM's responses in retrieved documents that are verified or deemed reliable. By leveraging external sources of information, RAG reduces the model's reliance on potentially flawed internal representations and biases, leading to more accurate outputs. -\end{itemize} - -\subsubsection{RAG Document Chains} - -Document chains are used in Retrieval-Augmented Generation (RAG) to effectively utilize retrieved documents. These chains serve various purposes, including efficient document processing, task decomposition, and improved accuracy. - -\textbf{Stuff Chain} - -This is the simplest form of document chain. It involves putting all relevant data into the prompt. Given \(n\) documents, it concatenates the documents with a separator, usually \verb|\n\n|. -The advantage of this method is \textit{it only requires one call to the LLM}, and the model has access to all the information at once. -However, one downside is \textit{most LLMs can only handle a certain amount of context}. For large or multiple documents, stuffing may result in a prompt that exceeds the context limit. -Additionally, this method is \textit{only suitable for smaller amounts of data}. When working with larger data, alternative approaches should be used. - -\begin{figure}[H] - \centering - \includegraphics[width=0.8\textwidth]{path/to/stuff_chain_image.jpg} - \caption{Illustration of Stuff Chain} -\end{figure} -\href{https://readmedium.com/en/https:/ogre51.medium.com/types-of-chains-in-langchain-823c8878c2e9}{Source} - -\textbf{Refine Chain} - -The Refine Documents Chain uses an iterative process to generate a response by analyzing each input document and updating its answer accordingly. -It passes all non-document inputs, the current document, and the latest intermediate answer to an LLM chain to obtain a new answer for each document. -This chain is ideal for tasks that involve analyzing more documents than can fit in the model’s context, as it \textit{only passes a single document to the LLM at a time}. -However, this also means it makes significantly more LLM calls than other chains, such as the Stuff Documents Chain. It may \textit{perform poorly for tasks that require cross-referencing between documents} or detailed information from multiple documents. -Pros of this method include \textit{incorporating more relevant context and potentially less data loss} than the MapReduce Documents Chain. However, \textit{it requires many more LLM calls and the calls are not independent}, meaning they cannot be paralleled like the MapReduce Documents Chain. -There may also be dependencies on the order in which the documents are analyzed, thus it might be ideal to provide documents in order of similarity. - -\begin{figure}[H] - \centering - \includegraphics[width=0.8\textwidth]{path/to/refine_chain_image.jpg} - \caption{Illustration of the Refine Chain method.} -\end{figure} -\href{https://readmedium.com/en/https:/ogre51.medium.com/types-of-chains-in-langchain-823c8878c2e9}{Source} - -\textbf{Map Reduce Chain} - -To process \textit{large amounts of data efficiently}, the MapReduceDocumentsChain method is used. -This involves applying an LLM chain to each document individually (in the Map step), producing a new document. Then, all the new documents are passed to a separate combine documents chain to get a single output (in the Reduce step). If necessary, the mapped documents can be compressed before passing them to the combine documents chain. -This compression step is performed recursively. -This method requires an initial prompt on each chunk of data. -For summarization tasks, this could be a summary of that chunk, while for question-answering tasks, it could be an answer based solely on that chunk. Then, a different prompt is run to combine all the initial outputs. -The pros of this method are that \textit{it can scale to larger documents and handle more documents} than the StuffDocumentsChain. Additionally, \textit{the calls to the LLM on individual documents are independent and can be parallelized}. -The cons are that it \textit{requires many more calls to the LLM} than the StuffDocumentsChain and \textit{loses some information during the final combining call}. - -\begin{figure}[H] - \centering - \includegraphics[width=0.8\textwidth]{path/to/map_reduce_chain_image.jpg} - \caption{Illustration of the Map Reduce Chain method.} -\end{figure} -\href{https://readmedium.com/en/https:/ogre51.medium.com/types-of-chains-in-langchain-823c8878c2e9}{Source} - -\subsubsection{Propmting} -Prompting strategies differ from model to model.For example, the Llama model takes system prompts. -(add example here) -\subsubsection{Other Hyperparameters} -\begin{itemize} - \item \textbf{Chunk Sizes} --- generally, the smallest chunk size you can get away with. - \item \textbf{Similarity Score} --- e.g., cosine similarity, a measure used to determine how similar two documents or vectors are. - \item \textbf{Embedding} --- a representation of text in a high-dimensional vector space, which allows for capturing the semantic meaning of words or phrases. -\end{itemize} - -\subsection {PDF Parser} -Parsing PDF documents presents a significant challenge due to their complex structure. PDFs often contain unstructured data, which lacks a predefined organization, making accurate recognition and processing arduous. A notable difficulty arises when handling tables, as PDFs do not inherently understand table columns, complicating the task of recognizing table layouts. This complexity is particularly evident in documents like tax forms, which feature intricate nested table structures. Additionally, scanned PDFs require Optical Character Recognition (OCR) tools to convert images back into text, introducing another layer of complexity. -Our approach involved experimenting with various packages and strategies to develop a program capable of parsing and processing PDF documents. Despite our efforts, we encountered limitations in parsing tables, where the results were inconsistent. - -\subsubsection{Unstructured IO} -This open-source library facilitates the processing of diverse document types. Utilizing its partition_pdf() function, we were able to segment a PDF document into distinct elements, enhancing the parsing process. Unstructured IO also supports "FigureCaptions" identification, potentially improving the contextual understanding of the model. We adopted their "hi-res" strategy, which converts PDF pages into images and then applying the OCR tool PyTesseract to extract text. -While the output for plain text was satisfactory, the library struggled with more complex documents, such as tax forms and bank statements, yielding inadequate results. -(add example of output-original text versus output text) -\subsubsection{PDFPlumber, Unstructured IO, and PyTesseract: (add more details on method here)} -To address these challenges, we integrated PDFPlumber for parsing table elements, PyTesseract for image-based text extraction, and Unstructured IO for processing other text content. PDFPlumber demonstrated superior layout detection capabilities, offering higher accuracy in parsing tables from non-scanned documents compared to our previous method. However, it underperformed with scanned documents and exhibited inconsistent results across various PDF files. -(add example of output - original text vs output text) - -\subesection{LLM implementation} -(outline only, needs revision) Quantize model first; currently cannot run without quantization. -Use llama.cpp which provides quantization -Have text user interface (TUI) for users to easily download the model from HuggingFace and quantize -Tested our implementation with Llama2 7b & 13b, Mixtral 8x7b, Gemma 13b -Could use any other model - -\subsection{Graph DB implementation} -(to be added) -% ------------------------------------------------------------------------------------------------------------------------ -\section{Results and Discussion} - -\subsection{Experimentation protocol} - -\subsection{Data tables} -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Discussion} -% ------------------------------------------------------------------------------------------------------------------------ - -\section{Conclusion} - -\bibliographystyle{IEEEtran} -\bibliography{references} -(add graph papers and Lewis et al. here) -%------ To create Appendix with additional stuff -------% -%\newpage -%\appendix -%\section{Appendix} -%Put data files, CAD drawings, additional sketches, etc. - -\end{document} \ No newline at end of file diff --git a/full_report/Latex_report/grag_report_v1.pdf b/full_report/Latex_report/grag_report_v1.pdf new file mode 100755 index 0000000..0c655e4 Binary files /dev/null and b/full_report/Latex_report/grag_report_v1.pdf differ diff --git a/full_report/Latex_report/grag_report_v1.tex b/full_report/Latex_report/grag_report_v1.tex new file mode 100755 index 0000000..b4c10e3 --- /dev/null +++ b/full_report/Latex_report/grag_report_v1.tex @@ -0,0 +1,320 @@ +% THIS TEMPLATE IS A WORK IN PROGRESS +% Adapted from an original template by faculty at Reykjavik University, Iceland + +\documentclass{scrartcl} +\input{File_Setup.tex} +\usepackage{graphicx,epsfig} +\usepackage{listings} +\usepackage{subcaption} +\hypersetup{ + colorlinks = true, %Colours links instead of ugly boxes + urlcolor = blue, %Colour for external hyper links + linkcolor = blue, %Colour of internal links + citecolor = red, %Colour of citations + setpagesize = false, + linktocpage = true, +} +\graphicspath{ {fig/} } + + + +\renewenvironment{abstract}{ + \centering + \textbf{Abstract} + \vspace{0.5cm} + \par + \begin{minipage}{0.7\linewidth}}{\end{minipage} + \noindent\ignorespaces +} +% ------------------------------------------------------------------------------------------------------------------------ + +\begin{document} +%Title of the report, name of coworkers and dates (of experiment and of report). +\begin{titlepage} + \centering + \includegraphics[width=0.6\textwidth]{GW_logo.eps}\par + \vspace{2cm} + %%%% COMMENT OUT irrelevant lines below: Data Science OR Computer Science OR none + {\scshape\LARGE Data Science Program \par} + \vspace{1cm} + {\scshape\Large Capstone Report - Spring 2024\par} + %{\large \today\par} + \vspace{1.5cm} + %%%% PROJECT TITLE + {\huge\bfseries GRAG - Open Sourced Python Package Implementation of Retrieval-Augmented Generation\par} + \vspace{2cm} + %%%% AUTHOR(S) + {\Large\itshape Arjun Bingly,\\ Sanchit Vijay,\\ Erika Pham,\\Kunal Inglunkar}\par + \vspace{1.5cm} + supervised by\par + %%%% SUPERVISOR(S) + Amir Jafari + +\newpage + \vfill + \begin{abstract} + This report introduces GRAG (Good RAG), an open-sourced Python package providing an end-to-end implementation of Retrieval-Augmented Generation (RAG). + The package provides easy integration with various LLMs locally, and support for vector databases such as Chroma and DeepLake. It also provides a simple GUI implementation. This report details GRAG and its features. + Future work includes enhancement of the PDF parsing features, possible integration for other document types, as well as testing GRAG performance on graphs versus a traditional vector database and producing an evaluation suite. + Our documentation can be accessed at \url{https://g-rag.org/} and Git repo at \url{https://github.com/arjbingly/grag}. + \vfill + \end{abstract} + \end{titlepage} +% Bottom of the page + + +\tableofcontents +\newpage +% ------------------------------------------------------------------------------------------------------------------------ +\section{Introduction} + +Figure 1 shows a basic Retrieval-Augmented Generation (RAG) pipeline. As the name implies, the process is two-part: retrieval and generation. +The input query and documents are first preprocessed into vectors through the embedding process. +The pipeline then retrieves data relevant to the query, performing a similarity search in the vector database. Once the retrieval process is complete, RAG utilizes an LLM to understand and preserve context. Then, RAG system integrates the retrieved information with the original query to provide a richer context for the generation phase. +In the generation step, the augmented query is processed by a large-language model (LLM), which synthesizes the information into a coherent and contextually appropriate response. The final output is then post-processed, if necessary, to ensure it meets the required specifications, such as correctness, coherence, and relevance. + +\begin{figure}[H] + \begin{center} + \includegraphics[scale=0.7]{capstone_report/fig/basic_RAG_pipeline.png} + \end{center} + \caption{Basic Retrieval-Augmented Generation (RAG) Pipeline} + \label{fig:ascent} +\end{figure} + +RAG provides several advantages and solutions to LLMs caveats: +\begin{itemize} + \item Empowering LLM solutions with real-time data access: + + LLMs are typically trained on vast datasets that may quickly become outdated as new information emerges. RAG technology addresses this limitation by allowing LLMs to access and incorporate real-time data into their responses. Through the retrieval component, RAG systems can query up-to-date databases or the internet to find the most current information, ensuring that the generated output reflects the latest developments. + \item Preserving data privacy: + + RAG can retrieve information from a controlled, secure dataset or environment rather than relying on direct access to private data. By designing the retrieval component to operate within privacy-preserving parameters, RAG can ensure that the LLM will not directly access or expose sensitive data. + \item Mitigating LLM hallucinations: + + "Hallucination" in the context of LLMs refers to the generation of plausible but inaccurate or entirely fabricated information. This is a known challenge with LLMs, where the model might confidently produce incorrect data or statements.(cite) RAG helps mitigate this issue by grounding the LLM's responses in retrieved documents that are verified or deemed reliable. By leveraging external sources of information, RAG reduces the model's reliance on potentially flawed internal representations and biases, leading to more accurate outputs. +\end{itemize} +\newline +RAG has become very popular since its introduction in Lewis et al. 2020 \cite{lewis2021retrievalaugmented}, with its most-known implementation, ChatGPT (powered by GPT-4), creating a sizeable and long-term impact in all industries and academic institutions. +Its versatility in several fields has fueled interest in research and development to develop more RAG implementations. Our package, GRAG (Good RAG), aims to add to the RAG literature. + +% ------------------------------------------------------------------------------------------------------------------------ +\section{Problem Statement} + +While RAG APIs such as OpenAI are very powerful, they usually have usage limits, which is a barrier for extensive commercial use. +Dependence on APIs could limit customization and integration to existing software, which is not ideal for institutions or individuals who have specific needs for their application. +Furthermore, sensitive data being stored on external servers, on top of data usage policy dependent on API providers; raise concerns of data privacy. +\newline +\newline +GRAG aims to provide a customizable, easy-to-use, end-to-end, open-sourced solution that resolves cost and data privacy issues. The package could be implemented locally, allowing for control over data storage and maximizing personalization. + +% ------------------------------------------------------------------------------------------------------------------------ + +% ------------------------------------------------------------------------------------------------------------------------ +\section{Features} + +\subsection {PDF Parser} + +Parsing PDF documents presents a significant challenge due to their complex structure. PDFs often contain unstructured data, which lacks a predefined organization, making accurate recognition and processing arduous. A notable difficulty arises when handling tables, as PDFs do not inherently understand table columns, complicating the task of recognizing table layouts. This complexity is particularly evident in documents like tax forms, which feature intricate nested table structures. Additionally, scanned PDFs require Optical Character Recognition (OCR) tools to convert images back into text, introducing another layer of complexity. +\newline +\newline +Initially, we tried only using \textit{unstructured.io}\cite{unstrio} and \textit{pdfplumber}\cite{pdfplumber}, respectively. Neither libraries could consistently parse all PDF files with high accuracy. +The current strategy is to primarily use the \textit{unstructured.io} library for partitioning and parsing. +For documents containing more complex table structures, such as nested tables or tax forms, \textit{pdfplumber} and \textit{pytesseract}\cite{pytesseract} are deployed. +The table structures on the documents are detected, then cropped out before the contained text is extracted from the tables. +\newline +\newpage +Below is an example of parsing a tax form, with a complex layout. We see that the output is fairly accurate in this case. The parsing tool is able to identify the layout and extract the information well. +\newline +\newline +\begin{figure}[h!] + \centering + \begin{subfigure}[b]{\textwidth} + \includegraphics[width=\textwidth]{capstone_report/fig/og_form.png} + \caption{Original Text} + \label{fig:image1} + \end{subfigure} + + \begin{subfigure}[b]{\textwidth} + \includegraphics[width=\textwidth]{capstone_report/fig/og_form_output.png} + \caption{Parsed Text} + \label{fig:image2} + \end{subfigure} + \caption{Tax Form Parsing Example} + \label{fig:images} +\end{figure} + +\newline +\newline + +This result is not consistent across all types of PDFs, however. Figure 6 (Appendix) is an example of how the tool extracts and parses a two-column paper. The layout is not accurate, and the text position is not in the right place. More experimentation is needed for a robust, consistent parsing tool for PDFs. + + +\subsection{Vector Stores} + +Vector store or vector database is a type of database that stores data in high-dimensional vectors. This is a crucial component of RAG, storing embeddings for both retrieval and generation processes. +Currently, GRAG supports Chroma\cite{chroma} and DeepLake\cite{deeplake}. By default, our embedding model is instructor-xl, but any HuggingFace embeddings can be used. + +\subsection{LLMs} +As explained above, cost and data privacy concerns mean we could not use OpenAI APIs. To run models locally, \textit{llama.cpp} is the best implementation as it uses a low-level language, with extensive backend support, such as CUDA. +Currently, GRAG provides two options to run LLMs locally: +\newline +\newline +1. Run LLMs using HuggingFace\cite{huggingface}: +\newline +This is the easiest way to get started, but does not offer as much flexibility. If using a config file (\textit{config.ini}), simply need to change the \textit{model\_name} to the HuggingFace repo id. +If the models are gated, user would need to provide an authentication token. +\newline +\newline +2. Run LLMs using llama.cpp \cite{langchain2023llamacpp}: +\newline +\textit{Llama.cpp} offers great flexibility, but its repository is not user-friendly. To streamline the process, we have a script/cookbook for ease of implementation. +\newline +\newline +GRAG has been tested with Llama2 7b \& 13b, Mixtral 8x7b, Gemma 13b, but could support any model from HuggingFace. +\newpage +\subsection{Multi-vector Retriever} + +We built a retriever, which employs a Langchain tool called \textit{multi-vector retriever}. +Instead of representing each document as one vector, one document is represented by multiple vectors, with each vector being a different aspect of the text. +For example, the tool could split and then embed smaller chunks of a document, which would help embeddings to better carry semantic context. +Our retriever is used to retrieve multiple chunks and return the most similar chunks from a vectorDB. + +\subsection{RAG Features} + +\subsubsection{RAG Document Chains} + +Document chains are used in Retrieval-Augmented Generation (RAG) to effectively utilize retrieved documents. These chains serve various purposes, including efficient document processing, task decomposition, and improved accuracy. +\newline +\newline + +\begin{figure}[H] + \centering + \includegraphics[width=0.8\textwidth]{capstone_report/fig/stuff_chain_langchain.jpg} + \caption{Illustration of Stuff Chain \href{https://readmedium.com/en/https:/ogre51.medium.com/types-of-chains-in-langchain-823c8878c2e9}{(LangChain)}} + +\end{figure} +\newline +\textbf{Stuff Chain} +\newline +\newline +This is the simplest form of document chain. It involves putting all relevant data into the prompt. Given \(n\) documents, it concatenates the documents with a separator, usually \verb|\n\n|. +The advantage of this method is \textit{it only requires one call to the LLM}, and the model has access to all the information at once. +\newline +\newline +However, most LLMs can only handle a certain amount of context. For large or multiple documents, stuffing may result in a prompt that exceeds the context limit. +Additionally, this method is only suitable for smaller amounts of data. When working with larger data, alternative approaches should be used. + +\begin{figure}[H] + \centering + \includegraphics[width=0.8\textwidth]{capstone_report/fig/refine_chain_langchain.jpg} + \caption{Illustration of the Refine Chain method \href{https://readmedium.com/en/https:/ogre51.medium.com/types-of-chains-in-langchain-823c8878c2e9}{(LangChain)}} +\end{figure} +\newline +\textbf{Refine Chain} +\newline +\newline +The Refine Documents Chain uses an iterative process to generate a response by analyzing each input document and updating its answer accordingly. +It passes all non-document inputs, the current document, and the latest intermediate answer to an LLM chain to obtain a new answer for each document. +\newline +\newline +This chain is ideal for tasks that involve analyzing more documents than can fit in the model’s context, as it only passes one document to the LLM at a time. +\newline +However, this also means it makes significantly more LLM calls than other chains, such as stuff chain. It may perform poorly for tasks that require cross-referencing between documents or detailed information from multiple documents. +\newline +\newline +There may also be dependencies on the order in which the documents are analyzed, thus it might be ideal to provide documents in order of similarity. + + +\subsubsection{Prompting} +\newline +\newline +In additional to model-specific prompting, GRAG provides cookbooks to implement custom generic and few-shot prompts. To use, user needs to download and quantize a model, and ingest their document(s) of choice. As an example, we have ingested the United States Constitution to demonstrate custom prompting. Figure 5 shows an example of a custom prompt output. + +\begin{figure}[H] + \centering + \includegraphics[width=\textwidth]{capstone_report/fig/custom_prompt_cookbook.png} + \caption{Custom Prompt Cookbook Example} +\end{figure} +\newpage +\noindent "Query: What is the first amendment?" shows the input question. +\newline +\newline +The generated text is: "The First Amendment protects citizens from government interference with their fundamental rights, including freedom of speech, religion, the press, assembly, and petition. It also prohibits the government from establishing a national religion or abridging the right to bear arms without proper justification. Additionally, it provides for due process of law and a speedy trial by an impartial jury in criminal cases." +\newline +\newline +The module output also shows three "Sources" of where the pipeline got the generated information from. + +\subsubsection{Other Hyperparameters} +\begin{itemize} + \item \textbf{Chunk Sizes} --- generally, the smallest chunk size you can get away with. + \item \textbf{Similarity Score} --- e.g., cosine similarity, a measure used to determine how similar two documents or vectors are. + \item \textbf{Embedding} --- a representation of text in a high-dimensional vector space, which allows for capturing the semantic meaning of words or phrases. +\end{itemize} + +\subsection{GUI} +GRAG provides a RAG GUI cookbook that you can run using Streamlit. To run, you need to download and quantize the LLM models, and ingest the documents you would like to use. As an example, we loaded the United States Constitution as a demo."Temperature" toggle decides how "creative" the generated answer is, and "Top-k" toggle decides how many relevant chunks to retrieve. + +\begin{figure}[H] + \centering + \includegraphics[width=0.8\textwidth]{capstone_report/fig/gui.png} + \caption{U.S. Constitution GUI Demo} + +\end{figure} +\newline + + +% ------------------------------------------------------------------------------------------------------------------------ + +\section{Challenges \& Future Work} + +\subsection{Evaluation} +\newline +When evaluating a RAG pipeline, we assess the performance of both the retrieval and generation processes. For retrieval, we measure how relevant the retrieved information is using precision/recall metrics. For generation, we examine the quality of the generated answer based on the retrieved content. +\newline\newline +For generation evaluation, traditional metrics such as ROUGE and BLEU calculate performance based on number of n-gram overlaps between generated text and a reference text. They are particularly limited; they do not correlate well with human evaluation \cite{sai2020survey}. Recently, LLMs have been proposed as a solution for reference-free end-to-end evaluation of question-answer pipelines like RAG. These models can be employed to predict the likelihood of a generated answer being correct based on context understanding rather than reference texts. +\newline +\newline +Currently, the most promising framework is Ragas \cite{es2023ragas}. However, it requires \mbox{OpenAI} to assess the performance. This is rather problematic for our project, since we have limited resources. + +\subsection{Document Parsing} +\newline +While we have made improvements in parsing PDF files, the results are not consistent for different PDF layouts. More experimentation is needed to produce a more robust and accurate result. We also plan to implement parsing for more file types, such as HTML. + +\subsection{Graphs Implementation} +\newline +Traditionally, RAG implementations uses vector databases for its retrieval process. As RAG uses vector embeddings for its processes, vector database is the optimal choice for ease of retrieval and efficiency in similarity search. +However, since RAG simply outputs the closest vector by means of cosine similarity, it leaves room for error if the database does not contain relevant information to the input prompt. This makes RAG overly reliant on the quality of the data and the embedding process. +\newline +\newline +Graphs present a very promising possibility due to its complex relational network - in theory, this could solve vector DB's limitation and reliance on cosine similarity. +There is limited existing literature comparing the performance of vector versus graph structure in a RAG implementation. Our next step is to implement graph database for GRAG, and compare its performance to vector databases. +\newpage +% ------------------------------------------------------------------------------------------------------------------------ + +\bibliographystyle{IEEEtran} +\bibliography{references} + + + +%------ To create Appendix with additional stuff -------% +\newpage +\appendix +\section{Appendix} +%Put data files, CAD drawings, additional sketches, etc. +\begin{figure}[h!] + \centering + \begin{subfigure}[b]{0.8\textwidth} + \includegraphics[width=\textwidth]{capstone_report/fig/og_text.png} + \caption{Original Text} + \label{fig:image1} + \end{subfigure} + + \begin{subfigure}[b]{0.8\textwidth} + \includegraphics[width=\textwidth]{capstone_report/fig/og_text_output.png} + \caption{Parsed Text} + \label{fig:image2} + \end{subfigure} + \caption{Double-column Parsing Example} + \label{fig:images} +\end{figure} +\end{document} \ No newline at end of file diff --git a/full_report/Latex_report/references.bib b/full_report/Latex_report/references.bib index 8859e14..802b229 100644 --- a/full_report/Latex_report/references.bib +++ b/full_report/Latex_report/references.bib @@ -1,64 +1,80 @@ -%% This BibTeX bibliography file was created using BibDesk. -%% http://bibdesk.sourceforge.net/ +@misc{sai2020survey, + title={A Survey of Evaluation Metrics Used for NLG Systems}, + author={Ananya B. Sai and Akash Kumar Mohankumar and Mitesh M. Khapra}, + year={2020}, + eprint={2008.12009}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +@misc{es2023ragas, + title={RAGAS: Automated Evaluation of Retrieval Augmented Generation}, + author={Shahul Es and Jithin James and Luis Espinosa-Anke and Steven Schockaert}, + year={2023}, + eprint={2309.15217}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} -@article{byzantine-pki, - Author = {Vivek Pathak and Liviu Iftode}, - Journal = {Computer Networks}, - Number = {4}, - Pages = {579--596}, - Title = {Byzantine fault tolerant public key authentication in peer-to-peer systems}, - Url = {http://dx.doi.org/10.1016/j.comnet.2005.07.007}, - Volume = {50}, - Year = {2006}} +@misc{lewis2021retrievalaugmented, + title={Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks}, + author={Patrick Lewis and Ethan Perez and Aleksandra Piktus and Fabio Petroni and Vladimir Karpukhin and Naman Goyal and Heinrich Küttler and Mike Lewis and Wen-tau Yih and Tim Rocktäschel and Sebastian Riedel and Douwe Kiela}, + year={2021}, + eprint={2005.11401}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +@misc{langchain2023llamacpp, + author = {LangChain}, + title = {LlamaCPP Integration}, + year = {2023}, + howpublished = {\url{https://python.langchain.com/v0.1/docs/integrations/llms/llamacpp/}}, + note = {Accessed: Jan 20, 2024} +} -@article{atomic-mcast-tcs01, - Author = {Rachid Guerraoui}, - Journal = {Theoretical Computer Science}, - Pages = {297--316}, - Title = {Genuine Atomic Multicast in Asynchronous Distributed Systems}, - Volume = {254}, - Year = {2001}} +@misc{unstrio, + author = {Unstructured}, + title = {unstructured.io}, + year = {2024}, + howpublished = {\url{https://unstructured.io/}}, + note = {Accessed: Jan 20, 2024} +} +@misc{pytesseract, + author = {Samuel Hoffstaetter}, + title = {pytesseract: Python bindings for Google Tesseract OCR}, + year = {2018}, + howpublished = {\url{https://github.com/h/pytesseract}}, + note = {Accessed: Jan 20, 2024} +} -@inproceedings{sybilattack, - Address = {London, {UK}}, - Author = {Douceur, John R.}, - Booktitle = {Revised Papers from the First International Workshop on Peer-to-Peer Systems}, - Isbn = {3-540-44179-4}, - Lccn = {2646}, - Month = {7-8 March}, - Pages = {251--260}, - Publisher = {Springer, Berlin}, - Series = {{IPTPS} '01}, - Title = {The Sybil Attack}, - Year = {2002}} +@misc{pdfplumber, + author = {Jeremy Singer-Vine}, + title = {pdfplumber}, + year = {2024}, + howpublished = {\url{https://github.com/jsvine/pdfplumber}}, + note = {Accessed: Jan 20, 2024} +} -@article{asymencription, - Author = {Simmons, Gustavus J.}, - Doi = {10.1145/356789.356793}, - Issn = {0360-0300}, - Journal = {{ACM} Comput. Surv.}, - Lccn = {0115}, - Month = dec, - Number = {4}, - Pages = {305--330}, - Title = {Symmetric and Asymmetric Encryption}, - Volume = {11}, - Year = {1979}, - Bdsk-Url-1 = {http://dx.doi.org/10.1145/356789.356793}} +@misc{huggingface, + author = {HuggingFace}, + title = {HuggingFace}, + year = {2024}, + howpublished = {\url{https://huggingface.co/}}, + note = {Accessed: Jan 20, 2024} +} -@misc{psn-fail, - Author = {Shane Richmond and Christopher Williams}, - Howpublished = {http://www.telegraph.co.uk/technology/news/8475728/ Millions-of-internet-users-hit-by-massive-Sony-PlayStation-data-theft.html}, - Note = {The Telegraph}, - Title = {Millions of internet users hit by massive Sony PlayStation data theft}, - Year = {2011}} +@misc{chroma, + author = {Chroma}, + title = {Chroma}, + year = {2023}, + howpublished = {\url{https://docs.trychroma.com/}}, + note = {Accessed: Jan 20, 2024}} -@misc{verisign-fail, - Author = {Joseph Menn}, - Howpublished = {http://www.reuters.com/article/2012/02/02/us-hacking-verisign-idUSTRE8110Z820120202}, - Note = {Reuters}, - Title = {Key Internet operator VeriSign hit by hackers}, - Year = {2012}} \ No newline at end of file +@misc{deeplake, + author = {deeplake}, + title = {deeplake}, + year = {2022}, + howpublished = {\url{https://docs.deeplake.ai/en/latest/deeplake.html}}, + note = {Accessed: Jan 20, 2024}} \ No newline at end of file