From c9ae5a1beebd82c463c46c34b21b5ce491019975 Mon Sep 17 00:00:00 2001
From: Maksym Zhytnikov <63515947+Maxxx-zh@users.noreply.github.com>
Date: Mon, 20 May 2024 21:35:35 +0300
Subject: [PATCH] [FSTORE-1404] LLM PDF Tutorial (#266)
* LLM PDF Search Tutorial using RAG and Fine-Tuning
---
README.md | 4 +-
.../llm_pdfs/1_feature_backfill.ipynb | 285 +++++++++++++
.../llm_pdfs/1a_feature_pipeline.py | 69 ++++
.../llm_pdfs/1b_dataset_generation.ipynb | 290 ++++++++++++++
.../llm_pdfs/2_training_pipeline.ipynb | 376 ++++++++++++++++++
.../llm_pdfs/3_inference_pipeline.ipynb | 366 +++++++++++++++++
advanced_tutorials/llm_pdfs/app.py | 127 ++++++
advanced_tutorials/llm_pdfs/config.py | 16 +
.../functions/connect_to_google_drive.py | 19 +
.../llm_pdfs/functions/llm_chain.py | 133 +++++++
.../llm_pdfs/functions/pdf_preprocess.py | 96 +++++
.../llm_pdfs/functions/prompt_engineering.py | 151 +++++++
.../llm_pdfs/functions/text_preprocess.py | 73 ++++
advanced_tutorials/llm_pdfs/requirements.txt | 22 +
14 files changed, 2026 insertions(+), 1 deletion(-)
create mode 100644 advanced_tutorials/llm_pdfs/1_feature_backfill.ipynb
create mode 100644 advanced_tutorials/llm_pdfs/1a_feature_pipeline.py
create mode 100644 advanced_tutorials/llm_pdfs/1b_dataset_generation.ipynb
create mode 100644 advanced_tutorials/llm_pdfs/2_training_pipeline.ipynb
create mode 100644 advanced_tutorials/llm_pdfs/3_inference_pipeline.ipynb
create mode 100644 advanced_tutorials/llm_pdfs/app.py
create mode 100644 advanced_tutorials/llm_pdfs/config.py
create mode 100644 advanced_tutorials/llm_pdfs/functions/connect_to_google_drive.py
create mode 100644 advanced_tutorials/llm_pdfs/functions/llm_chain.py
create mode 100644 advanced_tutorials/llm_pdfs/functions/pdf_preprocess.py
create mode 100644 advanced_tutorials/llm_pdfs/functions/prompt_engineering.py
create mode 100644 advanced_tutorials/llm_pdfs/functions/text_preprocess.py
create mode 100644 advanced_tutorials/llm_pdfs/requirements.txt
diff --git a/README.md b/README.md
index dd8e87cc..01793905 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ In order to understand the tutorials you need to be familiar with general concep
- [Iris](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/iris): Classify iris flower species.
- [Loan Approval](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/loan_approval): Predict loan approvals.
- Advanced Tutorials:
- - [Air Quality](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/air_quality): Predict the Air Quality value (PM2.5) in Europe and USA using weather features and air quality features of the previous days.
+ - [Air Quality](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/air_quality): Creating an air quality AI assistant that displays and explains air quality indicators for specific dates or periods, using Function Calling for LLMs and a RAG approach without a vector database.
- [Bitcoin](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/bitcoin): Predict Bitcoin price using timeseries features and tweets sentiment analysis.
- [Citibike](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/citibike): Predict the number of citibike users on each citibike station in the New York City.
- [Credit Scores](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/credit_scores): Predict clients' repayment abilities.
@@ -50,6 +50,8 @@ In order to understand the tutorials you need to be familiar with general concep
- [NYC Taxi Fares](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/nyc_taxi_fares): Predict the fare amount for a taxi ride in New York City given the pickup and dropoff locations.
- [Recommender System](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/recommender-system): Build a recommender system for fashion items.
- [TimeSeries](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/timeseries): Timeseries price prediction.
+ - [LLM PDF](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/llm_pdfs): An AI assistant that utilizes a Retrieval-Augmented Generation (RAG) system to provide accurate answers to user questions by retrieving relevant context from PDF documents.
+ - [Fraud Cheque Detection](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/fraud_cheque_detection): Building an AI assistant that detects fraudulent scanned cheque images and generates explanations for the fraud classification, using a fine-tuned open-source LLM.
- [Keras model and Sklearn Transformation Functions with Hopsworks Model Registry](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/transformation_functions/keras): How to register Sklearn Transformation Functions and Keras model in the Hopsworks Model Registry, how to retrieve them and then use in training and inference pipelines.
- [PyTorch model and Sklearn Transformation Functions with Hopsworks Model Registry](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/transformation_functions/pytorch): How to register Sklearn Transformation Functions and PyTorch model in the Hopsworks Model Registry, how to retrieve them and then use in training and inference pipelines.
- [Sklearn Transformation Functions With Hopsworks Model Registy](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/transformation_functions/sklearn): How to register sklearn.pipeline with transformation functions and classifier in Hopsworks Model Registry and use it in training and inference pipelines.
diff --git a/advanced_tutorials/llm_pdfs/1_feature_backfill.ipynb b/advanced_tutorials/llm_pdfs/1_feature_backfill.ipynb
new file mode 100644
index 00000000..652fec9e
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/1_feature_backfill.ipynb
@@ -0,0 +1,285 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "82622ee3",
+ "metadata": {},
+ "source": [
+ "## 📝 Imports "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ade7fe1f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install -r requirements.txt -q"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7ab771e2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import PyPDF2\n",
+ "import pandas as pd\n",
+ "from sentence_transformers import SentenceTransformer\n",
+ "\n",
+ "from functions.pdf_preprocess import (\n",
+ " download_files_to_folder, \n",
+ " process_pdf_file,\n",
+ ")\n",
+ "from functions.text_preprocess import process_text_data\n",
+ "import config\n",
+ "\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7e8f1796",
+ "metadata": {},
+ "source": [
+ "## 💾 Download files from Google Drive "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ea8c756e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Call the function to download files\n",
+ "new_files = download_files_to_folder(\n",
+ " config.FOLDER_ID, \n",
+ " config.DOWNLOAD_PATH,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f783e27e",
+ "metadata": {},
+ "source": [
+ "## 🧬 Text Extraction "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0b3b6715",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize an empty list\n",
+ "document_text = []\n",
+ "\n",
+ "for file in new_files:\n",
+ " process_pdf_file(\n",
+ " file, \n",
+ " document_text, \n",
+ " config.DOWNLOAD_PATH,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "348b723e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a DataFrame\n",
+ "columns = [\"file_name\", \"file_link\", \"page_number\", \"text\"]\n",
+ "df_text = pd.DataFrame(\n",
+ " data=document_text,\n",
+ " columns=columns,\n",
+ ")\n",
+ "# Display the DataFrame\n",
+ "df_text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "62a70763",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Process text data using the process_text_data function\n",
+ "df_text_processed = process_text_data(df_text)\n",
+ "\n",
+ "# Display the processed DataFrame\n",
+ "df_text_processed"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "10f9ea36",
+ "metadata": {},
+ "source": [
+ "## ⚙️ Embeddings Creation "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9805c689",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the SentenceTransformer model\n",
+ "model = SentenceTransformer(\n",
+ " config.MODEL_SENTENCE_TRANSFORMER,\n",
+ ").to(config.DEVICE)\n",
+ "model.device"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1b7a89a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Generate embeddings for the 'text' column using the SentenceTransformer model\n",
+ "df_text_processed['embeddings'] = pd.Series(\n",
+ " model.encode(df_text_processed['text']).tolist(),\n",
+ ")\n",
+ "\n",
+ "# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame\n",
+ "df_text_processed['context_id'] = [*range(df_text_processed.shape[0])]\n",
+ "\n",
+ "# Display the resulting DataFrame with the added 'embeddings' and 'context_id' columns\n",
+ "df_text_processed"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d2bced31",
+ "metadata": {},
+ "source": [
+ "## 🔮 Connecting to Hopsworks Feature Store "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7caf764d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import hopsworks\n",
+ "\n",
+ "project = hopsworks.login()\n",
+ "\n",
+ "fs = project.get_feature_store() "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0ed9ac69",
+ "metadata": {},
+ "source": [
+ "## 🪄 Feature Group Creation "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9f5e486b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from hsfs import embedding\n",
+ "\n",
+ "# Create the Embedding Index\n",
+ "emb = embedding.EmbeddingIndex()\n",
+ "\n",
+ "emb.add_embedding(\n",
+ " \"embeddings\", \n",
+ " model.get_sentence_embedding_dimension(),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6e32b548",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get or create the 'documents_fg' feature group\n",
+ "documents_fg = fs.get_or_create_feature_group(\n",
+ " name=\"documents_fg\",\n",
+ " embedding_index=emb,\n",
+ " primary_key=['context_id'],\n",
+ " version=1,\n",
+ " description='Information from various files, presenting details like file names, source links, and structured text excerpts from different pages and paragraphs.',\n",
+ " online_enabled=True,\n",
+ ")\n",
+ "\n",
+ "documents_fg.insert(df_text_processed)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d39a9ed6",
+ "metadata": {},
+ "source": [
+ "## 🪄 Feature View Creation \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7a7bc2f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get or create the 'documents' feature view\n",
+ "feature_view = fs.get_or_create_feature_view(\n",
+ " name=\"documents\",\n",
+ " version=1,\n",
+ " description='Chunked context for RAG system',\n",
+ " query=documents_fg.select([\"file_name\", \"file_link\", \"page_number\", \"paragraph\", \"text\"]),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "708b9a5f",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/advanced_tutorials/llm_pdfs/1a_feature_pipeline.py b/advanced_tutorials/llm_pdfs/1a_feature_pipeline.py
new file mode 100644
index 00000000..84c90c8d
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/1a_feature_pipeline.py
@@ -0,0 +1,69 @@
+import PyPDF2
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+
+from functions.pdf_preprocess import download_files_to_folder, process_pdf_file
+from functions.text_preprocess import process_text_data
+import config
+
+import hopsworks
+
+def pipeline():
+ # Call the function to download files
+ new_files = download_files_to_folder(
+ config.FOLDER_ID,
+ config.DOWNLOAD_PATH,
+ )
+
+ if len(new_files) == 0:
+ print('⛳️ Your folder is up to date!')
+ return
+
+ # Initialize an empty list
+ document_text = []
+
+ for file in new_files:
+ process_pdf_file(
+ file,
+ document_text,
+ config.DOWNLOAD_PATH,
+ )
+
+ # Create a DataFrame
+ columns = ["file_name", "page_number", "text"]
+ df_text = pd.DataFrame(
+ data=document_text,
+ columns=columns,
+ )
+
+ # Process text data using the process_text_data function
+ df_text_processed = process_text_data(df_text)
+
+ # Retrieve a SentenceTransformer
+ model = SentenceTransformer(
+ config.MODEL_SENTENCE_TRANSFORMER,
+ ).to(config.DEVICE)
+
+ # Generate embeddings for the 'text' column using the SentenceTransformer model
+ df_text_processed['embeddings'] = pd.Series(
+ model.encode(df_text_processed['text']).tolist(),
+ )
+
+ # Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame
+ df_text_processed['context_id'] = [*range(df_text_processed.shape[0])]
+
+
+ project = hopsworks.login()
+
+ fs = project.get_feature_store()
+
+ documents_fg = fs.get_feature_group(
+ name="documents_fg",
+ version=1,
+ )
+
+ documents_fg.insert(df_text_processed)
+ return
+
+if __name__ == '__main__':
+ pipeline()
diff --git a/advanced_tutorials/llm_pdfs/1b_dataset_generation.ipynb b/advanced_tutorials/llm_pdfs/1b_dataset_generation.ipynb
new file mode 100644
index 00000000..d2fd826e
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/1b_dataset_generation.ipynb
@@ -0,0 +1,290 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0279e128",
+ "metadata": {},
+ "source": [
+ "## 📝 Imports "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e8efd4e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from openai import OpenAI\n",
+ "import getpass\n",
+ "import json\n",
+ "import pandas as pd\n",
+ "import json_repair\n",
+ "from tqdm import tqdm"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4d389343",
+ "metadata": {},
+ "source": [
+ "## ⚙️ Settings "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "270b84fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.environ[\"OPENAI_API_KEY\"] = os.getenv(\"OPENAI_API_KEY\") or getpass.getpass('🔑 Enter your OpenAI API key: ')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d58f52ef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "client = OpenAI(\n",
+ " api_key=os.environ[\"OPENAI_API_KEY\"],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c16fbf15",
+ "metadata": {},
+ "source": [
+ "## 🔮 Connecting to Hopsworks Feature Store "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3a8916cf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import hopsworks\n",
+ "\n",
+ "project = hopsworks.login()\n",
+ "\n",
+ "fs = project.get_feature_store() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32f2bbae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Retrieve the 'documents' feature view\n",
+ "feature_view = fs.get_feature_view(\n",
+ " name='documents',\n",
+ " version=1,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f60460ab",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize batch scoring for feature view\n",
+ "feature_view.init_batch_scoring()\n",
+ "\n",
+ "# Get batch data from the feature view\n",
+ "data = feature_view.get_batch_data()\n",
+ "\n",
+ "# Filter data to include only rows where the 'text' column length is greater than 2500\n",
+ "data_filtered = data[data.text.str.len() > 2500]\n",
+ "\n",
+ "# Display the filtered data\n",
+ "data_filtered"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d3d2fcb2",
+ "metadata": {},
+ "source": [
+ "## 🪄 Dataset Generation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "80d80597",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def generate_questions(context):\n",
+ "\n",
+ " instruction = \"\"\"\n",
+ " The given text is the result of the text extraction from the PDF files. \n",
+ " Generate 3 meaningful questions on the text and the respective answers.\n",
+ " Reply strictly in the JSON format:\n",
+ " {\n",
+ " \"questions\": [\"question1\", \"question2\", \"question3\"],\n",
+ " \"answers\": [\"answer1\", \"answer2\", \"answer3\"]\n",
+ " }\n",
+ "\n",
+ " Ensure that the lists of questions and answers are complete and properly formatted. \n",
+ " DO NOT include any additional information or characters outside the specified JSON format. \n",
+ " The response must consist only of the requested JSON structure. \n",
+ " If the generated content does not meet the specified format, please make the necessary adjustments to ensure compliance.\"\"\"\n",
+ "\n",
+ " prompt = f\"\\nContext: {context}\\nQuestion: {instruction}\"\n",
+ "\n",
+ " # Create a chatbot\n",
+ " completion = client.chat.completions.create(\n",
+ " model=\"gpt-3.5-turbo\",\n",
+ " # Pre-define conversation messages for the possible roles \n",
+ " messages=[\n",
+ " {\"role\": \"user\", \"content\": prompt},\n",
+ " ]\n",
+ " )\n",
+ " response = json_repair.loads(completion.choices[0].message.content)\n",
+ " \n",
+ " response['context'] = context\n",
+ " \n",
+ " return response\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8d3642f6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Generate question-answer pairs\n",
+ "generated_questions = [\n",
+ " generate_questions(text)\n",
+ " for text \n",
+ " in tqdm(data_filtered['text'])\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d1f1cc46",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a DataFrame from the generated_questions\n",
+ "df = pd.DataFrame(generated_questions)\n",
+ "\n",
+ "# Display the first few rows of the DataFrame\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7f906442",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Explode the DataFrame to expand lists in specified columns ('questions' and 'answers')\n",
+ "df_expanded = df.explode(['questions', 'answers']).reset_index(drop=True)\n",
+ "\n",
+ "# Reset the index to create a new default integer index\n",
+ "df_expanded.reset_index(inplace=True)\n",
+ "\n",
+ "# Rename the 'index' column to 'record_id' for clarity\n",
+ "df_expanded.rename(columns={'index': 'record_id'}, inplace=True)\n",
+ "\n",
+ "# Display the expanded DataFrame\n",
+ "df_expanded"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4fe81b9f",
+ "metadata": {},
+ "source": [
+ "## 🪄 CQA Feature Group Creation "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0a84b387",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get or create the 'cqa_fg' feature group\n",
+ "cqa_fg = fs.get_or_create_feature_group(\n",
+ " name=\"cqa_fg\",\n",
+ " version=1,\n",
+ " description='Context-Question-Response Data',\n",
+ " primary_key=['record_id'],\n",
+ ")\n",
+ "\n",
+ "cqa_fg.insert(df_expanded)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ed251e4",
+ "metadata": {},
+ "source": [
+ "## 🪄 CQA Feature View Creation "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ed7146f7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get or create the 'cqa' feature view\n",
+ "feature_view = fs.get_or_create_feature_view(\n",
+ " name=\"cqa\",\n",
+ " version=1,\n",
+ " query=cqa_fg.select([\"context\", \"questions\", \"responses\"]),\n",
+ " description='Context-Question-Response pairs for model fine-tuning',\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "02f6f11a",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/advanced_tutorials/llm_pdfs/2_training_pipeline.ipynb b/advanced_tutorials/llm_pdfs/2_training_pipeline.ipynb
new file mode 100644
index 00000000..4177c447
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/2_training_pipeline.ipynb
@@ -0,0 +1,376 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "cc6015d0",
+ "metadata": {},
+ "source": [
+ "## 📝 Imports "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9ba30ecb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from datasets import Dataset\n",
+ "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
+ "from peft import LoraConfig\n",
+ "from transformers import TrainingArguments\n",
+ "from trl import SFTTrainer\n",
+ "\n",
+ "from functions.prompt_engineering import generate_prompt\n",
+ "import config"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1270e5f8",
+ "metadata": {},
+ "source": [
+ "## 🔮 Connecting to Hopsworks Feature Store "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e517b1cd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import hopsworks\n",
+ "\n",
+ "project = hopsworks.login()\n",
+ "\n",
+ "fs = project.get_feature_store() \n",
+ "mr = project.get_model_registry()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "86043802",
+ "metadata": {},
+ "source": [
+ "## 🪝 Feature View Retrieval "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4007db72",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Retrieve the 'cqa' feature view\n",
+ "feature_view = fs.get_feature_view(\n",
+ " name='cqa',\n",
+ " version=1,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "83b00e9e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize batch scoring for the feature view\n",
+ "feature_view.init_batch_scoring()\n",
+ "\n",
+ "# Get batch data from the feature view\n",
+ "data = feature_view.get_batch_data()\n",
+ "\n",
+ "# Display the first three rows of the batch data\n",
+ "data.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "64dab547",
+ "metadata": {},
+ "source": [
+ "## 🗄️ Dataset Creation "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "594f4e1a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Generate prompts for each record in the DataFrame using context, questions, and responses\n",
+ "prompts = data.apply(\n",
+ " lambda record: generate_prompt(record['context'], record['questions']) + f'\\n### RESPONSE:\\n{record[\"responses\"]}', \n",
+ " axis=1,\n",
+ ").tolist()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6bd1e493",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a dataset from a dictionary with a single column named \"text\" containing prompts\n",
+ "dataset = Dataset.from_dict({\n",
+ " \"text\": prompts,\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0756b8e7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(dataset[10]['text'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bc161e58",
+ "metadata": {},
+ "source": [
+ "## ⬇️ Model Loading "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "62477b0e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the tokenizer for Mistral-7B-Instruct model\n",
+ "tokenizer = AutoTokenizer.from_pretrained(\n",
+ " config.MODEL_ID,\n",
+ ")\n",
+ "\n",
+ "# Set the pad token to the unknown token to handle padding\n",
+ "tokenizer.pad_token = tokenizer.unk_token\n",
+ "\n",
+ "# Set the padding side to \"right\" to prevent warnings during tokenization\n",
+ "tokenizer.padding_side = \"right\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0153d320",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# BitsAndBytesConfig int-4 config\n",
+ "bnb_config = BitsAndBytesConfig(\n",
+ " load_in_4bit=True, \n",
+ " bnb_4bit_use_double_quant=True, \n",
+ " bnb_4bit_quant_type=\"nf4\", \n",
+ " bnb_4bit_compute_dtype=torch.bfloat16,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f8a4d9ee",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the Mistral-7B-Instruct model with quantization configuration\n",
+ "model = AutoModelForCausalLM.from_pretrained(\n",
+ " config.MODEL_ID,\n",
+ " device_map=\"auto\",\n",
+ " quantization_config=bnb_config,\n",
+ ")\n",
+ "\n",
+ "# Configure the pad token ID in the model to match the tokenizer's pad token ID\n",
+ "model.config.pad_token_id = tokenizer.pad_token_id"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "93c7ba90",
+ "metadata": {},
+ "source": [
+ "## ⚙️ Configuration "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "18d24668",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "peft_config = LoraConfig(\n",
+ " lora_alpha=64,\n",
+ " lora_dropout=0.1,\n",
+ " r=32,\n",
+ " bias=\"none\",\n",
+ " task_type=\"CAUSAL_LM\", \n",
+ " target_modules=[\n",
+ " \"q_proj\",\n",
+ " \"k_proj\",\n",
+ " \"v_proj\",\n",
+ " \"o_proj\",\n",
+ " \"gate_proj\",\n",
+ " \"up_proj\",\n",
+ " \"down_proj\",\n",
+ " \"lm_head\",\n",
+ " ],\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ebade183",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "training_arguments = TrainingArguments(\n",
+ " output_dir=\"mistral7b_finetuned\", # directory to save and repository id\n",
+ " num_train_epochs=3, # number of training epochs\n",
+ " per_device_train_batch_size=3, # batch size per device during training\n",
+ " gradient_accumulation_steps=2, # number of steps before performing a backward/update pass\n",
+ " gradient_checkpointing=True, # use gradient checkpointing to save memory\n",
+ " optim=\"adamw_torch_fused\", # use fused adamw optimizer\n",
+ " logging_steps=10, # log every 10 steps\n",
+ " save_strategy=\"epoch\", # save checkpoint every epoch\n",
+ " learning_rate=2e-4, # learning rate, based on QLoRA paper\n",
+ " bf16=True, # use bfloat16 precision\n",
+ " tf32=True, # use tf32 precision\n",
+ " max_grad_norm=0.3, # max gradient norm based on QLoRA paper\n",
+ " warmup_ratio=0.03, # warmup ratio based on QLoRA paper\n",
+ " lr_scheduler_type=\"constant\", # use constant learning rate scheduler\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "36e79a43",
+ "metadata": {},
+ "source": [
+ "## 🏃🏻♂️ Training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "13af595e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create the Supervised Fine-tuning Trainer\n",
+ "trainer = SFTTrainer(\n",
+ " model=model,\n",
+ " train_dataset=dataset,\n",
+ " peft_config=peft_config,\n",
+ " max_seq_length=4096,\n",
+ " tokenizer=tokenizer,\n",
+ " args=training_arguments,\n",
+ " dataset_text_field='text',\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e2c9a416",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Train the model\n",
+ "trainer.train()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "85e840c2",
+ "metadata": {},
+ "source": [
+ "## 💾 Saving Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "75940ca2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Save the trained model\n",
+ "trainer.save_model()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bfaae161",
+ "metadata": {},
+ "source": [
+ "## 🗄️ Model Registry"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2ff14642",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a Python model in the model registry\n",
+ "model_llm = mr.python.create_model(\n",
+ " name=\"mistral_model\", \n",
+ " description=\"Mistral Fine-tuned Model\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fbce3ba9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Save the model directory with the fine-tuned model to the model registry\n",
+ "model_llm.save(training_arguments.output_dir)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ecc9b1d0",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/advanced_tutorials/llm_pdfs/3_inference_pipeline.ipynb b/advanced_tutorials/llm_pdfs/3_inference_pipeline.ipynb
new file mode 100644
index 00000000..9e2f00dd
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/3_inference_pipeline.ipynb
@@ -0,0 +1,366 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "be60a8be",
+ "metadata": {},
+ "source": [
+ "## 📝 Imports "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f95e1e54",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sentence_transformers import SentenceTransformer\n",
+ "from FlagEmbedding import FlagReranker\n",
+ "\n",
+ "from functions.llm_chain import get_llm_chain\n",
+ "from functions.prompt_engineering import get_context_and_source\n",
+ "import config\n",
+ "\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f3a2715",
+ "metadata": {},
+ "source": [
+ "## 🔮 Connecting to Hopsworks Feature Store "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d292081d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import hopsworks\n",
+ "\n",
+ "project = hopsworks.login()\n",
+ "\n",
+ "fs = project.get_feature_store()\n",
+ "mr = project.get_model_registry()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "733aa65d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Retrieve the 'documents' feature view\n",
+ "feature_view = fs.get_feature_view(\n",
+ " name=\"documents\", \n",
+ " version=1,\n",
+ ") \n",
+ "\n",
+ "# Initialize serving\n",
+ "feature_view.init_serving(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1e562e9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get the Mistral model from Model Registry\n",
+ "mistral_model = mr.get_model(\n",
+ " name=\"mistral_model\",\n",
+ " version=1,\n",
+ ")\n",
+ "\n",
+ "# Download the Mistral model files to a local directory\n",
+ "saved_model_dir = mistral_model.download()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0235999b",
+ "metadata": {},
+ "source": [
+ "## ⛓️ LLM Chain "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bc70c06b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "llm_chain = get_llm_chain(saved_model_dir)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e6b5249d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "session_id = {\n",
+ " \"configurable\": {\"session_id\": \"default\"}\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d9377ab5",
+ "metadata": {},
+ "source": [
+ "## 🗄️ Sentence Transformer Loading "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "89b5ce52",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the Sentence Transformer\n",
+ "sentence_transformer = SentenceTransformer(\n",
+ " config.MODEL_SENTENCE_TRANSFORMER,\n",
+ ").to(config.DEVICE)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "40126e56",
+ "metadata": {},
+ "source": [
+ "## 🧬 Reranking "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "72cfcbd2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_reranker():\n",
+ " reranker = FlagReranker(\n",
+ " 'BAAI/bge-reranker-large', \n",
+ " use_fp16=True,\n",
+ " ) \n",
+ " return reranker"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "491e3847",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Retrieve a reranker\n",
+ "reranker = get_reranker()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c739dd2d",
+ "metadata": {},
+ "source": [
+ "## 🗄️ Context Retrieval "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "987d3108",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# User Question Example\n",
+ "user_input = 'What are the best risk reporting practices?' "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "02199904",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Retrieve reranked context and source\n",
+ "context, source = get_context_and_source(\n",
+ " user_input, \n",
+ " sentence_transformer,\n",
+ " feature_view, \n",
+ " reranker,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "decf4d3d",
+ "metadata": {},
+ "source": [
+ "## 🚀 Model Inference "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "622bfb9a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Generate model response\n",
+ "model_output = llm_chain.invoke({\n",
+ " \"context\": context, \n",
+ " \"question\": user_input,\n",
+ " },\n",
+ " session_id,\n",
+ ")\n",
+ "\n",
+ "print(model_output.split('### RESPONSE:\\n')[-1] + source)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5711145e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_input = 'What is Adaptability?'\n",
+ "\n",
+ "context, source = get_context_and_source(\n",
+ " user_input, \n",
+ " sentence_transformer,\n",
+ " feature_view, \n",
+ " reranker,\n",
+ ")\n",
+ "\n",
+ "model_output = llm_chain.invoke({\n",
+ " \"context\": context, \n",
+ " \"question\": user_input,\n",
+ " },\n",
+ " session_id,\n",
+ ")\n",
+ "\n",
+ "print(model_output.split('### RESPONSE:\\n')[-1] + source)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "044e9b15",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_input = 'What is a risk management?'\n",
+ "\n",
+ "context, source = get_context_and_source(\n",
+ " user_input, \n",
+ " sentence_transformer,\n",
+ " feature_view, \n",
+ " reranker,\n",
+ ")\n",
+ "\n",
+ "model_output = llm_chain.invoke({\n",
+ " \"context\": context, \n",
+ " \"question\": user_input,\n",
+ " },\n",
+ " session_id,\n",
+ ")\n",
+ "\n",
+ "print(model_output.split('### RESPONSE:\\n')[-1] + source)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "02be4b75",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_input = 'What is the purpose of maintaining an up-to-date data-flow diagram?'\n",
+ "\n",
+ "context, source = get_context_and_source(\n",
+ " user_input, \n",
+ " sentence_transformer,\n",
+ " feature_view, \n",
+ " reranker,\n",
+ ")\n",
+ "\n",
+ "model_output = llm_chain.invoke({\n",
+ " \"context\": context, \n",
+ " \"question\": user_input,\n",
+ " },\n",
+ " session_id,\n",
+ ")\n",
+ "\n",
+ "print(model_output.split('### RESPONSE:\\n')[-1] + source)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "43a409ea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "user_input = 'Why are security and privacy controls important?'\n",
+ "\n",
+ "context, source = get_context_and_source(\n",
+ " user_input, \n",
+ " sentence_transformer,\n",
+ " feature_view, \n",
+ " reranker,\n",
+ ")\n",
+ "\n",
+ "model_output = llm_chain.invoke({\n",
+ " \"context\": context, \n",
+ " \"question\": user_input,\n",
+ " },\n",
+ " session_id,\n",
+ ")\n",
+ "\n",
+ "print(model_output.split('### RESPONSE:\\n')[-1] + source)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "108ca3db",
+ "metadata": {},
+ "source": [
+ "---"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/advanced_tutorials/llm_pdfs/app.py b/advanced_tutorials/llm_pdfs/app.py
new file mode 100644
index 00000000..4db37f4c
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/app.py
@@ -0,0 +1,127 @@
+import streamlit as st
+import hopsworks
+from sentence_transformers import SentenceTransformer
+from FlagEmbedding import FlagReranker
+from functions.prompt_engineering import get_context_and_source
+from functions.llm_chain import get_llm_chain
+import config
+import warnings
+warnings.filterwarnings('ignore')
+
+st.title("💬 AI assistant")
+
+@st.cache_resource()
+def connect_to_hopsworks():
+ # Initialize Hopsworks feature store connection
+ project = hopsworks.login()
+ fs = project.get_feature_store()
+ mr = project.get_model_registry()
+
+ # Retrieve the 'documents' feature view
+ feature_view = fs.get_feature_view(
+ name="documents",
+ version=1,
+ )
+
+ # Initialize serving
+ feature_view.init_serving(1)
+
+ # Get the Mistral model from Model Registry
+ mistral_model = mr.get_model(
+ name="mistral_model",
+ version=1,
+ )
+
+ # Download the Mistral model files to a local directory
+ saved_model_dir = mistral_model.download()
+
+ return feature_view, saved_model_dir
+
+
+@st.cache_resource()
+def get_models(saved_model_dir):
+
+ # Load the Sentence Transformer
+ sentence_transformer = SentenceTransformer(
+ config.MODEL_SENTENCE_TRANSFORMER,
+ ).to(config.DEVICE)
+
+ llm_chain = get_llm_chain(saved_model_dir)
+
+ return sentence_transformer, llm_chain
+
+
+@st.cache_resource()
+def get_reranker():
+ reranker = FlagReranker(
+ 'BAAI/bge-reranker-large',
+ use_fp16=True,
+ )
+ return reranker
+
+
+def predict(user_query, sentence_transformer, feature_view, reranker, llm_chain):
+
+ st.write('⚙️ Generating Response...')
+
+ session_id = {
+ "configurable": {"session_id": "default"}
+ }
+
+ # Retrieve reranked context and source
+ context, source = get_context_and_source(
+ user_query,
+ sentence_transformer,
+ feature_view,
+ reranker,
+ )
+
+ # Generate model response
+ model_output = llm_chain.invoke({
+ "context": context,
+ "question": user_query,
+ },
+ session_id,
+ )
+
+ return model_output.split('### RESPONSE:\n')[-1] + source
+
+
+# Retrieve the feature view and the saved_model_dir
+feature_view, saved_model_dir = connect_to_hopsworks()
+
+# Load and retrieve the sentence_transformer and llm_chain
+sentence_transformer, llm_chain = get_models(saved_model_dir)
+
+# Retrieve the reranking model
+reranker = get_reranker()
+
+# Initialize chat history
+if "messages" not in st.session_state:
+ st.session_state.messages = []
+
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+ with st.chat_message(message["role"]):
+ st.markdown(message["content"])
+
+# React to user input
+if user_query := st.chat_input("How can I help you?"):
+ # Display user message in chat message container
+ st.chat_message("user").markdown(user_query)
+ # Add user message to chat history
+ st.session_state.messages.append({"role": "user", "content": user_query})
+
+ response = predict(
+ user_query,
+ sentence_transformer,
+ feature_view,
+ reranker,
+ llm_chain,
+ )
+
+ # Display assistant response in chat message container
+ with st.chat_message("assistant"):
+ st.markdown(response)
+ # Add assistant response to chat history
+ st.session_state.messages.append({"role": "assistant", "content": response})
diff --git a/advanced_tutorials/llm_pdfs/config.py b/advanced_tutorials/llm_pdfs/config.py
new file mode 100644
index 00000000..1b1ee098
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/config.py
@@ -0,0 +1,16 @@
+import torch
+
+# The unique identifier for the Google Drive folder where your PDF files are stored
+FOLDER_ID = '{YOUR_FOLDER_ID}'
+
+# The local directory path where downloaded data will be saved.
+DOWNLOAD_PATH = "data"
+
+# The identifier of the pre-trained sentence transformer model for producing sentence embeddings.
+MODEL_SENTENCE_TRANSFORMER = 'all-MiniLM-L6-v2'
+
+# The computing device to be used for model inference and training.
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+# The identifier for the Mistral-7B-Instruct model
+MODEL_ID = 'mistralai/Mistral-7B-Instruct-v0.2'
diff --git a/advanced_tutorials/llm_pdfs/functions/connect_to_google_drive.py b/advanced_tutorials/llm_pdfs/functions/connect_to_google_drive.py
new file mode 100644
index 00000000..a20a164c
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/functions/connect_to_google_drive.py
@@ -0,0 +1,19 @@
+from apiclient import discovery
+from httplib2 import Http
+from oauth2client import client, file, tools
+
+
+# Define path variables
+credentials_file_path = '../credentials/credentials.json'
+clientsecret_file_path = '../credentials/client_secret.json'
+
+# Define API scope
+SCOPE = 'https://www.googleapis.com/auth/drive'
+
+# Define store
+store = file.Storage(credentials_file_path)
+credentials = store.get()
+# Get access token
+if not credentials or credentials.invalid:
+ flow = client.flow_from_clientsecrets(clientsecret_file_path, SCOPE)
+ credentials = tools.run_flow(flow, store)
\ No newline at end of file
diff --git a/advanced_tutorials/llm_pdfs/functions/llm_chain.py b/advanced_tutorials/llm_pdfs/functions/llm_chain.py
new file mode 100644
index 00000000..46e56952
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/functions/llm_chain.py
@@ -0,0 +1,133 @@
+import os
+import getpass
+import torch
+import transformers
+from peft import AutoPeftModelForCausalLM
+from transformers import AutoTokenizer
+from langchain.llms import HuggingFacePipeline
+from langchain.prompts import PromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables.history import RunnableWithMessageHistory
+from langchain_community.chat_message_histories import ChatMessageHistory
+from langchain_core.chat_history import BaseChatMessageHistory
+
+
+def load_llm(model_dir) -> tuple:
+ """
+ Load the LLM and its corresponding tokenizer.
+
+ Args:
+ model_dir (str): Path to the pre-trained fine-tuned model.
+
+ Returns:
+ tuple: A tuple containing the tokenizer and loaded model.
+ """
+ # Setup the HuggingFace API Key
+ os.environ["HF_API_KEY"] = os.getenv("HF_API_KEY") or getpass.getpass('🔑 Enter your HuggingFace API key: ')
+
+ # Load a model from the saved model directory
+ model_llm = AutoPeftModelForCausalLM.from_pretrained(
+ model_dir,
+ device_map="auto",
+ torch_dtype=torch.float16,
+ token=os.environ["HF_API_KEY"],
+ )
+
+ # Load the tokenizer from the saved model directory
+ tokenizer = AutoTokenizer.from_pretrained(
+ model_dir,
+ token=os.environ["HF_API_KEY"],
+ )
+
+ # Set the pad token to the end-of-sequence token
+ tokenizer.pad_token = tokenizer.eos_token
+
+ # Set the padding side to "right" to remove warnings
+ tokenizer.padding_side = "right"
+
+ # Print device
+ print(f'⛳️ Device: {model_llm.device}')
+ return tokenizer, model_llm
+
+
+def get_prompt_template():
+ # Define a template for generating prompts
+ prompt_template = """
+ [INST]
+ Instruction: Prioritize brevity and clarity in responses.
+ Avoid unnecessary repetition and keep answers concise, adhering to a maximum of 750 characters.
+ Eliminate redundant phrases and sentences.
+ If details are repeated, provide them only once for better readability.
+ Focus on delivering key information without unnecessary repetition.
+ If a concept is already conveyed, there's no need to restate it. Ensure responses remain clear and to the point.
+ Make sure you do not repeat any sentences in your answer.
+ [/INST]
+
+ Previous conversation:
+ {chat_history}
+
+ ### CONTEXT:
+
+ {context}
+
+ ### QUESTION:
+ [INST]{question}[/INST]"""
+ return prompt_template
+
+
+def get_llm_chain(model_dir):
+ """
+ Initializes and returns a language model chain for text generation using Hugging Face's transformers library.
+
+ Parameters:
+ - model_dir (str): Path to the pre-trained fine-tuned model.
+
+ Returns:
+ - LLMChain: A configured chain consisting of a Hugging Face pipeline for text generation and prompt handling.
+ """
+
+ def get_global_history(session_id: str) -> BaseChatMessageHistory:
+ return global_chat_history
+
+ # Load LLM and its corresponding tokenizer
+ tokenizer, model = load_llm(model_dir)
+
+ # Create a text generation pipeline using the loaded model and tokenizer
+ text_generation_pipeline = transformers.pipeline(
+ model=model, # The pre-trained language model for text generation
+ tokenizer=tokenizer, # The tokenizer corresponding to the language model
+ task="text-generation", # Specify the task as text generation
+ temperature=0.2, # Controls the randomness of the generation (higher values for more randomness)
+ repetition_penalty=1.5, # Controls the penalty for repeating tokens in generated text
+ return_full_text=True, # Return the full generated text instead of just the generated tokens
+ max_new_tokens=750, # Limit the maximum number of newly generated tokens
+ pad_token_id=tokenizer.eos_token_id, # Use the end-of-sequence token as the padding token
+ do_sample=True, # Enable sampling during text generation
+ )
+
+ # Create a Hugging Face pipeline for Mistral LLM using the text generation pipeline
+ mistral_llm = HuggingFacePipeline(
+ pipeline=text_generation_pipeline,
+ )
+
+ # Create prompt from prompt template
+ prompt = PromptTemplate(
+ input_variables=["context", "question", "chat_history"],
+ template=get_prompt_template(),
+ )
+
+ # Create the runnable sequence
+ runnable = prompt | mistral_llm | StrOutputParser()
+
+ # Initialize a global chat history (shared for all invocations)
+ global_chat_history = ChatMessageHistory()
+
+ # Create the RunnableWithMessageHistory using the global history
+ llm_chain = RunnableWithMessageHistory(
+ runnable,
+ get_global_history,
+ input_messages_key="question",
+ history_messages_key="chat_history",
+ )
+
+ return llm_chain
diff --git a/advanced_tutorials/llm_pdfs/functions/pdf_preprocess.py b/advanced_tutorials/llm_pdfs/functions/pdf_preprocess.py
new file mode 100644
index 00000000..495207df
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/functions/pdf_preprocess.py
@@ -0,0 +1,96 @@
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive
+import PyPDF2
+import os
+from typing import List, Dict, Union
+
+def download_files_to_folder(folder_id: str, download_path: str) -> List:
+ """
+ Download files from a specified Google Drive folder to a local folder.
+
+ Parameters:
+ - folder_id (str): The ID of the Google Drive folder.
+ - download_path (str): The local folder path where files will be downloaded.
+
+ Returns:
+ - List: A list containing information about newly downloaded files.
+ """
+ # Authenticate with Google Drive
+ gauth = GoogleAuth()
+ gauth.LoadCredentialsFile("credentials/credentials.json")
+
+ if gauth.credentials is None:
+ gauth.LocalWebserverAuth()
+ elif gauth.access_token_expired:
+ gauth.Refresh()
+ else:
+ # Initialize the saved creds
+ gauth.Authorize()
+
+ # Save the current credentials to a file
+ gauth.SaveCredentialsFile("credentials/credentials.json")
+
+ drive = GoogleDrive(gauth)
+
+ # Create the local folder if it doesn't exist
+ if not os.path.exists(download_path):
+ os.makedirs(download_path)
+
+ # List files in the specified Google Drive folder
+ file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList()
+
+ # Initialize a list to store information about new files
+ new_files = []
+ print('⛳️ Loading...')
+
+ # Iterate through each file in the list
+ for file in file_list:
+ # Check if the file already exists locally
+ local_file_path = os.path.join(download_path, file["title"])
+
+ if not os.path.isfile(local_file_path):
+ # Download the file content and save it to the local folder
+ file.GetContentFile(local_file_path)
+
+ # Append information about the downloaded file to the list
+ new_files.append(file)
+
+ # Print the list of newly downloaded files
+ if len(new_files) == 0:
+ print("⛳️ There are no new files")
+ return new_files
+
+ print("⛳️ Newly downloaded files:")
+ for file in new_files:
+ print("title: %s, id: %s" % (file["title"], file["id"]))
+
+ return new_files
+
+
+def process_pdf_file(file_info: Dict,
+ document_text: List,
+ pdfs_path: str = 'data/') -> List:
+ """
+ Process content of a PDF file and append information to the document_text list.
+
+ Parameters:
+ - file_info (Dict): Information about the PDF file.
+ - document_text (List): List containing document information.
+ - pdfs_path (str): Path to the folder containing PDF files (default is 'data/').
+
+ Returns:
+ - List: Updated document_text list.
+ """
+ file_title = file_info["title"]
+
+ if file_title.split('.')[-1] == 'pdf':
+ print(f'⛳️ File Name: {file_title}')
+
+ pdf_path = os.path.join(pdfs_path, file_title)
+ pdf_reader = PyPDF2.PdfReader(pdf_path)
+ pages_amount = len(pdf_reader.pages)
+ print(f'Amount of pages: {pages_amount}')
+
+ for i, page in enumerate(pdf_reader.pages):
+ document_text.append([file_title, file_info['embedLink'], i+1, page.extract_text()])
+ return document_text
diff --git a/advanced_tutorials/llm_pdfs/functions/prompt_engineering.py b/advanced_tutorials/llm_pdfs/functions/prompt_engineering.py
new file mode 100644
index 00000000..a4a0b979
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/functions/prompt_engineering.py
@@ -0,0 +1,151 @@
+from typing import List, Tuple
+from sentence_transformers import SentenceTransformer
+
+def get_source(neighbors: List[Tuple[str, str, int, int]]) -> str:
+ """
+ Generates a formatted string for the sources of the provided context.
+
+ Args:
+ neighbors (List[Tuple[str, str, int, int]]): List of tuples representing document information.
+
+ Returns:
+ str: Formatted string containing document names, links, pages, and paragraphs.
+ """
+ return '\n\nReferences:\n' + '\n'.join(
+ [
+ f' - {neighbor[0]}({neighbor[1]}): Page: {neighbor[2]}, Paragraph: {neighbor[3]}'
+ for neighbor
+ in neighbors
+ ]
+ )
+
+def get_context(neighbors: List[Tuple[str]]) -> str:
+ """
+ Generates a formatted string for the context based on the provided neighbors.
+
+ Args:
+ neighbors (List[Tuple[str]]): List of tuples representing context information.
+
+ Returns:
+ str: Formatted string containing context information.
+ """
+ return '\n\n'.join([neighbor[-1] for neighbor in neighbors])
+
+
+def generate_prompt(context: str, question: str) -> str:
+ """
+ Generates a prompt for the AI assistant based on context and question.
+
+ Args:
+ context (str): Formatted string containing context information.
+ question (str): The question to be included in the prompt.
+
+ Returns:
+ str: Formatted prompt for the AI assistant.
+ """
+ prompt_template = """
+[INST]
+Instruction: You are an AI assistant specialized in regulatory documents.
+Your role is to provide accurate and informative answers based on the given context.
+[/INST]
+
+### CONTEXT:
+
+{context}
+
+### QUESTION:
+[INST]{question}[/INST]
+ """
+
+ return prompt_template.format(
+ context=context,
+ question=question,
+ )
+
+
+def get_neighbors(query: str, sentence_transformer: SentenceTransformer, feature_view, k: int = 10) -> List[Tuple[str, float]]:
+ """
+ Get the k closest neighbors for a given query using sentence embeddings.
+
+ Parameters:
+ - query (str): The input query string.
+ - sentence_transformer (SentenceTransformer): The sentence transformer model.
+ - feature_view (FeatureView): The feature view for retrieving neighbors.
+ - k (int, optional): Number of neighbors to retrieve. Default is 10.
+
+ Returns:
+ - List[Tuple[str, float]]: A list of tuples containing the neighbor context.
+ """
+ question_embedding = sentence_transformer.encode(query)
+
+ # Retrieve closest neighbors
+ neighbors = feature_view.find_neighbors(
+ question_embedding,
+ k=k,
+ )
+
+ return neighbors
+
+
+def rerank(query: str, neighbors: List[str], reranker, k: int = 3) -> List[str]:
+ """
+ Rerank a list of neighbors based on a reranking model.
+
+ Parameters:
+ - query (str): The input query string.
+ - neighbors (List[str]): List of neighbor contexts.
+ - reranker (Reranker): The reranking model.
+ - k (int, optional): Number of top-ranked neighbors to return. Default is 3.
+
+ Returns:
+ - List[str]: The top-ranked neighbor contexts after reranking.
+ """
+ # Compute scores for each context using the reranker
+ scores = [reranker.compute_score([query, context[-1]]) for context in neighbors]
+
+ combined_data = [*zip(scores, neighbors)]
+
+ # Sort contexts based on the scores in descending order
+ sorted_data = sorted(combined_data, key=lambda x: x[0], reverse=True)
+
+ # Return the top-k ranked contexts
+ return [context for score, context in sorted_data][:k]
+
+
+def get_context_and_source(user_query: str, sentence_transformer: SentenceTransformer,
+ feature_view, reranker) -> Tuple[str, str]:
+ """
+ Retrieve context and source based on user query using a combination of embedding, feature view, and reranking.
+
+ Parameters:
+ - user_query (str): The user's input query string.
+ - sentence_transformer (SentenceTransformer): The sentence transformer model.
+ - feature_view (FeatureView): The feature view for retrieving neighbors.
+ - reranker (Reranker): The reranking model.
+
+ Returns:
+ - Tuple[str, str]: A tuple containing the retrieved context and source.
+ """
+ # Retrieve closest neighbors
+ neighbors = get_neighbors(
+ user_query,
+ sentence_transformer,
+ feature_view,
+ k=10,
+ )
+
+ # Rerank the neighbors to get top-k
+ context_reranked = rerank(
+ user_query,
+ neighbors,
+ reranker,
+ k=3,
+ )
+
+ # Retrieve context
+ context = get_context(context_reranked)
+
+ # Retrieve source
+ source = get_source(context_reranked)
+
+ return context, source
diff --git a/advanced_tutorials/llm_pdfs/functions/text_preprocess.py b/advanced_tutorials/llm_pdfs/functions/text_preprocess.py
new file mode 100644
index 00000000..47cbd21d
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/functions/text_preprocess.py
@@ -0,0 +1,73 @@
+import pandas as pd
+from typing import List
+
+def split_page(document: str) -> List[str]:
+ """
+ Splits a document into a list of paragraphs based on newline characters.
+
+ Parameters:
+ - document (str): The input document to be split.
+
+ Returns:
+ - List[str]: A list of paragraphs.
+ """
+ return document.split('\n \n')
+
+
+def get_paragraphs(data: pd.DataFrame) -> pd.DataFrame:
+ """
+ Explodes the 'text' column in the DataFrame, adds a 'paragraph' column indicating the index
+ of the element in the list grouped by file_name and page_number.
+
+ Parameters:
+ - data (pd.DataFrame): The input DataFrame containing 'file_name', 'page_number', and 'text' columns.
+
+ Returns:
+ - pd.DataFrame: The modified DataFrame with an added 'paragraph' column.
+ """
+ # Explode the list to separate rows
+ data_text_exploded = data.explode('text')
+
+ # Add a 'paragraph' column indicating the index of the element in the list
+ data_text_exploded['paragraph'] = data_text_exploded.groupby(
+ ['file_name', 'page_number']
+ ).cumcount() + 1
+
+ return data_text_exploded
+
+
+def process_text_data(df: pd.DataFrame) -> pd.DataFrame:
+ """
+ Processes text data by applying the split_page, get_paragraphs functions.
+
+ Parameters:
+ - df (pd.DataFrame): The input DataFrame containing 'file_name' and 'text' columns.
+
+ Returns:
+ - pd.DataFrame: The processed DataFrame with 'file_name', 'page_number', 'paragraph', and 'text' columns.
+ """
+ # Apply split_page function to split text into paragraphs
+ df['text'] = df['text'].apply(split_page)
+
+ # Apply get_paragraphs function to explode the list and add paragraph numbers
+ df = get_paragraphs(df)
+
+ # Apply strip to remove leading and trailing spaces
+ df['text'] = df['text'].str.strip()
+
+ # Filter rows where the length of the 'text' column is greater than 500
+ df = df[df['text'].str.len() > 500]
+
+ # Set a regex pattern to identify rows with 5 or more consecutive dots or dashes
+ pattern_to_remove = r'(\.{5,}|\-{5,})'
+
+ # Remove rows matching the pattern
+ df_filtered = df[~df['text'].str.contains(pattern_to_remove, regex=True)]
+
+ # Reset index
+ df_filtered.reset_index(drop=True, inplace=True)
+
+ # Reorder columns for better readability
+ df_filtered = df_filtered[['file_name', 'file_link', 'page_number', 'paragraph', 'text']]
+
+ return df_filtered
diff --git a/advanced_tutorials/llm_pdfs/requirements.txt b/advanced_tutorials/llm_pdfs/requirements.txt
new file mode 100644
index 00000000..8c00f616
--- /dev/null
+++ b/advanced_tutorials/llm_pdfs/requirements.txt
@@ -0,0 +1,22 @@
+google-api-python-client==2.114.0
+httplib2==0.22.0
+oauth2client==4.1.3
+pydrive==1.3.1
+PyPDF2==3.0.1
+pandas==2.1.4
+sentence-transformers==2.2.2
+accelerate==0.26.1
+peft==0.7.1
+bitsandbytes==0.40.2
+transformers==4.36.2
+flask-sqlalchemy==3.1.1
+trl==0.7.9
+langchain==0.1.1
+pyopenssl==23.3.0
+FlagEmbedding
+streamlit==1.30.0
+openai==1.9.0
+getpass4==0.0.14.1
+json_repair==0.6.1
+protobuf==3.20.0
+hopsworks