From c9ae5a1beebd82c463c46c34b21b5ce491019975 Mon Sep 17 00:00:00 2001 From: Maksym Zhytnikov <63515947+Maxxx-zh@users.noreply.github.com> Date: Mon, 20 May 2024 21:35:35 +0300 Subject: [PATCH] [FSTORE-1404] LLM PDF Tutorial (#266) * LLM PDF Search Tutorial using RAG and Fine-Tuning --- README.md | 4 +- .../llm_pdfs/1_feature_backfill.ipynb | 285 +++++++++++++ .../llm_pdfs/1a_feature_pipeline.py | 69 ++++ .../llm_pdfs/1b_dataset_generation.ipynb | 290 ++++++++++++++ .../llm_pdfs/2_training_pipeline.ipynb | 376 ++++++++++++++++++ .../llm_pdfs/3_inference_pipeline.ipynb | 366 +++++++++++++++++ advanced_tutorials/llm_pdfs/app.py | 127 ++++++ advanced_tutorials/llm_pdfs/config.py | 16 + .../functions/connect_to_google_drive.py | 19 + .../llm_pdfs/functions/llm_chain.py | 133 +++++++ .../llm_pdfs/functions/pdf_preprocess.py | 96 +++++ .../llm_pdfs/functions/prompt_engineering.py | 151 +++++++ .../llm_pdfs/functions/text_preprocess.py | 73 ++++ advanced_tutorials/llm_pdfs/requirements.txt | 22 + 14 files changed, 2026 insertions(+), 1 deletion(-) create mode 100644 advanced_tutorials/llm_pdfs/1_feature_backfill.ipynb create mode 100644 advanced_tutorials/llm_pdfs/1a_feature_pipeline.py create mode 100644 advanced_tutorials/llm_pdfs/1b_dataset_generation.ipynb create mode 100644 advanced_tutorials/llm_pdfs/2_training_pipeline.ipynb create mode 100644 advanced_tutorials/llm_pdfs/3_inference_pipeline.ipynb create mode 100644 advanced_tutorials/llm_pdfs/app.py create mode 100644 advanced_tutorials/llm_pdfs/config.py create mode 100644 advanced_tutorials/llm_pdfs/functions/connect_to_google_drive.py create mode 100644 advanced_tutorials/llm_pdfs/functions/llm_chain.py create mode 100644 advanced_tutorials/llm_pdfs/functions/pdf_preprocess.py create mode 100644 advanced_tutorials/llm_pdfs/functions/prompt_engineering.py create mode 100644 advanced_tutorials/llm_pdfs/functions/text_preprocess.py create mode 100644 advanced_tutorials/llm_pdfs/requirements.txt diff --git a/README.md b/README.md index dd8e87cc..01793905 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ In order to understand the tutorials you need to be familiar with general concep - [Iris](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/iris): Classify iris flower species. - [Loan Approval](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/loan_approval): Predict loan approvals. - Advanced Tutorials: - - [Air Quality](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/air_quality): Predict the Air Quality value (PM2.5) in Europe and USA using weather features and air quality features of the previous days. + - [Air Quality](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/air_quality): Creating an air quality AI assistant that displays and explains air quality indicators for specific dates or periods, using Function Calling for LLMs and a RAG approach without a vector database. - [Bitcoin](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/bitcoin): Predict Bitcoin price using timeseries features and tweets sentiment analysis. - [Citibike](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/citibike): Predict the number of citibike users on each citibike station in the New York City. - [Credit Scores](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/credit_scores): Predict clients' repayment abilities. @@ -50,6 +50,8 @@ In order to understand the tutorials you need to be familiar with general concep - [NYC Taxi Fares](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/nyc_taxi_fares): Predict the fare amount for a taxi ride in New York City given the pickup and dropoff locations. - [Recommender System](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/recommender-system): Build a recommender system for fashion items. - [TimeSeries](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/timeseries): Timeseries price prediction. + - [LLM PDF](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/llm_pdfs): An AI assistant that utilizes a Retrieval-Augmented Generation (RAG) system to provide accurate answers to user questions by retrieving relevant context from PDF documents. + - [Fraud Cheque Detection](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/fraud_cheque_detection): Building an AI assistant that detects fraudulent scanned cheque images and generates explanations for the fraud classification, using a fine-tuned open-source LLM. - [Keras model and Sklearn Transformation Functions with Hopsworks Model Registry](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/transformation_functions/keras): How to register Sklearn Transformation Functions and Keras model in the Hopsworks Model Registry, how to retrieve them and then use in training and inference pipelines. - [PyTorch model and Sklearn Transformation Functions with Hopsworks Model Registry](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/transformation_functions/pytorch): How to register Sklearn Transformation Functions and PyTorch model in the Hopsworks Model Registry, how to retrieve them and then use in training and inference pipelines. - [Sklearn Transformation Functions With Hopsworks Model Registy](https://github.com/logicalclocks/hopsworks-tutorials/tree/master/advanced_tutorials/transformation_functions/sklearn): How to register sklearn.pipeline with transformation functions and classifier in Hopsworks Model Registry and use it in training and inference pipelines. diff --git a/advanced_tutorials/llm_pdfs/1_feature_backfill.ipynb b/advanced_tutorials/llm_pdfs/1_feature_backfill.ipynb new file mode 100644 index 00000000..652fec9e --- /dev/null +++ b/advanced_tutorials/llm_pdfs/1_feature_backfill.ipynb @@ -0,0 +1,285 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "82622ee3", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ade7fe1f", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -r requirements.txt -q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ab771e2", + "metadata": {}, + "outputs": [], + "source": [ + "import PyPDF2\n", + "import pandas as pd\n", + "from sentence_transformers import SentenceTransformer\n", + "\n", + "from functions.pdf_preprocess import (\n", + " download_files_to_folder, \n", + " process_pdf_file,\n", + ")\n", + "from functions.text_preprocess import process_text_data\n", + "import config\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "7e8f1796", + "metadata": {}, + "source": [ + "## 💾 Download files from Google Drive " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea8c756e", + "metadata": {}, + "outputs": [], + "source": [ + "# Call the function to download files\n", + "new_files = download_files_to_folder(\n", + " config.FOLDER_ID, \n", + " config.DOWNLOAD_PATH,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f783e27e", + "metadata": {}, + "source": [ + "## 🧬 Text Extraction " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b3b6715", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize an empty list\n", + "document_text = []\n", + "\n", + "for file in new_files:\n", + " process_pdf_file(\n", + " file, \n", + " document_text, \n", + " config.DOWNLOAD_PATH,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "348b723e", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a DataFrame\n", + "columns = [\"file_name\", \"file_link\", \"page_number\", \"text\"]\n", + "df_text = pd.DataFrame(\n", + " data=document_text,\n", + " columns=columns,\n", + ")\n", + "# Display the DataFrame\n", + "df_text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62a70763", + "metadata": {}, + "outputs": [], + "source": [ + "# Process text data using the process_text_data function\n", + "df_text_processed = process_text_data(df_text)\n", + "\n", + "# Display the processed DataFrame\n", + "df_text_processed" + ] + }, + { + "cell_type": "markdown", + "id": "10f9ea36", + "metadata": {}, + "source": [ + "## ⚙️ Embeddings Creation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9805c689", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the SentenceTransformer model\n", + "model = SentenceTransformer(\n", + " config.MODEL_SENTENCE_TRANSFORMER,\n", + ").to(config.DEVICE)\n", + "model.device" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1b7a89a", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate embeddings for the 'text' column using the SentenceTransformer model\n", + "df_text_processed['embeddings'] = pd.Series(\n", + " model.encode(df_text_processed['text']).tolist(),\n", + ")\n", + "\n", + "# Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame\n", + "df_text_processed['context_id'] = [*range(df_text_processed.shape[0])]\n", + "\n", + "# Display the resulting DataFrame with the added 'embeddings' and 'context_id' columns\n", + "df_text_processed" + ] + }, + { + "cell_type": "markdown", + "id": "d2bced31", + "metadata": {}, + "source": [ + "## 🔮 Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7caf764d", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store() " + ] + }, + { + "cell_type": "markdown", + "id": "0ed9ac69", + "metadata": {}, + "source": [ + "## 🪄 Feature Group Creation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f5e486b", + "metadata": {}, + "outputs": [], + "source": [ + "from hsfs import embedding\n", + "\n", + "# Create the Embedding Index\n", + "emb = embedding.EmbeddingIndex()\n", + "\n", + "emb.add_embedding(\n", + " \"embeddings\", \n", + " model.get_sentence_embedding_dimension(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e32b548", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'documents_fg' feature group\n", + "documents_fg = fs.get_or_create_feature_group(\n", + " name=\"documents_fg\",\n", + " embedding_index=emb,\n", + " primary_key=['context_id'],\n", + " version=1,\n", + " description='Information from various files, presenting details like file names, source links, and structured text excerpts from different pages and paragraphs.',\n", + " online_enabled=True,\n", + ")\n", + "\n", + "documents_fg.insert(df_text_processed)" + ] + }, + { + "cell_type": "markdown", + "id": "d39a9ed6", + "metadata": {}, + "source": [ + "## 🪄 Feature View Creation \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a7bc2f0", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'documents' feature view\n", + "feature_view = fs.get_or_create_feature_view(\n", + " name=\"documents\",\n", + " version=1,\n", + " description='Chunked context for RAG system',\n", + " query=documents_fg.select([\"file_name\", \"file_link\", \"page_number\", \"paragraph\", \"text\"]),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "708b9a5f", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/llm_pdfs/1a_feature_pipeline.py b/advanced_tutorials/llm_pdfs/1a_feature_pipeline.py new file mode 100644 index 00000000..84c90c8d --- /dev/null +++ b/advanced_tutorials/llm_pdfs/1a_feature_pipeline.py @@ -0,0 +1,69 @@ +import PyPDF2 +import pandas as pd +from sentence_transformers import SentenceTransformer + +from functions.pdf_preprocess import download_files_to_folder, process_pdf_file +from functions.text_preprocess import process_text_data +import config + +import hopsworks + +def pipeline(): + # Call the function to download files + new_files = download_files_to_folder( + config.FOLDER_ID, + config.DOWNLOAD_PATH, + ) + + if len(new_files) == 0: + print('⛳️ Your folder is up to date!') + return + + # Initialize an empty list + document_text = [] + + for file in new_files: + process_pdf_file( + file, + document_text, + config.DOWNLOAD_PATH, + ) + + # Create a DataFrame + columns = ["file_name", "page_number", "text"] + df_text = pd.DataFrame( + data=document_text, + columns=columns, + ) + + # Process text data using the process_text_data function + df_text_processed = process_text_data(df_text) + + # Retrieve a SentenceTransformer + model = SentenceTransformer( + config.MODEL_SENTENCE_TRANSFORMER, + ).to(config.DEVICE) + + # Generate embeddings for the 'text' column using the SentenceTransformer model + df_text_processed['embeddings'] = pd.Series( + model.encode(df_text_processed['text']).tolist(), + ) + + # Create a new column 'context_id' with values ranging from 0 to the number of rows in the DataFrame + df_text_processed['context_id'] = [*range(df_text_processed.shape[0])] + + + project = hopsworks.login() + + fs = project.get_feature_store() + + documents_fg = fs.get_feature_group( + name="documents_fg", + version=1, + ) + + documents_fg.insert(df_text_processed) + return + +if __name__ == '__main__': + pipeline() diff --git a/advanced_tutorials/llm_pdfs/1b_dataset_generation.ipynb b/advanced_tutorials/llm_pdfs/1b_dataset_generation.ipynb new file mode 100644 index 00000000..d2fd826e --- /dev/null +++ b/advanced_tutorials/llm_pdfs/1b_dataset_generation.ipynb @@ -0,0 +1,290 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0279e128", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8efd4e5", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from openai import OpenAI\n", + "import getpass\n", + "import json\n", + "import pandas as pd\n", + "import json_repair\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "id": "4d389343", + "metadata": {}, + "source": [ + "## ⚙️ Settings " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "270b84fd", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"OPENAI_API_KEY\"] = os.getenv(\"OPENAI_API_KEY\") or getpass.getpass('🔑 Enter your OpenAI API key: ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d58f52ef", + "metadata": {}, + "outputs": [], + "source": [ + "client = OpenAI(\n", + " api_key=os.environ[\"OPENAI_API_KEY\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c16fbf15", + "metadata": {}, + "source": [ + "## 🔮 Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a8916cf", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store() " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32f2bbae", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the 'documents' feature view\n", + "feature_view = fs.get_feature_view(\n", + " name='documents',\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f60460ab", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize batch scoring for feature view\n", + "feature_view.init_batch_scoring()\n", + "\n", + "# Get batch data from the feature view\n", + "data = feature_view.get_batch_data()\n", + "\n", + "# Filter data to include only rows where the 'text' column length is greater than 2500\n", + "data_filtered = data[data.text.str.len() > 2500]\n", + "\n", + "# Display the filtered data\n", + "data_filtered" + ] + }, + { + "cell_type": "markdown", + "id": "d3d2fcb2", + "metadata": {}, + "source": [ + "## 🪄 Dataset Generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80d80597", + "metadata": {}, + "outputs": [], + "source": [ + "def generate_questions(context):\n", + "\n", + " instruction = \"\"\"\n", + " The given text is the result of the text extraction from the PDF files. \n", + " Generate 3 meaningful questions on the text and the respective answers.\n", + " Reply strictly in the JSON format:\n", + " {\n", + " \"questions\": [\"question1\", \"question2\", \"question3\"],\n", + " \"answers\": [\"answer1\", \"answer2\", \"answer3\"]\n", + " }\n", + "\n", + " Ensure that the lists of questions and answers are complete and properly formatted. \n", + " DO NOT include any additional information or characters outside the specified JSON format. \n", + " The response must consist only of the requested JSON structure. \n", + " If the generated content does not meet the specified format, please make the necessary adjustments to ensure compliance.\"\"\"\n", + "\n", + " prompt = f\"\\nContext: {context}\\nQuestion: {instruction}\"\n", + "\n", + " # Create a chatbot\n", + " completion = client.chat.completions.create(\n", + " model=\"gpt-3.5-turbo\",\n", + " # Pre-define conversation messages for the possible roles \n", + " messages=[\n", + " {\"role\": \"user\", \"content\": prompt},\n", + " ]\n", + " )\n", + " response = json_repair.loads(completion.choices[0].message.content)\n", + " \n", + " response['context'] = context\n", + " \n", + " return response\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d3642f6", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate question-answer pairs\n", + "generated_questions = [\n", + " generate_questions(text)\n", + " for text \n", + " in tqdm(data_filtered['text'])\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1f1cc46", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a DataFrame from the generated_questions\n", + "df = pd.DataFrame(generated_questions)\n", + "\n", + "# Display the first few rows of the DataFrame\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f906442", + "metadata": {}, + "outputs": [], + "source": [ + "# Explode the DataFrame to expand lists in specified columns ('questions' and 'answers')\n", + "df_expanded = df.explode(['questions', 'answers']).reset_index(drop=True)\n", + "\n", + "# Reset the index to create a new default integer index\n", + "df_expanded.reset_index(inplace=True)\n", + "\n", + "# Rename the 'index' column to 'record_id' for clarity\n", + "df_expanded.rename(columns={'index': 'record_id'}, inplace=True)\n", + "\n", + "# Display the expanded DataFrame\n", + "df_expanded" + ] + }, + { + "cell_type": "markdown", + "id": "4fe81b9f", + "metadata": {}, + "source": [ + "## 🪄 CQA Feature Group Creation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a84b387", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'cqa_fg' feature group\n", + "cqa_fg = fs.get_or_create_feature_group(\n", + " name=\"cqa_fg\",\n", + " version=1,\n", + " description='Context-Question-Response Data',\n", + " primary_key=['record_id'],\n", + ")\n", + "\n", + "cqa_fg.insert(df_expanded)" + ] + }, + { + "cell_type": "markdown", + "id": "2ed251e4", + "metadata": {}, + "source": [ + "## 🪄 CQA Feature View Creation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed7146f7", + "metadata": {}, + "outputs": [], + "source": [ + "# Get or create the 'cqa' feature view\n", + "feature_view = fs.get_or_create_feature_view(\n", + " name=\"cqa\",\n", + " version=1,\n", + " query=cqa_fg.select([\"context\", \"questions\", \"responses\"]),\n", + " description='Context-Question-Response pairs for model fine-tuning',\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "02f6f11a", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/llm_pdfs/2_training_pipeline.ipynb b/advanced_tutorials/llm_pdfs/2_training_pipeline.ipynb new file mode 100644 index 00000000..4177c447 --- /dev/null +++ b/advanced_tutorials/llm_pdfs/2_training_pipeline.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cc6015d0", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ba30ecb", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from datasets import Dataset\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n", + "from peft import LoraConfig\n", + "from transformers import TrainingArguments\n", + "from trl import SFTTrainer\n", + "\n", + "from functions.prompt_engineering import generate_prompt\n", + "import config" + ] + }, + { + "cell_type": "markdown", + "id": "1270e5f8", + "metadata": {}, + "source": [ + "## 🔮 Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e517b1cd", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store() \n", + "mr = project.get_model_registry()" + ] + }, + { + "cell_type": "markdown", + "id": "86043802", + "metadata": {}, + "source": [ + "## 🪝 Feature View Retrieval " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4007db72", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the 'cqa' feature view\n", + "feature_view = fs.get_feature_view(\n", + " name='cqa',\n", + " version=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83b00e9e", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize batch scoring for the feature view\n", + "feature_view.init_batch_scoring()\n", + "\n", + "# Get batch data from the feature view\n", + "data = feature_view.get_batch_data()\n", + "\n", + "# Display the first three rows of the batch data\n", + "data.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "64dab547", + "metadata": {}, + "source": [ + "## 🗄️ Dataset Creation " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "594f4e1a", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate prompts for each record in the DataFrame using context, questions, and responses\n", + "prompts = data.apply(\n", + " lambda record: generate_prompt(record['context'], record['questions']) + f'\\n### RESPONSE:\\n{record[\"responses\"]}', \n", + " axis=1,\n", + ").tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bd1e493", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a dataset from a dictionary with a single column named \"text\" containing prompts\n", + "dataset = Dataset.from_dict({\n", + " \"text\": prompts,\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0756b8e7", + "metadata": {}, + "outputs": [], + "source": [ + "print(dataset[10]['text'])" + ] + }, + { + "cell_type": "markdown", + "id": "bc161e58", + "metadata": {}, + "source": [ + "## ⬇️ Model Loading " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62477b0e", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the tokenizer for Mistral-7B-Instruct model\n", + "tokenizer = AutoTokenizer.from_pretrained(\n", + " config.MODEL_ID,\n", + ")\n", + "\n", + "# Set the pad token to the unknown token to handle padding\n", + "tokenizer.pad_token = tokenizer.unk_token\n", + "\n", + "# Set the padding side to \"right\" to prevent warnings during tokenization\n", + "tokenizer.padding_side = \"right\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0153d320", + "metadata": {}, + "outputs": [], + "source": [ + "# BitsAndBytesConfig int-4 config\n", + "bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True, \n", + " bnb_4bit_use_double_quant=True, \n", + " bnb_4bit_quant_type=\"nf4\", \n", + " bnb_4bit_compute_dtype=torch.bfloat16,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8a4d9ee", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the Mistral-7B-Instruct model with quantization configuration\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " config.MODEL_ID,\n", + " device_map=\"auto\",\n", + " quantization_config=bnb_config,\n", + ")\n", + "\n", + "# Configure the pad token ID in the model to match the tokenizer's pad token ID\n", + "model.config.pad_token_id = tokenizer.pad_token_id" + ] + }, + { + "cell_type": "markdown", + "id": "93c7ba90", + "metadata": {}, + "source": [ + "## ⚙️ Configuration " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18d24668", + "metadata": {}, + "outputs": [], + "source": [ + "peft_config = LoraConfig(\n", + " lora_alpha=64,\n", + " lora_dropout=0.1,\n", + " r=32,\n", + " bias=\"none\",\n", + " task_type=\"CAUSAL_LM\", \n", + " target_modules=[\n", + " \"q_proj\",\n", + " \"k_proj\",\n", + " \"v_proj\",\n", + " \"o_proj\",\n", + " \"gate_proj\",\n", + " \"up_proj\",\n", + " \"down_proj\",\n", + " \"lm_head\",\n", + " ],\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebade183", + "metadata": {}, + "outputs": [], + "source": [ + "training_arguments = TrainingArguments(\n", + " output_dir=\"mistral7b_finetuned\", # directory to save and repository id\n", + " num_train_epochs=3, # number of training epochs\n", + " per_device_train_batch_size=3, # batch size per device during training\n", + " gradient_accumulation_steps=2, # number of steps before performing a backward/update pass\n", + " gradient_checkpointing=True, # use gradient checkpointing to save memory\n", + " optim=\"adamw_torch_fused\", # use fused adamw optimizer\n", + " logging_steps=10, # log every 10 steps\n", + " save_strategy=\"epoch\", # save checkpoint every epoch\n", + " learning_rate=2e-4, # learning rate, based on QLoRA paper\n", + " bf16=True, # use bfloat16 precision\n", + " tf32=True, # use tf32 precision\n", + " max_grad_norm=0.3, # max gradient norm based on QLoRA paper\n", + " warmup_ratio=0.03, # warmup ratio based on QLoRA paper\n", + " lr_scheduler_type=\"constant\", # use constant learning rate scheduler\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "36e79a43", + "metadata": {}, + "source": [ + "## 🏃🏻‍♂️ Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13af595e", + "metadata": {}, + "outputs": [], + "source": [ + "# Create the Supervised Fine-tuning Trainer\n", + "trainer = SFTTrainer(\n", + " model=model,\n", + " train_dataset=dataset,\n", + " peft_config=peft_config,\n", + " max_seq_length=4096,\n", + " tokenizer=tokenizer,\n", + " args=training_arguments,\n", + " dataset_text_field='text',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2c9a416", + "metadata": {}, + "outputs": [], + "source": [ + "# Train the model\n", + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "id": "85e840c2", + "metadata": {}, + "source": [ + "## 💾 Saving Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75940ca2", + "metadata": {}, + "outputs": [], + "source": [ + "# Save the trained model\n", + "trainer.save_model()" + ] + }, + { + "cell_type": "markdown", + "id": "bfaae161", + "metadata": {}, + "source": [ + "## 🗄️ Model Registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ff14642", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Python model in the model registry\n", + "model_llm = mr.python.create_model(\n", + " name=\"mistral_model\", \n", + " description=\"Mistral Fine-tuned Model\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fbce3ba9", + "metadata": {}, + "outputs": [], + "source": [ + "# Save the model directory with the fine-tuned model to the model registry\n", + "model_llm.save(training_arguments.output_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "ecc9b1d0", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/llm_pdfs/3_inference_pipeline.ipynb b/advanced_tutorials/llm_pdfs/3_inference_pipeline.ipynb new file mode 100644 index 00000000..9e2f00dd --- /dev/null +++ b/advanced_tutorials/llm_pdfs/3_inference_pipeline.ipynb @@ -0,0 +1,366 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "be60a8be", + "metadata": {}, + "source": [ + "## 📝 Imports " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f95e1e54", + "metadata": {}, + "outputs": [], + "source": [ + "from sentence_transformers import SentenceTransformer\n", + "from FlagEmbedding import FlagReranker\n", + "\n", + "from functions.llm_chain import get_llm_chain\n", + "from functions.prompt_engineering import get_context_and_source\n", + "import config\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "id": "3f3a2715", + "metadata": {}, + "source": [ + "## 🔮 Connecting to Hopsworks Feature Store " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d292081d", + "metadata": {}, + "outputs": [], + "source": [ + "import hopsworks\n", + "\n", + "project = hopsworks.login()\n", + "\n", + "fs = project.get_feature_store()\n", + "mr = project.get_model_registry()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "733aa65d", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve the 'documents' feature view\n", + "feature_view = fs.get_feature_view(\n", + " name=\"documents\", \n", + " version=1,\n", + ") \n", + "\n", + "# Initialize serving\n", + "feature_view.init_serving(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1e562e9", + "metadata": {}, + "outputs": [], + "source": [ + "# Get the Mistral model from Model Registry\n", + "mistral_model = mr.get_model(\n", + " name=\"mistral_model\",\n", + " version=1,\n", + ")\n", + "\n", + "# Download the Mistral model files to a local directory\n", + "saved_model_dir = mistral_model.download()" + ] + }, + { + "cell_type": "markdown", + "id": "0235999b", + "metadata": {}, + "source": [ + "## ⛓️ LLM Chain " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc70c06b", + "metadata": {}, + "outputs": [], + "source": [ + "llm_chain = get_llm_chain(saved_model_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6b5249d", + "metadata": {}, + "outputs": [], + "source": [ + "session_id = {\n", + " \"configurable\": {\"session_id\": \"default\"}\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "d9377ab5", + "metadata": {}, + "source": [ + "## 🗄️ Sentence Transformer Loading " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89b5ce52", + "metadata": {}, + "outputs": [], + "source": [ + "# Load the Sentence Transformer\n", + "sentence_transformer = SentenceTransformer(\n", + " config.MODEL_SENTENCE_TRANSFORMER,\n", + ").to(config.DEVICE)" + ] + }, + { + "cell_type": "markdown", + "id": "40126e56", + "metadata": {}, + "source": [ + "## 🧬 Reranking " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72cfcbd2", + "metadata": {}, + "outputs": [], + "source": [ + "def get_reranker():\n", + " reranker = FlagReranker(\n", + " 'BAAI/bge-reranker-large', \n", + " use_fp16=True,\n", + " ) \n", + " return reranker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "491e3847", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve a reranker\n", + "reranker = get_reranker()" + ] + }, + { + "cell_type": "markdown", + "id": "c739dd2d", + "metadata": {}, + "source": [ + "## 🗄️ Context Retrieval " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "987d3108", + "metadata": {}, + "outputs": [], + "source": [ + "# User Question Example\n", + "user_input = 'What are the best risk reporting practices?' " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02199904", + "metadata": {}, + "outputs": [], + "source": [ + "# Retrieve reranked context and source\n", + "context, source = get_context_and_source(\n", + " user_input, \n", + " sentence_transformer,\n", + " feature_view, \n", + " reranker,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "decf4d3d", + "metadata": {}, + "source": [ + "## 🚀 Model Inference " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "622bfb9a", + "metadata": {}, + "outputs": [], + "source": [ + "# Generate model response\n", + "model_output = llm_chain.invoke({\n", + " \"context\": context, \n", + " \"question\": user_input,\n", + " },\n", + " session_id,\n", + ")\n", + "\n", + "print(model_output.split('### RESPONSE:\\n')[-1] + source)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5711145e", + "metadata": {}, + "outputs": [], + "source": [ + "user_input = 'What is Adaptability?'\n", + "\n", + "context, source = get_context_and_source(\n", + " user_input, \n", + " sentence_transformer,\n", + " feature_view, \n", + " reranker,\n", + ")\n", + "\n", + "model_output = llm_chain.invoke({\n", + " \"context\": context, \n", + " \"question\": user_input,\n", + " },\n", + " session_id,\n", + ")\n", + "\n", + "print(model_output.split('### RESPONSE:\\n')[-1] + source)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "044e9b15", + "metadata": {}, + "outputs": [], + "source": [ + "user_input = 'What is a risk management?'\n", + "\n", + "context, source = get_context_and_source(\n", + " user_input, \n", + " sentence_transformer,\n", + " feature_view, \n", + " reranker,\n", + ")\n", + "\n", + "model_output = llm_chain.invoke({\n", + " \"context\": context, \n", + " \"question\": user_input,\n", + " },\n", + " session_id,\n", + ")\n", + "\n", + "print(model_output.split('### RESPONSE:\\n')[-1] + source)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02be4b75", + "metadata": {}, + "outputs": [], + "source": [ + "user_input = 'What is the purpose of maintaining an up-to-date data-flow diagram?'\n", + "\n", + "context, source = get_context_and_source(\n", + " user_input, \n", + " sentence_transformer,\n", + " feature_view, \n", + " reranker,\n", + ")\n", + "\n", + "model_output = llm_chain.invoke({\n", + " \"context\": context, \n", + " \"question\": user_input,\n", + " },\n", + " session_id,\n", + ")\n", + "\n", + "print(model_output.split('### RESPONSE:\\n')[-1] + source)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43a409ea", + "metadata": {}, + "outputs": [], + "source": [ + "user_input = 'Why are security and privacy controls important?'\n", + "\n", + "context, source = get_context_and_source(\n", + " user_input, \n", + " sentence_transformer,\n", + " feature_view, \n", + " reranker,\n", + ")\n", + "\n", + "model_output = llm_chain.invoke({\n", + " \"context\": context, \n", + " \"question\": user_input,\n", + " },\n", + " session_id,\n", + ")\n", + "\n", + "print(model_output.split('### RESPONSE:\\n')[-1] + source)" + ] + }, + { + "cell_type": "markdown", + "id": "108ca3db", + "metadata": {}, + "source": [ + "---" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/advanced_tutorials/llm_pdfs/app.py b/advanced_tutorials/llm_pdfs/app.py new file mode 100644 index 00000000..4db37f4c --- /dev/null +++ b/advanced_tutorials/llm_pdfs/app.py @@ -0,0 +1,127 @@ +import streamlit as st +import hopsworks +from sentence_transformers import SentenceTransformer +from FlagEmbedding import FlagReranker +from functions.prompt_engineering import get_context_and_source +from functions.llm_chain import get_llm_chain +import config +import warnings +warnings.filterwarnings('ignore') + +st.title("💬 AI assistant") + +@st.cache_resource() +def connect_to_hopsworks(): + # Initialize Hopsworks feature store connection + project = hopsworks.login() + fs = project.get_feature_store() + mr = project.get_model_registry() + + # Retrieve the 'documents' feature view + feature_view = fs.get_feature_view( + name="documents", + version=1, + ) + + # Initialize serving + feature_view.init_serving(1) + + # Get the Mistral model from Model Registry + mistral_model = mr.get_model( + name="mistral_model", + version=1, + ) + + # Download the Mistral model files to a local directory + saved_model_dir = mistral_model.download() + + return feature_view, saved_model_dir + + +@st.cache_resource() +def get_models(saved_model_dir): + + # Load the Sentence Transformer + sentence_transformer = SentenceTransformer( + config.MODEL_SENTENCE_TRANSFORMER, + ).to(config.DEVICE) + + llm_chain = get_llm_chain(saved_model_dir) + + return sentence_transformer, llm_chain + + +@st.cache_resource() +def get_reranker(): + reranker = FlagReranker( + 'BAAI/bge-reranker-large', + use_fp16=True, + ) + return reranker + + +def predict(user_query, sentence_transformer, feature_view, reranker, llm_chain): + + st.write('⚙️ Generating Response...') + + session_id = { + "configurable": {"session_id": "default"} + } + + # Retrieve reranked context and source + context, source = get_context_and_source( + user_query, + sentence_transformer, + feature_view, + reranker, + ) + + # Generate model response + model_output = llm_chain.invoke({ + "context": context, + "question": user_query, + }, + session_id, + ) + + return model_output.split('### RESPONSE:\n')[-1] + source + + +# Retrieve the feature view and the saved_model_dir +feature_view, saved_model_dir = connect_to_hopsworks() + +# Load and retrieve the sentence_transformer and llm_chain +sentence_transformer, llm_chain = get_models(saved_model_dir) + +# Retrieve the reranking model +reranker = get_reranker() + +# Initialize chat history +if "messages" not in st.session_state: + st.session_state.messages = [] + +# Display chat messages from history on app rerun +for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + +# React to user input +if user_query := st.chat_input("How can I help you?"): + # Display user message in chat message container + st.chat_message("user").markdown(user_query) + # Add user message to chat history + st.session_state.messages.append({"role": "user", "content": user_query}) + + response = predict( + user_query, + sentence_transformer, + feature_view, + reranker, + llm_chain, + ) + + # Display assistant response in chat message container + with st.chat_message("assistant"): + st.markdown(response) + # Add assistant response to chat history + st.session_state.messages.append({"role": "assistant", "content": response}) diff --git a/advanced_tutorials/llm_pdfs/config.py b/advanced_tutorials/llm_pdfs/config.py new file mode 100644 index 00000000..1b1ee098 --- /dev/null +++ b/advanced_tutorials/llm_pdfs/config.py @@ -0,0 +1,16 @@ +import torch + +# The unique identifier for the Google Drive folder where your PDF files are stored +FOLDER_ID = '{YOUR_FOLDER_ID}' + +# The local directory path where downloaded data will be saved. +DOWNLOAD_PATH = "data" + +# The identifier of the pre-trained sentence transformer model for producing sentence embeddings. +MODEL_SENTENCE_TRANSFORMER = 'all-MiniLM-L6-v2' + +# The computing device to be used for model inference and training. +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + +# The identifier for the Mistral-7B-Instruct model +MODEL_ID = 'mistralai/Mistral-7B-Instruct-v0.2' diff --git a/advanced_tutorials/llm_pdfs/functions/connect_to_google_drive.py b/advanced_tutorials/llm_pdfs/functions/connect_to_google_drive.py new file mode 100644 index 00000000..a20a164c --- /dev/null +++ b/advanced_tutorials/llm_pdfs/functions/connect_to_google_drive.py @@ -0,0 +1,19 @@ +from apiclient import discovery +from httplib2 import Http +from oauth2client import client, file, tools + + +# Define path variables +credentials_file_path = '../credentials/credentials.json' +clientsecret_file_path = '../credentials/client_secret.json' + +# Define API scope +SCOPE = 'https://www.googleapis.com/auth/drive' + +# Define store +store = file.Storage(credentials_file_path) +credentials = store.get() +# Get access token +if not credentials or credentials.invalid: + flow = client.flow_from_clientsecrets(clientsecret_file_path, SCOPE) + credentials = tools.run_flow(flow, store) \ No newline at end of file diff --git a/advanced_tutorials/llm_pdfs/functions/llm_chain.py b/advanced_tutorials/llm_pdfs/functions/llm_chain.py new file mode 100644 index 00000000..46e56952 --- /dev/null +++ b/advanced_tutorials/llm_pdfs/functions/llm_chain.py @@ -0,0 +1,133 @@ +import os +import getpass +import torch +import transformers +from peft import AutoPeftModelForCausalLM +from transformers import AutoTokenizer +from langchain.llms import HuggingFacePipeline +from langchain.prompts import PromptTemplate +from langchain_core.output_parsers import StrOutputParser +from langchain_core.runnables.history import RunnableWithMessageHistory +from langchain_community.chat_message_histories import ChatMessageHistory +from langchain_core.chat_history import BaseChatMessageHistory + + +def load_llm(model_dir) -> tuple: + """ + Load the LLM and its corresponding tokenizer. + + Args: + model_dir (str): Path to the pre-trained fine-tuned model. + + Returns: + tuple: A tuple containing the tokenizer and loaded model. + """ + # Setup the HuggingFace API Key + os.environ["HF_API_KEY"] = os.getenv("HF_API_KEY") or getpass.getpass('🔑 Enter your HuggingFace API key: ') + + # Load a model from the saved model directory + model_llm = AutoPeftModelForCausalLM.from_pretrained( + model_dir, + device_map="auto", + torch_dtype=torch.float16, + token=os.environ["HF_API_KEY"], + ) + + # Load the tokenizer from the saved model directory + tokenizer = AutoTokenizer.from_pretrained( + model_dir, + token=os.environ["HF_API_KEY"], + ) + + # Set the pad token to the end-of-sequence token + tokenizer.pad_token = tokenizer.eos_token + + # Set the padding side to "right" to remove warnings + tokenizer.padding_side = "right" + + # Print device + print(f'⛳️ Device: {model_llm.device}') + return tokenizer, model_llm + + +def get_prompt_template(): + # Define a template for generating prompts + prompt_template = """ + [INST] + Instruction: Prioritize brevity and clarity in responses. + Avoid unnecessary repetition and keep answers concise, adhering to a maximum of 750 characters. + Eliminate redundant phrases and sentences. + If details are repeated, provide them only once for better readability. + Focus on delivering key information without unnecessary repetition. + If a concept is already conveyed, there's no need to restate it. Ensure responses remain clear and to the point. + Make sure you do not repeat any sentences in your answer. + [/INST] + + Previous conversation: + {chat_history} + + ### CONTEXT: + + {context} + + ### QUESTION: + [INST]{question}[/INST]""" + return prompt_template + + +def get_llm_chain(model_dir): + """ + Initializes and returns a language model chain for text generation using Hugging Face's transformers library. + + Parameters: + - model_dir (str): Path to the pre-trained fine-tuned model. + + Returns: + - LLMChain: A configured chain consisting of a Hugging Face pipeline for text generation and prompt handling. + """ + + def get_global_history(session_id: str) -> BaseChatMessageHistory: + return global_chat_history + + # Load LLM and its corresponding tokenizer + tokenizer, model = load_llm(model_dir) + + # Create a text generation pipeline using the loaded model and tokenizer + text_generation_pipeline = transformers.pipeline( + model=model, # The pre-trained language model for text generation + tokenizer=tokenizer, # The tokenizer corresponding to the language model + task="text-generation", # Specify the task as text generation + temperature=0.2, # Controls the randomness of the generation (higher values for more randomness) + repetition_penalty=1.5, # Controls the penalty for repeating tokens in generated text + return_full_text=True, # Return the full generated text instead of just the generated tokens + max_new_tokens=750, # Limit the maximum number of newly generated tokens + pad_token_id=tokenizer.eos_token_id, # Use the end-of-sequence token as the padding token + do_sample=True, # Enable sampling during text generation + ) + + # Create a Hugging Face pipeline for Mistral LLM using the text generation pipeline + mistral_llm = HuggingFacePipeline( + pipeline=text_generation_pipeline, + ) + + # Create prompt from prompt template + prompt = PromptTemplate( + input_variables=["context", "question", "chat_history"], + template=get_prompt_template(), + ) + + # Create the runnable sequence + runnable = prompt | mistral_llm | StrOutputParser() + + # Initialize a global chat history (shared for all invocations) + global_chat_history = ChatMessageHistory() + + # Create the RunnableWithMessageHistory using the global history + llm_chain = RunnableWithMessageHistory( + runnable, + get_global_history, + input_messages_key="question", + history_messages_key="chat_history", + ) + + return llm_chain diff --git a/advanced_tutorials/llm_pdfs/functions/pdf_preprocess.py b/advanced_tutorials/llm_pdfs/functions/pdf_preprocess.py new file mode 100644 index 00000000..495207df --- /dev/null +++ b/advanced_tutorials/llm_pdfs/functions/pdf_preprocess.py @@ -0,0 +1,96 @@ +from pydrive.auth import GoogleAuth +from pydrive.drive import GoogleDrive +import PyPDF2 +import os +from typing import List, Dict, Union + +def download_files_to_folder(folder_id: str, download_path: str) -> List: + """ + Download files from a specified Google Drive folder to a local folder. + + Parameters: + - folder_id (str): The ID of the Google Drive folder. + - download_path (str): The local folder path where files will be downloaded. + + Returns: + - List: A list containing information about newly downloaded files. + """ + # Authenticate with Google Drive + gauth = GoogleAuth() + gauth.LoadCredentialsFile("credentials/credentials.json") + + if gauth.credentials is None: + gauth.LocalWebserverAuth() + elif gauth.access_token_expired: + gauth.Refresh() + else: + # Initialize the saved creds + gauth.Authorize() + + # Save the current credentials to a file + gauth.SaveCredentialsFile("credentials/credentials.json") + + drive = GoogleDrive(gauth) + + # Create the local folder if it doesn't exist + if not os.path.exists(download_path): + os.makedirs(download_path) + + # List files in the specified Google Drive folder + file_list = drive.ListFile({'q': f"'{folder_id}' in parents and trashed=false"}).GetList() + + # Initialize a list to store information about new files + new_files = [] + print('⛳️ Loading...') + + # Iterate through each file in the list + for file in file_list: + # Check if the file already exists locally + local_file_path = os.path.join(download_path, file["title"]) + + if not os.path.isfile(local_file_path): + # Download the file content and save it to the local folder + file.GetContentFile(local_file_path) + + # Append information about the downloaded file to the list + new_files.append(file) + + # Print the list of newly downloaded files + if len(new_files) == 0: + print("⛳️ There are no new files") + return new_files + + print("⛳️ Newly downloaded files:") + for file in new_files: + print("title: %s, id: %s" % (file["title"], file["id"])) + + return new_files + + +def process_pdf_file(file_info: Dict, + document_text: List, + pdfs_path: str = 'data/') -> List: + """ + Process content of a PDF file and append information to the document_text list. + + Parameters: + - file_info (Dict): Information about the PDF file. + - document_text (List): List containing document information. + - pdfs_path (str): Path to the folder containing PDF files (default is 'data/'). + + Returns: + - List: Updated document_text list. + """ + file_title = file_info["title"] + + if file_title.split('.')[-1] == 'pdf': + print(f'⛳️ File Name: {file_title}') + + pdf_path = os.path.join(pdfs_path, file_title) + pdf_reader = PyPDF2.PdfReader(pdf_path) + pages_amount = len(pdf_reader.pages) + print(f'Amount of pages: {pages_amount}') + + for i, page in enumerate(pdf_reader.pages): + document_text.append([file_title, file_info['embedLink'], i+1, page.extract_text()]) + return document_text diff --git a/advanced_tutorials/llm_pdfs/functions/prompt_engineering.py b/advanced_tutorials/llm_pdfs/functions/prompt_engineering.py new file mode 100644 index 00000000..a4a0b979 --- /dev/null +++ b/advanced_tutorials/llm_pdfs/functions/prompt_engineering.py @@ -0,0 +1,151 @@ +from typing import List, Tuple +from sentence_transformers import SentenceTransformer + +def get_source(neighbors: List[Tuple[str, str, int, int]]) -> str: + """ + Generates a formatted string for the sources of the provided context. + + Args: + neighbors (List[Tuple[str, str, int, int]]): List of tuples representing document information. + + Returns: + str: Formatted string containing document names, links, pages, and paragraphs. + """ + return '\n\nReferences:\n' + '\n'.join( + [ + f' - {neighbor[0]}({neighbor[1]}): Page: {neighbor[2]}, Paragraph: {neighbor[3]}' + for neighbor + in neighbors + ] + ) + +def get_context(neighbors: List[Tuple[str]]) -> str: + """ + Generates a formatted string for the context based on the provided neighbors. + + Args: + neighbors (List[Tuple[str]]): List of tuples representing context information. + + Returns: + str: Formatted string containing context information. + """ + return '\n\n'.join([neighbor[-1] for neighbor in neighbors]) + + +def generate_prompt(context: str, question: str) -> str: + """ + Generates a prompt for the AI assistant based on context and question. + + Args: + context (str): Formatted string containing context information. + question (str): The question to be included in the prompt. + + Returns: + str: Formatted prompt for the AI assistant. + """ + prompt_template = """ +[INST] +Instruction: You are an AI assistant specialized in regulatory documents. +Your role is to provide accurate and informative answers based on the given context. +[/INST] + +### CONTEXT: + +{context} + +### QUESTION: +[INST]{question}[/INST] + """ + + return prompt_template.format( + context=context, + question=question, + ) + + +def get_neighbors(query: str, sentence_transformer: SentenceTransformer, feature_view, k: int = 10) -> List[Tuple[str, float]]: + """ + Get the k closest neighbors for a given query using sentence embeddings. + + Parameters: + - query (str): The input query string. + - sentence_transformer (SentenceTransformer): The sentence transformer model. + - feature_view (FeatureView): The feature view for retrieving neighbors. + - k (int, optional): Number of neighbors to retrieve. Default is 10. + + Returns: + - List[Tuple[str, float]]: A list of tuples containing the neighbor context. + """ + question_embedding = sentence_transformer.encode(query) + + # Retrieve closest neighbors + neighbors = feature_view.find_neighbors( + question_embedding, + k=k, + ) + + return neighbors + + +def rerank(query: str, neighbors: List[str], reranker, k: int = 3) -> List[str]: + """ + Rerank a list of neighbors based on a reranking model. + + Parameters: + - query (str): The input query string. + - neighbors (List[str]): List of neighbor contexts. + - reranker (Reranker): The reranking model. + - k (int, optional): Number of top-ranked neighbors to return. Default is 3. + + Returns: + - List[str]: The top-ranked neighbor contexts after reranking. + """ + # Compute scores for each context using the reranker + scores = [reranker.compute_score([query, context[-1]]) for context in neighbors] + + combined_data = [*zip(scores, neighbors)] + + # Sort contexts based on the scores in descending order + sorted_data = sorted(combined_data, key=lambda x: x[0], reverse=True) + + # Return the top-k ranked contexts + return [context for score, context in sorted_data][:k] + + +def get_context_and_source(user_query: str, sentence_transformer: SentenceTransformer, + feature_view, reranker) -> Tuple[str, str]: + """ + Retrieve context and source based on user query using a combination of embedding, feature view, and reranking. + + Parameters: + - user_query (str): The user's input query string. + - sentence_transformer (SentenceTransformer): The sentence transformer model. + - feature_view (FeatureView): The feature view for retrieving neighbors. + - reranker (Reranker): The reranking model. + + Returns: + - Tuple[str, str]: A tuple containing the retrieved context and source. + """ + # Retrieve closest neighbors + neighbors = get_neighbors( + user_query, + sentence_transformer, + feature_view, + k=10, + ) + + # Rerank the neighbors to get top-k + context_reranked = rerank( + user_query, + neighbors, + reranker, + k=3, + ) + + # Retrieve context + context = get_context(context_reranked) + + # Retrieve source + source = get_source(context_reranked) + + return context, source diff --git a/advanced_tutorials/llm_pdfs/functions/text_preprocess.py b/advanced_tutorials/llm_pdfs/functions/text_preprocess.py new file mode 100644 index 00000000..47cbd21d --- /dev/null +++ b/advanced_tutorials/llm_pdfs/functions/text_preprocess.py @@ -0,0 +1,73 @@ +import pandas as pd +from typing import List + +def split_page(document: str) -> List[str]: + """ + Splits a document into a list of paragraphs based on newline characters. + + Parameters: + - document (str): The input document to be split. + + Returns: + - List[str]: A list of paragraphs. + """ + return document.split('\n \n') + + +def get_paragraphs(data: pd.DataFrame) -> pd.DataFrame: + """ + Explodes the 'text' column in the DataFrame, adds a 'paragraph' column indicating the index + of the element in the list grouped by file_name and page_number. + + Parameters: + - data (pd.DataFrame): The input DataFrame containing 'file_name', 'page_number', and 'text' columns. + + Returns: + - pd.DataFrame: The modified DataFrame with an added 'paragraph' column. + """ + # Explode the list to separate rows + data_text_exploded = data.explode('text') + + # Add a 'paragraph' column indicating the index of the element in the list + data_text_exploded['paragraph'] = data_text_exploded.groupby( + ['file_name', 'page_number'] + ).cumcount() + 1 + + return data_text_exploded + + +def process_text_data(df: pd.DataFrame) -> pd.DataFrame: + """ + Processes text data by applying the split_page, get_paragraphs functions. + + Parameters: + - df (pd.DataFrame): The input DataFrame containing 'file_name' and 'text' columns. + + Returns: + - pd.DataFrame: The processed DataFrame with 'file_name', 'page_number', 'paragraph', and 'text' columns. + """ + # Apply split_page function to split text into paragraphs + df['text'] = df['text'].apply(split_page) + + # Apply get_paragraphs function to explode the list and add paragraph numbers + df = get_paragraphs(df) + + # Apply strip to remove leading and trailing spaces + df['text'] = df['text'].str.strip() + + # Filter rows where the length of the 'text' column is greater than 500 + df = df[df['text'].str.len() > 500] + + # Set a regex pattern to identify rows with 5 or more consecutive dots or dashes + pattern_to_remove = r'(\.{5,}|\-{5,})' + + # Remove rows matching the pattern + df_filtered = df[~df['text'].str.contains(pattern_to_remove, regex=True)] + + # Reset index + df_filtered.reset_index(drop=True, inplace=True) + + # Reorder columns for better readability + df_filtered = df_filtered[['file_name', 'file_link', 'page_number', 'paragraph', 'text']] + + return df_filtered diff --git a/advanced_tutorials/llm_pdfs/requirements.txt b/advanced_tutorials/llm_pdfs/requirements.txt new file mode 100644 index 00000000..8c00f616 --- /dev/null +++ b/advanced_tutorials/llm_pdfs/requirements.txt @@ -0,0 +1,22 @@ +google-api-python-client==2.114.0 +httplib2==0.22.0 +oauth2client==4.1.3 +pydrive==1.3.1 +PyPDF2==3.0.1 +pandas==2.1.4 +sentence-transformers==2.2.2 +accelerate==0.26.1 +peft==0.7.1 +bitsandbytes==0.40.2 +transformers==4.36.2 +flask-sqlalchemy==3.1.1 +trl==0.7.9 +langchain==0.1.1 +pyopenssl==23.3.0 +FlagEmbedding +streamlit==1.30.0 +openai==1.9.0 +getpass4==0.0.14.1 +json_repair==0.6.1 +protobuf==3.20.0 +hopsworks