Merge branch 'main' into issue-7557

deepset-ai · May 5, 2024 · e57549b · e57549b
2 parents 026f910 + cd66a80
commit e57549b
Show file tree

Hide file tree

Showing 28 changed files with 825 additions and 108 deletions.
diff --git a/.github/workflows/minor_version_release.yml b/.github/workflows/minor_version_release.yml
@@ -1,7 +1,15 @@
-name: Minor Version Release (1.x)
+name: Minor Version Release
 
 on:
   workflow_dispatch:
+    inputs:
+      version:
+        description: "Version to release"
+        required: true
+        type: choice
+        options:
+          - v1.x
+          - v2.x
 
 env:
   PYTHON_VERSION: "3.8"
@@ -10,10 +18,20 @@ jobs:
   sync:
     runs-on: ubuntu-latest
     steps:
+      - name: Get branch name
+        id: branch
+        shell: python
+        run: |
+          import os
+          version = "${{ inputs.version }}"
+          branch = "v1.x" if version == "v1.x" else "main"
+          with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+            print(f"name={branch}", file=f)
+
       - name: Checkout this repo
         uses: actions/checkout@v4
         with:
-          ref: "v1.x"
+          ref: "${{ steps.branch.outputs.name }}"
 
       - name: Define all versions
         id: versions
@@ -32,20 +50,40 @@ jobs:
           git checkout -b v${{ steps.versions.outputs.current_release_minor }}.x
           git push -u origin v${{ steps.versions.outputs.current_release_minor }}.x --tags
 
-      - name: Bump version on v1.x
+      - name: Bump version on ${{ steps.branch.outputs.name }}
         shell: bash
         env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          # We use the HAYSTACK_BOT_TOKEN here so the PR created by the step will
+          # trigger required workflows and can be merged by anyone
+          GITHUB_TOKEN: ${{ secrets.HAYSTACK_BOT_TOKEN }}
         run: |
-          git checkout v1.x
+          git checkout "${{ steps.branch.outputs.name }}"
+
+          # Tag the base with X.Y.Z-rc0.
+          # At this point VERSION.txt still contains the previous version and not
+          # the one specified by the tag.
+          # This is good though as we just need this to make reno work properly.
           NEW_VERSION=$(awk -F. '/[0-9]+\./{$2++;print}' OFS=. < VERSION.txt)
           echo "$NEW_VERSION" > VERSION.txt
+          VERSION_TAG="v$NEW_VERSION"
+          git tag "$VERSION_TAG" -m"$VERSION_TAG"
+          git push --tags
+
+          # Create the branch that bump version in dev branch
           cat VERSION.txt
+          git checkout -b bump-version
           git add .
           git commit -m "Update unstable version to $NEW_VERSION"
-          VERSION_TAG="v$NEW_VERSION"
-          git tag $VERSION_TAG -m"$VERSION_TAG"
-          git push --atomic origin v1.x $VERSION_TAG
+          git push -u origin bump-version
+
+          # Create the PR
+          gh pr create -B "${{ steps.branch.outputs.name }}" \
+            -H bump-version \
+            --title "Bump unstable version" \
+            --body "This PR bumps the unstable version for ${{ inputs.version }}.\n \
+          The release branch \`v${{ steps.versions.outputs.current_release_minor }}.x\` has been correctly created.\n\
+          Verify documentation on Readme has been correctly updated before approving and merging this PR." \
+            --label "ignore-for-release-notes"
 
       - uses: actions/setup-python@v5
         with:
@@ -58,5 +96,5 @@ jobs:
         env:
           RDME_API_KEY: ${{ secrets.README_API_KEY }}
         run: |
-          git checkout v1.x
+          git checkout ${{ steps.branch.outputs.name }}
           python ./.github/utils/release_docs.py --new-version ${{ steps.versions.outputs.current_release_minor }}
diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-2.1.0-rc0
+2.2.0-rc0
diff --git a/e2e/pipelines/test_evaluation_pipeline.py b/e2e/pipelines/test_evaluation_pipeline.py
@@ -80,7 +80,7 @@ def evaluation_pipeline():
     """
     eval_pipeline = Pipeline()
     eval_pipeline.add_component("doc_mrr", DocumentMRREvaluator())
-    eval_pipeline.add_component("groundness", FaithfulnessEvaluator())
+    eval_pipeline.add_component("groundedness", FaithfulnessEvaluator())
     eval_pipeline.add_component("sas", SASEvaluator(model=EMBEDDINGS_MODEL))
     eval_pipeline.add_component("doc_map", DocumentMAPEvaluator())
     eval_pipeline.add_component("doc_recall_single_hit", DocumentRecallEvaluator(mode=RecallMode.SINGLE_HIT))
@@ -94,7 +94,7 @@ def built_eval_input(questions, truth_docs, truth_answers, retrieved_docs, conte
     """Helper function to build the input for the evaluation pipeline"""
     return {
         "doc_mrr": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},
-        "groundness": {"questions": questions, "contexts": contexts, "responses": truth_answers},
+        "groundedness": {"questions": questions, "contexts": contexts, "predicted_answers": pred_answers},
         "sas": {"predicted_answers": pred_answers, "ground_truth_answers": truth_answers},
         "doc_map": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},
         "doc_recall_single_hit": {"ground_truth_documents": truth_docs, "retrieved_documents": retrieved_docs},
@@ -141,8 +141,8 @@ def built_input_for_results_eval(rag_results):
             "score": rag_results["sas"]["score"],
         },
         "Faithfulness": {
-            "individual_scores": rag_results["groundness"]["individual_scores"],
-            "score": rag_results["groundness"]["score"],
+            "individual_scores": rag_results["groundedness"]["individual_scores"],
+            "score": rag_results["groundedness"]["score"],
         },
         "Document MAP": {
             "individual_scores": rag_results["doc_map"]["individual_scores"],

diff --git a/haystack/components/converters/__init__.py b/haystack/components/converters/__init__.py
@@ -3,6 +3,7 @@
 from haystack.components.converters.markdown import MarkdownToDocument
 from haystack.components.converters.openapi_functions import OpenAPIServiceToFunctions
 from haystack.components.converters.output_adapter import OutputAdapter
+from haystack.components.converters.pdfminer import PDFMinerToDocument
 from haystack.components.converters.pypdf import PyPDFToDocument
 from haystack.components.converters.tika import TikaDocumentConverter
 from haystack.components.converters.txt import TextFileToDocument
@@ -12,6 +13,7 @@
     "TikaDocumentConverter",
     "AzureOCRDocumentConverter",
     "PyPDFToDocument",
+    "PDFMinerToDocument",
     "HTMLToDocument",
     "MarkdownToDocument",
     "OpenAPIServiceToFunctions",

diff --git a/haystack/components/converters/pdfminer.py b/haystack/components/converters/pdfminer.py
@@ -0,0 +1,160 @@
+import io
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+from haystack import Document, component, logging
+from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
+from haystack.dataclasses import ByteStream
+from haystack.lazy_imports import LazyImport
+
+with LazyImport("Run 'pip install pdfminer.six'") as pdfminer_import:
+    from pdfminer.high_level import extract_pages
+    from pdfminer.layout import LAParams, LTTextContainer
+
+logger = logging.getLogger(__name__)
+
+
+@component
+class PDFMinerToDocument:
+    """
+    Converts PDF files to Documents.
+
+    Uses `pdfminer` compatible converters to convert PDF files to Documents. https://pdfminersix.readthedocs.io/en/latest/
+
+    Usage example:
+    ```python
+    from haystack.components.converters.pdfminer import PDFMinerToDocument
+
+    converter = PDFMinerToDocument()
+    results = converter.run(sources=["sample.pdf"], meta={"date_added": datetime.now().isoformat()})
+    documents = results["documents"]
+    print(documents[0].content)
+    # 'This is a text from the PDF file.'
+    ```
+    """
+
+    def __init__(
+        self,
+        line_overlap: float = 0.5,
+        char_margin: float = 2.0,
+        line_margin: float = 0.5,
+        word_margin: float = 0.1,
+        boxes_flow: Optional[float] = 0.5,
+        detect_vertical: bool = True,
+        all_texts: bool = False,
+    ) -> None:
+        """
+        Create a PDFMinerToDocument component.
+
+        :param line_overlap:
+            This parameter determines whether two characters are considered to be on
+            the same line based on the amount of overlap between them.
+            The overlap is calculated relative to the minimum height of both characters.
+        :param char_margin:
+            Determines whether two characters are part of the same line based on the distance between them.
+            If the distance is less than the margin specified, the characters are considered to be on the same line.
+            The margin is calculated relative to the width of the character.
+        :param word_margin:
+            Determines whether two characters on the same line are part of the same word
+            based on the distance between them. If the distance is greater than the margin specified,
+            an intermediate space will be added between them to make the text more readable.
+            The margin is calculated relative to the width of the character.
+        :param line_margin:
+            This parameter determines whether two lines are part of the same paragraph based on
+            the distance between them. If the distance is less than the margin specified,
+            the lines are considered to be part of the same paragraph.
+            The margin is calculated relative to the height of a line.
+        :param boxes_flow:
+            This parameter determines the importance of horizontal and vertical position when
+            determining the order of text boxes. A value between -1.0 and +1.0 can be set,
+            with -1.0 indicating that only horizontal position matters and +1.0 indicating
+            that only vertical position matters. Setting the value to 'None' will disable advanced
+            layout analysis, and text boxes will be ordered based on the position of their bottom left corner.
+        :param detect_vertical:
+            This parameter determines whether vertical text should be considered during layout analysis.
+        :param all_texts:
+            If layout analysis should be performed on text in figures.
+        """
+
+        pdfminer_import.check()
+
+        self.layout_params = LAParams(
+            line_overlap=line_overlap,
+            char_margin=char_margin,
+            line_margin=line_margin,
+            word_margin=word_margin,
+            boxes_flow=boxes_flow,
+            detect_vertical=detect_vertical,
+            all_texts=all_texts,
+        )
+
+    def __converter(self, extractor) -> Document:
+        """
+        Extracts text from PDF pages then convert the text into Documents
+
+        :param extractor:
+            Python generator that yields PDF pages.
+
+        :returns:
+            PDF text converted to Haystack Document
+        """
+        pages = []
+        for page in extractor:
+            text = ""
+            for container in page:
+                # Keep text only
+                if isinstance(container, LTTextContainer):
+                    text += container.get_text()
+            pages.append(text)
+
+        # Add a page delimiter
+        concat = "\f".join(pages)
+
+        return Document(content=concat)
+
+    @component.output_types(document=List[Document])
+    def run(
+        self,
+        sources: List[Union[str, Path, ByteStream]],
+        meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
+    ):
+        """
+        Converts PDF files to Documents.
+
+        :param sources:
+            List of PDF file paths or ByteStream objects.
+        :param meta:
+            Optional metadata to attach to the Documents.
+            This value can be either a list of dictionaries or a single dictionary.
+            If it's a single dictionary, its content is added to the metadata of all produced Documents.
+            If it's a list, the length of the list must match the number of sources, because the two lists will be zipped.
+            If `sources` contains ByteStream objects, their `meta` will be added to the output Documents.
+
+        :returns:
+            A dictionary with the following keys:
+            - `documents`: Created Documents
+        """
+        documents = []
+
+        meta_list = normalize_metadata(meta, sources_count=len(sources))
+
+        for source, metadata in zip(sources, meta_list):
+            try:
+                bytestream = get_bytestream_from_source(source)
+            except Exception as e:
+                logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
+                continue
+            try:
+                pdf_reader = extract_pages(io.BytesIO(bytestream.data), laparams=self.layout_params)
+                document = self.__converter(pdf_reader)
+            except Exception as e:
+                logger.warning(
+                    "Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
+                )
+                continue
+
+            merged_metadata = {**bytestream.meta, **metadata}
+            document.meta = merged_metadata
+            documents.append(document)
+
+        return {"documents": documents}
diff --git a/haystack/components/embedders/hugging_face_api_document_embedder.py b/haystack/components/embedders/hugging_face_api_document_embedder.py
@@ -10,7 +10,7 @@
 from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
 from haystack.utils.url_validation import is_valid_http_url
 
-with LazyImport(message="Run 'pip install \"huggingface_hub>=0.22.0\"'") as huggingface_hub_import:
+with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\"'") as huggingface_hub_import:
     from huggingface_hub import InferenceClient
 
 logger = logging.getLogger(__name__)

diff --git a/haystack/components/embedders/hugging_face_api_text_embedder.py b/haystack/components/embedders/hugging_face_api_text_embedder.py
@@ -7,7 +7,7 @@
 from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
 from haystack.utils.url_validation import is_valid_http_url
 
-with LazyImport(message="Run 'pip install \"huggingface_hub>=0.22.0\"'") as huggingface_hub_import:
+with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\"'") as huggingface_hub_import:
     from huggingface_hub import InferenceClient
 
 logger = logging.getLogger(__name__)

diff --git a/haystack/components/embedders/hugging_face_tei_document_embedder.py b/haystack/components/embedders/hugging_face_tei_document_embedder.py
@@ -11,7 +11,7 @@
 from haystack.utils import Secret, deserialize_secrets_inplace
 from haystack.utils.hf import HFModelType, check_valid_model
 
-with LazyImport(message="Run 'pip install \"huggingface_hub>=0.22.0\"'") as huggingface_hub_import:
+with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\"'") as huggingface_hub_import:
     from huggingface_hub import InferenceClient
 
 logger = logging.getLogger(__name__)

diff --git a/haystack/components/embedders/hugging_face_tei_text_embedder.py b/haystack/components/embedders/hugging_face_tei_text_embedder.py
@@ -8,7 +8,7 @@
 from haystack.utils import Secret, deserialize_secrets_inplace
 from haystack.utils.hf import HFModelType, check_valid_model
 
-with LazyImport(message="Run 'pip install \"huggingface_hub>=0.22.0\"'") as huggingface_hub_import:
+with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\"'") as huggingface_hub_import:
     from huggingface_hub import InferenceClient
 
 logger = logging.getLogger(__name__)

diff --git a/haystack/components/generators/chat/hugging_face_api.py b/haystack/components/generators/chat/hugging_face_api.py
@@ -7,7 +7,7 @@
 from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model
 from haystack.utils.url_validation import is_valid_http_url
 
-with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.22.0\"'") as huggingface_hub_import:
+with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.23.0\"'") as huggingface_hub_import:
     from huggingface_hub import ChatCompletionOutput, ChatCompletionStreamOutput, InferenceClient
 
 

diff --git a/haystack/components/generators/chat/hugging_face_tgi.py b/haystack/components/generators/chat/hugging_face_tgi.py
@@ -9,7 +9,7 @@
 from haystack.utils import Secret, deserialize_callable, deserialize_secrets_inplace, serialize_callable
 from haystack.utils.hf import HFModelType, check_generation_params, check_valid_model, list_inference_deployed_models
 
-with LazyImport(message="Run 'pip install \"huggingface_hub>=0.22.0\" transformers'") as transformers_import:
+with LazyImport(message="Run 'pip install \"huggingface_hub>=0.23.0\" transformers'") as transformers_import:
     from huggingface_hub import (
         InferenceClient,
         TextGenerationOutput,
@@ -275,13 +275,13 @@ def _run_streaming(
         message = ChatMessage.from_assistant(chunk.generated_text)
         message.meta.update(
             {
-                "finish_reason": chunk.details.finish_reason,
+                "finish_reason": chunk.details.finish_reason if chunk.details else None,
                 "index": 0,
                 "model": self.client.model,
                 "usage": {
-                    "completion_tokens": chunk.details.generated_tokens,
+                    "completion_tokens": chunk.details.generated_tokens if chunk.details else 0,
                     "prompt_tokens": prompt_token_count,
-                    "total_tokens": prompt_token_count + chunk.details.generated_tokens,
+                    "total_tokens": prompt_token_count + chunk.details.generated_tokens if chunk.details else 0,
                 },
             }
         )
@@ -294,15 +294,22 @@ def _run_non_streaming(
         for _i in range(num_responses):
             tgr: TextGenerationOutput = self.client.text_generation(prepared_prompt, details=True, **generation_kwargs)
             message = ChatMessage.from_assistant(tgr.generated_text)
+            if tgr.details:
+                completion_tokens = len(tgr.details.tokens)
+                prompt_token_count = prompt_token_count + completion_tokens
+                finish_reason = tgr.details.finish_reason
+            else:
+                finish_reason = None
+                completion_tokens = 0
             message.meta.update(
                 {
-                    "finish_reason": tgr.details.finish_reason,
+                    "finish_reason": finish_reason,
                     "index": _i,
                     "model": self.client.model,
                     "usage": {
-                        "completion_tokens": len(tgr.details.tokens),
+                        "completion_tokens": completion_tokens,
                         "prompt_tokens": prompt_token_count,
-                        "total_tokens": prompt_token_count + len(tgr.details.tokens),
+                        "total_tokens": prompt_token_count + completion_tokens,
                     },
                 }
             )