Skip to content

Commit

Permalink
enhancement: reduced usage of numpy and substituted built-in libraries (
Browse files Browse the repository at this point in the history
#8418)

* reduced usage of numpy and substituted built-in libraries

* added release note

* edited expit function to support both float as well as list (this case was giving error CI)

* revert code , numpy can't be removed here

* more cleaning

* fix relnote

---------

Co-authored-by: anakin87 <[email protected]>
  • Loading branch information
ajit97singh and anakin87 authored Oct 18, 2024
1 parent ff584f1 commit 6cf13e8
Show file tree
Hide file tree
Showing 11 changed files with 31 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict, List, Optional, cast

import numpy as np
from typing import Any, Dict, List, Optional

from haystack.lazy_imports import LazyImport
from haystack.utils.auth import Secret
Expand Down Expand Up @@ -78,5 +76,5 @@ def __init__(
)

def embed(self, data: List[str], **kwargs) -> List[List[float]]:
embeddings = cast(np.ndarray, self.model.encode(data, **kwargs)).tolist()
embeddings = self.model.encode(data, **kwargs).tolist()
return embeddings
3 changes: 0 additions & 3 deletions haystack/testing/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
import os
import random

import numpy as np

from haystack import logging

logger = logging.getLogger(__name__)
Expand All @@ -23,7 +21,6 @@ def set_all_seeds(seed: int, deterministic_cudnn: bool = False) -> None:
:param deterministic_cudnn: Enable for full reproducibility when using CUDA. Caution: might slow down training.
"""
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

try:
Expand Down
12 changes: 8 additions & 4 deletions haystack/utils/expit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
#
# SPDX-License-Identifier: Apache-2.0

import numpy as np
from numpy import exp


def expit(x: float) -> float:
"""Compute logistic sigmoid function. Maps input values to a range between 0 and 1"""
return 1 / (1 + np.exp(-x))
def expit(x) -> float:
"""
Compute logistic sigmoid function. Maps input values to a range between 0 and 1
:param x: input value. Can be a scalar or a numpy array.
"""
return 1 / (1 + exp(-x))
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Reduced numpy usage to speed up imports.
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import os
from unittest.mock import MagicMock, patch

import random
import pytest
from huggingface_hub.utils import RepositoryNotFoundError
from numpy import array, random

from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
from haystack.dataclasses import Document
Expand All @@ -24,7 +24,7 @@ def mock_check_valid_model():


def mock_embedding_generation(json, **kwargs):
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
return response


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import os
from unittest.mock import MagicMock, patch

import random
import pytest
from huggingface_hub.utils import RepositoryNotFoundError
from numpy import array, random

from haystack.components.embedders import HuggingFaceAPITextEmbedder
from haystack.utils.auth import Secret
Expand All @@ -22,7 +22,7 @@ def mock_check_valid_model():


def mock_embedding_generation(json, **kwargs):
response = str(array([random.rand(384) for i in range(len(json["inputs"]))]).tolist()).encode()
response = str([[random.random() for _ in range(384)] for _ in range(len(json["inputs"]))]).encode()
return response


Expand Down
5 changes: 3 additions & 2 deletions test/components/embedders/test_openai_document_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import List
from haystack.utils.auth import Secret

import numpy as np
import random
import pytest

from haystack import Document
Expand All @@ -16,7 +16,8 @@ def mock_openai_response(input: List[str], model: str = "text-embedding-ada-002"
dict_response = {
"object": "list",
"data": [
{"object": "embedding", "index": i, "embedding": np.random.rand(1536).tolist()} for i in range(len(input))
{"object": "embedding", "index": i, "embedding": [random.random() for _ in range(1536)]}
for i in range(len(input))
],
"model": model,
"usage": {"prompt_tokens": 4, "total_tokens": 4},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import MagicMock, patch

import numpy as np
import random
import pytest
import torch

Expand Down Expand Up @@ -264,7 +264,9 @@ def test_warmup_doesnt_reload(self, mocked_factory):
def test_run(self):
embedder = SentenceTransformersDocumentEmbedder(model="model")
embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
embedder.embedding_backend.embed = lambda x, **kwargs: [
[random.random() for _ in range(16)] for _ in range(len(x))
]

documents = [Document(content=f"document number {i}") for i in range(5)]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from unittest.mock import MagicMock, patch

import torch
import numpy as np
import random
import pytest

from haystack.components.embedders.sentence_transformers_text_embedder import SentenceTransformersTextEmbedder
Expand Down Expand Up @@ -239,7 +239,9 @@ def test_warmup_doesnt_reload(self, mocked_factory):
def test_run(self):
embedder = SentenceTransformersTextEmbedder(model="model")
embedder.embedding_backend = MagicMock()
embedder.embedding_backend.embed = lambda x, **kwargs: np.random.rand(len(x), 16).tolist()
embedder.embedding_backend.embed = lambda x, **kwargs: [
[random.random() for _ in range(16)] for _ in range(len(x))
]

text = "a nice text to embed"

Expand Down
1 change: 0 additions & 1 deletion test/components/evaluators/test_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import math
from typing import List

import numpy as np
import pytest

from haystack import Pipeline
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from typing import Dict, Any

import pytest
import numpy as np

from haystack import Pipeline, DeserializationError
from haystack.document_stores.types import FilterPolicy
Expand Down Expand Up @@ -135,7 +134,7 @@ def test_valid_run(self):

assert "documents" in result
assert len(result["documents"]) == top_k
assert np.array_equal(result["documents"][0].embedding, [1.0, 1.0, 1.0, 1.0])
assert result["documents"][0].embedding == [1.0, 1.0, 1.0, 1.0]

def test_invalid_run_wrong_store_type(self):
SomeOtherDocumentStore = document_store_class("SomeOtherDocumentStore")
Expand Down Expand Up @@ -165,4 +164,4 @@ def test_run_with_pipeline(self):
results_docs = result["retriever"]["documents"]
assert results_docs
assert len(results_docs) == top_k
assert np.array_equal(results_docs[0].embedding, [1.0, 1.0, 1.0, 1.0])
assert results_docs[0].embedding == [1.0, 1.0, 1.0, 1.0]

0 comments on commit 6cf13e8

Please sign in to comment.