From ab28a2b695f82af641bb284597959e7e8f6c113f Mon Sep 17 00:00:00 2001 From: Vladimir Blagojevic Date: Mon, 11 Nov 2024 14:37:14 +0100 Subject: [PATCH] chore: Update transformers on 1.x branch (#8528) * Update transformers * New Anthropic tokenizer.json URL * Black it * Fix mypy failures * Fix pylint failures --------- Co-authored-by: Silvano Cerza --- haystack/document_stores/mongodb_atlas.py | 12 ++++++++---- haystack/modeling/model/language_model.py | 17 +++++++---------- haystack/modeling/model/prediction_head.py | 7 +++---- .../prompt/invocation_layer/anthropic_claude.py | 7 ++++--- haystack/utils/experiment_tracking.py | 2 +- pyproject.toml | 4 ++-- 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/haystack/document_stores/mongodb_atlas.py b/haystack/document_stores/mongodb_atlas.py index 8c572f54d2..e7acd13e6a 100644 --- a/haystack/document_stores/mongodb_atlas.py +++ b/haystack/document_stores/mongodb_atlas.py @@ -1,19 +1,23 @@ import re from typing import Dict, Generator, List, Optional, Union + import numpy as np from tqdm import tqdm + +from haystack import __version__ as haystack_version from haystack.document_stores import BaseDocumentStore from haystack.errors import DocumentStoreError from haystack.nodes.retriever import DenseRetriever from haystack.schema import Document, FilterType from haystack.utils import get_batches_from_generator -from haystack import __version__ as haystack_version -from .mongodb_filters import mongo_filter_converter + from ..lazy_imports import LazyImport +from .mongodb_filters import mongo_filter_converter with LazyImport("Run 'pip install farm-haystack[mongodb]'") as mongodb_import: import pymongo from pymongo import InsertOne, ReplaceOne, UpdateOne + from pymongo.collection import Collection as MongoCollection from pymongo.driver_info import DriverInfo METRIC_TYPES = ["euclidean", "cosine", "dotProduct"] @@ -82,7 +86,7 @@ def __init__( def _create_document_field_map(self) -> Dict: return {self.embedding_field: "embedding"} - def _get_collection(self, index=None) -> "pymongo.collection.Collection": + def _get_collection(self, index=None) -> "MongoCollection": """ Returns the collection named by index or returns the collection specified when the driver was initialized. @@ -126,7 +130,7 @@ def delete_documents( elif (ids, filters) == (ids, filters): mongo_filters = {"$and": [mongo_filter_converter(filters), {"id": {"$in": ids}}]} - collection.delete_many(filter=mongo_filters) + collection.delete_many(filter=mongo_filters) # pylint: disable=possibly-used-before-assignment def delete_index(self, index=None): """ diff --git a/haystack/modeling/model/language_model.py b/haystack/modeling/model/language_model.py index e1f4028eba..abc15a5547 100644 --- a/haystack/modeling/model/language_model.py +++ b/haystack/modeling/model/language_model.py @@ -18,26 +18,24 @@ Thanks for the great work! """ -from typing import Type, Optional, Dict, Any, Union, List - -import re import json import logging import os +import re from abc import ABC, abstractmethod from pathlib import Path +from typing import Any, Dict, List, Optional, Type, Union + import numpy as np import torch -from torch import nn import transformers -from transformers import PretrainedConfig, PreTrainedModel -from transformers import AutoModel, AutoConfig +from torch import nn +from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel from transformers.modeling_utils import SequenceSummary from haystack.errors import ModelingError from haystack.modeling.utils import silence_transformers_logs - logger = logging.getLogger(__name__) @@ -213,8 +211,7 @@ def _pool_tokens( ): token_vecs = sequence_output.cpu().numpy() # we only take the aggregated value of non-padding tokens - padding_mask = padding_mask.cpu().numpy() - ignore_mask_2d = padding_mask == 0 + ignore_mask_2d = padding_mask.cpu().numpy() == 0 # sometimes we want to exclude the CLS token as well from our aggregation operation if ignore_first_token: ignore_mask_2d[:, 0] = True @@ -225,7 +222,7 @@ def _pool_tokens( if strategy == "reduce_mean": pooled_vecs = np.ma.array(data=token_vecs, mask=ignore_mask_3d).mean(axis=1).data - return pooled_vecs + return pooled_vecs # pylint: disable=possibly-used-before-assignment class HFLanguageModel(LanguageModel): diff --git a/haystack/modeling/model/prediction_head.py b/haystack/modeling/model/prediction_head.py index df025b5cdd..74136ddf23 100644 --- a/haystack/modeling/model/prediction_head.py +++ b/haystack/modeling/model/prediction_head.py @@ -502,15 +502,14 @@ def logits_to_preds( # sorted_candidates.shape : (batch_size, max_seq_len^2, 2) start_indices = torch.div(flat_sorted_indices, max_seq_len, rounding_mode="trunc") end_indices = flat_sorted_indices % max_seq_len - sorted_candidates = torch.cat((start_indices, end_indices), dim=2) # Get the n_best candidate answers for each sample - sorted_candidates = sorted_candidates.cpu().numpy() - start_end_matrix = start_end_matrix.cpu().numpy() + sorted_candidates = torch.cat((start_indices, end_indices), dim=2).cpu().numpy() + start_end_matrix_array = start_end_matrix.cpu().numpy() for sample_idx in range(batch_size): sample_top_n = self.get_top_candidates( sorted_candidates[sample_idx], - start_end_matrix[sample_idx], + start_end_matrix_array[sample_idx], sample_idx, start_matrix=start_matrix[sample_idx], end_matrix=end_matrix[sample_idx], diff --git a/haystack/nodes/prompt/invocation_layer/anthropic_claude.py b/haystack/nodes/prompt/invocation_layer/anthropic_claude.py index edd1c6c28c..539d0f9ded 100644 --- a/haystack/nodes/prompt/invocation_layer/anthropic_claude.py +++ b/haystack/nodes/prompt/invocation_layer/anthropic_claude.py @@ -22,9 +22,10 @@ # Taken from: # https://github.com/anthropics/anthropic-sdk-python/blob/main/anthropic/tokenizer.py#L7 # This is a JSON config to load the tokenizer used for Anthropic Claude. -CLAUDE_TOKENIZER_REMOTE_FILE = ( - "https://raw.githubusercontent.com/anthropics/anthropic-sdk-python/main/src/anthropic/tokenizer.json" -) +# Anthropic removed tokenizer.json from their repo (https://github.com/anthropics/anthropic-sdk-python/pull/726), +# we need to use the commit from the latest version of the SDK that still +# has it, i.e. 0.38.0 and commit hash 14afc93ffd809e60666a267763a57a328184c5e4. +CLAUDE_TOKENIZER_REMOTE_FILE = "https://raw.githubusercontent.com/anthropics/anthropic-sdk-python/14afc93ffd809e60666a267763a57a328184c5e4/src/anthropic/tokenizer.json" class AnthropicClaudeInvocationLayer(PromptModelInvocationLayer): diff --git a/haystack/utils/experiment_tracking.py b/haystack/utils/experiment_tracking.py index 2a9f8d1ef4..21195449d7 100644 --- a/haystack/utils/experiment_tracking.py +++ b/haystack/utils/experiment_tracking.py @@ -213,7 +213,7 @@ def track_params(self, params: Dict[str, Any]): def track_artifacts(self, dir_path: Union[str, Path], artifact_path: Optional[str] = None): try: - mlflow.log_artifacts(dir_path, artifact_path) + mlflow.log_artifacts(str(dir_path), artifact_path) except ConnectionError: logger.warning("ConnectionError in logging artifacts to MLflow") except Exception as e: diff --git a/pyproject.toml b/pyproject.toml index 5d99d923b4..136d9c8205 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,7 @@ dependencies = [ "requests", "httpx", "pydantic<2", - "transformers==4.39.3", + "transformers>=4.46,<5.0", "pandas", "rank_bm25", "scikit-learn>=1.3.0", # TF-IDF and metrics @@ -86,7 +86,7 @@ dependencies = [ [project.optional-dependencies] inference = [ - "transformers[torch,sentencepiece]==4.39.3", + "transformers[torch,sentencepiece]>=4.46,<5.0", "sentence-transformers<=3.0.0,>=2.3.1", # See haystack/nodes/retriever/_embedding_encoder.py, _SentenceTransformersEmbeddingEncoder "huggingface-hub>=0.5.0", ]