diff --git a/.github/workflows/_all_ci.yml b/.github/workflows/_all_ci.yml index 49e62c48d86db..37aacd426531f 100644 --- a/.github/workflows/_all_ci.yml +++ b/.github/workflows/_all_ci.yml @@ -52,8 +52,8 @@ jobs: working-directory: ${{ inputs.working-directory }} secrets: inherit - pydantic-compatibility: - uses: ./.github/workflows/_pydantic_compatibility.yml + dependencies: + uses: ./.github/workflows/_dependencies.yml with: working-directory: ${{ inputs.working-directory }} secrets: inherit diff --git a/.github/workflows/_pydantic_compatibility.yml b/.github/workflows/_dependencies.yml similarity index 91% rename from .github/workflows/_pydantic_compatibility.yml rename to .github/workflows/_dependencies.yml index 5381c1b3369b0..af01a7eafa77d 100644 --- a/.github/workflows/_pydantic_compatibility.yml +++ b/.github/workflows/_dependencies.yml @@ -1,4 +1,4 @@ -name: pydantic v1/v2 compatibility +name: dependencies on: workflow_call: @@ -28,7 +28,7 @@ jobs: - "3.9" - "3.10" - "3.11" - name: Pydantic v1/v2 compatibility - Python ${{ matrix.python-version }} + name: dependencies - Python ${{ matrix.python-version }} steps: - uses: actions/checkout@v4 @@ -42,7 +42,15 @@ jobs: - name: Install dependencies shell: bash - run: poetry install --with test + run: poetry install + + - name: Check imports with base dependencies + shell: bash + run: poetry run make check_imports + + - name: Install test dependencies + shell: bash + run: poetry install --with test - name: Install langchain editable working-directory: ${{ inputs.working-directory }} diff --git a/libs/core/Makefile b/libs/core/Makefile index e47a52d1fb034..ac30e94ac198e 100644 --- a/libs/core/Makefile +++ b/libs/core/Makefile @@ -15,6 +15,10 @@ tests: test_watch: poetry run ptw --snapshot-update --now . -- -vv -x tests/unit_tests +check_imports: langchain_core/**/*.py + for f in $^ ; do \ + python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \ + done extended_tests: poetry run pytest --only-extended $(TEST_FILE) @@ -32,7 +36,7 @@ lint_tests: PYTHON_FILES=tests lint lint_diff lint_package lint_tests: ./scripts/check_pydantic.sh . - ./scripts/check_imports.sh + ./scripts/lint_imports.sh poetry run ruff . [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff [ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES) diff --git a/libs/core/scripts/check_imports.sh b/libs/core/scripts/lint_imports.sh similarity index 100% rename from libs/core/scripts/check_imports.sh rename to libs/core/scripts/lint_imports.sh diff --git a/libs/experimental/Makefile b/libs/experimental/Makefile index c0174a286b7e5..a00ffaf8472db 100644 --- a/libs/experimental/Makefile +++ b/libs/experimental/Makefile @@ -21,6 +21,11 @@ extended_tests: integration_tests: poetry run pytest tests/integration_tests +check_imports: langchain_experimental/**/*.py + for f in $^ ; do \ + python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \ + done + ###################### # LINTING AND FORMATTING diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py index 9db586c2848c3..72d2350d9da3e 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -1,10 +1,11 @@ import re from collections import defaultdict from dataclasses import dataclass, field -from typing import Dict, List +from typing import TYPE_CHECKING, Dict, List -from presidio_analyzer import RecognizerResult -from presidio_anonymizer.entities import EngineResult +if TYPE_CHECKING: + from presidio_analyzer import RecognizerResult + from presidio_anonymizer.entities import EngineResult MappingDataType = Dict[str, Dict[str, str]] @@ -62,8 +63,8 @@ def update(self, new_mapping: MappingDataType) -> None: def create_anonymizer_mapping( original_text: str, - analyzer_results: List[RecognizerResult], - anonymizer_results: EngineResult, + analyzer_results: List["RecognizerResult"], + anonymizer_results: "EngineResult", is_reversed: bool = False, ) -> MappingDataType: """Creates or updates the mapping used to anonymize and/or deanonymize text. diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index 1b20607679e66..6161d47a6e74c 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -23,28 +23,62 @@ get_pseudoanonymizer_mapping, ) -try: - from presidio_analyzer import AnalyzerEngine +if TYPE_CHECKING: + from presidio_analyzer import AnalyzerEngine, EntityRecognizer from presidio_analyzer.nlp_engine import NlpEngineProvider - -except ImportError as e: - raise ImportError( - "Could not import presidio_analyzer, please install with " - "`pip install presidio-analyzer`. You will also need to download a " - "spaCy model to use the analyzer, e.g. " - "`python -m spacy download en_core_web_lg`." - ) from e -try: from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer.entities import OperatorConfig -except ImportError as e: - raise ImportError( - "Could not import presidio_anonymizer, please install with " - "`pip install presidio-anonymizer`." - ) from e -if TYPE_CHECKING: - from presidio_analyzer import EntityRecognizer + +def _import_analyzer_engine() -> "AnalyzerEngine": + try: + from presidio_analyzer import AnalyzerEngine + + except ImportError as e: + raise ImportError( + "Could not import presidio_analyzer, please install with " + "`pip install presidio-analyzer`. You will also need to download a " + "spaCy model to use the analyzer, e.g. " + "`python -m spacy download en_core_web_lg`." + ) from e + return AnalyzerEngine + + +def _import_nlp_engine_provider() -> "NlpEngineProvider": + try: + from presidio_analyzer.nlp_engine import NlpEngineProvider + + except ImportError as e: + raise ImportError( + "Could not import presidio_analyzer, please install with " + "`pip install presidio-analyzer`. You will also need to download a " + "spaCy model to use the analyzer, e.g. " + "`python -m spacy download en_core_web_lg`." + ) from e + return NlpEngineProvider + + +def _import_anonymizer_engine() -> "AnonymizerEngine": + try: + from presidio_anonymizer import AnonymizerEngine + except ImportError as e: + raise ImportError( + "Could not import presidio_anonymizer, please install with " + "`pip install presidio-anonymizer`." + ) from e + return AnonymizerEngine + + +def _import_operator_config() -> "OperatorConfig": + try: + from presidio_anonymizer.entities import OperatorConfig + except ImportError as e: + raise ImportError( + "Could not import presidio_anonymizer, please install with " + "`pip install presidio-anonymizer`." + ) from e + return OperatorConfig + # Configuring Anonymizer for multiple languages # Detailed description and examples can be found here: @@ -89,6 +123,11 @@ def __init__( Defaults to None, in which case faker will be seeded randomly and provide random values. """ + OperatorConfig = _import_operator_config() + AnalyzerEngine = _import_analyzer_engine() + NlpEngineProvider = _import_nlp_engine_provider() + AnonymizerEngine = _import_anonymizer_engine() + self.analyzed_fields = ( analyzed_fields if analyzed_fields is not None diff --git a/libs/langchain/Makefile b/libs/langchain/Makefile index 5923a3f8ff69c..cc0e3d9bce4fd 100644 --- a/libs/langchain/Makefile +++ b/libs/langchain/Makefile @@ -40,6 +40,11 @@ docker_tests: docker build -t my-langchain-image:test . docker run --rm my-langchain-image:test +check_imports: langchain/**/*.py + for f in $^ ; do \ + python -c "from importlib.machinery import SourceFileLoader; SourceFileLoader('x', '$$f').load_module()" || exit 1; \ + done + ###################### # LINTING AND FORMATTING ###################### @@ -53,7 +58,7 @@ lint_tests: PYTHON_FILES=tests lint lint_diff lint_package lint_tests: ./scripts/check_pydantic.sh . - ./scripts/check_imports.sh + ./scripts/lint_imports.sh poetry run ruff . [ "$(PYTHON_FILES)" = "" ] || poetry run ruff format $(PYTHON_FILES) --diff [ "$(PYTHON_FILES)" = "" ] || poetry run ruff --select I $(PYTHON_FILES) diff --git a/libs/langchain/langchain/utilities/github.py b/libs/langchain/langchain/utilities/github.py index 940708718332b..9983951be4d17 100644 --- a/libs/langchain/langchain/utilities/github.py +++ b/libs/langchain/langchain/utilities/github.py @@ -5,7 +5,6 @@ from typing import TYPE_CHECKING, Any, Dict, List, Optional import requests -import tiktoken from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator from langchain.utils import get_from_dict_or_env @@ -15,6 +14,18 @@ from github.PullRequest import PullRequest +def _import_tiktoken() -> Any: + """Import tiktoken.""" + try: + import tiktoken + except ImportError: + raise ImportError( + "tiktoken is not installed. " + "Please install it with `pip install tiktoken`" + ) + return tiktoken + + class GitHubAPIWrapper(BaseModel): """Wrapper for GitHub API.""" @@ -385,6 +396,7 @@ def list_pull_request_files(self, pr_number: int) -> List[Dict[str, Any]]: dict: A dictionary containing the issue's title, body, and comments as a string """ + tiktoken = _import_tiktoken() MAX_TOKENS_FOR_FILES = 3_000 pr_files = [] pr = self.github_repo_instance.get_pull(number=int(pr_number)) @@ -453,6 +465,7 @@ def get_pull_request(self, pr_number: int) -> Dict[str, Any]: total_tokens = 0 def get_tokens(text: str) -> int: + tiktoken = _import_tiktoken() return len(tiktoken.get_encoding("cl100k_base").encode(text)) def add_to_dict(data_dict: Dict[str, Any], key: str, value: str) -> None: diff --git a/libs/langchain/langchain/vectorstores/_pgvector_data_models.py b/libs/langchain/langchain/vectorstores/_pgvector_data_models.py deleted file mode 100644 index 1a4b60776537b..0000000000000 --- a/libs/langchain/langchain/vectorstores/_pgvector_data_models.py +++ /dev/null @@ -1,71 +0,0 @@ -from typing import Optional, Tuple - -import sqlalchemy -from pgvector.sqlalchemy import Vector -from sqlalchemy.dialects.postgresql import JSON, UUID -from sqlalchemy.orm import Session, relationship - -from langchain.vectorstores.pgvector import BaseModel - - -class CollectionStore(BaseModel): - """Collection store.""" - - __tablename__ = "langchain_pg_collection" - - name = sqlalchemy.Column(sqlalchemy.String) - cmetadata = sqlalchemy.Column(JSON) - - embeddings = relationship( - "EmbeddingStore", - back_populates="collection", - passive_deletes=True, - ) - - @classmethod - def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]: - return session.query(cls).filter(cls.name == name).first() # type: ignore - - @classmethod - def get_or_create( - cls, - session: Session, - name: str, - cmetadata: Optional[dict] = None, - ) -> Tuple["CollectionStore", bool]: - """ - Get or create a collection. - Returns [Collection, bool] where the bool is True if the collection was created. - """ - created = False - collection = cls.get_by_name(session, name) - if collection: - return collection, created - - collection = cls(name=name, cmetadata=cmetadata) - session.add(collection) - session.commit() - created = True - return collection, created - - -class EmbeddingStore(BaseModel): - """Embedding store.""" - - __tablename__ = "langchain_pg_embedding" - - collection_id = sqlalchemy.Column( - UUID(as_uuid=True), - sqlalchemy.ForeignKey( - f"{CollectionStore.__tablename__}.uuid", - ondelete="CASCADE", - ), - ) - collection = relationship(CollectionStore, back_populates="embeddings") - - embedding: Vector = sqlalchemy.Column(Vector(None)) - document = sqlalchemy.Column(sqlalchemy.String, nullable=True) - cmetadata = sqlalchemy.Column(JSON, nullable=True) - - # custom_id : any user defined id - custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True) diff --git a/libs/langchain/langchain/vectorstores/pgvector.py b/libs/langchain/langchain/vectorstores/pgvector.py index c10686bf2b4ba..4f2ccb9b899ca 100644 --- a/libs/langchain/langchain/vectorstores/pgvector.py +++ b/libs/langchain/langchain/vectorstores/pgvector.py @@ -7,7 +7,6 @@ import uuid from functools import partial from typing import ( - TYPE_CHECKING, Any, Callable, Dict, @@ -22,8 +21,8 @@ import numpy as np import sqlalchemy from sqlalchemy import delete -from sqlalchemy.dialects.postgresql import UUID -from sqlalchemy.orm import Session +from sqlalchemy.dialects.postgresql import JSON, UUID +from sqlalchemy.orm import Session, relationship try: from sqlalchemy.orm import declarative_base @@ -37,9 +36,6 @@ from langchain.utils import get_from_dict_or_env from langchain.vectorstores.utils import maximal_marginal_relevance -if TYPE_CHECKING: - from langchain.vectorstores._pgvector_data_models import CollectionStore - class DistanceStrategy(str, enum.Enum): """Enumerator of the Distance strategies.""" @@ -64,6 +60,74 @@ class BaseModel(Base): uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) +class CollectionStore(BaseModel): + """Collection store.""" + + __tablename__ = "langchain_pg_collection" + + name = sqlalchemy.Column(sqlalchemy.String) + cmetadata = sqlalchemy.Column(JSON) + + embeddings = relationship( + "EmbeddingStore", + back_populates="collection", + passive_deletes=True, + ) + + @classmethod + def get_by_name(cls, session: Session, name: str) -> Optional["CollectionStore"]: + return session.query(cls).filter(cls.name == name).first() # type: ignore + + @classmethod + def get_or_create( + cls, + session: Session, + name: str, + cmetadata: Optional[dict] = None, + ) -> Tuple["CollectionStore", bool]: + """ + Get or create a collection. + Returns [Collection, bool] where the bool is True if the collection was created. + """ + created = False + collection = cls.get_by_name(session, name) + if collection: + return collection, created + + collection = cls(name=name, cmetadata=cmetadata) + session.add(collection) + session.commit() + created = True + return collection, created + + +def _get_embedding_store() -> Any: + from pgvector.sqlalchemy import Vector + + class EmbeddingStore(BaseModel): + """Embedding store.""" + + __tablename__ = "langchain_pg_embedding" + + collection_id = sqlalchemy.Column( + UUID(as_uuid=True), + sqlalchemy.ForeignKey( + f"{CollectionStore.__tablename__}.uuid", + ondelete="CASCADE", + ), + ) + collection = relationship(CollectionStore, back_populates="embeddings") + + embedding: Vector = sqlalchemy.Column(Vector(None)) + document = sqlalchemy.Column(sqlalchemy.String, nullable=True) + cmetadata = sqlalchemy.Column(JSON, nullable=True) + + # custom_id : any user defined id + custom_id = sqlalchemy.Column(sqlalchemy.String, nullable=True) + + return EmbeddingStore + + def _results_to_docs(docs_and_scores: Any) -> List[Document]: """Return docs from docs and scores.""" return [doc for doc, _ in docs_and_scores] @@ -138,13 +202,9 @@ def __post_init__( ) -> None: """Initialize the store.""" self.create_vector_extension() - from langchain.vectorstores._pgvector_data_models import ( - CollectionStore, - EmbeddingStore, - ) self.CollectionStore = CollectionStore - self.EmbeddingStore = EmbeddingStore + self.EmbeddingStore = _get_embedding_store() self.create_tables_if_not_exists() self.create_collection() diff --git a/libs/langchain/scripts/check_imports.sh b/libs/langchain/scripts/lint_imports.sh similarity index 100% rename from libs/langchain/scripts/check_imports.sh rename to libs/langchain/scripts/lint_imports.sh