Merge branch 'v0.1.1'

Knowledgator · Jun 11, 2024 · e61d1c6 · e61d1c6
2 parents a67b55f + c4b9b4a
commit e61d1c6
Show file tree

Hide file tree

Showing 25 changed files with 1,001 additions and 21 deletions.
diff --git a/programs/db/chromadb/__init__.py b/programs/db/chromadb/__init__.py
diff --git a/programs/db/chromadb/main.py b/programs/db/chromadb/main.py
@@ -0,0 +1,48 @@
+from chromadb import PersistentClient, Documents, Embeddings
+
+from utca.core import AddData, ReplacingScope
+from utca.implementation.tasks import TransformersTextEmbedding
+from utca.implementation.datasources.db import (
+    ChromaDBGetOrCreateCollection,
+    ChromaDBCollectionAdd,
+    ChromaDBEmbeddingFunctionComponent,
+    ChromaDBCollectionQuery,
+)
+
+# Sentences for dataset
+sentences = [
+    "People who test positive for Covid-19 no longer need to routinely stay away from others for at least five days, according to new guidelines from the US Centers for Disease Control and Prevention issued Friday.", 
+    "The change ends a strategy from earlier in the pandemic that experts said has been important to controlling the spread of the infection.",
+    "Whether it be the latest prized Stanley cup or that 10-year-old plastic spout bottle you don’t go anywhere without, “emotional support water bottles” seem to be stuck to our sides and not going anywhere.",
+    "Health officials in Alaska recently reported the first known human death from a virus called Alaskapox.",
+    "Blizzard conditions continued to slam Northern California over the weekend with damaging winds and heavy snow dumping on mountain ridges down to the valleys.",
+]
+
+class EmbeddingFunction(ChromaDBEmbeddingFunctionComponent[Documents]):
+    def __call__(self, documents: Documents) -> Embeddings:
+        return embedding_pipe.run({"texts": documents})["embeddings"].tolist()
+
+if __name__ == "__main__":
+    embedding_pipe = TransformersTextEmbedding()
+
+    pipe = (
+        AddData({
+            "collection_name": "test",
+        })
+        | ChromaDBGetOrCreateCollection(
+            client=PersistentClient(), embedding_function=EmbeddingFunction(embedding_pipe) # type: ignore
+        ).use(get_key="collection_name")
+        | AddData({
+            "documents": sentences,
+            "ids": [f"id_{i}" for i in range(1, len(sentences)+1)]
+        })
+        | ChromaDBCollectionAdd()
+        | AddData({
+            "query_texts": ["Bad weather"],
+            "n_results": 1,
+            "include": ["documents", "distances"]
+        })
+        | ChromaDBCollectionQuery().use(set_key="results", replace=ReplacingScope.GLOBAL)
+    )
+
+    print(pipe.run()["results"])
diff --git a/programs/db/graph/README.md b/programs/db/graph/README.md
@@ -0,0 +1,13 @@
+# Neo4j
+
+To run this example, you can run Docker locally. Execute the following command to start the container before running the program:
+
+``` console
+sh run_neo4j_docker.sh
+```
+
+Then, run the program:
+
+``` console
+python main.py
+```
diff --git a/programs/db/graph/__init__.py b/programs/db/graph/__init__.py
diff --git a/programs/db/graph/main.py b/programs/db/graph/main.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+from typing import Any, Dict, cast
+
+from neo4j import ManagedTransaction
+
+from utca.core import While, ExecuteFunction
+from utca.implementation.datasources.db import (
+    Neo4jClient, Neo4jWriteAction
+)
+
+employee_threshold=10
+
+def employ_person_tx(tx: ManagedTransaction, name: str) -> str:
+    # Create new Person node with given name, if not exists already
+    result = tx.run("""
+        MERGE (p:Person {name: $name})
+        RETURN p.name AS name
+        """, name=name
+    )
+
+    # Obtain most recent organization ID and the number of people linked to it
+    result = tx.run("""
+        MATCH (o:Organization)
+        RETURN o.id AS id, COUNT{(p:Person)-[r:WORKS_FOR]->(o)} AS employees_n
+        ORDER BY o.created_date DESC
+        LIMIT 1
+    """)
+    org = result.single()
+
+    if org is not None and org["employees_n"] == 0:
+        raise Exception("Most recent organization is empty.")
+        # Transaction will roll back -> not even Person is created!
+
+    # If org does not have too many employees, add this Person to that
+    if org is not None and org.get("employees_n") < employee_threshold:
+        result = tx.run("""
+            MATCH (o:Organization {id: $org_id})
+            MATCH (p:Person {name: $name})
+            MERGE (p)-[r:WORKS_FOR]->(o)
+            RETURN $org_id AS id
+            """, org_id=org["id"], name=name
+        )
+
+    # Otherwise, create a new Organization and link Person to it
+    else:
+        result = tx.run("""
+            MATCH (p:Person {name: $name})
+            CREATE (o:Organization {id: randomuuid(), created_date: datetime()})
+            MERGE (p)-[r:WORKS_FOR]->(o)
+            RETURN o.id AS id
+            """, name=name
+        )
+
+    # Return the Organization ID to which the new Person ends up in
+    return cast(str, result.single(strict=True)["id"])
+
+if __name__ == "__main__":
+    # See shell script for docker
+    client = Neo4jClient(
+        url="neo4j://localhost:7687",
+        user="neo4j",
+        password="password",
+    )
+
+    employee_id = 0
+    def employee_name(_: Any) -> Dict[str, Any]:
+        global employee_id
+        employee_id += 1
+        return {"kwargs": {"name": f"Thor{employee_id}"}}
+
+    def print_message(input_data: Dict[str, Any]) -> None:
+        print(f'User {input_data["kwargs"]["name"]} added to organization {input_data["org_id"]}')
+
+    p = While(
+        ExecuteFunction(employee_name)
+        | Neo4jWriteAction(
+            database="neo4j",
+            transaction_function=employ_person_tx,
+            client=client,
+        ).use(set_key="org_id")
+        | ExecuteFunction(print_message),
+        max_iterations=100,
+    )
+    p.run()
+
+    client.close()
diff --git a/programs/db/graph/run_neo4j_docker.sh b/programs/db/graph/run_neo4j_docker.sh
@@ -0,0 +1,4 @@
+docker run \
+    --publish=7474:7474 --publish=7687:7687 \
+    --env NEO4J_AUTH=neo4j/password \
+    neo4j
diff --git a/programs/db/sql/__init__.py b/programs/db/sql/__init__.py
diff --git a/programs/db/main.py → programs/db/sql/main.py b/programs/db/main.py → programs/db/sql/main.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
+from typing import List, Optional
 
-from typing import List
-from typing import Optional
 from sqlalchemy import (
     ForeignKey, 
     String,

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "utca"
-version = "0.1.0"
+version = "0.1.1"
 description = ""
 authors = ["knowledgator.com"]
 readme = "README.md"
@@ -27,6 +27,8 @@ requests-html = "^0.10.0"
 reportlab = "^4.2.0"
 requests = "^2.32.2"
 gliner = "^0.2.2"
+neo4j = "^5.20.0"
+chromadb = "^0.5.0"
 
 
 [build-system]

diff --git a/src/utca/implementation/datasources/db/__init__.py b/src/utca/implementation/datasources/db/__init__.py
@@ -1,10 +1,40 @@
 from utca.implementation.datasources.db.sqlalchemy.main import (
     SQLSessionFactory, BaseModel, SQLAction, SQLActionWithReturns
 )
+from utca.implementation.datasources.db.neo4j.main import (
+    Neo4jClient, Neo4jReadAction, Neo4jWriteAction
+)
+from utca.implementation.datasources.db.chroma.main import (
+    ChromaDBCollectionAdd, 
+    ChromaDBCollectionUpdate,
+    ChromaDBCollectionUpsert,
+    ChromaDBGetCollection,
+    ChromaDBCreateCollection,
+    ChromaDBGetOrCreateCollection,
+    ChromaDBDeleteCollection,
+    ChromaDBCollectionGet,
+    ChromaDBCollectionQuery,   
+)
+from utca.implementation.datasources.db.chroma.schema import (
+    ChromaDBEmbeddingFunctionComponent,
+)
 
 __all__ = [
     "SQLSessionFactory",
     "BaseModel",
     "SQLAction",
     "SQLActionWithReturns",
+    "Neo4jClient",
+    "Neo4jReadAction",
+    "Neo4jWriteAction",
+    "ChromaDBGetCollection",
+    "ChromaDBCreateCollection",
+    "ChromaDBGetOrCreateCollection",
+    "ChromaDBDeleteCollection",
+    "ChromaDBCollectionAdd", 
+    "ChromaDBCollectionUpdate",
+    "ChromaDBCollectionUpsert",
+    "ChromaDBCollectionGet",
+    "ChromaDBCollectionQuery",
+    "ChromaDBEmbeddingFunctionComponent",
 ]
diff --git a/src/utca/implementation/datasources/db/chroma/__init__.py b/src/utca/implementation/datasources/db/chroma/__init__.py