Merge branch 'langchain-ai:master' into meenakshiruia/16913/issue

langchain-ai · Dec 25, 2024 · 80d58b5 · 80d58b5
2 parents 3d5514a + 5991b45
commit 80d58b5
Show file tree

Hide file tree

Showing 14 changed files with 829 additions and 285 deletions.
diff --git a/docs/docs/integrations/chat/mlx.ipynb b/docs/docs/integrations/chat/mlx.ipynb
@@ -155,8 +155,48 @@
     "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
     "\n",
     "# setup ReAct style prompt\n",
-    "prompt = hub.pull(\"hwchase17/react-json\")\n",
-    "prompt = prompt.partial(\n",
+    "# Based on 'hwchase17/react' prompt modification, cause mlx does not support the `System` role\n",
+    "human_prompt = \"\"\"\n",
+    "Answer the following questions as best you can. You have access to the following tools:\n",
+    "\n",
+    "{tools}\n",
+    "\n",
+    "The way you use the tools is by specifying a json blob.\n",
+    "Specifically, this json should have a `action` key (with the name of the tool to use) and a `action_input` key (with the input to the tool going here).\n",
+    "\n",
+    "The only values that should be in the \"action\" field are: {tool_names}\n",
+    "\n",
+    "The $JSON_BLOB should only contain a SINGLE action, do NOT return a list of multiple actions. Here is an example of a valid $JSON_BLOB:\n",
+    "\n",
+    "```\n",
+    "{{\n",
+    "  \"action\": $TOOL_NAME,\n",
+    "  \"action_input\": $INPUT\n",
+    "}}\n",
+    "```\n",
+    "\n",
+    "ALWAYS use the following format:\n",
+    "\n",
+    "Question: the input question you must answer\n",
+    "Thought: you should always think about what to do\n",
+    "Action:\n",
+    "```\n",
+    "$JSON_BLOB\n",
+    "```\n",
+    "Observation: the result of the action\n",
+    "... (this Thought/Action/Observation can repeat N times)\n",
+    "Thought: I now know the final answer\n",
+    "Final Answer: the final answer to the original input question\n",
+    "\n",
+    "Begin! Reminder to always use the exact characters `Final Answer` when responding.\n",
+    "\n",
+    "{input}\n",
+    "\n",
+    "{agent_scratchpad}\n",
+    "\n",
+    "\"\"\"\n",
+    "\n",
+    "prompt = human_prompt.partial(\n",
     "    tools=render_text_description(tools),\n",
     "    tool_names=\", \".join([t.name for t in tools]),\n",
     ")\n",
@@ -207,7 +247,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.18"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,

diff --git a/docs/docs/integrations/providers/cratedb.mdx b/docs/docs/integrations/providers/cratedb.mdx
@@ -0,0 +1,132 @@
+# CrateDB
+
+> [CrateDB] is a distributed and scalable SQL database for storing and
+> analyzing massive amounts of data in near real-time, even with complex
+> queries. It is PostgreSQL-compatible, based on Lucene, and inheriting
+> from Elasticsearch.
+
+
+## Installation and Setup
+
+### Setup CrateDB
+There are two ways to get started with CrateDB quickly. Alternatively,
+choose other [CrateDB installation options].
+
+#### Start CrateDB on your local machine
+Example: Run a single-node CrateDB instance with security disabled,
+using Docker or Podman. This is not recommended for production use.
+
+```bash
+docker run --name=cratedb --rm \
+  --publish=4200:4200 --publish=5432:5432 --env=CRATE_HEAP_SIZE=2g \
+  crate:latest -Cdiscovery.type=single-node
+```
+
+#### Deploy cluster on CrateDB Cloud
+[CrateDB Cloud] is a managed CrateDB service. Sign up for a
+[free trial][CrateDB Cloud Console].
+
+### Install Client
+Install the most recent version of the `langchain-cratedb` package
+and a few others that are needed for this tutorial.
+```bash
+pip install --upgrade langchain-cratedb langchain-openai unstructured
+```
+
+
+## Documentation
+For a more detailed walkthrough of the CrateDB wrapper, see
+[using LangChain with CrateDB]. See also [all features of CrateDB]
+to learn about other functionality provided by CrateDB.
+
+
+## Features
+The CrateDB adapter for LangChain provides APIs to use CrateDB as vector store,
+document loader, and storage for chat messages.
+
+### Vector Store
+Use the CrateDB vector store functionality around `FLOAT_VECTOR` and `KNN_MATCH`
+for similarity search and other purposes. See also [CrateDBVectorStore Tutorial].
+
+Make sure you've configured a valid OpenAI API key.
+```bash
+export OPENAI_API_KEY=sk-XJZ...
+```
+```python
+from langchain_community.document_loaders import UnstructuredURLLoader
+from langchain_cratedb import CrateDBVectorStore
+from langchain_openai import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+
+loader = UnstructuredURLLoader(urls=["https://github.com/langchain-ai/langchain/raw/refs/tags/langchain-core==0.3.28/docs/docs/how_to/state_of_the_union.txt"])
+documents = loader.load()
+text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+docs = text_splitter.split_documents(documents)
+
+embeddings = OpenAIEmbeddings()
+
+# Connect to a self-managed CrateDB instance on localhost.
+CONNECTION_STRING = "crate://?schema=testdrive"
+
+store = CrateDBVectorStore.from_documents(
+    documents=docs,
+    embedding=embeddings,
+    collection_name="state_of_the_union",
+    connection=CONNECTION_STRING,
+)
+
+query = "What did the president say about Ketanji Brown Jackson"
+docs_with_score = store.similarity_search_with_score(query)
+```
+
+### Document Loader
+Load load documents from a CrateDB database table, using the document loader
+`CrateDBLoader`, which is based on SQLAlchemy. See also [CrateDBLoader Tutorial].
+
+To use the document loader in your applications:
+```python
+import sqlalchemy as sa
+from langchain_community.utilities import SQLDatabase
+from langchain_cratedb import CrateDBLoader
+
+# Connect to a self-managed CrateDB instance on localhost.
+CONNECTION_STRING = "crate://?schema=testdrive"
+
+db = SQLDatabase(engine=sa.create_engine(CONNECTION_STRING))
+
+loader = CrateDBLoader(
+    'SELECT * FROM sys.summits LIMIT 42',
+    db=db,
+)
+documents = loader.load()
+```
+
+### Chat Message History
+Use CrateDB as the storage for your chat messages.
+See also [CrateDBChatMessageHistory Tutorial].
+
+To use the chat message history in your applications:
+```python
+from langchain_cratedb import CrateDBChatMessageHistory
+
+# Connect to a self-managed CrateDB instance on localhost.
+CONNECTION_STRING = "crate://?schema=testdrive"
+
+message_history = CrateDBChatMessageHistory(
+    session_id="test-session",
+    connection=CONNECTION_STRING,
+)
+
+message_history.add_user_message("hi!")
+```
+
+
+[all features of CrateDB]: https://cratedb.com/docs/guide/feature/
+[CrateDB]: https://cratedb.com/database
+[CrateDB Cloud]: https://cratedb.com/database/cloud
+[CrateDB Cloud Console]: https://console.cratedb.cloud/?utm_source=langchain&utm_content=documentation
+[CrateDB installation options]: https://cratedb.com/docs/guide/install/
+[CrateDBChatMessageHistory Tutorial]: https://github.com/crate/cratedb-examples/blob/main/topic/machine-learning/llm-langchain/conversational_memory.ipynb
+[CrateDBLoader Tutorial]: https://github.com/crate/cratedb-examples/blob/main/topic/machine-learning/llm-langchain/document_loader.ipynb
+[CrateDBVectorStore Tutorial]: https://github.com/crate/cratedb-examples/blob/main/topic/machine-learning/llm-langchain/vector_search.ipynb
+[using LangChain with CrateDB]: https://cratedb.com/docs/guide/integrate/langchain/
diff --git a/docs/scripts/notebook_convert.py b/docs/scripts/notebook_convert.py
@@ -143,16 +143,22 @@ def _modify_frontmatter(
     edit_url = (
         f"https://github.com/langchain-ai/langchain/edit/master/docs/docs/{rel_path}"
     )
+    frontmatter = {
+        "custom_edit_url": edit_url,
+    }
     if re.match(r"^[\s\n]*---\n", body):
-        # if custom_edit_url already exists, leave it
-        if re.match(r"custom_edit_url: ", body):
-            return body
-        else:
-            return re.sub(
-                r"^[\s\n]*---\n", f"---\ncustom_edit_url: {edit_url}\n", body, count=1
-            )
+        # frontmatter already present
+
+        for k, v in frontmatter.items():
+            # if key already exists, leave it
+            if re.match(f"{k}: ", body):
+                continue
+            else:
+                body = re.sub(r"^[\s\n]*---\n", f"---\n{k}: {v}\n", body, count=1)
+        return body
     else:
-        return f"---\ncustom_edit_url: {edit_url}\n---\n{body}"
+        insert = "\n".join([f"{k}: {v}" for k, v in frontmatter.items()])
+        return f"---\n{insert}\n---\n{body}"
 
 
 def _convert_notebook(

diff --git a/docs/scripts/packages_yml_get_downloads.py b/docs/scripts/packages_yml_get_downloads.py
@@ -0,0 +1,71 @@
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+
+import requests
+from ruamel.yaml import YAML
+from ruamel.yaml.comments import CommentedMap
+
+yaml = YAML()
+
+PACKAGE_YML = Path(__file__).parents[2] / "libs" / "packages.yml"
+
+
+def _get_downloads(p: dict) -> int:
+    url = f"https://pypistats.org/api/packages/{p['name']}/recent?period=month"
+    r = requests.get(url)
+    r.raise_for_status()
+    return r.json()["data"]["last_month"]
+
+
+current_datetime = datetime.now(timezone.utc)
+yesterday = current_datetime - timedelta(days=1)
+
+with open(PACKAGE_YML) as f:
+    data = yaml.load(f)
+
+
+def _reorder_keys(p):
+    keys = p.keys()
+    key_order = [
+        "name",
+        "name_title",
+        "path",
+        "repo",
+        "type",
+        "provider_page",
+        "js",
+        "downloads",
+        "downloads_updated_at",
+    ]
+    if set(keys) - set(key_order):
+        raise ValueError(f"Unexpected keys: {set(keys) - set(key_order)}")
+    return CommentedMap((k, p[k]) for k in key_order if k in p)
+
+
+data["packages"] = [_reorder_keys(p) for p in data["packages"]]
+
+seen = set()
+for p in data["packages"]:
+    if p["name"] in seen:
+        raise ValueError(f"Duplicate package: {p['name']}")
+    seen.add(p["name"])
+    downloads_updated_at_str = p.get("downloads_updated_at")
+    downloads_updated_at = (
+        datetime.fromisoformat(downloads_updated_at_str)
+        if downloads_updated_at_str
+        else None
+    )
+
+    if downloads_updated_at is not None and downloads_updated_at > yesterday:
+        print(f"done: {p['name']}: {p['downloads']}")
+        continue
+
+    p["downloads"] = _get_downloads(p)
+    p["downloads_updated_at"] = current_datetime.isoformat()
+    with open(PACKAGE_YML, "w") as f:
+        yaml.dump(data, f)
+    print(f"{p['name']}: {p['downloads']}")
+
+
+with open(PACKAGE_YML, "w") as f:
+    yaml.dump(data, f)