added code and examples

langchain-ai · Nov 23, 2023 · 24644a7 · 24644a7
1 parent 300ff01
commit 24644a7
Show file tree

Hide file tree

Showing 5 changed files with 372 additions and 0 deletions.
diff --git a/docs/docs/integrations/document_loaders/kaggle_dataset.ipynb b/docs/docs/integrations/document_loaders/kaggle_dataset.ipynb
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "04c9fdc5",
+   "metadata": {},
+   "source": [
+    "# Kaggle dataset\n",
+    "\n",
+    ">[Kaggle](https://www.kaggle.com/) is a data science competition platform and online community of data scientists and machine learning practitioners under `Google LLC`. `Kaggle` enables users to find and publish datasets, explore and build models in a web-based data science environment, work with other data scientists and machine learning engineers, and enter competitions to solve data science challenges.\n",
+    "\n",
+    "\n",
+    "This notebook shows how to load [`Kaggle` datasets](https://www.kaggle.com/datasets) to LangChain."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54b7fc48-ee05-4f71-bead-ed59fd18ca40",
+   "metadata": {},
+   "source": [
+    "## Setting up\n",
+    "\n",
+    "Follow these steps to use this loader:\n",
+    "- [Register a Kaggle account and create an API token](https://www.kaggle.com/settings) to use this loader.\n",
+    "- Install `kaggle` and `pandas` python packages with `pip install kaggle pandas`\n",
+    "- Use `kaggle datasets list` to list all available datasets\n",
+    "- Use `kaggle datasets <dataset_name>` to download the dataset\n",
+    "- Use `unzip <dataset_zipfile_name>` to extract all files in the dataset\n",
+    "- Open the dataset CSV file and choose the column name for page content\n",
+    "- Use the dataset CSV file name and the column name to\n",
+    "  initialize the `KaggleDatasetLoader`\n",
+    "\n",
+    "Note: Other columns in the dataset CSV file will be treated as metadata.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cefa09a-53b2-4f4e-b54e-8d88bb25f1a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install kaggle pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "3611e092",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/leo/.kaggle/kaggle.json'\n",
+      "ref                                                             title                                             size  lastUpdated          downloadCount  voteCount  usabilityRating  \n",
+      "--------------------------------------------------------------  -----------------------------------------------  -----  -------------------  -------------  ---------  ---------------  \n",
+      "carlmcbrideellis/llm-7-prompt-training-dataset                  LLM: 7 prompt training dataset                    41MB  2023-11-15 07:32:56            926         82  1.0              \n",
+      "thedrcat/daigt-v2-train-dataset                                 DAIGT V2 Train Dataset                            29MB  2023-11-16 01:38:36            438         70  1.0              \n",
+      "thedrcat/daigt-proper-train-dataset                             DAIGT Proper Train Dataset                       119MB  2023-11-05 14:03:25           1000        112  1.0              \n",
+      "joebeachcapital/30000-spotify-songs                             30000 Spotify Songs                                3MB  2023-11-01 06:06:43           5906        148  1.0              \n",
+      "ddosad/auto-sales-data                                          Automobile Sales data                             79KB  2023-11-18 12:36:41           1243         35  1.0              \n",
+      "everydaycodings/job-opportunity-dataset                         Job Opportunities Dataset                         95KB  2023-11-20 08:33:14            783         26  1.0              \n",
+      "iamsouravbanerjee/customer-shopping-trends-dataset              Customer Shopping Trends Dataset                 146KB  2023-10-05 06:45:37          31292        583  1.0              \n",
+      "dillonmyrick/high-school-student-performance-and-demographics   High School Student Performance & Demographics    24KB  2023-11-10 01:33:35           2421         40  1.0              \n",
+      "nelgiriyewithana/world-educational-data                         World Educational Data                             9KB  2023-11-04 06:10:17           4623         97  1.0              \n",
+      "prasad22/healthcare-dataset                                     🩺Healthcare Dataset 🧪                            483KB  2023-10-31 11:30:58           4769         82  1.0              \n",
+      "mauryansshivam/list-of-internet-products-of-top-tech-companies  List of Internet Products of Top Tech Companies    9KB  2023-11-15 19:56:56            980         24  1.0              \n",
+      "alejopaullier/daigt-external-dataset                            DAIGT | External Dataset                           3MB  2023-10-31 19:11:35            793        113  0.7647059        \n",
+      "lakshayjain611/imdb-100-lowest-ranked-movies-dataset            IMDb 100 Lowest Ranked Movies Dataset              8KB  2023-11-11 09:33:18           1033         33  1.0              \n",
+      "jacksondivakarr/online-shopping-dataset                         🛒 Online Shopping Dataset 📊📉📈                      5MB  2023-11-12 12:35:58           2187         47  1.0              \n",
+      "bwandowando/1-5-million-netflix-google-store-reviews            🎬🎥1.5 Million Netflix Google Store Reviews       114MB  2023-11-17 01:30:48            487         24  1.0              \n",
+      "samyakb/student-stress-factors                                  Student stress factors                            887B  2023-11-02 12:42:11           5294         87  0.9411765        \n",
+      "jdaustralia/icc-cwc23-all-innings-cleaned                       ICC Cricket World Cup CWC23 All innings           28KB  2023-11-21 04:41:01            805         21  0.9411765        \n",
+      "anshtanwar/top-200-trending-books-with-reviews                  Top 100 Bestselling Book Reviews on Amazon       422KB  2023-11-09 06:31:02           5677         53  1.0              \n",
+      "miquelneck/worlds-spotify-top-50-playlist-musicality-data       World's Spotify TOP-50 playlist musicality data  171KB  2023-11-16 11:14:23           1281         36  1.0              \n",
+      "joebeachcapital/coronavirus-covid-19-cases-daily-updates        Coronavirus (COVID-19) Cases (Daily Updates)      14MB  2023-11-22 23:28:03            834         26  1.0              \n"
+     ]
+    }
+   ],
+   "source": [
+    "!kaggle datasets list"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "020462f8-9dea-43d0-9365-7cbf3c445440",
+   "metadata": {},
+   "source": [
+    "## Example\n",
+    "\n",
+    "Here we download one of the dataset used for promt training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5e903ebc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/leo/.kaggle/kaggle.json'\n",
+      "Downloading llm-7-prompt-training-dataset.zip to /home/leo/PycharmProjects/GLD/langchain/docs/docs/integrations/document_loaders\n",
+      "100%|██████████████████████████████████████| 41.4M/41.4M [00:18<00:00, 2.43MB/s]\n",
+      "100%|██████████████████████████████████████| 41.4M/41.4M [00:18<00:00, 2.40MB/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "!kaggle datasets download carlmcbrideellis/llm-7-prompt-training-dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e8559946",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Archive:  llm-7-prompt-training-dataset.zip\n",
+      "  inflating: train_essays_7_prompts.csv  \n",
+      "  inflating: train_essays_7_prompts_v2.csv  \n",
+      "  inflating: train_essays_RDizzl3_seven_v1.csv  \n",
+      "  inflating: train_essays_RDizzl3_seven_v2.csv  \n"
+     ]
+    }
+   ],
+   "source": [
+    "!unzip llm-7-prompt-training-dataset.zip"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "109dcac9-25e3-46b8-b1cf-e6a4e2fd562a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import KaggleDatasetLoader"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "021bc377",
+   "metadata": {},
+   "source": [
+    "`KaggleDatasetLoader` has these arguments:\n",
+    "- **dataset_path**: Path to the dataset CSV file.\n",
+    "- **page_content_column**: Column name of the page content."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d924885c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "14877"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "loader = KaggleDatasetLoader(\n",
+    "    dataset_path=\"train_essays_7_prompts.csv\", \n",
+    "    page_content_column=\"text\"\n",
+    ")\n",
+    "docs = loader.load()\n",
+    "len(docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f94ce6a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content='Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built the first ModelT. Cars have played a major role in our every day lives since then. But now, people are starting to question if limiting car usage would be a good thing. To me, limiting the use of cars might be a good thing to do.\\n\\nIn like matter of this, article, \"In German Suburb, Life Goes On Without Cars,\" by Elizabeth Rosenthal states, how automobiles are the linchpin of suburbs, where middle class families from either Shanghai or Chicago tend to make their homes. Experts say how this is a huge impediment to current efforts to reduce greenhouse gas emissions from tailpipe. Passenger cars are responsible for 12 percent of greenhouse gas emissions in Europe...and up to 50 percent in some carintensive areas in the United States. Cars are the main reason for the greenhouse gas emissions because of a lot of people driving them around all the time getting where they need to go. Article, \"Paris bans driving due to smog,\" by Robert Duffer says, how Paris, after days of nearrecord pollution, enforced a partial driving ban to clear the air of the global city. It also says, how on Monday, motorist with evennumbered license plates were ordered to leave their cars at home or be fined a 22euro fine 31. The same order would be applied to oddnumbered plates the following day. Cars are the reason for polluting entire cities like Paris. This shows how bad cars can be because, of all the pollution that they can cause to an entire city.\\n\\nLikewise, in the article, \"Carfree day is spinning into a big hit in Bogota,\" by Andrew Selsky says, how programs that\\'s set to spread to other countries, millions of Columbians hiked, biked, skated, or took the bus to work during a carfree day, leaving streets of this capital city eerily devoid of traffic jams. It was the third straight year cars have been banned with only buses and taxis permitted for the Day Without Cars in the capital city of 7 million. People like the idea of having carfree days because, it allows them to lesson the pollution that cars put out of their exhaust from people driving all the time. The article also tells how parks and sports centers have bustled throughout the city uneven, pitted sidewalks have been replaced by broad, smooth sidewalks rushhour restrictions have dramatically cut traffic and new restaurants and upscale shopping districts have cropped up. Having no cars has been good for the country of Columbia because, it has aloud them to repair things that have needed repairs for a long time, traffic jams have gone down, and restaurants and shopping districts have popped up, all due to the fact of having less cars around.\\n\\nIn conclusion, the use of less cars and having carfree days, have had a big impact on the environment of cities because, it is cutting down the air pollution that the cars have majorly polluted, it has aloud countries like Columbia to repair sidewalks, and cut down traffic jams. Limiting the use of cars would be a good thing for America. So we should limit the use of cars by maybe riding a bike, or maybe walking somewhere that isn\\'t that far from you and doesn\\'t need the use of a car to get you there. To me, limiting the use of cars might be a good thing to do.', metadata={'label': 0})"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[0]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/docs/integrations/providers/kaggle.mdx b/docs/docs/integrations/providers/kaggle.mdx
@@ -0,0 +1,17 @@
+# Kaggle
+
+>[Kaggle](https://www.kaggle.com/) is a data science competition platform and online community of data scientists and machine learning practitioners under `Google LLC`. `Kaggle` enables users to find and publish datasets, explore and build models in a web-based data science environment, work with other data scientists and machine learning engineers, and enter competitions to solve data science challenges.
+
+## Installation and Setup
+
+You need to install `kaggle` python package.
+
+```bash
+pip install kaggle
+```
+
+## Document Loader
+
+The Kaggle has lots of [datasets](https://www.kaggle.com/datasets) that can be used to train and evaluate your LLM chains.
+
+See a [usage example and detailed installation instructions](/docs/integrations/document_loaders/kaggle_dataset).
diff --git a/libs/langchain/langchain/document_loaders/__init__.py b/libs/langchain/langchain/document_loaders/__init__.py
@@ -198,6 +198,8 @@
     YoutubeLoader,
 )
 
+from .kaggle_dataset import KaggleDatasetLoader
+
 # Legacy: only for backwards compatibility. Use PyPDFLoader instead
 PagedPDFSplitter = PyPDFLoader
 
@@ -282,6 +284,7 @@
     "IuguLoader",
     "JSONLoader",
     "JoplinLoader",
+    "KaggleDatasetLoader",
     "LarkSuiteDocLoader",
     "LakeFSLoader",
     "MHTMLLoader",

diff --git a/libs/langchain/langchain/document_loaders/kaggle_dataset.py b/libs/langchain/langchain/document_loaders/kaggle_dataset.py
@@ -0,0 +1,74 @@
+from typing import Iterator, List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class KaggleDatasetLoader(BaseLoader):
+    """Load from `Kaggle` datasets.
+
+    Follow these steps to use this loader:
+    - Register a Kaggle account and create an API token to use this loader.
+    See https://www.kaggle.com/settings
+    - Install `kaggle` python package with `pip install kaggle`
+    - Use `kaggle datasets list` to list all available datasets
+    - Use `kaggle datasets <dataset_name>` to download the dataset
+    - Use `unzip <dataset_zipfile_name>` to extract all files in the dataset
+    - Open the dataset CSV file and choose the column name for page content
+    - Use the dataset CSV file name and the column name to
+      initialize the KaggleDatasetLoader
+
+    Note: Other columns in the dataset CSV file will be treated as metadata.
+    """
+
+    def __init__(self, dataset_path: str, page_content_column: str):
+        """Initialize the KaggleDatasetLoader.
+
+        Args:
+            dataset_path: Path to the dataset CSV file.
+            page_content_column: Page content column name.
+        """
+
+        self.dataset_path = dataset_path
+        self.page_content_column = page_content_column
+
+    def lazy_load(self) -> Iterator[Document]:
+        """Load documents lazily."""
+        for doc in self.load():
+            yield doc
+
+    def load(self) -> List[Document]:
+        """Load documents."""
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ImportError(
+                "Could not import pandas python package. "
+                "Please install it with `pip install pandas`."
+            )
+
+        df = pd.read_csv(self.dataset_path)
+        docs = []
+        df.apply(lambda row: docs.append(self._sample2document(row)), axis=1)
+        return docs
+
+    def _sample2document(self, sample) -> Document:
+        """Convert a pandas dataframe sample into a Document
+
+        The `content_field` goes into the document content,
+        all other fields go into the metadata.
+        """
+        assert self.page_content_column in sample.index, (
+            f"content field {self.page_content_column} "
+            f"should be in the sample columns: {sample}"
+        )
+        return Document(
+            page_content=str(sample[self.page_content_column])
+            if sample[self.page_content_column] is not None
+            else "",
+            metadata={
+                k: v
+                for k, v in sample.to_dict().items()
+                if k != self.page_content_column
+            },
+        )
diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_kaggle_dataset.py b/libs/langchain/tests/unit_tests/document_loaders/test_kaggle_dataset.py
@@ -0,0 +1,50 @@
+import os
+
+import pandas as pd
+import pytest
+from pytest import fixture
+
+from langchain.document_loaders.kaggle_dataset import KaggleDatasetLoader
+
+TEST_DATASET_PATH = "./kaggle_dataset.csv"
+
+
+@fixture(autouse=True)
+def setup_and_tear_down():
+    if os.path.isfile(TEST_DATASET_PATH):
+        return True
+    data = {"col1": [1, 2], "col2": [3, 4], "col3": ["5", "6"]}
+    pd.DataFrame(data=data).to_csv(TEST_DATASET_PATH, index=False)
+    yield
+    if os.path.isfile(TEST_DATASET_PATH):
+        os.remove(TEST_DATASET_PATH)
+
+
+def test_raise_error_if_path_not_exist() -> None:
+    assert os.path.isfile(TEST_DATASET_PATH)
+    os.remove(TEST_DATASET_PATH)
+    with pytest.raises(FileNotFoundError):
+        loader = KaggleDatasetLoader(
+            dataset_path=TEST_DATASET_PATH, page_content_column="col3"
+        )
+        loader.load()
+
+
+def test_raise_error_if_wrong_() -> None:
+    with pytest.raises(AssertionError):
+        loader = KaggleDatasetLoader(
+            dataset_path=TEST_DATASET_PATH, page_content_column="wrong_column"
+        )
+        loader.load()
+
+
+def test_success() -> None:
+    loader = KaggleDatasetLoader(
+        dataset_path=TEST_DATASET_PATH, page_content_column="col3"
+    )
+    docs = loader.load()
+    assert len(docs) == 2
+    assert docs[0].page_content == "5"
+    assert docs[1].page_content == "6"
+    assert docs[0].metadata == {"col1": 1, "col2": 3}
+    assert docs[1].metadata == {"col1": 2, "col2": 4}