georgia-tech-db · xzdandy · Aug 13, 2023 · Aug 13, 2023 · Aug 13, 2023 · Aug 13, 2023
diff --git a/benchmark/text_summarization/README.md b/benchmark/text_summarization/README.md
@@ -0,0 +1 @@
+Check the [documentation](https://evadb.readthedocs.io/en/latest/source/benchmarks/text_summarization.html) for instructions.
diff --git a/benchmark/text_summarization/download_dataset.sh b/benchmark/text_summarization/download_dataset.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+wget -qO cnn_news_test.csv https://www.dropbox.com/scl/fi/xwzu1rvsrdhnbt38wslnn/test.csv?rlkey=ijfia0xnyyyn5jls2epwq0w1k&dl=0
diff --git a/benchmark/text_summarization/text_summarization_with_evadb.py b/benchmark/text_summarization/text_summarization_with_evadb.py
@@ -0,0 +1,39 @@
+import evadb
+import time
+
+cursor = evadb.connect().cursor()
+
+
+cursor.query("DROP TABLE IF EXISTS cnn_news_test;").df()
+cursor.query("""
+    CREATE TABLE IF NOT EXISTS cnn_news_test(
+        id TEXT(128),
+        article TEXT(4096),
+        highlights TEXT(1024)
+    );""").df()
+cursor.load('./cnn_news_test.csv', 'cnn_news_test', format="CSV").df()
+
+cursor.query("DROP UDF IF EXISTS TextSummarizer;").df()
+cursor.query("""CREATE UDF IF NOT EXISTS TextSummarizer
+                TYPE HuggingFace
+                'task' 'summarization'
+                'model' 'sshleifer/distilbart-cnn-12-6'
+                'min_length' 5
+                'max_length' 100;""").df()
+
+
+cursor.query("DROP TABLE IF EXISTS cnn_news_summary;").df()
+
+cursor._evadb.config.update_value("executor", "batch_mem_size", 300000)
+cursor._evadb.config.update_value("executor", "gpu_ids", [0,1])
+cursor._evadb.config.update_value("experimental", "ray", True)
+
+start_time = time.perf_counter()
+cursor.query("""
+    CREATE TABLE IF NOT EXISTS cnn_news_summary AS
+    SELECT TextSummarizer(article) FROM cnn_news_test;""").df()
+end_time = time.perf_counter()
+print(f"{end_time-start_time:.2f} seconds")
+
+
+
diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -61,6 +61,11 @@ parts:
       - file: source/overview/docker
         title: Docker
 
+  - caption: Benchmarks
+    chapters:
+      - file: source/benchmarks/text_summarization.rst
+        title: Text Summarization
+
   - caption: Developer Guide
     chapters:
       - file: source/contribute/index

diff --git a/docs/source/benchmarks/text_summarization.rst b/docs/source/benchmarks/text_summarization.rst
@@ -0,0 +1,111 @@
+Text summarization benchmark 
+====
+In this benchmark, we compare the performance of text summarization between EvaDB and MindsDB on `CNN-DailyMail News <https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail>`.
+
+1. Prepare dataset
+----
+
+.. code-block: bash
+
+   cd benchmark/text_summarization
+   bash download_dataset.sh
+
+2. Using EvaDB to summarize the CNN DailyMail News
+----
+
+.. note::
+
+   Install ray in your EvaDB virtual environment. ``pip install "ray>=1.13.0,<2.5.0"``
+
+.. code-block: bash
+
+   cd benchmark/text_summarization
+   python text_summarization_with_evadb.py
+
+
+3. Using MindsDB to summarize the CNN DailyMail News
+----
+
+.. _sqlite database:
+
+Prepare sqlite database for MindsDB
+****
+
+.. code-block: bash
+
+   sqlite3 cnn_news_test.db
+   > .mode csv
+   > .import cnn_news_test.csv cnn_news_test
+   > .exit
+
+
+Install MindsDB
+****
+Follow the `Setup for Source Code via pip <https://docs.mindsdb.com/setup/self-hosted/pip/source>` to install mindsdb.
+
+.. note::
+
+   At the time of this documentation, we need to manully ``pip install evaluate`` for huggingface model to work in MindsDB.
+
+After the installation, we use mysql cli to connect to MindsDB. Replace the port number as needed.
+
+.. code-block: bash
+
+   mysql -h 127.0.0.1 --port 47335 -u mindsdb -p
+
+Run Experiment
+****
+
+Connect the sqlite database we created before: :ref:`sqlite database`.
+
+.. code-block: sql
+
+   CREATE DATABASE sqlite_datasource
+   WITH ENGINE = 'sqlite',
+   PARAMETERS = {
+     "db_file": "cnn_news_test.db"
+   };
+
+Create text summarization model and wait for its readiness.
+
+.. code-block: sql
+
+   CREATE MODEL mindsdb.hf_bart_sum_20
+   PREDICT PRED
+   USING
+   engine = 'huggingface',
+   task = 'summarization',
+   model_name = 'sshleifer/distilbart-cnn-12-6',
+   input_column = 'article',
+   min_output_length = 5,
+   max_output_length = 100;
+
+   DESCRIBE mindsdb.hf_bart_sum_20;
+
+Use the model to summarize the CNN DailyMail news
+
+.. code-block: sql
+
+   CREATE OR REPLACE TABLE sqlite_datasource.cnn_news_summary (
+     SELECT PRED
+     FROM mindsdb.hf_bart_sum_20
+     JOIN sqlite_datasource.cnn_news_test
+   );
+
+
+4. Experiment results
+----
+Below are nubmers from a server with 56 Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz and two Quadro P6000 GPU
+
+.. list-table:: Text summarization with ``sshleifer/distilbart-cnn-12-6`` on CNN-DailyMail News
+
+   * -
+     - MindsDB
+     - EvaDB (off-the-shelf)
+     - EvaDB (full GPU utilization)
+   * - Time
+     - 4 hours 45 mins 47.56 secs
+     - 1 hour 9 mins 39.8 secs
+     - 42 mins 50.22 secs
+
+