lanterndata · var77 · Feb 21, 2024 · Feb 10, 2024 · Feb 11, 2024 · Feb 11, 2024
diff --git a/.github/workflows/publish-cli-docker.yaml b/.github/workflows/publish-cli-docker.yaml
@@ -2,11 +2,16 @@ name: publish-cli-docker
 on:
   workflow_dispatch:
     inputs:
+      LATEST:
+        type: boolean
+        description: "Publish as latest release"
+        required: false
+        default: false
       VERSION:
         type: string
         description: "CLI version"
         required: true
-        default: "0.0.38"
+        default: "0.0.39"
       IMAGE_NAME:
         type: string
         description: "Container image name to tag"
@@ -34,12 +39,38 @@ jobs:
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Build and push
+      - name: Login to GCR Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ secrets.GCP_REGION }}-docker.pkg.dev
+          username: _json_key_base64
+          password: ${{ secrets.GCP_CREDENTIALS_JSON_B64 }}
+      - name: Build and push without latest tags
+        uses: docker/build-push-action@v5
+        id: build_image
+        if: ${{ inputs.LATEST == false || inputs.LATEST == 'false' }}
+        with:
+          context: .
+          platforms: linux/amd64
+          file: Dockerfile.cli${{ (matrix.device == 'gpu' && '.cuda' || '') }}
+          push: true
+          tags: |
+            ${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
+            ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
+      - name: Build and push with latest tags
         uses: docker/build-push-action@v5
+        id: build_image_latest
+        if: ${{ inputs.LATEST == true || inputs.LATEST == 'true' }}
         with:
           context: .
           platforms: linux/amd64
           file: Dockerfile.cli${{ (matrix.device == 'gpu' && '.cuda' || '') }}
           push: true
           # the :latest tag will refer to cpu version
-          tags: ${{ (matrix.device == 'cpu' && format('{0}:latest', inputs.IMAGE_NAME) || format('{0}:gpu', inputs.IMAGE_NAME)) }},${{ inputs.IMAGE_NAME }}:latest-${{ matrix.device }},${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
+          tags: |
+            ${{ (matrix.device == 'cpu' && format('{0}:latest', inputs.IMAGE_NAME) || format('{0}:gpu', inputs.IMAGE_NAME)) }}
+            ${{ inputs.IMAGE_NAME }}:latest-${{ matrix.device }}
+            ${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
+            ${{ (matrix.device == 'cpu' && format('{0}-docker.pkg.dev/{1}/{2}:latest', secrets.GCP_REGION, secrets.GCP_PROJECT_ID, inputs.IMAGE_NAME) || format('{0}-docker.pkg.dev/{1}/{2}:gpu', secrets.GCP_REGION, secrets.GCP_PROJECT_ID, inputs.IMAGE_NAME)) }}
+            ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/${{ inputs.IMAGE_NAME }}:latest-${{ matrix.device }}
+            ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,6 +13,7 @@ members = [
     "lantern_cli",
     "lantern_daemon",
     "lantern_index_autotune",
+    "lantern_pq",
 ]
 
 [profile.release]

diff --git a/README.md b/README.md
@@ -347,3 +347,56 @@ CREATE TABLE "public"."index_parameter_experiment_results" (
      build_time DOUBLE PRECISION NULL
 );
 ```
+
+## Lantern PQ
+
+## Description
+
+Use external product quantization to compress table vectors using kmeans clustering.
+
+### Usage
+
+Run `lantern-cli pq-table --help` to show the cli options.
+
+Job can be run both on local instance and also using GCP batch jobs to parallelize the workload over handreds of VMs to speed up clustering.
+
+To run locally use:
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32
+```
+
+The job will be run on current machine utilizing all available cores.
+
+For big datasets over 1M it is convinient to run the job using GCP batch jobs.  
+Make sure to have GCP credentials set-up before running this command:
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32 --run-on-gcp
+```
+
+If you prefer to orchestrate task on your own on premise servers you need to do the following 3 steps:
+
+1. Run setup job. This will create necessary tables and add `pqvec` column on target table
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32 --skip-codebook-creation --skip-vector-compression
+```
+
+2. Run clustering job. This will create codebook for the table and export to postgres table
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32 --skip-table-setup --skip-vector-compression --parallel-task-count 10 --subvector-id 0
+```
+
+In this case this command should be run 32 times for each subvector in range [0-31] and `--parallel-task-count` means at most we will run 10 tasks in parallel. This is used to not exceed max connection limit on postgres.
+
+3. Run compression job. This will compress vectors using the generated codebook and export results under `pqvec` column
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32 --skip-table-setup --skip-codebook-creation --parallel-task-count 10 --total-task-count 10 --compression-task-id 0
+```
+
+In this case this command should be run 10 times for each part of codebook in range [0-9] and `--parallel-task-count` means at most we will run 10 tasks in parallel. This is used to not exceed max connection limit on postgres.
+
+Table should have primary key, in order for this job to work. If primary key is different than `id` provide it using `--pk` argument
diff --git a/ci/scripts/build.sh b/ci/scripts/build.sh
@@ -48,7 +48,7 @@ function setup_postgres() {
 }
 
 function setup_lantern() {
-   LANTERN_VERSION=v0.1.1
+   LANTERN_VERSION=main
     git clone --recursive https://github.com/lanterndata/lantern.git /tmp/lantern 
     pushd /tmp/lantern
       git checkout ${LANTERN_VERSION} && \

diff --git a/lantern_cli/Cargo.toml b/lantern_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lantern_cli"
-version = "0.0.38"
+version = "0.0.39"
 edition = "2021"
 
 [[bin]]
@@ -16,3 +16,4 @@ lantern_embeddings = { path = "../lantern_embeddings" }
 lantern_daemon = { path = "../lantern_daemon" }
 lantern_logger = { path = "../lantern_logger" }
 lantern_index_autotune = { path = "../lantern_index_autotune" }
+lantern_pq = { path = "../lantern_pq" }
diff --git a/lantern_cli/src/cli.rs b/lantern_cli/src/cli.rs
@@ -3,6 +3,7 @@ use lantern_daemon::cli::DaemonArgs;
 use lantern_embeddings::cli::{EmbeddingArgs, MeasureModelSpeedArgs, ShowModelsArgs};
 use lantern_external_index::cli::CreateIndexArgs;
 use lantern_index_autotune::cli::IndexAutotuneArgs;
+use lantern_pq::cli::PQArgs;
 
 #[derive(Subcommand, Debug)]
 pub enum Commands {
@@ -18,6 +19,8 @@ pub enum Commands {
     MeasureModelSpeed(MeasureModelSpeedArgs),
     /// Autotune index
     AutotuneIndex(IndexAutotuneArgs),
+    /// Quantize table
+    PQTable(PQArgs),
     /// Start in daemon mode
     StartDaemon(DaemonArgs),
 }

diff --git a/lantern_cli/src/main.rs b/lantern_cli/src/main.rs
@@ -1,8 +1,11 @@
+use std::process;
+
 use clap::Parser;
 use lantern_daemon;
 use lantern_embeddings;
 use lantern_external_index;
 use lantern_logger::{LogLevel, Logger};
+use lantern_pq;
 mod cli;
 
 fn main() {
@@ -46,6 +49,11 @@ fn main() {
             _main_logger = Some(logger.clone());
             lantern_index_autotune::autotune_index(&args, None, None, Some(logger))
         }
+        cli::Commands::PQTable(args) => {
+            let logger = Logger::new("Lantern PQ", LogLevel::Debug);
+            _main_logger = Some(logger.clone());
+            lantern_pq::quantize_table(args, None, None, Some(logger))
+        }
         cli::Commands::StartDaemon(args) => {
             let logger = Logger::new("Lantern Daemon", args.log_level.value());
             _main_logger = Some(logger.clone());
@@ -56,5 +64,6 @@ fn main() {
     let logger = _main_logger.unwrap();
     if let Err(e) = res {
         logger.error(&e.to_string());
+        process::exit(1);
     }
 }
diff --git a/lantern_pq/Cargo.toml b/lantern_pq/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "lantern_pq"
+version = "0.0.1"
+edition = "2021"
+
+[lib]
+crate-type = ["lib"]
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+clap = { version = "4.4.0", features = ["derive"] }
+anyhow = "1.0.75"
+postgres = "0.19.7"
+lantern_logger = { path = "../lantern_logger" }
+lantern_utils = { path = "../lantern_utils" }
+rand = "0.8.5"
+linfa-clustering = { version = "0.7.0", features = ["ndarray-linalg"] }
+linfa = "0.7.0"
+ndarray = { version = "0.15.6", features = ["rayon"] }
+rayon = "1.8.1"
+md5 = "0.7.0"
+isahc = "1.7.2"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0.111"
+gcp_auth = "0.10.0"
+tokio = { version = "1.36.0", features = ["rt", "rt-multi-thread"] }
diff --git a/lantern_pq/src/cli.rs b/lantern_pq/src/cli.rs
@@ -0,0 +1,125 @@
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+#[command(version, about, long_about = None)]
+pub struct PQArgs {
+    /// Fully associated database connection string including db name
+    #[arg(short, long)]
+    pub uri: String,
+
+    /// Table name
+    #[arg(short, long)]
+    pub table: String,
+
+    /// Schema name
+    #[arg(short, long, default_value = "public")]
+    pub schema: String,
+
+    /// Column name to quantize
+    #[arg(short, long)]
+    pub column: String,
+
+    /// Name for codebook table
+    #[arg(long)]
+    pub codebook_table_name: Option<String>,
+
+    /// Dataset limit. Limit should be greater or equal to cluster count
+    #[arg(long)]
+    pub dataset_limit: Option<usize>,
+
+    /// Cluster count for kmeans
+    #[arg(long, default_value_t = 256)]
+    pub clusters: usize,
+
+    /// Subvector count to split vector
+    #[arg(long, default_value_t = 1)]
+    pub splits: usize,
+
+    /// Subvector part to process
+    #[arg(long)]
+    pub subvector_id: Option<usize>,
+
+    /// If true, codebook table will not be created and pq column will not be added to table. So
+    /// they should be set up externally
+    #[arg(long, default_value_t = false)]
+    pub skip_table_setup: bool,
+
+    /// If true vectors will not be quantized and exported to the table
+    #[arg(long, default_value_t = false)]
+    pub skip_vector_quantization: bool,
+
+    /// If true codebook will not be created
+    #[arg(long, default_value_t = false)]
+    pub skip_codebook_creation: bool,
+
+    /// Primary key of the table, needed for quantization job
+    #[arg(long, default_value = "id")]
+    pub pk: String,
+
+    /// Number of total tasks running (used in gcp batch jobs)
+    #[arg(long)]
+    pub total_task_count: Option<usize>,
+
+    /// Number of tasks running in parallel (used in gcp batch jobs)
+    #[arg(long)]
+    pub parallel_task_count: Option<usize>,
+
+    /// Task id of currently running quantization job (used in gcp batch jobs)
+    #[arg(long)]
+    pub quantization_task_id: Option<usize>,
+
+    // GCP ARGS
+    /// If true job will be submitted to gcp
+    #[arg(long, default_value_t = false)]
+    pub run_on_gcp: bool,
+
+    /// Image tag to use for GCR. example: 0.0.38-cpu
+    #[arg(long)]
+    pub gcp_cli_image_tag: Option<String>,
+
+    /// GCP project ID
+    #[arg(long)]
+    pub gcp_project: Option<String>,
+
+    /// GCP region. Default: us-central1
+    #[arg(long)]
+    pub gcp_region: Option<String>,
+
+    /// Full GCR image name. default: {gcp_region}-docker.pkg.dev/{gcp_project_id}/lanterndata/lantern-cli:{gcp_cli_image_tag}
+    #[arg(long)]
+    pub gcp_image: Option<String>,
+
+    /// Task count for quantization. default: calculated automatically based on dataset size
+    #[arg(long)]
+    pub gcp_quantization_task_count: Option<usize>,
+
+    /// Parallel tasks for quantization. default: calculated automatically based on
+    /// max connections
+    #[arg(long)]
+    pub gcp_quantization_task_parallelism: Option<usize>,
+
+    /// Parallel tasks for quantization. default: calculated automatically based on
+    /// max connections and dataset size
+    #[arg(long)]
+    pub gcp_clustering_task_parallelism: Option<usize>,
+
+    /// If image is hosted on GCR this will speed up the VM startup time
+    #[arg(long, default_value_t = true)]
+    pub gcp_enable_image_streaming: bool,
+
+    /// CPU count for one VM in clustering task. default: calculated based on dataset size
+    #[arg(long)]
+    pub gcp_clustering_cpu: Option<usize>,
+
+    /// Memory GB for one VM in clustering task. default: calculated based on CPU count
+    #[arg(long)]
+    pub gcp_clustering_memory_gb: Option<usize>,
+
+    /// CPU count for one VM in quantization task. default: calculated based on dataset size
+    #[arg(long)]
+    pub gcp_quantization_cpu: Option<usize>,
+
+    /// Memory GB for one VM in quantization task. default: calculated based on CPU count
+    #[arg(long)]
+    pub gcp_quantization_memory_gb: Option<usize>,
+}