lanterndata · var77 · Feb 21, 2024 · Feb 10, 2024 · Feb 11, 2024 · Feb 11, 2024
diff --git a/.github/workflows/publish-cli-docker.yaml b/.github/workflows/publish-cli-docker.yaml
@@ -2,11 +2,16 @@ name: publish-cli-docker
 on:
   workflow_dispatch:
     inputs:
+      LATEST:
+        type: boolean
+        description: "Publish as latest release"
+        required: false
+        default: false
       VERSION:
         type: string
         description: "CLI version"
         required: true
-        default: "0.0.38"
+        default: "0.0.39"
       IMAGE_NAME:
         type: string
         description: "Container image name to tag"
@@ -34,12 +39,38 @@ jobs:
         with:
           username: ${{ secrets.DOCKERHUB_USERNAME }}
           password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Build and push
+      - name: Login to GCR Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ secrets.GCP_REGION }}-docker.pkg.dev
+          username: _json_key_base64
+          password: ${{ secrets.GCP_CREDENTIALS_JSON_B64 }}
+      - name: Build and push without latest tags
+        uses: docker/build-push-action@v5
+        id: build_image
+        if: ${{ inputs.LATEST == false || inputs.LATEST == 'false' }}
+        with:
+          context: .
+          platforms: linux/amd64
+          file: Dockerfile.cli${{ (matrix.device == 'gpu' && '.cuda' || '') }}
+          push: true
+          tags: |
+            ${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
+            ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
+      - name: Build and push with latest tags
         uses: docker/build-push-action@v5
+        id: build_image_latest
+        if: ${{ inputs.LATEST == true || inputs.LATEST == 'true' }}
         with:
           context: .
           platforms: linux/amd64
           file: Dockerfile.cli${{ (matrix.device == 'gpu' && '.cuda' || '') }}
           push: true
           # the :latest tag will refer to cpu version
-          tags: ${{ (matrix.device == 'cpu' && format('{0}:latest', inputs.IMAGE_NAME) || format('{0}:gpu', inputs.IMAGE_NAME)) }},${{ inputs.IMAGE_NAME }}:latest-${{ matrix.device }},${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
+          tags: |
+            ${{ (matrix.device == 'cpu' && format('{0}:latest', inputs.IMAGE_NAME) || format('{0}:gpu', inputs.IMAGE_NAME)) }}
+            ${{ inputs.IMAGE_NAME }}:latest-${{ matrix.device }}
+            ${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
+            ${{ (matrix.device == 'cpu' && format('{0}-docker.pkg.dev/{1}/{2}:latest', secrets.GCP_REGION, secrets.GCP_PROJECT_ID, inputs.IMAGE_NAME) || format('{0}-docker.pkg.dev/{1}/{2}:gpu', secrets.GCP_REGION, secrets.GCP_PROJECT_ID, inputs.IMAGE_NAME)) }}
+            ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/${{ inputs.IMAGE_NAME }}:latest-${{ matrix.device }}
+            ${{ secrets.GCP_REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT_ID }}/${{ inputs.IMAGE_NAME }}:${{ inputs.VERSION }}-${{ matrix.device }}
diff --git a/Cargo.toml b/Cargo.toml
@@ -13,6 +13,7 @@ members = [
     "lantern_cli",
     "lantern_daemon",
     "lantern_index_autotune",
+    "lantern_pq",
 ]
 
 [profile.release]

diff --git a/README.md b/README.md
@@ -347,3 +347,56 @@ CREATE TABLE "public"."index_parameter_experiment_results" (
      build_time DOUBLE PRECISION NULL
 );
 ```
+
+## Lantern PQ
+
+## Description
+
+Use external product quantization to compress table vectors using kmeans clustering.
+
+### Usage
+
+Run `lantern-cli pq-table --help` to show the cli options.
+
+Job can be run both on local instance and also using GCP batch jobs to parallelize the workload over handreds of VMs to speed up clustering.
+
+To run locally use:
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32
+```
+
+The job will be run on current machine utilizing all available cores.
+
+For big datasets over 1M it is convinient to run the job using GCP batch jobs.  
+Make sure to have GCP credentials set-up before running this command:
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32 --run-on-gcp
+```
+
+If you prefer to orchestrate task on your own on premise servers you need to do the following 3 steps:
+
+1. Run setup job. This will create necessary tables and add `pqvec` column on target table
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32 --skip-codebook-creation --skip-vector-compression
+```
+
+2. Run clustering job. This will create codebook for the table and export to postgres table
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32 --skip-table-setup --skip-vector-compression --parallel-task-count 10 --subvector-id 0
+```
+
+In this case this command should be run 32 times for each subvector in range [0-31] and `--parallel-task-count` means at most we will run 10 tasks in parallel. This is used to not exceed max connection limit on postgres.
+
+3. Run compression job. This will compress vectors using the generated codebook and export results under `pqvec` column
+
+```bash
+lantern-cli pq-table --uri 'postgres://[email protected]:5432/postgres' --table sift10k --column v --clusters 256 --splits 32 --skip-table-setup --skip-codebook-creation --parallel-task-count 10 --total-task-count 10 --compression-task-id 0
+```
+
+In this case this command should be run 10 times for each part of codebook in range [0-9] and `--parallel-task-count` means at most we will run 10 tasks in parallel. This is used to not exceed max connection limit on postgres.
+
+Table should have primary key, in order for this job to work. If primary key is different than `id` provide it using `--pk` argument
diff --git a/ci/scripts/build.sh b/ci/scripts/build.sh
@@ -48,14 +48,15 @@ function setup_postgres() {
 }
 
 function setup_lantern() {
-   LANTERN_VERSION=v0.1.1
+   LANTERN_VERSION=main
+   LANTERN_VERSION=narek/pgvector-compat
     git clone --recursive https://github.com/lanterndata/lantern.git /tmp/lantern 
     pushd /tmp/lantern
       git checkout ${LANTERN_VERSION} && \
       git submodule update --recursive && \
       mkdir build 
       pushd build
-        cmake -DUSEARCH_NO_MARCH_NATIVE=ON .. && \
+        cmake -DMARCH_NATIVE=OFF -DBUILD_FOR_DSTRIBUTING=1 .. && \
         make install
       popd
     popd

diff --git a/lantern_cli/Cargo.toml b/lantern_cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "lantern_cli"
-version = "0.0.38"
+version = "0.0.39"
 edition = "2021"
 
 [[bin]]
@@ -16,3 +16,4 @@ lantern_embeddings = { path = "../lantern_embeddings" }
 lantern_daemon = { path = "../lantern_daemon" }
 lantern_logger = { path = "../lantern_logger" }
 lantern_index_autotune = { path = "../lantern_index_autotune" }
+lantern_pq = { path = "../lantern_pq" }
diff --git a/lantern_cli/src/cli.rs b/lantern_cli/src/cli.rs
@@ -3,6 +3,7 @@ use lantern_daemon::cli::DaemonArgs;
 use lantern_embeddings::cli::{EmbeddingArgs, MeasureModelSpeedArgs, ShowModelsArgs};
 use lantern_external_index::cli::CreateIndexArgs;
 use lantern_index_autotune::cli::IndexAutotuneArgs;
+use lantern_pq::cli::PQArgs;
 
 #[derive(Subcommand, Debug)]
 pub enum Commands {
@@ -18,6 +19,8 @@ pub enum Commands {
     MeasureModelSpeed(MeasureModelSpeedArgs),
     /// Autotune index
     AutotuneIndex(IndexAutotuneArgs),
+    /// Quantize table
+    PQTable(PQArgs),
     /// Start in daemon mode
     StartDaemon(DaemonArgs),
 }

diff --git a/lantern_cli/src/main.rs b/lantern_cli/src/main.rs
@@ -1,8 +1,11 @@
+use std::process;
+
 use clap::Parser;
 use lantern_daemon;
 use lantern_embeddings;
 use lantern_external_index;
 use lantern_logger::{LogLevel, Logger};
+use lantern_pq;
 mod cli;
 
 fn main() {
@@ -46,6 +49,11 @@ fn main() {
             _main_logger = Some(logger.clone());
             lantern_index_autotune::autotune_index(&args, None, None, Some(logger))
         }
+        cli::Commands::PQTable(args) => {
+            let logger = Logger::new("Lantern PQ", LogLevel::Debug);
+            _main_logger = Some(logger.clone());
+            lantern_pq::quantize_table(args, None, None, Some(logger))
+        }
         cli::Commands::StartDaemon(args) => {
             let logger = Logger::new("Lantern Daemon", args.log_level.value());
             _main_logger = Some(logger.clone());
@@ -56,5 +64,6 @@ fn main() {
     let logger = _main_logger.unwrap();
     if let Err(e) = res {
         logger.error(&e.to_string());
+        process::exit(1);
     }
 }
diff --git a/lantern_daemon/src/external_index_jobs.rs b/lantern_daemon/src/external_index_jobs.rs
@@ -135,6 +135,7 @@ async fn external_index_worker(
                         dims: 0,
                         out: index_path,
                         remote_database: true,
+                        pq: false,
                     }, progress_callback, Some(is_canceled_clone), Some(task_logger));
                     futures::executor::block_on(cancel_tx_clone.send(false))?;
                     result

diff --git a/lantern_external_index/Cargo.toml b/lantern_external_index/Cargo.toml
@@ -13,7 +13,7 @@ clap = { version = "4.4.0", features = ["derive"] }
 cxx = "1.0.106"
 postgres = "0.19.7"
 postgres-types = { version = "0.2.6", features = ["derive"] }
-usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch = "main-lantern" }
+usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch="main-lantern" }
 lantern_logger = { path = "../lantern_logger" }
 lantern_utils = { path = "../lantern_utils" }
 rand = "0.8.5"
diff --git a/lantern_external_index/src/cli.rs b/lantern_external_index/src/cli.rs
@@ -113,6 +113,10 @@ pub struct CreateIndexArgs {
     #[arg(short, long)]
     pub column: String,
 
+    /// Use already created codebook to create product-quantized binary index
+    #[arg(short, long, default_value_t = false)]
+    pub pq: bool,
+
     /// Number of neighbours for each vector
     #[arg(short, default_value_t = 16)]
     pub m: usize,

diff --git a/lantern_external_index/src/lib.rs b/lantern_external_index/src/lib.rs
@@ -142,6 +142,71 @@ pub fn create_usearch_index(
         dimensions, args.m, args.ef, args.efc
     ));
 
+    let mut pq_codebook: *const f32 = std::ptr::null();
+    let mut v: Vec<f32> = vec![];
+    let mut num_centroids: usize = 0;
+    let mut num_subvectors: usize = 0;
+
+    if args.pq {
+        let codebook_table_name = format!(
+            "_codebook_{table_name}_{column_name}",
+            table_name = &args.table,
+            column_name = &args.column
+        );
+        let full_codebook_table_name =
+            get_full_table_name("_lantern_internal", &codebook_table_name);
+
+        let rows_codebook_exists = transaction.query("SELECT true FROM information_schema.tables WHERE table_schema='_lantern_internal' AND table_name=$1;", &[&codebook_table_name])?;
+
+        if rows_codebook_exists.len() == 0 {
+            anyhow::bail!("Codebook table {full_codebook_table_name} does not exist");
+        }
+
+        let rows_c = transaction.query(
+            &format!("SELECT COUNT(*) FROM {full_codebook_table_name} WHERE subvector_id = 0;"),
+            &[],
+        )?;
+        let rows_sv = transaction.query(
+            &format!("SELECT COUNT(*) FROM {full_codebook_table_name} WHERE centroid_id = 0;"),
+            &[],
+        )?;
+
+        if rows_c.len() == 0 || rows_sv.len() == 0 {
+            anyhow::bail!("Invalid codebook table");
+        }
+
+        num_centroids = rows_c.first().unwrap().get::<usize, i64>(0) as usize;
+        num_subvectors = rows_sv.first().unwrap().get::<usize, i64>(0) as usize;
+
+        v.resize(num_centroids * dimensions, 0.);
+
+        let rows = transaction.query(
+            &format!(
+                "SELECT subvector_id, centroid_id, c FROM _lantern_internal._codebook_{table_name}_{column_name};",
+                table_name = args.table,
+                column_name = args.column,
+            ),
+            &[],
+        )?;
+
+        logger.info(&format!(
+            "Codebook has {} rows - {num_centroids} centroids and {num_subvectors} subvectors",
+            rows.len()
+        ));
+
+        for r in rows {
+            let subvector_id: i32 = r.get(0);
+            let centroid_id: i32 = r.get(1);
+            let subvector: Vec<f32> = r.get(2);
+            for i in 0..subvector.len() {
+                v[centroid_id as usize * dimensions
+                    + subvector_id as usize * subvector.len()
+                    + i] = subvector[i];
+            }
+        }
+        pq_codebook = v.as_ptr();
+    }
+
     let options = IndexOptions {
         dimensions,
         metric: args.metric_kind.value(),
@@ -150,6 +215,20 @@ pub fn create_usearch_index(
         connectivity: args.m,
         expansion_add: args.efc,
         expansion_search: args.ef,
+
+        num_threads: 0, // automatic
+
+        // note: pq_construction and pq_output distinction is not yet implemented in usearch
+        // in the future, if pq_construction is false, we will use full vectors in memory (and
+        // require large memory for construction) but will output pq-quantized graph
+        //
+        // currently, regardless of pq_construction value, as long as pq_output is true,
+        // we construct a pq_quantized index using quantized values during construction
+        pq_construction: args.pq,
+        pq_output: args.pq,
+        num_centroids,
+        num_subvectors,
+        codebook: pq_codebook,
     };
     let index = Index::new(&options)?;
 
@@ -317,6 +396,7 @@ pub fn create_usearch_index(
                 args.efc,
                 dimensions,
                 args.m,
+                args.pq,
             )?;
         } else {
             // If job is run on the same server as database we can skip copying part
@@ -331,7 +411,8 @@ pub fn create_usearch_index(
             }
 
             transaction.execute(
-            &format!("CREATE INDEX {idx_name} ON {table_name} USING hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=args.out, table_name=&get_full_table_name(&args.schema, &args.table),column_name=&quote_ident(&args.column), m=args.m, ef=args.ef, ef_construction=args.efc, dim=dimensions),
+            &format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', pq={pq}, ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=args.out, table_name=&get_full_table_name(&args.schema, &args.table),
+            column_name=&quote_ident(&args.column), pq=args.pq, m=args.m, ef=args.ef, ef_construction=args.efc, dim=dimensions),
             &[],
             )?;
 

diff --git a/lantern_external_index/src/postgres_large_objects.rs b/lantern_external_index/src/postgres_large_objects.rs
@@ -41,6 +41,7 @@ impl<'a> LargeObject<'a> {
         ef_construction: usize,
         dim: usize,
         m: usize,
+        pq: bool,
     ) -> crate::AnyhowVoidResult {
         let mut transaction = self.transaction.unwrap();
         transaction.execute(
@@ -57,7 +58,8 @@ impl<'a> LargeObject<'a> {
         }
 
         transaction.execute(
-            &format!("CREATE INDEX {idx_name} ON {table_name} USING hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=self.index_path),
+            &format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', pq={pq}, ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", 
+            index_path=self.index_path),
             &[],
         )?;
 

diff --git a/lantern_extras/src/external_index.rs b/lantern_extras/src/external_index.rs
@@ -18,6 +18,7 @@ fn lantern_create_external_index<'a>(
     m: default!(i32, 16),
     ef_construction: default!(i32, 16),
     ef: default!(i32, 16),
+    pq: default!(bool, false),
     index_name: default!(&'a str, "''"),
 ) -> Result<(), anyhow::Error> {
     validate_index_param("ef", ef, 1, 400);
@@ -84,6 +85,7 @@ fn lantern_create_external_index<'a>(
             dims: dim as usize,
             index_name,
             remote_database: false,
+            pq,
         },
         None,
         None,
@@ -111,6 +113,7 @@ mod lantern_extras {
         m: i32,
         ef_construction: i32,
         ef: i32,
+        pq: default!(bool, false),
     ) -> Result<(), anyhow::Error> {
         let index_name = index.name().to_owned();
         let schema = index.namespace().to_owned();
@@ -152,6 +155,7 @@ mod lantern_extras {
             m,
             ef_construction,
             ef,
+            pq,
             &index_name,
         )
     }