From 0a64ce5baefa0a6fcd2372236ee6f501b224bc3e Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Sat, 17 Feb 2024 21:04:43 +0000 Subject: [PATCH 1/9] Use renamed lantern access method --- lantern_external_index/src/lib.rs | 2 +- lantern_external_index/src/postgres_large_objects.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lantern_external_index/src/lib.rs b/lantern_external_index/src/lib.rs index 6206918..ad6393c 100644 --- a/lantern_external_index/src/lib.rs +++ b/lantern_external_index/src/lib.rs @@ -331,7 +331,7 @@ pub fn create_usearch_index( } transaction.execute( - &format!("CREATE INDEX {idx_name} ON {table_name} USING hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=args.out, table_name=&get_full_table_name(&args.schema, &args.table),column_name="e_ident(&args.column), m=args.m, ef=args.ef, ef_construction=args.efc, dim=dimensions), + &format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=args.out, table_name=&get_full_table_name(&args.schema, &args.table),column_name="e_ident(&args.column), m=args.m, ef=args.ef, ef_construction=args.efc, dim=dimensions), &[], )?; diff --git a/lantern_external_index/src/postgres_large_objects.rs b/lantern_external_index/src/postgres_large_objects.rs index 93992b2..3a9a0f1 100644 --- a/lantern_external_index/src/postgres_large_objects.rs +++ b/lantern_external_index/src/postgres_large_objects.rs @@ -57,7 +57,7 @@ impl<'a> LargeObject<'a> { } transaction.execute( - &format!("CREATE INDEX {idx_name} ON {table_name} USING hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=self.index_path), + &format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=self.index_path), &[], )?; From 9683724b41cb136875dd7473177335fe20de1298 Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Sat, 17 Feb 2024 21:08:57 +0000 Subject: [PATCH 2/9] Release v0.2.0 --- lantern_extras/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lantern_extras/Cargo.toml b/lantern_extras/Cargo.toml index e59e5d6..df07d20 100644 --- a/lantern_extras/Cargo.toml +++ b/lantern_extras/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lantern_extras" -version = "0.1.2" +version = "0.2.0" edition = "2021" [lib] From bf947ad40ba687d5b03845229a644932f8b000eb Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Sat, 17 Feb 2024 21:09:16 +0000 Subject: [PATCH 3/9] Temporarily change lantern tag for testing before lantern is released --- ci/scripts/build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/scripts/build.sh b/ci/scripts/build.sh index 04970d7..246c9d1 100755 --- a/ci/scripts/build.sh +++ b/ci/scripts/build.sh @@ -49,6 +49,7 @@ function setup_postgres() { function setup_lantern() { LANTERN_VERSION=main + LANTERN_VERSION=narek/pgvector-compat git clone --recursive https://github.com/lanterndata/lantern.git /tmp/lantern pushd /tmp/lantern git checkout ${LANTERN_VERSION} && \ From 4b540b3a45a56d48f0c6b072d32a3b74b7997d78 Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Tue, 20 Feb 2024 08:35:32 +0000 Subject: [PATCH 4/9] Implement pq-quantization in external index construction --- lantern_daemon/src/external_index_jobs.rs | 1 + lantern_external_index/Cargo.toml | 3 +- lantern_external_index/src/cli.rs | 4 ++ lantern_external_index/src/lib.rs | 69 +++++++++++++++++++++++ lantern_index_autotune/src/lib.rs | 2 + 5 files changed, 78 insertions(+), 1 deletion(-) diff --git a/lantern_daemon/src/external_index_jobs.rs b/lantern_daemon/src/external_index_jobs.rs index 1ed5669..d327880 100644 --- a/lantern_daemon/src/external_index_jobs.rs +++ b/lantern_daemon/src/external_index_jobs.rs @@ -135,6 +135,7 @@ async fn external_index_worker( dims: 0, out: index_path, remote_database: true, + pq: false, }, progress_callback, Some(is_canceled_clone), Some(task_logger)); futures::executor::block_on(cancel_tx_clone.send(false))?; result diff --git a/lantern_external_index/Cargo.toml b/lantern_external_index/Cargo.toml index 2908ac3..bd9855a 100644 --- a/lantern_external_index/Cargo.toml +++ b/lantern_external_index/Cargo.toml @@ -13,7 +13,8 @@ clap = { version = "4.4.0", features = ["derive"] } cxx = "1.0.106" postgres = "0.19.7" postgres-types = { version = "0.2.6", features = ["derive"] } -usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch = "main-lantern" } +# usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch = "main-lantern" } +usearch = {path = "../../third_party/usearch/"} lantern_logger = { path = "../lantern_logger" } lantern_utils = { path = "../lantern_utils" } rand = "0.8.5" diff --git a/lantern_external_index/src/cli.rs b/lantern_external_index/src/cli.rs index 1e9a8b2..ee08046 100644 --- a/lantern_external_index/src/cli.rs +++ b/lantern_external_index/src/cli.rs @@ -113,6 +113,10 @@ pub struct CreateIndexArgs { #[arg(short, long)] pub column: String, + /// Use already created codebook to create product-quantized binary index + #[arg(short, long, default_value_t = false)] + pub pq: bool, + /// Number of neighbours for each vector #[arg(short, default_value_t = 16)] pub m: usize, diff --git a/lantern_external_index/src/lib.rs b/lantern_external_index/src/lib.rs index ad6393c..d8d7402 100644 --- a/lantern_external_index/src/lib.rs +++ b/lantern_external_index/src/lib.rs @@ -114,6 +114,7 @@ pub fn create_usearch_index( let full_table_name = get_full_table_name(&args.schema, &args.table); transaction.execute("SET lock_timeout='5s'", &[])?; + //todo:: ask-Varik: why is this necessary? transaction.execute( &format!("LOCK TABLE ONLY {full_table_name} IN SHARE MODE"), &[], @@ -142,6 +143,60 @@ pub fn create_usearch_index( dimensions, args.m, args.ef, args.efc )); + let mut pq_codebook: *const f32 = std::ptr::null(); + let mut num_centroids: usize = 0; + let mut num_subvectors: usize = 0; + + if args.pq { + let rows_c = transaction.query( + &format!( + "SELECT count(*) FROM _lantern_internal._codebook_{table_name}_{column_name} WHERE subvector_id = 0;", + table_name = args.table, + column_name = args.column, + ), + &[], + )?; + let rows_sv = transaction.query( + &format!( + "SELECT count(*) FROM _lantern_internal._codebook_{table_name}_{column_name} WHERE centroid_id = 0;", + table_name = args.table, + column_name = args.column, + ), + &[], + )?; + + if rows_c.len() == 0 || rows_sv.len() == 0 { + anyhow::bail!("Invalid codebook table"); + } + + num_centroids = rows_c.first().unwrap().get::(0) as usize; + num_subvectors = rows_sv.first().unwrap().get::(0) as usize; + + let rows = transaction.query( + &format!( + "SELECT subvector_id, centroid_id, c FROM _lantern_internal._codebook_{table_name}_{column_name};", + table_name = args.table, + column_name = args.column, + ), + &[], + )?; + let mut v = vec![0.; num_centroids * dimensions]; + pq_codebook = v.as_ptr(); + logger.info(&format!( + "codebook has {} rows - {num_centroids} centroids and {num_subvectors} subvectors", + rows.len() + )); + + for r in rows { + let subvector_id: i32 = r.get(0); + let centroid_id: i32 = r.get(1); + let subvector: Vec = r.get(2); + for i in 0..subvector.len() { + v[centroid_id as usize * dimensions + subvector_id as usize + i] = subvector[i]; + } + } + } + let options = IndexOptions { dimensions, metric: args.metric_kind.value(), @@ -150,6 +205,20 @@ pub fn create_usearch_index( connectivity: args.m, expansion_add: args.efc, expansion_search: args.ef, + + num_threads: 0, // automatic + + // note: pq_construction and pq_output distinction is not yet implemented in usearch + // in the future, if pq_construction is false, we will use full vectors in memory (and + // require large memory for construction) but will output pq-quantized graph + // + // currently, regardless of pq_construction value, as long as pq_output is true, + // we construct a pq_quantized index using quantized values during construction + pq_construction: args.pq, + pq_output: args.pq, + num_centroids, + num_subvectors, + codebook: pq_codebook, }; let index = Index::new(&options)?; diff --git a/lantern_index_autotune/src/lib.rs b/lantern_index_autotune/src/lib.rs index 9194de8..ee6e4a1 100644 --- a/lantern_index_autotune/src/lib.rs +++ b/lantern_index_autotune/src/lib.rs @@ -446,6 +446,7 @@ pub fn autotune_index( dims: column_dims as usize, index_name: Some(index_name.clone()), remote_database: true, + pq: false, }, None, Some(is_canceled.clone()), @@ -531,6 +532,7 @@ pub fn autotune_index( dims: column_dims as usize, index_name: None, remote_database: true, + pq: false, }, None, Some(is_canceled.clone()), From 786c6cb7a2051f57de0b79b71498f3202554eed8 Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Tue, 20 Feb 2024 10:14:55 +0000 Subject: [PATCH 5/9] Fix codebook offset bug --- lantern_external_index/src/lib.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lantern_external_index/src/lib.rs b/lantern_external_index/src/lib.rs index d8d7402..3908511 100644 --- a/lantern_external_index/src/lib.rs +++ b/lantern_external_index/src/lib.rs @@ -180,7 +180,7 @@ pub fn create_usearch_index( ), &[], )?; - let mut v = vec![0.; num_centroids * dimensions]; + let mut v: Vec = vec![0.; num_centroids * dimensions]; pq_codebook = v.as_ptr(); logger.info(&format!( "codebook has {} rows - {num_centroids} centroids and {num_subvectors} subvectors", @@ -192,7 +192,9 @@ pub fn create_usearch_index( let centroid_id: i32 = r.get(1); let subvector: Vec = r.get(2); for i in 0..subvector.len() { - v[centroid_id as usize * dimensions + subvector_id as usize + i] = subvector[i]; + v[centroid_id as usize * dimensions + + subvector_id as usize * subvector.len() + + i] = subvector[i]; } } } From 5b4d024a90bfa9b8bfc0b938599e5e08feaf15eb Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Tue, 20 Feb 2024 10:15:18 +0000 Subject: [PATCH 6/9] set pq parameter in index construction when importing --- lantern_external_index/src/lib.rs | 4 +++- lantern_external_index/src/postgres_large_objects.rs | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lantern_external_index/src/lib.rs b/lantern_external_index/src/lib.rs index 3908511..73376b3 100644 --- a/lantern_external_index/src/lib.rs +++ b/lantern_external_index/src/lib.rs @@ -388,6 +388,7 @@ pub fn create_usearch_index( args.efc, dimensions, args.m, + args.pq, )?; } else { // If job is run on the same server as database we can skip copying part @@ -402,7 +403,8 @@ pub fn create_usearch_index( } transaction.execute( - &format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=args.out, table_name=&get_full_table_name(&args.schema, &args.table),column_name="e_ident(&args.column), m=args.m, ef=args.ef, ef_construction=args.efc, dim=dimensions), + &format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', pq={pq}, ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=args.out, table_name=&get_full_table_name(&args.schema, &args.table), + column_name="e_ident(&args.column), pq=args.pq, m=args.m, ef=args.ef, ef_construction=args.efc, dim=dimensions), &[], )?; diff --git a/lantern_external_index/src/postgres_large_objects.rs b/lantern_external_index/src/postgres_large_objects.rs index 3a9a0f1..4c26dd2 100644 --- a/lantern_external_index/src/postgres_large_objects.rs +++ b/lantern_external_index/src/postgres_large_objects.rs @@ -41,6 +41,7 @@ impl<'a> LargeObject<'a> { ef_construction: usize, dim: usize, m: usize, + pq: bool, ) -> crate::AnyhowVoidResult { let mut transaction = self.transaction.unwrap(); transaction.execute( @@ -57,7 +58,8 @@ impl<'a> LargeObject<'a> { } transaction.execute( - &format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=self.index_path), + &format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', pq={pq}, ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", + index_path=self.index_path), &[], )?; From ccbe33c996fa57b61f948242fa92436559eeb9bc Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Wed, 21 Feb 2024 05:59:23 +0000 Subject: [PATCH 7/9] Fix codebook lifetime bug in rust<->C interface --- lantern_external_index/src/lib.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lantern_external_index/src/lib.rs b/lantern_external_index/src/lib.rs index 73376b3..bc3a542 100644 --- a/lantern_external_index/src/lib.rs +++ b/lantern_external_index/src/lib.rs @@ -144,6 +144,7 @@ pub fn create_usearch_index( )); let mut pq_codebook: *const f32 = std::ptr::null(); + let mut v: Vec = vec![]; let mut num_centroids: usize = 0; let mut num_subvectors: usize = 0; @@ -172,6 +173,8 @@ pub fn create_usearch_index( num_centroids = rows_c.first().unwrap().get::(0) as usize; num_subvectors = rows_sv.first().unwrap().get::(0) as usize; + v.resize(num_centroids * dimensions, 0.); + let rows = transaction.query( &format!( "SELECT subvector_id, centroid_id, c FROM _lantern_internal._codebook_{table_name}_{column_name};", @@ -180,8 +183,6 @@ pub fn create_usearch_index( ), &[], )?; - let mut v: Vec = vec![0.; num_centroids * dimensions]; - pq_codebook = v.as_ptr(); logger.info(&format!( "codebook has {} rows - {num_centroids} centroids and {num_subvectors} subvectors", rows.len() @@ -197,6 +198,7 @@ pub fn create_usearch_index( + i] = subvector[i]; } } + pq_codebook = v.as_ptr(); } let options = IndexOptions { From 93126c8733335db0a6c224027550b9ff27ab7975 Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Wed, 21 Feb 2024 07:35:43 +0000 Subject: [PATCH 8/9] Prepare for release --- lantern_external_index/Cargo.toml | 3 +-- lantern_external_index/src/lib.rs | 1 - lantern_extras/Cargo.toml | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lantern_external_index/Cargo.toml b/lantern_external_index/Cargo.toml index bd9855a..2908ac3 100644 --- a/lantern_external_index/Cargo.toml +++ b/lantern_external_index/Cargo.toml @@ -13,8 +13,7 @@ clap = { version = "4.4.0", features = ["derive"] } cxx = "1.0.106" postgres = "0.19.7" postgres-types = { version = "0.2.6", features = ["derive"] } -# usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch = "main-lantern" } -usearch = {path = "../../third_party/usearch/"} +usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch = "main-lantern" } lantern_logger = { path = "../lantern_logger" } lantern_utils = { path = "../lantern_utils" } rand = "0.8.5" diff --git a/lantern_external_index/src/lib.rs b/lantern_external_index/src/lib.rs index bc3a542..926fad0 100644 --- a/lantern_external_index/src/lib.rs +++ b/lantern_external_index/src/lib.rs @@ -114,7 +114,6 @@ pub fn create_usearch_index( let full_table_name = get_full_table_name(&args.schema, &args.table); transaction.execute("SET lock_timeout='5s'", &[])?; - //todo:: ask-Varik: why is this necessary? transaction.execute( &format!("LOCK TABLE ONLY {full_table_name} IN SHARE MODE"), &[], diff --git a/lantern_extras/Cargo.toml b/lantern_extras/Cargo.toml index df07d20..f70ea44 100644 --- a/lantern_extras/Cargo.toml +++ b/lantern_extras/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lantern_extras" -version = "0.2.0" +version = "0.1.3" edition = "2021" [lib] From 09be32c1e4f7765e847f54db06692b268e697a7f Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Wed, 21 Feb 2024 13:08:31 +0400 Subject: [PATCH 9/9] Fix naming for uppercase table names, check if codebook table exists before processing --- ci/scripts/build.sh | 2 +- lantern_external_index/Cargo.toml | 2 +- lantern_external_index/src/lib.rs | 29 ++++++++++++++++++----------- lantern_extras/Cargo.toml | 2 +- 4 files changed, 21 insertions(+), 14 deletions(-) diff --git a/ci/scripts/build.sh b/ci/scripts/build.sh index 246c9d1..9283879 100755 --- a/ci/scripts/build.sh +++ b/ci/scripts/build.sh @@ -56,7 +56,7 @@ function setup_lantern() { git submodule update --recursive && \ mkdir build pushd build - cmake -DUSEARCH_NO_MARCH_NATIVE=ON .. && \ + cmake -DMARCH_NATIVE=OFF -DBUILD_FOR_DSTRIBUTING=1 .. && \ make install popd popd diff --git a/lantern_external_index/Cargo.toml b/lantern_external_index/Cargo.toml index 2908ac3..5a65aed 100644 --- a/lantern_external_index/Cargo.toml +++ b/lantern_external_index/Cargo.toml @@ -13,7 +13,7 @@ clap = { version = "4.4.0", features = ["derive"] } cxx = "1.0.106" postgres = "0.19.7" postgres-types = { version = "0.2.6", features = ["derive"] } -usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch = "main-lantern" } +usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch="main-lantern" } lantern_logger = { path = "../lantern_logger" } lantern_utils = { path = "../lantern_utils" } rand = "0.8.5" diff --git a/lantern_external_index/src/lib.rs b/lantern_external_index/src/lib.rs index 926fad0..e18c901 100644 --- a/lantern_external_index/src/lib.rs +++ b/lantern_external_index/src/lib.rs @@ -148,20 +148,26 @@ pub fn create_usearch_index( let mut num_subvectors: usize = 0; if args.pq { + let codebook_table_name = format!( + "_codebook_{table_name}_{column_name}", + table_name = &args.table, + column_name = &args.column + ); + let full_codebook_table_name = + get_full_table_name("_lantern_internal", &codebook_table_name); + + let rows_codebook_exists = transaction.query("SELECT true FROM information_schema.tables WHERE table_schema='_lantern_internal' AND table_name=$1;", &[&codebook_table_name])?; + + if rows_codebook_exists.len() == 0 { + anyhow::bail!("Codebook table {full_codebook_table_name} does not exist"); + } + let rows_c = transaction.query( - &format!( - "SELECT count(*) FROM _lantern_internal._codebook_{table_name}_{column_name} WHERE subvector_id = 0;", - table_name = args.table, - column_name = args.column, - ), + &format!("SELECT COUNT(*) FROM {full_codebook_table_name} WHERE subvector_id = 0;"), &[], )?; let rows_sv = transaction.query( - &format!( - "SELECT count(*) FROM _lantern_internal._codebook_{table_name}_{column_name} WHERE centroid_id = 0;", - table_name = args.table, - column_name = args.column, - ), + &format!("SELECT COUNT(*) FROM {full_codebook_table_name} WHERE centroid_id = 0;"), &[], )?; @@ -182,8 +188,9 @@ pub fn create_usearch_index( ), &[], )?; + logger.info(&format!( - "codebook has {} rows - {num_centroids} centroids and {num_subvectors} subvectors", + "Codebook has {} rows - {num_centroids} centroids and {num_subvectors} subvectors", rows.len() )); diff --git a/lantern_extras/Cargo.toml b/lantern_extras/Cargo.toml index f70ea44..e59e5d6 100644 --- a/lantern_extras/Cargo.toml +++ b/lantern_extras/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lantern_extras" -version = "0.1.3" +version = "0.1.2" edition = "2021" [lib]