Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use renamed lantern access method #84

Merged
merged 9 commits into from
Feb 21, 2024
1 change: 1 addition & 0 deletions ci/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ function setup_postgres() {

function setup_lantern() {
LANTERN_VERSION=main
LANTERN_VERSION=narek/pgvector-compat
git clone --recursive https://github.com/lanterndata/lantern.git /tmp/lantern
pushd /tmp/lantern
git checkout ${LANTERN_VERSION} && \
Expand Down
1 change: 1 addition & 0 deletions lantern_daemon/src/external_index_jobs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ async fn external_index_worker(
dims: 0,
out: index_path,
remote_database: true,
pq: false,
}, progress_callback, Some(is_canceled_clone), Some(task_logger));
futures::executor::block_on(cancel_tx_clone.send(false))?;
result
Expand Down
3 changes: 2 additions & 1 deletion lantern_external_index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ clap = { version = "4.4.0", features = ["derive"] }
cxx = "1.0.106"
postgres = "0.19.7"
postgres-types = { version = "0.2.6", features = ["derive"] }
usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch = "main-lantern" }
# usearch = { git = "https://github.com/Ngalstyan4/usearch.git", branch = "main-lantern" }
usearch = {path = "../../third_party/usearch/"}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the version I used is now upstream @ narek/pq-index https://github.com/Ngalstyan4/usearch/tree/narek/pq-index

lantern_logger = { path = "../lantern_logger" }
lantern_utils = { path = "../lantern_utils" }
rand = "0.8.5"
4 changes: 4 additions & 0 deletions lantern_external_index/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ pub struct CreateIndexArgs {
#[arg(short, long)]
pub column: String,

/// Use already created codebook to create product-quantized binary index
#[arg(short, long, default_value_t = false)]
pub pq: bool,

/// Number of neighbours for each vector
#[arg(short, default_value_t = 16)]
pub m: usize,
Expand Down
71 changes: 70 additions & 1 deletion lantern_external_index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ pub fn create_usearch_index(
let full_table_name = get_full_table_name(&args.schema, &args.table);

transaction.execute("SET lock_timeout='5s'", &[])?;
//todo:: ask-Varik: why is this necessary?
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are locking table to make sure no tids will be changed as index is being created

transaction.execute(
&format!("LOCK TABLE ONLY {full_table_name} IN SHARE MODE"),
&[],
Expand Down Expand Up @@ -142,6 +143,60 @@ pub fn create_usearch_index(
dimensions, args.m, args.ef, args.efc
));

let mut pq_codebook: *const f32 = std::ptr::null();
let mut num_centroids: usize = 0;
let mut num_subvectors: usize = 0;

if args.pq {
let rows_c = transaction.query(
&format!(
"SELECT count(*) FROM _lantern_internal._codebook_{table_name}_{column_name} WHERE subvector_id = 0;",
table_name = args.table,
column_name = args.column,
),
&[],
)?;
let rows_sv = transaction.query(
&format!(
"SELECT count(*) FROM _lantern_internal._codebook_{table_name}_{column_name} WHERE centroid_id = 0;",
table_name = args.table,
column_name = args.column,
),
&[],
)?;

if rows_c.len() == 0 || rows_sv.len() == 0 {
anyhow::bail!("Invalid codebook table");
}

num_centroids = rows_c.first().unwrap().get::<usize, i64>(0) as usize;
num_subvectors = rows_sv.first().unwrap().get::<usize, i64>(0) as usize;

let rows = transaction.query(
&format!(
"SELECT subvector_id, centroid_id, c FROM _lantern_internal._codebook_{table_name}_{column_name};",
table_name = args.table,
column_name = args.column,
),
&[],
)?;
let mut v = vec![0.; num_centroids * dimensions];
pq_codebook = v.as_ptr();
logger.info(&format!(
"codebook has {} rows - {num_centroids} centroids and {num_subvectors} subvectors",
rows.len()
));

for r in rows {
let subvector_id: i32 = r.get(0);
let centroid_id: i32 = r.get(1);
let subvector: Vec<f32> = r.get(2);
for i in 0..subvector.len() {
v[centroid_id as usize * dimensions + subvector_id as usize + i] = subvector[i];
}
}
}

let options = IndexOptions {
dimensions,
metric: args.metric_kind.value(),
Expand All @@ -150,6 +205,20 @@ pub fn create_usearch_index(
connectivity: args.m,
expansion_add: args.efc,
expansion_search: args.ef,

num_threads: 0, // automatic

// note: pq_construction and pq_output distinction is not yet implemented in usearch
// in the future, if pq_construction is false, we will use full vectors in memory (and
// require large memory for construction) but will output pq-quantized graph
//
// currently, regardless of pq_construction value, as long as pq_output is true,
// we construct a pq_quantized index using quantized values during construction
pq_construction: args.pq,
pq_output: args.pq,
num_centroids,
num_subvectors,
codebook: pq_codebook,
};
let index = Index::new(&options)?;

Expand Down Expand Up @@ -331,7 +400,7 @@ pub fn create_usearch_index(
}

transaction.execute(
&format!("CREATE INDEX {idx_name} ON {table_name} USING hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=args.out, table_name=&get_full_table_name(&args.schema, &args.table),column_name=&quote_ident(&args.column), m=args.m, ef=args.ef, ef_construction=args.efc, dim=dimensions),
&format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=args.out, table_name=&get_full_table_name(&args.schema, &args.table),column_name=&quote_ident(&args.column), m=args.m, ef=args.ef, ef_construction=args.efc, dim=dimensions),
&[],
)?;

Expand Down
2 changes: 1 addition & 1 deletion lantern_external_index/src/postgres_large_objects.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ impl<'a> LargeObject<'a> {
}

transaction.execute(
&format!("CREATE INDEX {idx_name} ON {table_name} USING hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=self.index_path),
&format!("CREATE INDEX {idx_name} ON {table_name} USING lantern_hnsw({column_name} {op_class}) WITH (_experimental_index_path='{index_path}', ef={ef}, dim={dim}, m={m}, ef_construction={ef_construction});", index_path=self.index_path),
&[],
)?;

Expand Down
2 changes: 1 addition & 1 deletion lantern_extras/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "lantern_extras"
version = "0.1.2"
version = "0.2.0"
edition = "2021"

[lib]
Expand Down
2 changes: 2 additions & 0 deletions lantern_index_autotune/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ pub fn autotune_index(
dims: column_dims as usize,
index_name: Some(index_name.clone()),
remote_database: true,
pq: false,
},
None,
Some(is_canceled.clone()),
Expand Down Expand Up @@ -531,6 +532,7 @@ pub fn autotune_index(
dims: column_dims as usize,
index_name: None,
remote_database: true,
pq: false,
},
None,
Some(is_canceled.clone()),
Expand Down