diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index 97295f863..f18839c48 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -1,5 +1,5 @@ ARG VERSION=15 -ARG PGVECTOR_VERSION=0.5.1 +ARG PGVECTOR_VERSION=0.7.4 #fix pg_cron at the latest commit of the time ARG PG_CRON_COMMIT_SHA=7e91e72b1bebc5869bb900d9253cc9e92518b33f @@ -31,7 +31,7 @@ RUN gem install pg -- --with-pg-include=/usr/local/pgsql/include/ --with-pg-lib= # hack to make sure postgres user has write access to externally mounted volumes RUN mkdir /lantern_shared && chown postgres:postgres /lantern_shared -RUN cd /root/postgresql-15.5/contrib && make install -j +RUN cd /root/postgresql-15.5/contrib && make install # allow non-root users to install in the container to make it easier to run update-tests RUN chmod -R 777 /usr/local/pgsql/lib/ /usr/local/pgsql/share/extension/ /usr/local/pgsql/include/server/ @@ -55,7 +55,7 @@ COPY . . RUN sudo rm -rf build \ && mkdir build \ && cd build \ - && cmake -DCMAKE_BUILD_TYPE=Debug .. \ + && cmake .. \ && make -j install # Install benchmarking tools in build folder @@ -65,7 +65,7 @@ RUN git clone https://github.com/lanterndata/benchmark build/benchmark \ && pip install -r external/requirements.txt # Install perf -RUN sudo apt update && sudo apt install -y linux-tools-common linux-tools-generic linux-tools-`uname -r` +RUN sudo apt update && sudo apt install -y linux-tools-common linux-tools-generic # in host, enable perf_event paranoid via # echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid diff --git a/sql/lantern.sql b/sql/lantern.sql index ab115bb57..2a9aad6a6 100644 --- a/sql/lantern.sql +++ b/sql/lantern.sql @@ -140,7 +140,7 @@ BEGIN COMMUTATOR = '' ); - -- pgvecor's vector type requires floats and we cannot define hamming distance for floats + -- pgvector's vector type requires floats and we cannot define hamming distance for floats CREATE OPERATOR CLASS dist_vec_l2sq_ops DEFAULT FOR TYPE vector USING lantern_hnsw AS OPERATOR 1 (vector, vector) FOR ORDER BY float_ops, @@ -667,6 +667,7 @@ CREATE OR REPLACE FUNCTION _lantern_internal.maybe_setup_weighted_vector_search( $weighted_vector_search$ DECLARE pgvector_exists boolean; + pgvector_sparsevec_exists boolean; BEGIN -- Check if the vector type from pgvector exists SELECT EXISTS ( @@ -680,33 +681,32 @@ BEGIN RETURN; END IF; - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, + -- Check if the sparsevec type from pgvector exists + SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'sparsevec' + ) INTO pgvector_sparsevec_exists; + + CREATE OR REPLACE FUNCTION _lantern_internal.weighted_vector_search_helper( + table_name regtype, w1 numeric, col1 text, - vec1 vector, - w2 numeric= 0, + vec1 text, + w2 numeric = 0, col2 text = NULL, - vec2 vector = NULL, + vec2 text = NULL, w3 numeric = 0, col3 text = NULL, - vec3 vector = NULL, + vec3 text = NULL, ef integer = 100, max_dist numeric = NULL, - -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. distance_operator text = '<->', id_col text = 'id', exact boolean = false, debug_output boolean = false, analyze_output boolean = false - ) - -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement - -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) - -- then that single "column" is properly spread with source table's column names - -- but, when returning ("row" anyelement, "anothercol" integer), things fall all oaver the place - -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" - RETURNS TABLE ("row" anyelement) AS - $$ + ) RETURNS TEXT AS $$ DECLARE joint_condition text; query_base text; @@ -714,10 +714,15 @@ BEGIN query1 text; query2 text; query3 text; - -- variables for weighted columns + parsed_schema_name text; + parsed_table_name text; wc1 text = NULL; wc2 text = NULL; wc3 text = NULL; + is_sparsevec_regex text = '\{\d+:\d+(\.\d+)?(,\d+:\d+(\.\d+)?)*\}/\d+'; + vec1_string text = NULL; + vec2_string text = NULL; + vec3_string text = NULL; cte_query text; maybe_unions_query text; final_query text; @@ -727,32 +732,80 @@ BEGIN debug_count integer; maybe_analyze text = ''; BEGIN - -- TODO:: better validate inputs to throw nicer errors in case of wrong input: - -- 1. only allow valid distance_operator stirngs (<->, <=>, but not abracadabra) - -- 2. only allow valid column names - -- 3. throw an error on negative weights - -- 4. check that id_col column exists before proceeding + + -- Start: Validate inputs + + -- 1. only allow valid distance_operator strings (<->, <=>, but not abracadabra) + IF distance_operator NOT IN ('<->', '<=>', '<#>', '<+>') THEN + RAISE EXCEPTION 'Invalid distance operator: %', distance_operator; + END IF; + + -- 2. only allow valid column names, i.e., column names that exist in the table + SELECT n.nspname, c.relname INTO parsed_schema_name, parsed_table_name FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.reltype = table_name::oid; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = id_col) THEN + RAISE EXCEPTION 'Invalid column name: %', id_col; + END IF; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col1) THEN + RAISE EXCEPTION 'Invalid column name: %', col1; + END IF; + IF col2 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col2) THEN + RAISE EXCEPTION 'Invalid column name: %', col2; + END IF; + IF col3 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col3) THEN + RAISE EXCEPTION 'Invalid column name: %', col3; + END IF; + + -- 3. throw an error on negative weights + IF w1 < 0 OR w2 < 0 OR w3 < 0 THEN + RAISE EXCEPTION 'Invalid weight: %', w1; + END IF; + + -- End: Validate inputs IF analyze_output THEN - maybe_analyze := 'ANALYZE, BUFFERS,'; + maybe_analyze := 'ANALYZE, BUFFERS,'; END IF; - -- Joint similarity metric condition - -- the cast ::vector is necessary for cases when the column is not of type vector + + -- Generate vector strings + -- the cast is necessary for cases when the column is not of type vector -- and for some reason in those cases cast does not happen automatically - wc1 := format('(%s * (%I %s %L::vector))', w1, col1, distance_operator, vec1); + IF vec1 IS NOT NULL THEN + IF vec1 ~ is_sparsevec_regex THEN + vec1_string := vec1 || '::sparsevec'; + ELSE + vec1_string := vec1 || '::vector'; + END IF; + END IF; + IF vec2 IS NOT NULL THEN + IF vec2 ~ is_sparsevec_regex THEN + vec2_string := vec2 || '::sparsevec'; + ELSE + vec2_string := vec2 || '::vector'; + END IF; + END IF; + IF vec3 IS NOT NULL THEN + IF vec3 ~ is_sparsevec_regex THEN + vec3_string := vec3 || '::sparsevec'; + ELSE + vec3_string := vec3 || '::vector'; + END IF; + END IF; + + -- Joint similarity metric condition + wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1_string); IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - wc2 := format(' (%s * (%I %s %L::vector))', w2, col2, distance_operator, vec2); + wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2_string); END IF; IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - wc3 := format(' (%s * (%I %s %L::vector))', w3, col3, distance_operator, vec3); + wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3_string); END IF; joint_condition := wc1 || COALESCE('+' || wc2, '') || COALESCE('+' || wc3, ''); -- Base query with joint similarity metric - query_base := format('SELECT * FROM %s ', pg_typeof(relation_type)); + query_base := format('SELECT * FROM %s ', table_name); IF max_dist IS NOT NULL THEN - query_final_where := format(' WHERE %s < %L', joint_condition, max_dist); + query_final_where := format(' WHERE %s < %L', joint_condition, max_dist); END IF; IF exact THEN @@ -766,9 +819,8 @@ BEGIN explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); RAISE WARNING 'weighted vector search explain(exact=true): %', jsonb_pretty(explain_output); END IF; - RETURN QUERY EXECUTE final_query; - -- the empty return below is crucial, to make sure the rest of the function is not executed after the return query above - RETURN; + + RETURN final_query; END IF; EXECUTE format('SET LOCAL hnsw.ef_search TO %L', ef); @@ -776,7 +828,7 @@ BEGIN maybe_unions_query := ''; -- Query 1: Order by first condition's weighted similarity - query1 := format('%s ORDER BY %I %s %L::vector LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1, ef); + query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1_string, ef); IF debug_output THEN EXECUTE format('SELECT count(*) FROM (%s) t', query1) INTO debug_count; @@ -787,7 +839,7 @@ BEGIN -- Query 2: Order by other conditions' weighted similarity, if applicable IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - query2 := format('%s ORDER BY %I %s %L::vector LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2, ef); + query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2_string, ef); cte_query := cte_query || format(', query2 AS (%s)', query2); maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query2) '); IF debug_output THEN @@ -796,8 +848,9 @@ BEGIN END IF; END IF; + -- Query 3: Order by third condition's weighted similarity, if applicable IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - query3 := format('%s ORDER BY %I %s %L::vector LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3, ef); + query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3_string, ef); cte_query := cte_query || format(', query3 AS (%s)', query3); maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query3) '); IF debug_output THEN @@ -815,26 +868,28 @@ BEGIN tt %s ORDER BY %s$final_cte_query$, id_col, maybe_unions_query, query_final_where, joint_condition); - IF debug_output THEN - explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); - EXECUTE explain_query INTO explain_output; + IF debug_output THEN + explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); + EXECUTE explain_query INTO explain_output; - RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); + RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); - explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); - RAISE WARNING ' weighted vector search explain: %', jsonb_pretty(explain_output); - END IF; - RETURN QUERY EXECUTE final_query; - END + explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); + RAISE WARNING ' weighted vector search explain: %', jsonb_pretty(explain_output); + END IF; + + RETURN final_query; + + END $$ LANGUAGE plpgsql; --- setup API shortcuts - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( relation_type anyelement, w1 numeric, col1 text, vec1 vector, - w2 numeric= 0, + w2 numeric = 0, col2 text = NULL, vec2 vector = NULL, w3 numeric = 0, @@ -842,6 +897,8 @@ BEGIN vec3 vector = NULL, ef integer = 100, max_dist numeric = NULL, + -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. + distance_operator text = '<->', id_col text = 'id', exact boolean = false, debug_output boolean = false, @@ -850,20 +907,242 @@ BEGIN -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) -- then that single "column" is properly spread with source table's column names - -- but, when returning ("row" anyelement, "anothercol" integer), things fall all oaver the place + -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" - RETURNS TABLE ("row" anyelement) AS $$ + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; -BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); -END $$ LANGUAGE plpgsql; + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + RETURN; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + END IF; + + -- setup Cosine API shortcuts - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( relation_type anyelement, w1 numeric, col1 text, vec1 vector, - w2 numeric= 0, + w2 numeric = 0, col2 text = NULL, vec2 vector = NULL, w3 numeric = 0, @@ -875,18 +1154,367 @@ END $$ LANGUAGE plpgsql; exact boolean = false, debug_output boolean = false, analyze_output boolean = false - ) - -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement - -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) - -- then that single "column" is properly spread with source table's column names - -- but, when returning ("row" anyelement, "anothercol" integer), things fall all oaver the place - -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" - RETURNS TABLE ("row" anyelement) AS $$ + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; -BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); -END $$ LANGUAGE plpgsql; + -- s v v + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + RETURN; + ELSE + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; + + -- setup L2SQ API shortcuts + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + RETURN; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; END $weighted_vector_search$ LANGUAGE plpgsql; diff --git a/sql/updates/0.3.1--0.3.2.sql b/sql/updates/0.3.1--0.3.2.sql index e69de29bb..fa5f43ab0 100644 --- a/sql/updates/0.3.1--0.3.2.sql +++ b/sql/updates/0.3.1--0.3.2.sql @@ -0,0 +1,858 @@ +CREATE OR REPLACE FUNCTION _lantern_internal.maybe_setup_weighted_vector_search() RETURNS VOID AS +$weighted_vector_search$ +DECLARE + pgvector_exists boolean; + pgvector_sparsevec_exists boolean; +BEGIN + -- Check if the vector type from pgvector exists + SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'vector' + ) INTO pgvector_exists; + + IF NOT pgvector_exists THEN + RAISE NOTICE 'pgvector extension not found. Skipping lantern weighted vector search setup'; + RETURN; + END IF; + + -- Check if the sparsevec type from pgvector exists + SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'sparsevec' + ) INTO pgvector_sparsevec_exists; + + CREATE OR REPLACE FUNCTION _lantern_internal.weighted_vector_search_helper( + table_name regtype, + w1 numeric, + col1 text, + vec1 text, + w2 numeric = 0, + col2 text = NULL, + vec2 text = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 text = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TEXT AS $$ + DECLARE + joint_condition text; + query_base text; + query_final_where text = ''; + query1 text; + query2 text; + query3 text; + parsed_schema_name text; + parsed_table_name text; + wc1 text = NULL; + wc2 text = NULL; + wc3 text = NULL; + is_sparsevec_regex text = '\{\d+:\d+(\.\d+)?(,\d+:\d+(\.\d+)?)*\}/\d+'; + vec1_string text = NULL; + vec2_string text = NULL; + vec3_string text = NULL; + cte_query text; + maybe_unions_query text; + final_query text; + explain_query text; + explain_output jsonb; + old_hnsw_ef_search numeric; + debug_count integer; + maybe_analyze text = ''; + BEGIN + + -- Start: Validate inputs + + -- 1. only allow valid distance_operator strings (<->, <=>, but not abracadabra) + IF distance_operator NOT IN ('<->', '<=>', '<#>', '<+>') THEN + RAISE EXCEPTION 'Invalid distance operator: %', distance_operator; + END IF; + + -- 2. only allow valid column names, i.e., column names that exist in the table + SELECT n.nspname, c.relname INTO parsed_schema_name, parsed_table_name FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.reltype = table_name::oid; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = id_col) THEN + RAISE EXCEPTION 'Invalid column name: %', id_col; + END IF; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col1) THEN + RAISE EXCEPTION 'Invalid column name: %', col1; + END IF; + IF col2 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col2) THEN + RAISE EXCEPTION 'Invalid column name: %', col2; + END IF; + IF col3 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col3) THEN + RAISE EXCEPTION 'Invalid column name: %', col3; + END IF; + + -- 3. throw an error on negative weights + IF w1 < 0 OR w2 < 0 OR w3 < 0 THEN + RAISE EXCEPTION 'Invalid weight: %', w1; + END IF; + + -- End: Validate inputs + + IF analyze_output THEN + maybe_analyze := 'ANALYZE, BUFFERS,'; + END IF; + + -- Generate vector strings + -- the cast is necessary for cases when the column is not of type vector + -- and for some reason in those cases cast does not happen automatically + IF vec1 IS NOT NULL THEN + IF vec1 ~ is_sparsevec_regex THEN + vec1_string := vec1 || '::sparsevec'; + ELSE + vec1_string := vec1 || '::vector'; + END IF; + END IF; + IF vec2 IS NOT NULL THEN + IF vec2 ~ is_sparsevec_regex THEN + vec2_string := vec2 || '::sparsevec'; + ELSE + vec2_string := vec2 || '::vector'; + END IF; + END IF; + IF vec3 IS NOT NULL THEN + IF vec3 ~ is_sparsevec_regex THEN + vec3_string := vec3 || '::sparsevec'; + ELSE + vec3_string := vec3 || '::vector'; + END IF; + END IF; + + -- Joint similarity metric condition + wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1_string); + IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN + wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2_string); + END IF; + IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN + wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3_string); + END IF; + + joint_condition := wc1 || COALESCE('+' || wc2, '') || COALESCE('+' || wc3, ''); + + -- Base query with joint similarity metric + query_base := format('SELECT * FROM %s ', table_name); + IF max_dist IS NOT NULL THEN + query_final_where := format(' WHERE %s < %L', joint_condition, max_dist); + END IF; + + IF exact THEN + final_query := query_base || query_final_where || format(' ORDER BY %s', joint_condition); + IF debug_output THEN + explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); + EXECUTE explain_query INTO explain_output; + + RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); + + explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); + RAISE WARNING 'weighted vector search explain(exact=true): %', jsonb_pretty(explain_output); + END IF; + + RETURN final_query; + END IF; + + EXECUTE format('SET LOCAL hnsw.ef_search TO %L', ef); + -- UNION ALL.. part of the final query that aggregates results from individual vector search queries + maybe_unions_query := ''; + + -- Query 1: Order by first condition's weighted similarity + query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1_string, ef); + + IF debug_output THEN + EXECUTE format('SELECT count(*) FROM (%s) t', query1) INTO debug_count; + RAISE WARNING 'col1 yielded % rows', debug_count; + END IF; + + cte_query = format('WITH query1 AS (%s) ', query1); + + -- Query 2: Order by other conditions' weighted similarity, if applicable + IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN + query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2_string, ef); + cte_query := cte_query || format(', query2 AS (%s)', query2); + maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query2) '); + IF debug_output THEN + EXECUTE format('SELECT count(*) FROM (%s) t', query2) INTO debug_count; + RAISE WARNING 'col2 yielded % rows', debug_count; + END IF; + END IF; + + -- Query 3: Order by third condition's weighted similarity, if applicable + IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN + query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3_string, ef); + cte_query := cte_query || format(', query3 AS (%s)', query3); + maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query3) '); + IF debug_output THEN + EXECUTE format('SELECT count(*) FROM (%s) t', query3) INTO debug_count; + RAISE WARNING 'col3 yielded % rows', debug_count; + END IF; + END IF; + + final_query := cte_query || format($final_cte_query$SELECT * FROM ( + SELECT DISTINCT ON (%I) * FROM ( + (SELECT * FROM query1) + %s + ) t + ) + tt %s ORDER BY %s$final_cte_query$, + id_col, maybe_unions_query, query_final_where, joint_condition); + + IF debug_output THEN + explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); + EXECUTE explain_query INTO explain_output; + + RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); + + explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); + RAISE WARNING ' weighted vector search explain: %', jsonb_pretty(explain_output); + END IF; + + RETURN final_query; + + END + $$ LANGUAGE plpgsql; + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement + -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) + -- then that single "column" is properly spread with source table's column names + -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place + -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + RETURN; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + END IF; + + -- setup Cosine API shortcuts + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v v + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + RETURN; + ELSE + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; + + -- setup L2SQ API shortcuts + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + RETURN; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; + +END +$weighted_vector_search$ LANGUAGE plpgsql; + +SELECT _lantern_internal.maybe_setup_weighted_vector_search(); +DROP FUNCTION _lantern_internal.maybe_setup_weighted_vector_search; \ No newline at end of file diff --git a/test/expected/weighted_search.out b/test/expected/weighted_search.out index a58c7142d..7397ef8d0 100644 --- a/test/expected/weighted_search.out +++ b/test/expected/weighted_search.out @@ -522,6 +522,42 @@ SELECT count(*) 15 (1 row) +-- test sparsevec support +\ir utils/small_world_sparsevec.sql +CREATE TABLE small_world ( + id VARCHAR(3), + b BOOLEAN, + v VECTOR(3), + s SPARSEVEC(3) +); +INSERT INTO small_world (id, b, v, s) VALUES + ('000', TRUE, '[0,0,0]', '{}/3'), + ('001', TRUE, '[0,0,1]', '{3:1}/3'), + ('010', FALSE, '[0,1,0]' , '{2:1}/3'), + ('011', TRUE, '[0,1,1]', '{2:1,3:1}/3'), + ('100', FALSE, '[1,0,0]', '{1:1}/3'), + ('101', FALSE, '[1,0,1]', '{1:1,3:1}/3'), + ('110', FALSE, '[1,1,0]', '{1:1,2:1}/3'), + ('111', TRUE, '[1,1,1]', '{1:1,2:1,3:1}/3'); +SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset +SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset +SELECT + id, + 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist +FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, + w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, + w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector +); + id | dist +-----+-------------------- + 000 | 0.5438256229963183 + 100 | 0.7832971001877798 + 010 | 0.8720951452234501 + 001 | 0.9887007171012123 + 110 | 1.0374727590416999 + 011 | 1.1996858765930867 +(6 rows) + -- create non superuser and test the function SET client_min_messages = WARNING; -- suppress NOTICE: role "test_user" does not exist, skipping diff --git a/test/sql/utils/small_world_sparsevec.sql b/test/sql/utils/small_world_sparsevec.sql new file mode 100644 index 000000000..e9ea24b82 --- /dev/null +++ b/test/sql/utils/small_world_sparsevec.sql @@ -0,0 +1,16 @@ +CREATE TABLE small_world ( + id VARCHAR(3), + b BOOLEAN, + v VECTOR(3), + s SPARSEVEC(3) +); + +INSERT INTO small_world (id, b, v, s) VALUES + ('000', TRUE, '[0,0,0]', '{}/3'), + ('001', TRUE, '[0,0,1]', '{3:1}/3'), + ('010', FALSE, '[0,1,0]' , '{2:1}/3'), + ('011', TRUE, '[0,1,1]', '{2:1,3:1}/3'), + ('100', FALSE, '[1,0,0]', '{1:1}/3'), + ('101', FALSE, '[1,0,1]', '{1:1,3:1}/3'), + ('110', FALSE, '[1,1,0]', '{1:1,2:1}/3'), + ('111', TRUE, '[1,1,1]', '{1:1,2:1,3:1}/3'); \ No newline at end of file diff --git a/test/sql/weighted_search.sql b/test/sql/weighted_search.sql index 9902219a1..5eb187d27 100644 --- a/test/sql/weighted_search.sql +++ b/test/sql/weighted_search.sql @@ -53,7 +53,6 @@ LIMIT 100) t; -- when max_dist is not specified, number of returned values dicreases with smaller ef - SELECT count(*) FROM lantern.weighted_vector_search(CAST(NULL as "sift_base1k"), debug_output => true, exact => true, w1=> 0.03, col1=>'v'::text, vec1=>:'v4'::vector, @@ -121,6 +120,18 @@ SELECT count(*) w3=> 0.52, col3=>'v_real'::text, vec3=>:'v444'::vector ); +-- test sparsevec support +\ir utils/small_world_sparsevec.sql +SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset +SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset +SELECT + id, + 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist +FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, + w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, + w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector +); + -- create non superuser and test the function SET client_min_messages = WARNING; -- suppress NOTICE: role "test_user" does not exist, skipping @@ -138,4 +149,3 @@ SELECT count(*) w2=> 0.45, col2=>'v_real'::text, vec2=>:'v44'::vector, w3=> 0.52, col3=>'v_real'::text, vec3=>:'v444'::vector ); -