From f532c7e5d47e21072f35d572db6bf2020b680e8f Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 2 Jul 2024 14:50:45 +0200 Subject: [PATCH 1/2] Add document frequency to query items. --- container-search/abi-spec.json | 31 +++++++++++++++++-- .../prelude/query/CompositeTaggableItem.java | 8 +++++ .../prelude/query/DocumentFrequency.java | 10 ++++++ .../java/com/yahoo/prelude/query/Item.java | 4 +++ .../prelude/query/SimpleTaggableItem.java | 8 +++++ .../com/yahoo/prelude/query/TaggableItem.java | 4 +++ .../prelude/query/TaggableSegmentItem.java | 8 +++++ .../prelude/query/TaggableItemsTestCase.java | 11 ++++++- 8 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index e4d64c83b497..08f069348045 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -480,10 +480,30 @@ "public void setExplicitSignificance(boolean)", "public boolean hasExplicitSignificance()", "public double getSignificance()", + "public void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)", + "public java.util.Optional getDocumentFrequency()", "public boolean hasUniqueID()" ], "fields" : [ ] }, + "com.yahoo.prelude.query.DocumentFrequency" : { + "superClass" : "java.lang.Record", + "interfaces" : [ ], + "attributes" : [ + "public", + "final", + "record" + ], + "methods" : [ + "public void (long, long)", + "public final java.lang.String toString()", + "public final int hashCode()", + "public final boolean equals(java.lang.Object)", + "public long frequency()", + "public long corpusSize()" + ], + "fields" : [ ] + }, "com.yahoo.prelude.query.DotProductItem" : { "superClass" : "com.yahoo.prelude.query.WeightedSetItem", "interfaces" : [ ], @@ -874,7 +894,8 @@ "protected com.yahoo.prelude.query.Item connectedBacklink", "protected double connectivity", "protected double significance", - "protected boolean explicitSignificance" + "protected boolean explicitSignificance", + "protected com.yahoo.prelude.query.DocumentFrequency documentFrequency" ] }, "com.yahoo.prelude.query.ItemHelper" : { @@ -1578,6 +1599,8 @@ "public void setExplicitSignificance(boolean)", "public boolean hasExplicitSignificance()", "public double getSignificance()", + "public void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)", + "public java.util.Optional getDocumentFrequency()", "public boolean hasUniqueID()" ], "fields" : [ ] @@ -1679,7 +1702,9 @@ "public abstract void setSignificance(double)", "public abstract boolean hasExplicitSignificance()", "public abstract void setExplicitSignificance(boolean)", - "public abstract double getSignificance()" + "public abstract double getSignificance()", + "public abstract void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)", + "public abstract java.util.Optional getDocumentFrequency()" ], "fields" : [ ] }, @@ -1703,6 +1728,8 @@ "public void setExplicitSignificance(boolean)", "public boolean hasExplicitSignificance()", "public double getSignificance()", + "public void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)", + "public java.util.Optional getDocumentFrequency()", "public boolean hasUniqueID()" ], "fields" : [ ] diff --git a/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java index 10dc817b2b03..9cad27d72094 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java @@ -1,6 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import java.util.Optional; + /** * Common implementation for Item classes implementing the TaggableItem interface. * Note that this file exists in 3 copies that should be kept in sync: @@ -68,6 +70,12 @@ public double getSignificance() { return significance; } + @Override + public void setDocumentFrequency(DocumentFrequency documentFrequency) { this.documentFrequency = documentFrequency; } + + @Override + public Optional getDocumentFrequency() { return Optional.ofNullable(documentFrequency); } + //Change access privilege from protected to public. public boolean hasUniqueID() { return super.hasUniqueID(); diff --git a/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java b/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java new file mode 100644 index 000000000000..ef9966c64dec --- /dev/null +++ b/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java @@ -0,0 +1,10 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.prelude.query; + +/* + * The expected number of documents matching the item given a corpus of + * multiple documents. This is the raw data used to calculate variants + * of idf, used as significance. + */ +public record DocumentFrequency(long frequency, long corpusSize) { +} diff --git a/container-search/src/main/java/com/yahoo/prelude/query/Item.java b/container-search/src/main/java/com/yahoo/prelude/query/Item.java index 099c546e3f0c..6c82a2bea0e5 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/Item.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/Item.java @@ -124,6 +124,8 @@ public enum ItemCreator { protected double significance = 0; protected boolean explicitSignificance = false; + protected DocumentFrequency documentFrequency = null; + /** Whether this item is eligible for change by query rewriters (false) or should be kept as-is (true) */ private boolean isProtected; @@ -495,6 +497,8 @@ public void disclose(Discloser discloser) { discloser.addProperty("usePositionData", usePositionData); if (explicitSignificance) discloser.addProperty("significance", significance); + if (documentFrequency != null) + discloser.addProperty("documentFrequency", documentFrequency); if (weight != 100) discloser.addProperty("weight", weight); if (label != null) diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java index 1718a4e77089..8d3eecd9f29a 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java @@ -1,6 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import java.util.Optional; + /** * Common implementation for Item classes implementing the TaggableItem interface. * Note that this file exist in 3 copies that should be kept in sync: @@ -68,6 +70,12 @@ public double getSignificance() { return significance; } + @Override + public void setDocumentFrequency(DocumentFrequency documentFrequency) { this.documentFrequency = documentFrequency; } + + @Override + public Optional getDocumentFrequency() { return Optional.ofNullable(documentFrequency); } + //Change access privilege from protected to public. public boolean hasUniqueID() { return super.hasUniqueID(); diff --git a/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java index 1bfd75f8d27d..0ae232925b2b 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java @@ -1,6 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import java.util.Optional; + /** * An interface used for anything which may be addressed using an external, * unique ID in the query tree in the backend. @@ -44,4 +46,6 @@ public interface TaggableItem { void setExplicitSignificance(boolean significance); double getSignificance(); + void setDocumentFrequency(DocumentFrequency documentFrequency); + Optional getDocumentFrequency(); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java index 1cba588ce405..b3549e6868b7 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java @@ -1,6 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import java.util.Optional; + /** * Common implementation for Item classes implementing the TaggableItem interface. * Note that this file exist in 3 copies that should be kept in sync: @@ -81,6 +83,12 @@ public double getSignificance() { return significance; } + @Override + public void setDocumentFrequency(DocumentFrequency documentFrequency) { this.documentFrequency = documentFrequency; } + + @Override + public Optional getDocumentFrequency() { return Optional.ofNullable(documentFrequency); } + //Change access privilege from protected to public. @Override public boolean hasUniqueID() { diff --git a/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java index de77d6721f06..10d4523041ce 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java @@ -99,7 +99,7 @@ void requireSimilarAPIs() { .getDeclaredMethods(); final Method[] simple = SimpleTaggableItem.class.getDeclaredMethods(); final Method[] segment = TaggableSegmentItem.class.getDeclaredMethods(); - final int numberOfMethods = 10; + final int numberOfMethods = 12; assertEquals(numberOfMethods, composite.length); assertEquals(numberOfMethods, simple.length); assertEquals(numberOfMethods, segment.length); @@ -152,4 +152,13 @@ final void testSetSignificance() { assertTrue(p.hasExplicitSignificance()); } + @Test + final void testSetDocumentFrequency() { + final PhraseSegmentItem p = new PhraseSegmentItem("farmyards", false, false); + assertFalse(p.getDocumentFrequency().isPresent()); + p.setDocumentFrequency(new DocumentFrequency(13, 100)); + assertTrue(p.getDocumentFrequency().isPresent()); + assertEquals(new DocumentFrequency(13, 100), p.getDocumentFrequency().get()); + } + } From c700babbe23608e359e4ee660cf76e9282c7f775 Mon Sep 17 00:00:00 2001 From: Tor Egge Date: Tue, 2 Jul 2024 16:02:29 +0200 Subject: [PATCH 2/2] Add Beta annotation for DocumentFrequency record. --- container-search/abi-spec.json | 2 +- .../com/yahoo/prelude/query/DocumentFrequency.java | 12 +++++++++--- .../java/com/yahoo/prelude/query/TaggableItem.java | 4 ++++ 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index 08f069348045..a36a56e65e82 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -500,7 +500,7 @@ "public final int hashCode()", "public final boolean equals(java.lang.Object)", "public long frequency()", - "public long corpusSize()" + "public long count()" ], "fields" : [ ] }, diff --git a/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java b/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java index ef9966c64dec..da35914eaa4f 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java @@ -1,10 +1,16 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; -/* - * The expected number of documents matching the item given a corpus of +import com.yahoo.api.annotations.Beta; + +/** + * The expected number of documents matching an item given a corpus of * multiple documents. This is the raw data used to calculate variants * of idf, used as significance. + * + * @param frequency The number of documents in which an item occurs + * @param count The total number of documents in the corpus */ -public record DocumentFrequency(long frequency, long corpusSize) { +@Beta +public record DocumentFrequency(long frequency, long count) { } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java index 0ae232925b2b..a91fe29590ef 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java @@ -1,5 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import com.yahoo.api.annotations.Beta; import java.util.Optional; @@ -46,6 +47,9 @@ public interface TaggableItem { void setExplicitSignificance(boolean significance); double getSignificance(); + @Beta void setDocumentFrequency(DocumentFrequency documentFrequency); + + @Beta Optional getDocumentFrequency(); }