Skip to content

Commit

Permalink
storage: Improve testing SampleIndexAggregation of intergenic queries…
Browse files Browse the repository at this point in the history
…. #TASK-5663
  • Loading branch information
j-coll committed Feb 27, 2024
1 parent 9ccb5d9 commit e0845b5
Show file tree
Hide file tree
Showing 15 changed files with 183 additions and 56 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ private static String majorMinor(String version) {
public String getVersionFromServer() throws IOException {
if (serverVersion == null) {
synchronized (this) {
ObjectMap result = cellBaseClient.getMetaClient().about().firstResult();
ObjectMap result = retryMetaAbout(3);
if (result == null) {
throw new IOException("Unable to get version from server for cellbase " + toString());
}
Expand All @@ -322,6 +322,16 @@ public String getVersionFromServer() throws IOException {
return serverVersion;
}

private ObjectMap retryMetaAbout(int retries) throws IOException {
ObjectMap result = cellBaseClient.getMetaClient().about().firstResult();
if (result == null && retries > 0) {
// Retry
logger.warn("Unable to get version from server for cellbase " + toString() + ". Retrying...");
result = retryMetaAbout(retries - 1);
}
return result;
}

public boolean isMinVersion(String minVersion) throws IOException {
String serverVersion = getVersionFromServer();
return VersionUtils.isMinVersion(minVersion, serverVersion);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,9 @@ protected List<VariantAnnotation> getVariantAnnotationList(List<Variant> variant
variantAnnotation.getAdditionalAttributes().put(GROUP_NAME.key(), additionalAttribute);
}
}
if (variantAnnotation.getConsequenceTypes() == null || variantAnnotation.getConsequenceTypes().isEmpty()) {
logger.warn("No consequence type found for variant " + variant);
}
variantAnnotationList.add(variantAnnotation);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public boolean open() {
jsonObjectMapper.addMixIn(ConsequenceType.class, ConsequenceTypeMixin.class);
jsonObjectMapper.configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true);
try {
parser = factory.createParser(inputStream);
parser = jsonObjectMapper.createParser(inputStream);
} catch (IOException e) {
throw new RuntimeException(e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,13 @@ public void evaluate(FacetField field) {
* Accumulate T in the given field.
* @param field Field
* @param t element
* @return true if the count was increased, false otherwise
*/
public final void accumulate(FacetField field, T t) {
public final boolean accumulate(FacetField field, T t) {
List<FacetField.Bucket> buckets = getBuckets(field, t);
if (buckets == null || buckets.isEmpty()) {
return;
// Do not increase count if the element does not belong to any bucket
return false;
}
field.addCount(1);
for (FacetField.Bucket bucket : buckets) {
Expand All @@ -83,7 +85,17 @@ public final void accumulate(FacetField field, T t) {
nestedFieldAccumulator.accumulate(bucket.getFacetFields().get(0), t);
}
}
return true;
}

protected abstract List<FacetField.Bucket> getBuckets(FacetField field, T t);

@Override
public String toString() {
final StringBuilder sb = new StringBuilder("FacetFieldAccumulator{name:'");
sb.append(getName()).append('\'');
sb.append(", nestedFieldAccumulator:").append(nestedFieldAccumulator);
sb.append('}');
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -124,17 +124,15 @@ protected VariantQueryResult<FacetField> aggregation(Query query, QueryOptions o

// Loop
long numMatches = 0;
int count = 0;
while (sampleVariantIndexEntryIterator.hasNext()) {
count++;
numMatches++;
SampleVariantIndexEntry entry = sampleVariantIndexEntryIterator.next();
for (int i = 0; i < accumulators.size(); i++) {
FacetFieldAccumulator<SampleVariantIndexEntry> accumulator = accumulators.get(i);
FacetField field = fields.get(i);
accumulator.accumulate(field, entry);
}
}
numMatches += count;

// Tear down and clean up results.
for (int i = 0; i < accumulators.size(); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@ public Variant next() {
return variant;
}

@Override
public Variant nextVariant() {
fetchNextIfNeeded();
return next;
}

@Override
public SampleVariantIndexEntry nextSampleVariantIndexEntry() {
AnnotationIndexEntry annotationIndexEntry = nextAnnotationIndexEntry();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

public class SampleAnnotationIndexQuery {
private final byte[] annotationIndexMask; // byte[] = {mask , index}
private final Boolean intergenic;
private final IndexFieldFilter consequenceTypeFilter;
private final IndexFieldFilter biotypeFilter;
private final IndexFieldFilter transcriptFlagFilter;
Expand All @@ -17,6 +18,7 @@ public class SampleAnnotationIndexQuery {

public SampleAnnotationIndexQuery(SampleIndexSchema schema) {
this.annotationIndexMask = new byte[]{0, 0};
this.intergenic = null;
this.consequenceTypeFilter = schema.getCtIndex().getField().noOpFilter();
this.biotypeFilter = schema.getBiotypeIndex().getField().noOpFilter();
this.transcriptFlagFilter = schema.getTranscriptFlagIndexSchema().getField().noOpFilter();
Expand All @@ -25,12 +27,16 @@ public SampleAnnotationIndexQuery(SampleIndexSchema schema) {
this.populationFrequencyFilter = schema.getPopFreqIndex().noOpFilter();
}

public SampleAnnotationIndexQuery(byte[] annotationIndexMask, IndexFieldFilter consequenceTypeFilter, IndexFieldFilter biotypeFilter,
public SampleAnnotationIndexQuery(byte[] annotationIndexMask,
Boolean intergenic,
IndexFieldFilter consequenceTypeFilter,
IndexFieldFilter biotypeFilter,
IndexFieldFilter transcriptFlagFilter,
CombinationTripleIndexSchema.Filter ctBtTfFilter,
IndexFilter clinicalFilter,
IndexFilter populationFrequencyFilter) {
this.annotationIndexMask = annotationIndexMask;
this.intergenic = intergenic;
this.consequenceTypeFilter = consequenceTypeFilter;
this.biotypeFilter = biotypeFilter;
this.transcriptFlagFilter = transcriptFlagFilter;
Expand All @@ -47,6 +53,10 @@ public byte getAnnotationIndex() {
return annotationIndexMask[1];
}

public Boolean getIntergenic() {
return intergenic;
}

public IndexFieldFilter getConsequenceTypeFilter() {
return consequenceTypeFilter;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,7 @@ private boolean filterClinicalFields(AnnotationIndexEntry annotationIndexEntry)

private boolean filterBtCtTfFields(AnnotationIndexEntry annotationIndexEntry) {
if (annotationIndexEntry == null || !annotationIndexEntry.hasSummaryIndex()) {
// Missing annotation. Unable to filter
return true;
}
if (annotationIndexEntry.isIntergenic()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,7 @@ private Scan parse(SingleSampleIndexQuery query, LocusQuery locusQuery, boolean

logger.info("---------");
logger.info("Sample = \"" + query.getSample() + "\" , schema version = " + query.getSchema().getVersion());
logger.info("Table = " + getSampleIndexTableName(query));
printScan(scan);
printQuery(locusQuery);
printQuery(query);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ public interface SampleIndexEntryIterator extends Iterator<Variant> {
*/
Variant next();

/**
* Get next variant without moving the cursor.
* @return next variant
*/
Variant nextVariant();

default SampleVariantIndexEntry nextSampleVariantIndexEntry() {
AnnotationIndexEntry annotationIndexEntry = nextAnnotationIndexEntry();
if (annotationIndexEntry != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1216,19 +1216,7 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
CtBtFtCombinationIndexSchema.Filter ctBtTfFilter = schema.getCtBtTfIndex().getField().noOpFilter();
IndexFilter clinicalFilter = schema.getClinicalIndexSchema().noOpFilter();

Boolean intergenic = null;

ParsedVariantQuery.VariantQueryXref variantQueryXref = VariantQueryParser.parseXrefs(query);
if (!isValidParam(query, REGION)) {
if (!variantQueryXref.getGenes().isEmpty()
&& variantQueryXref.getIds().isEmpty()
&& variantQueryXref.getOtherXrefs().isEmpty()
&& variantQueryXref.getVariants().isEmpty()) {
// If only filtering by genes, is not intergenic.
intergenic = false;
}
}

final Boolean intergenic = isIntergenicQuery(query);
// BiotypeConsquenceTypeFlagCombination combination = BiotypeConsquenceTypeFlagCombination
// .fromQuery(query, Arrays.asList(schema.getTranscriptFlagIndexSchema().getField().getConfiguration().getValues()));
BiotypeConsquenceTypeFlagCombination combination = BiotypeConsquenceTypeFlagCombination.fromQuery(query, null);
Expand All @@ -1237,18 +1225,10 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
boolean tfCovered = false;

if (isValidParam(query, ANNOT_CONSEQUENCE_TYPE)) {
List<String> soNames = query.getAsStringList(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key());
soNames = soNames.stream()
List<String> soNames = query.getAsStringList(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key())
.stream()
.map(ct -> ConsequenceTypeMappings.accessionToTerm.get(VariantQueryUtils.parseConsequenceType(ct)))
.collect(Collectors.toList());
if (!soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)
&& !soNames.contains(VariantAnnotationConstants.REGULATORY_REGION_VARIANT)
&& !soNames.contains(VariantAnnotationConstants.TF_BINDING_SITE_VARIANT)) {
// All ct values but "intergenic_variant" and "regulatory_region_variant" are in genes (i.e. non-intergenic)
intergenic = false;
} else if (soNames.size() == 1 && soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)) {
intergenic = true;
} // else, leave undefined : intergenic = null
boolean ctFilterCoveredBySummary = false;
boolean ctBtCombinationCoveredBySummary = false;
if (SampleIndexSchema.CUSTOM_LOF.containsAll(soNames)) {
Expand Down Expand Up @@ -1295,14 +1275,17 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
}
}

// Do not use ctIndex if the CT filter is covered by the summary
// Use the ctIndex if:
// Do not use ctIndex for intergenic queries (intergenic == true)
// or queries that might return intergenic variants (intergenic == null)
//
// Use the ctIndex if any of:
// - The CtFilter is not covered by the summary
// - The query has the combination CT+BT , and it is not covered by the summary
// - The query has the combination CT+TF
boolean useCtIndexFilter = !ctFilterCoveredBySummary
|| (!ctBtCombinationCoveredBySummary && combination.isBiotype())
|| combination.isFlag();
boolean useCtIndexFilter =
intergenic == Boolean.FALSE && (!ctFilterCoveredBySummary
|| (!ctBtCombinationCoveredBySummary && combination.isBiotype())
|| combination.isFlag());
if (useCtIndexFilter) {
ctCovered = completeIndex;
consequenceTypeFilter = schema.getCtIndex().getField().buildFilter(new OpValue<>("=", soNames));
Expand All @@ -1317,8 +1300,6 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
}

if (isValidParam(query, ANNOT_BIOTYPE)) {
// All biotype values are in genes (i.e. non-intergenic)
intergenic = false;
boolean biotypeFilterCoveredBySummary = false;
List<String> biotypes = query.getAsStringList(VariantQueryParam.ANNOT_BIOTYPE.key());
if (BIOTYPE_SET.containsAll(biotypes)) {
Expand Down Expand Up @@ -1350,8 +1331,6 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema
List<String> transcriptFlags = query.getAsStringList(ANNOT_TRANSCRIPT_FLAG.key());
tfFilter = schema.getTranscriptFlagIndexSchema().getField().buildFilter(new OpValue<>("=", transcriptFlags));
tfCovered = completeIndex & tfFilter.isExactFilter();
// Transcript flags are in transcripts/genes. (i.e. non-intergenic)
intergenic = false;
// TranscriptFlag filter is covered by index
if (tfCovered) {
if (!isValidParam(query, GENE) && simpleCombination(combination)) {
Expand Down Expand Up @@ -1536,14 +1515,60 @@ protected SampleAnnotationIndexQuery parseAnnotationIndexQuery(SampleIndexSchema

if (intergenic == null || intergenic) {
// If intergenic is undefined, or true, CT and BT filters can not be used.
biotypeFilter = schema.getBiotypeIndex().getField().noOpFilter();
consequenceTypeFilter = schema.getCtIndex().getField().noOpFilter();
if (!biotypeFilter.isNoOp()) {
throw new IllegalStateException("Unexpected BT filter for intergenic=" + intergenic);
}
if (!consequenceTypeFilter.isNoOp()) {
throw new IllegalStateException("Unexpected CT filter for intergenic=" + intergenic);
}
}

return new SampleAnnotationIndexQuery(new byte[]{annotationIndexMask, annotationIndex},
return new SampleAnnotationIndexQuery(new byte[]{annotationIndexMask, annotationIndex}, intergenic,
consequenceTypeFilter, biotypeFilter, tfFilter, ctBtTfFilter, clinicalFilter, populationFrequencyFilter);
}

private Boolean isIntergenicQuery(Query query) {
ParsedVariantQuery.VariantQueryXref variantQueryXref = VariantQueryParser.parseXrefs(query);
if (!isValidParam(query, REGION)) {
if (!variantQueryXref.getGenes().isEmpty()
&& variantQueryXref.getIds().isEmpty()
&& variantQueryXref.getOtherXrefs().isEmpty()
&& variantQueryXref.getVariants().isEmpty()) {
// If only filtering by genes, is not intergenic.
return false;
}
}

if (isValidParam(query, ANNOT_BIOTYPE)) {
// All biotype values are in genes (i.e. non-intergenic)
return false;
}
if (isValidParam(query, ANNOT_BIOTYPE)) {
// All biotype values are in genes (i.e. non-intergenic)
return false;
}
if (isValidParam(query, ANNOT_TRANSCRIPT_FLAG)) {
// Transcript flags are in transcripts/genes. (i.e. non-intergenic)
return false;
}
if (isValidParam(query, ANNOT_CONSEQUENCE_TYPE)) {
List<String> soNames = query.getAsStringList(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key());
soNames = soNames.stream()
.map(ct -> ConsequenceTypeMappings.accessionToTerm.get(VariantQueryUtils.parseConsequenceType(ct)))
.collect(Collectors.toList());
if (!soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)
&& !soNames.contains(VariantAnnotationConstants.REGULATORY_REGION_VARIANT)
&& !soNames.contains(VariantAnnotationConstants.TF_BINDING_SITE_VARIANT)) {
// All ct values but "intergenic_variant" and "regulatory_region_variant" are in genes (i.e. non-intergenic)
return false;
} else if (soNames.size() == 1 && soNames.contains(VariantAnnotationConstants.INTERGENIC_VARIANT)) {
return true;
} // else, leave undefined : intergenic = null
}
// Unable to determine if the query is intergenic or not. Return null for uncertain.
return null;
}

private boolean simpleCombination(BiotypeConsquenceTypeFlagCombination combination) {
return combination.numParams() == 1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,11 @@ public AnnotationIndexEntry nextAnnotationIndexEntry() {
public Variant next() {
throw new NoSuchElementException("Empty iterator");
}

@Override
public Variant nextVariant() {
throw new NoSuchElementException("Empty iterator");
}
}

private static final class CountSampleIndexGtEntryIterator extends SampleIndexGtEntryIterator {
Expand Down Expand Up @@ -525,6 +530,11 @@ public Variant next() {
return DUMMY_VARIANT;
}

@Override
public Variant nextVariant() {
return DUMMY_VARIANT;
}

@Override
public int getApproxSize() {
return count;
Expand Down Expand Up @@ -575,13 +585,19 @@ public boolean hasNext() {
public Variant next() {
nextAnnotationIndexEntry(); // ensure read annotation
increaseCounters();
Variant variant = nextVariant();
movePointer();
return variant;
}

@Override
public Variant nextVariant() {
Variant variant;
if (encodedRefAlt) {
variant = toVariantEncodedAlleles(chromosome, batchStart, bytes, currentOffset);
} else {
variant = toVariant(chromosome, batchStart, bytes, currentOffset, referenceLength, alternateLength);
}
movePointer();
return variant;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ private SingleSampleIndexQuery getSingleSampleIndexQuery(Query query) {
private SingleSampleIndexQuery getSingleSampleIndexQuery(VariantQueryUtils.QueryOperation op, RangeIndexFieldFilter... frequencyQuery) {
SampleAnnotationIndexQuery annotationIndexQuery = new SampleAnnotationIndexQuery(
new byte[2],
null,
schema.getCtIndex().getField().noOpFilter(),
schema.getBiotypeIndex().getField().noOpFilter(),
schema.getTranscriptFlagIndexSchema().getField().noOpFilter(),
Expand Down
Loading

0 comments on commit e0845b5

Please sign in to comment.