Skip to content

Commit

Permalink
Merge pull request #224 from conveyal/dev
Browse files Browse the repository at this point in the history
Bug fix release
  • Loading branch information
landonreed authored Apr 24, 2019
2 parents 1d11e03 + e6d9082 commit a264d3b
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 100 deletions.
3 changes: 2 additions & 1 deletion src/main/java/com/conveyal/gtfs/loader/JdbcGtfsLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,8 @@ private int loadInternal (Table table) throws Exception {
// CSV reader get on an empty field will be an empty string literal.
String string = csvReader.get(f);
// Use spec table to check that references are valid and IDs are unique.
Set<NewGTFSError> errors = table.checkReferencesAndUniqueness(keyValue, lineNumber, field, string, referenceTracker);
Set<NewGTFSError> errors = referenceTracker
.checkReferencesAndUniqueness(keyValue, lineNumber, field, string, table);
// Check for special case with calendar_dates where added service should not trigger ref. integrity
// error.
if (
Expand Down
121 changes: 118 additions & 3 deletions src/main/java/com/conveyal/gtfs/loader/ReferenceTracker.java
Original file line number Diff line number Diff line change
@@ -1,14 +1,129 @@
package com.conveyal.gtfs.loader;

import com.conveyal.gtfs.error.NewGTFSError;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

import static com.conveyal.gtfs.error.NewGTFSErrorType.DUPLICATE_ID;
import static com.conveyal.gtfs.error.NewGTFSErrorType.REFERENTIAL_INTEGRITY;

/**
* This class is used during feed loads to track the unique keys that are encountered in a GTFS feed. It has two sets of
* strings that it tracks, one for single field keys (e.g., route_id or stop_id) and one for keys that are compound,
* usually made up of a string ID with a sequence field (e.g., trip_id + stop_sequence for tracking unique stop times).
* This class is used during feed loads to track the unique keys that are encountered in a GTFS
* feed. It has two sets of strings that it tracks, one for single field keys (e.g., route_id or
* stop_id) and one for keys that are compound, usually made up of a string ID with a sequence field
* (e.g., trip_id + stop_sequence for tracking unique stop times).
* <p>
* NOTE: Its methods should remain public because they are used during external processes that
* validate or otherwise iterate over each line of a GTFS file and need to check for reference
* validity (e.g., while merging GTFS feeds this is used to determine ID conflicts).
*/
public class ReferenceTracker {
public final Set<String> transitIds = new HashSet<>();
public final Set<String> transitIdsWithSequence = new HashSet<>();

/**
* During table load, checks the uniqueness of the entity ID and that references are valid.
* NOTE: This method defaults the key field and order field names to this table's values.
*
* @param keyValue key value for the record being checked
* @param lineNumber line number of the record being checked
* @param field field currently being checked
* @param value value that corresponds to field
* @param table table currently being checked
* @return any duplicate or bad reference errors.
*/
public Set<NewGTFSError> checkReferencesAndUniqueness(String keyValue, int lineNumber,
Field field, String value, Table table) {
return checkReferencesAndUniqueness(keyValue, lineNumber, field, value, table,
table.getKeyFieldName(), table.getOrderFieldName());
}

/**
* During table load, checks the uniqueness of the entity ID and that references are valid.
* These references are stored in the provided reference tracker. Any non-unique IDs or invalid
* references will store an error. NOTE: this instance of checkReferencesAndUniqueness allows
* for arbitrarily setting the keyField and orderField, which is helpful for checking uniqueness
* of fields that are not the standard primary key (e.g., route_short_name).
*/
public Set<NewGTFSError> checkReferencesAndUniqueness(String keyValue, int lineNumber,
Field field, String value, Table table, String keyField, String orderField) {
Set<NewGTFSError> errors = new HashSet<>();
// Store field-scoped transit ID for referential integrity check. (Note, entity scoping
// doesn't work here because we need to cross-check multiple entity types for valid
// references, e.g., stop times and trips both share trip id.)
// If table has an order field, that order field should supersede the key field as the
// "unique" field. In other words, if it has an order field, the unique key is actually
// compound -- made up of the keyField + orderField.
String uniqueKeyField = orderField != null ? orderField
// If table has no unique key field (e.g., calendar_dates or transfers), there is no
// need to check for duplicates.
: !table.hasUniqueKeyField ? null : keyField;
String transitId = String.join(":", keyField, keyValue);

// If the field is optional and there is no value present, skip check.
if (!field.isRequired() && "".equals(value)) return Collections.emptySet();

// First, handle referential integrity check.
boolean isOrderField = field.name.equals(orderField);
if (field.isForeignReference()) {
// Check referential integrity if the field is a foreign reference. Note: the
// reference table must be loaded before the table/value being currently checked.
String referenceField = field.referenceTable.getKeyFieldName();
String referenceTransitId = String.join(":", referenceField, value);

if (!this.transitIds.contains(referenceTransitId)) {
// If the reference tracker does not contain
NewGTFSError referentialIntegrityError = NewGTFSError
.forLine(table, lineNumber, REFERENTIAL_INTEGRITY, referenceTransitId)
.setEntityId(keyValue);
// If the field is an order field, set the sequence for the new error.
if (isOrderField) referentialIntegrityError.setSequence(value);
errors.add(referentialIntegrityError);
}
}
// Next, handle duplicate ID check.
// In most cases there is no need to check for duplicate IDs if the field is a foreign
// reference. However, transfers#to_stop_id is defined as an order field, so we need to
// check that this field (which is both a foreign ref and order field) is dataset unique
// in conjunction with the key field.
// These hold references to the set of IDs to check for duplicates and the ID to check.
// These depend on whether an order field is part of the "unique ID."
Set<String> listOfUniqueIds = this.transitIds;
String uniqueId = transitId;

// Next, check that the ID is table-unique. For example, the trip_id field is table unique
// in trips.txt and the the stop_sequence field (joined with trip_id) is table unique in
// stop_times.txt.
if (field.name.equals(uniqueKeyField)) {
// Check for duplicate IDs and store entity-scoped IDs for referential integrity check
if (isOrderField) {
// Check duplicate reference in set of field-scoped id:sequence (e.g.,
// stop_sequence:12345:2)
// This should not be scoped by key field because there may be conflicts (e.g.,
// with trip_id="12345:2")
listOfUniqueIds = this.transitIdsWithSequence;
uniqueId = String.join(":", field.name, keyValue, value);
}
// Add ID and check duplicate reference in entity-scoped IDs (e.g., stop_id:12345)
boolean valueAlreadyExists = !listOfUniqueIds.add(uniqueId);
if (valueAlreadyExists) {
// If the value is a duplicate, add an error.
NewGTFSError duplicateIdError =
NewGTFSError.forLine(table, lineNumber, DUPLICATE_ID, uniqueId)
.setEntityId(keyValue);
if (isOrderField) { duplicateIdError.setSequence(value); }
errors.add(duplicateIdError);
}
} else if (field.name.equals(keyField) && !field.isForeignReference()) {
// We arrive here if the field is not a foreign reference and not the unique key field
// on the table (e.g., shape_pt_sequence), but is still a key on the table. For
// example, this is where we add shape_id from the shapes table, so that when we
// check the referential integrity of trips#shape_id, we know that the shape_id
// exists in the shapes table. It also handles tracking calendar_dates#service_id values.
this.transitIds.add(uniqueId);
}
return errors;
}
}
97 changes: 1 addition & 96 deletions src/main/java/com/conveyal/gtfs/loader/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ public class Table {
/** When snapshotting a table for editor use, this indicates whether a primary key constraint should be added to ID. */
private boolean usePrimaryKey = false;
/** Indicates whether the table has unique key field. */
private boolean hasUniqueKeyField = true;
public boolean hasUniqueKeyField = true;
/**
* Indicates whether the table has a compound key that must be used in conjunction with the key field to determine
* table uniqueness(e.g., transfers#to_stop_id).
Expand Down Expand Up @@ -945,101 +945,6 @@ public Table getParentTable() {
return parentTable;
}

/**
* During table load, checks the uniqueness of the entity ID and that references are valid. NOTE: This method
* defaults the key field and order field names to this table's values.
* @param keyValue key value for the record being checked
* @param lineNumber line number of the record being checked
* @param field field currently being checked
* @param value value that corresponds to field
* @param referenceTracker reference tracker in which to store references
* @return any duplicate or bad reference errors.
*/
public Set<NewGTFSError> checkReferencesAndUniqueness(String keyValue, int lineNumber, Field field, String value, ReferenceTracker referenceTracker) {
return checkReferencesAndUniqueness(keyValue, lineNumber, field, value, referenceTracker, getKeyFieldName(), getOrderFieldName());
}

/**
* During table load, checks the uniqueness of the entity ID and that references are valid. These references are
* stored in the provided reference tracker. Any non-unique IDs or invalid references will store an error. NOTE:
* this instance of checkReferencesAndUniqueness allows for arbitrarily setting the keyField and orderField, which
* is helpful for checking uniqueness of fields that are not the standard primary key (e.g., route_short_name).
*/
public Set<NewGTFSError> checkReferencesAndUniqueness(String keyValue, int lineNumber, Field field, String value, ReferenceTracker referenceTracker, String keyField, String orderField) {
Set<NewGTFSError> errors = new HashSet<>();
// Store field-scoped transit ID for referential integrity check. (Note, entity scoping doesn't work here because
// we need to cross-check multiple entity types for valid references, e.g., stop times and trips both share trip
// id.)
// If table has an order field, that order field should supersede the key field as the "unique" field. In other
// words, if it has an order field, the unique key is actually compound -- made up of the keyField + orderField.
String uniqueKeyField = orderField != null
? orderField
// If table has no unique key field (e.g., calendar_dates or transfers), there is no need to check for
// duplicates.
: !hasUniqueKeyField
? null
: keyField;
String transitId = String.join(":", keyField, keyValue);

// If the field is optional and there is no value present, skip check.
if (!field.isRequired() && "".equals(value)) return Collections.EMPTY_SET;

// First, handle referential integrity check.
boolean isOrderField = field.name.equals(orderField);
if (field.isForeignReference()) {
// Check referential integrity if the field is a foreign reference. Note: the reference table must be loaded
// before the table/value being currently checked.
String referenceField = field.referenceTable.getKeyFieldName();
String referenceTransitId = String.join(":", referenceField, value);

if (!referenceTracker.transitIds.contains(referenceTransitId)) {
// If the reference tracker does not contain
NewGTFSError referentialIntegrityError = NewGTFSError.forLine(
this, lineNumber, REFERENTIAL_INTEGRITY, referenceTransitId)
.setEntityId(keyValue);
// If the field is an order field, set the sequence for the new error.
if (isOrderField) referentialIntegrityError.setSequence(value);
errors.add(referentialIntegrityError);
}
}
// Next, handle duplicate ID check.
// In most cases there is no need to check for duplicate IDs if the field is a foreign reference. However,
// transfers#to_stop_id is defined as an order field, so we need to check that this field (which is both a
// foreign ref and order field) is dataset unique in conjunction with the key field.
// These hold references to the set of IDs to check for duplicates and the ID to check. These depend on
// whether an order field is part of the "unique ID."
Set<String> listOfUniqueIds = referenceTracker.transitIds;
String uniqueId = transitId;

// Next, check that the ID is table-unique. For example, the trip_id field is table unique in trips.txt and
// the the stop_sequence field (joined with trip_id) is table unique in stop_times.txt.
if (field.name.equals(uniqueKeyField)) {
// Check for duplicate IDs and store entity-scoped IDs for referential integrity check
if (isOrderField) {
// Check duplicate reference in set of field-scoped id:sequence (e.g., stop_sequence:12345:2)
// This should not be scoped by key field because there may be conflicts (e.g., with trip_id="12345:2"
listOfUniqueIds = referenceTracker.transitIdsWithSequence;
uniqueId = String.join(":", field.name, keyValue, value);
}
// Add ID and check duplicate reference in entity-scoped IDs (e.g., stop_id:12345)
boolean valueAlreadyExists = !listOfUniqueIds.add(uniqueId);
if (valueAlreadyExists) {
// If the value is a duplicate, add an error.
NewGTFSError duplicateIdError = NewGTFSError.forLine(this, lineNumber, DUPLICATE_ID, uniqueId)
.setEntityId(keyValue);
if (isOrderField) duplicateIdError.setSequence(value);
errors.add(duplicateIdError);
}
} else if (field.name.equals(keyField) && !field.isForeignReference()) {
// We arrive here if the field is not a foreign reference and not the unique key field on the table (e.g.,
// shape_pt_sequence), but is still a key on the table. For example, this is where we add shape_id from
// the shapes table, so that when we check the referential integrity of trips#shape_id, we know that the
// shape_id exists in the shapes table. It also handles tracking calendar_dates#service_id values.
referenceTracker.transitIds.add(uniqueId);
}
return errors;
}

/**
* For an array of field headers, returns the matching set of {@link Field}s for a {@link Table}. If errorStorage is
* not null, errors related to unexpected or duplicate header names will be stored.
Expand Down

0 comments on commit a264d3b

Please sign in to comment.