-
Notifications
You must be signed in to change notification settings - Fork 38
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #224 from conveyal/dev
Bug fix release
- Loading branch information
Showing
3 changed files
with
121 additions
and
100 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
121 changes: 118 additions & 3 deletions
121
src/main/java/com/conveyal/gtfs/loader/ReferenceTracker.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,129 @@ | ||
package com.conveyal.gtfs.loader; | ||
|
||
import com.conveyal.gtfs.error.NewGTFSError; | ||
|
||
import java.util.Collections; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
|
||
import static com.conveyal.gtfs.error.NewGTFSErrorType.DUPLICATE_ID; | ||
import static com.conveyal.gtfs.error.NewGTFSErrorType.REFERENTIAL_INTEGRITY; | ||
|
||
/** | ||
* This class is used during feed loads to track the unique keys that are encountered in a GTFS feed. It has two sets of | ||
* strings that it tracks, one for single field keys (e.g., route_id or stop_id) and one for keys that are compound, | ||
* usually made up of a string ID with a sequence field (e.g., trip_id + stop_sequence for tracking unique stop times). | ||
* This class is used during feed loads to track the unique keys that are encountered in a GTFS | ||
* feed. It has two sets of strings that it tracks, one for single field keys (e.g., route_id or | ||
* stop_id) and one for keys that are compound, usually made up of a string ID with a sequence field | ||
* (e.g., trip_id + stop_sequence for tracking unique stop times). | ||
* <p> | ||
* NOTE: Its methods should remain public because they are used during external processes that | ||
* validate or otherwise iterate over each line of a GTFS file and need to check for reference | ||
* validity (e.g., while merging GTFS feeds this is used to determine ID conflicts). | ||
*/ | ||
public class ReferenceTracker { | ||
public final Set<String> transitIds = new HashSet<>(); | ||
public final Set<String> transitIdsWithSequence = new HashSet<>(); | ||
|
||
/** | ||
* During table load, checks the uniqueness of the entity ID and that references are valid. | ||
* NOTE: This method defaults the key field and order field names to this table's values. | ||
* | ||
* @param keyValue key value for the record being checked | ||
* @param lineNumber line number of the record being checked | ||
* @param field field currently being checked | ||
* @param value value that corresponds to field | ||
* @param table table currently being checked | ||
* @return any duplicate or bad reference errors. | ||
*/ | ||
public Set<NewGTFSError> checkReferencesAndUniqueness(String keyValue, int lineNumber, | ||
Field field, String value, Table table) { | ||
return checkReferencesAndUniqueness(keyValue, lineNumber, field, value, table, | ||
table.getKeyFieldName(), table.getOrderFieldName()); | ||
} | ||
|
||
/** | ||
* During table load, checks the uniqueness of the entity ID and that references are valid. | ||
* These references are stored in the provided reference tracker. Any non-unique IDs or invalid | ||
* references will store an error. NOTE: this instance of checkReferencesAndUniqueness allows | ||
* for arbitrarily setting the keyField and orderField, which is helpful for checking uniqueness | ||
* of fields that are not the standard primary key (e.g., route_short_name). | ||
*/ | ||
public Set<NewGTFSError> checkReferencesAndUniqueness(String keyValue, int lineNumber, | ||
Field field, String value, Table table, String keyField, String orderField) { | ||
Set<NewGTFSError> errors = new HashSet<>(); | ||
// Store field-scoped transit ID for referential integrity check. (Note, entity scoping | ||
// doesn't work here because we need to cross-check multiple entity types for valid | ||
// references, e.g., stop times and trips both share trip id.) | ||
// If table has an order field, that order field should supersede the key field as the | ||
// "unique" field. In other words, if it has an order field, the unique key is actually | ||
// compound -- made up of the keyField + orderField. | ||
String uniqueKeyField = orderField != null ? orderField | ||
// If table has no unique key field (e.g., calendar_dates or transfers), there is no | ||
// need to check for duplicates. | ||
: !table.hasUniqueKeyField ? null : keyField; | ||
String transitId = String.join(":", keyField, keyValue); | ||
|
||
// If the field is optional and there is no value present, skip check. | ||
if (!field.isRequired() && "".equals(value)) return Collections.emptySet(); | ||
|
||
// First, handle referential integrity check. | ||
boolean isOrderField = field.name.equals(orderField); | ||
if (field.isForeignReference()) { | ||
// Check referential integrity if the field is a foreign reference. Note: the | ||
// reference table must be loaded before the table/value being currently checked. | ||
String referenceField = field.referenceTable.getKeyFieldName(); | ||
String referenceTransitId = String.join(":", referenceField, value); | ||
|
||
if (!this.transitIds.contains(referenceTransitId)) { | ||
// If the reference tracker does not contain | ||
NewGTFSError referentialIntegrityError = NewGTFSError | ||
.forLine(table, lineNumber, REFERENTIAL_INTEGRITY, referenceTransitId) | ||
.setEntityId(keyValue); | ||
// If the field is an order field, set the sequence for the new error. | ||
if (isOrderField) referentialIntegrityError.setSequence(value); | ||
errors.add(referentialIntegrityError); | ||
} | ||
} | ||
// Next, handle duplicate ID check. | ||
// In most cases there is no need to check for duplicate IDs if the field is a foreign | ||
// reference. However, transfers#to_stop_id is defined as an order field, so we need to | ||
// check that this field (which is both a foreign ref and order field) is dataset unique | ||
// in conjunction with the key field. | ||
// These hold references to the set of IDs to check for duplicates and the ID to check. | ||
// These depend on whether an order field is part of the "unique ID." | ||
Set<String> listOfUniqueIds = this.transitIds; | ||
String uniqueId = transitId; | ||
|
||
// Next, check that the ID is table-unique. For example, the trip_id field is table unique | ||
// in trips.txt and the the stop_sequence field (joined with trip_id) is table unique in | ||
// stop_times.txt. | ||
if (field.name.equals(uniqueKeyField)) { | ||
// Check for duplicate IDs and store entity-scoped IDs for referential integrity check | ||
if (isOrderField) { | ||
// Check duplicate reference in set of field-scoped id:sequence (e.g., | ||
// stop_sequence:12345:2) | ||
// This should not be scoped by key field because there may be conflicts (e.g., | ||
// with trip_id="12345:2") | ||
listOfUniqueIds = this.transitIdsWithSequence; | ||
uniqueId = String.join(":", field.name, keyValue, value); | ||
} | ||
// Add ID and check duplicate reference in entity-scoped IDs (e.g., stop_id:12345) | ||
boolean valueAlreadyExists = !listOfUniqueIds.add(uniqueId); | ||
if (valueAlreadyExists) { | ||
// If the value is a duplicate, add an error. | ||
NewGTFSError duplicateIdError = | ||
NewGTFSError.forLine(table, lineNumber, DUPLICATE_ID, uniqueId) | ||
.setEntityId(keyValue); | ||
if (isOrderField) { duplicateIdError.setSequence(value); } | ||
errors.add(duplicateIdError); | ||
} | ||
} else if (field.name.equals(keyField) && !field.isForeignReference()) { | ||
// We arrive here if the field is not a foreign reference and not the unique key field | ||
// on the table (e.g., shape_pt_sequence), but is still a key on the table. For | ||
// example, this is where we add shape_id from the shapes table, so that when we | ||
// check the referential integrity of trips#shape_id, we know that the shape_id | ||
// exists in the shapes table. It also handles tracking calendar_dates#service_id values. | ||
this.transitIds.add(uniqueId); | ||
} | ||
return errors; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters