Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tweaks for MTC enhancements #200

Merged
merged 5 commits into from
Feb 19, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions src/main/java/com/conveyal/gtfs/loader/Field.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@
public abstract class Field {

public final String name;
final Requirement requirement;
public final Requirement requirement;
/**
* Indicates that this field acts as a foreign key to this referenced table. This is used when checking referential
* integrity when loading a feed.
* */
Table referenceTable = null;
public Table referenceTable = null;
private boolean shouldBeIndexed;
private boolean emptyValuePermitted;

Expand All @@ -51,6 +51,16 @@ public void setNull(PreparedStatement preparedStatement, int oneBasedIndex) thro
preparedStatement.setNull(oneBasedIndex, getSqlType().getVendorTypeNumber());
}

/**
* Finds the index of the field given a string name.
* @return the index of the field or -1 if no match is found
*/
public static int getFieldIndex (Field[] fields, String name) {
// Linear search, assuming a small number of fields per table.
for (int i = 0; i < fields.length; i++) if (fields[i].name.equals(name)) return i;
return -1;
}

public abstract SQLType getSqlType ();

// Overridden to create exception for "double precision", since its enum value is just called DOUBLE, and ARRAY types,
Expand Down
98 changes: 16 additions & 82 deletions src/main/java/com/conveyal/gtfs/loader/JdbcGtfsLoader.java
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public class JdbcGtfsLoader {
private SQLErrorStorage errorStorage;

// Contains references to unique entity IDs during load stage used for referential integrity check.
private ReferenceTracker referenceTracker;
private ReferenceTracker referenceTracker = new ReferenceTracker();

public JdbcGtfsLoader(String gtfsFilePath, DataSource dataSource) {
this.gtfsFilePath = gtfsFilePath;
Expand Down Expand Up @@ -130,9 +130,9 @@ public FeedLoadResult loadTables () {
result.filename = gtfsFilePath;
result.uniqueIdentifier = tablePrefix;

//The order of the following four lines should not be changed because the schema needs to be in place
//before the error storage can be constructed, which in turn needs to exist in case any errors are
//encountered during the loading process.
// The order of the following four lines should not be changed because the schema needs to be in place
// before the error storage can be constructed, which in turn needs to exist in case any errors are
// encountered during the loading process.
{
createSchema(connection, tablePrefix);
//the SQLErrorStorage constructor expects the tablePrefix to contain the dot separator.
Expand All @@ -143,7 +143,6 @@ public FeedLoadResult loadTables () {
// This allows everything to work even when there's no prefix.
this.tablePrefix += ".";
}
this.referenceTracker = new ReferenceTracker(errorStorage);
// Load each table in turn, saving some summary information about what happened during each table load
result.agency = load(Table.AGENCY);
result.calendar = load(Table.CALENDAR);
Expand Down Expand Up @@ -210,7 +209,7 @@ private void registerFeed (File gtfsFile) {
// FIXME is this extra CSV reader used anymore? Check comment below.
// First, inspect feed_info.txt to extract the ID and version.
// We could get this with SQL after loading, but feed_info, feed_id and feed_version are all optional.
CsvReader csvReader = getCsvReader(Table.FEED_INFO);
CsvReader csvReader = Table.FEED_INFO.getCsvReader(zip, errorStorage);
String feedId = "", feedVersion = "";
if (csvReader != null) {
// feed_info.txt has been found and opened.
Expand Down Expand Up @@ -256,43 +255,6 @@ private void registerFeed (File gtfsFile) {
}
}

/**
* In GTFS feeds, all files are supposed to be in the root of the zip file, but feed producers often put them
* in a subdirectory. This function will search subdirectories if the entry is not found in the root.
* It records an error if the entry is in a subdirectory.
* It then creates a CSV reader for that table if it's found.
*/
private CsvReader getCsvReader (Table table) {
final String tableFileName = table.name + ".txt";
ZipEntry entry = zip.getEntry(tableFileName);
if (entry == null) {
// Table was not found, check if it is in a subdirectory.
Enumeration<? extends ZipEntry> entries = zip.entries();
while (entries.hasMoreElements()) {
ZipEntry e = entries.nextElement();
if (e.getName().endsWith(tableFileName)) {
entry = e;
errorStorage.storeError(NewGTFSError.forTable(table, TABLE_IN_SUBDIRECTORY));
break;
}
}
}
if (entry == null) return null;
try {
InputStream zipInputStream = zip.getInputStream(entry);
// Skip any byte order mark that may be present. Files must be UTF-8,
// but the GTFS spec says that "files that include the UTF byte order mark are acceptable".
InputStream bomInputStream = new BOMInputStream(zipInputStream);
CsvReader csvReader = new CsvReader(bomInputStream, ',', Charset.forName("UTF8"));
csvReader.readHeaders();
return csvReader;
} catch (IOException e) {
LOG.error("Exception while opening zip entry: {}", e);
e.printStackTrace();
return null;
}
}

/**
* This wraps the main internal table loader method to catch exceptions and figure out how many errors happened.
*/
Expand Down Expand Up @@ -339,7 +301,7 @@ private int getTableSize(Table table) {
* @return number of rows that were loaded.
*/
private int loadInternal (Table table) throws Exception {
CsvReader csvReader = getCsvReader(table);
CsvReader csvReader = table.getCsvReader(zip, errorStorage);
if (csvReader == null) {
LOG.info(String.format("file %s.txt not found in gtfs zipfile", table.name));
// This GTFS table could not be opened in the zip, even in a subdirectory.
Expand All @@ -353,25 +315,8 @@ private int loadInternal (Table table) throws Exception {
// TODO Strip out line returns, tabs in field contents.
// By default the CSV reader trims leading and trailing whitespace in fields.
// Build up a list of fields in the same order they appear in this GTFS CSV file.
int headerCount = csvReader.getHeaderCount();
Field[] fields = new Field[headerCount];
Set<String> fieldsSeen = new HashSet<>();
String keyField = table.getKeyFieldName();
int keyFieldIndex = -1;
for (int h = 0; h < headerCount; h++) {
String header = sanitize(csvReader.getHeader(h));
if (fieldsSeen.contains(header) || "id".equals(header)) {
// FIXME: add separate error for tables containing ID field.
errorStorage.storeError(NewGTFSError.forTable(table, DUPLICATE_HEADER).setBadValue(header));
fields[h] = null;
} else {
fields[h] = table.getFieldForName(header);
fieldsSeen.add(header);
if (keyField.equals(header)) {
keyFieldIndex = h;
}
}
}
Field[] fields = table.getFieldsFromFieldHeaders(csvReader.getHeaders(), errorStorage);
int keyFieldIndex = table.getKeyFieldIndex(fields);
// Create separate fields array with filtered list that does not include null values (for duplicate headers or
// ID field). This is solely used to construct the table and array of values to load.
Field[] cleanFields = Arrays.stream(fields).filter(field -> field != null).toArray(Field[]::new);
Expand Down Expand Up @@ -439,7 +384,8 @@ private int loadInternal (Table table) throws Exception {
// CSV reader get on an empty field will be an empty string literal.
String string = csvReader.get(f);
// Use spec table to check that references are valid and IDs are unique.
table.checkReferencesAndUniqueness(keyValue, lineNumber, field, string, referenceTracker);
Set<NewGTFSError> errors = table.checkReferencesAndUniqueness(keyValue, lineNumber, field, string, referenceTracker);
errorStorage.storeErrors(errors);
// Add value for entry into table
setValueForField(table, columnIndex, lineNumber, field, string, postgresText, transformedStrings);
// Increment column index.
Expand Down Expand Up @@ -521,10 +467,10 @@ public void setValueForField(Table table, int fieldIndex, int lineNumber, Field
// The Field objects throw exceptions to avoid passing the line number, table name etc. into them.
try {
// FIXME we need to set the transformed string element even when an error occurs.
// This means the validation and insertion step need to happen separately.
// or the errors should not be signaled with exceptions.
// Also, we should probably not be converting any GTFS field values.
// We should be saving it as-is in the database and converting upon load into our model objects.
// This means the validation and insertion step need to happen separately.
// or the errors should not be signaled with exceptions.
// Also, we should probably not be converting any GTFS field values.
// We should be saving it as-is in the database and converting upon load into our model objects.
if (postgresText) transformedStrings[fieldIndex + 1] = field.validateAndConvert(string);
else field.setParameter(insertStatement, fieldIndex + 2, string);
} catch (StorageException ex) {
Expand Down Expand Up @@ -562,24 +508,12 @@ private void setFieldToNull(boolean postgresText, String[] transformedStrings, i
*
* TODO add a test including SQL injection text (quote and semicolon)
*/
public String sanitize (String string) throws SQLException {
public static String sanitize (String string, SQLErrorStorage errorStorage) {
String clean = string.replaceAll("[^\\p{Alnum}_]", "");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't quite understand this regex. Can you make a comment for it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@abyrd, can you provide a description of this? (I can add your comments to code).

Copy link
Member

@abyrd abyrd Feb 20, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the undocumented regex, obviously these things should always have a comment. This removes all characters that are not alphanumeric or an underscore. The reasoning being that all GTFS field names are composed of only alphanumeric characters and underscores, and limiting all supplied strings to those characters should completely preclude SQL injection.

Since this is already merged, I think it would be OK to make a commit directly to dev to add this information in a comment.

if (!clean.equals(string)) {
LOG.warn("SQL identifier '{}' was sanitized to '{}'", string, clean);
if (errorStorage != null) {
errorStorage.storeError(NewGTFSError.forFeed(COLUMN_NAME_UNSAFE, string));
}
if (errorStorage != null) errorStorage.storeError(NewGTFSError.forFeed(COLUMN_NAME_UNSAFE, string));
}
return clean;
}

public class ReferenceTracker {
public final Set<String> transitIds = new HashSet<>();
public final Set<String> transitIdsWithSequence = new HashSet<>();
public final SQLErrorStorage errorStorage;

public ReferenceTracker(SQLErrorStorage errorStorage) {
this.errorStorage = errorStorage;
}
}
}
14 changes: 14 additions & 0 deletions src/main/java/com/conveyal/gtfs/loader/ReferenceTracker.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.conveyal.gtfs.loader;

import java.util.HashSet;
import java.util.Set;

/**
* This class is used during feed loads to track the unique keys that are encountered in a GTFS feed. It has two sets of
* strings that it tracks, one for single field keys (e.g., route_id or stop_id) and one for keys that are compound,
* usually made up of a string ID with a sequence field (e.g., trip_id + stop_sequence for tracking unique stop times).
*/
public class ReferenceTracker {
landonreed marked this conversation as resolved.
Show resolved Hide resolved
public final Set<String> transitIds = new HashSet<>();
public final Set<String> transitIdsWithSequence = new HashSet<>();
}
Loading