Skip to content

Commit

Permalink
[#228] HTML and XLS files are now being processed directly
Browse files Browse the repository at this point in the history
  • Loading branch information
Evgenii Grigorev authored and blcham committed Jan 6, 2025
1 parent 047fec9 commit a4077a9
Show file tree
Hide file tree
Showing 5 changed files with 444 additions and 106 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import cz.cvut.spipes.exception.ResourceNotUniqueException;
import cz.cvut.spipes.exception.SPipesException;
import cz.cvut.spipes.modules.annotations.SPipesModule;
import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException;
import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException;
import cz.cvut.spipes.modules.handlers.ModeHandler;
Expand All @@ -31,6 +30,7 @@
import org.supercsv.io.ICsvListReader;
import org.supercsv.prefs.CsvPreference;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
Expand Down Expand Up @@ -113,12 +113,12 @@ public class TabularModule extends AnnotatedAbstractModule {
private final Property P_PROCESS_TABLE_AT_INDEX = getSpecificParameter("process-table-at-index");

@Parameter(iri = SML.replace, comment = "Specifies whether a module should overwrite triples" +
" from its predecessors. When set to true (default is false), it prevents" +
" passing through triples from the predecessors.")
" from its predecessors. When set to true (default is false), it prevents" +
" passing through triples from the predecessors.")
private boolean isReplace = false;

@Parameter(iri = PARAM_URL_PREFIX + "source-resource-uri", comment = "URI of resource" +
" that represent tabular data (e.g. resource representing CSV file).")
" that represent tabular data (e.g. resource representing CSV file).")
private StreamResource sourceResource;

@Parameter(iri = PARAM_URL_PREFIX + "delimiter", comment = "Column delimiter. Default value is comma ','.")
Expand Down Expand Up @@ -189,6 +189,8 @@ ExecutionContext executeSelf() {

StreamResource originalSourceResource = sourceResource;
TSVConvertor tsvConvertor = null;
StreamReaderAdapter streamReaderAdapter = new XLSStreamReaderAdapter();
CsvPreference csvPreference = null;

switch (sourceResourceFormat) {
case HTML:
Expand All @@ -198,30 +200,22 @@ ExecutionContext executeSelf() {
if (processTableAtIndex != 1) {
throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet.");
}
tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
table.setLabel(tsvConvertor.getTableName(sourceResource));
setSourceResource(tsvConvertor.convertToTSV(sourceResource));
setDelimiter('\t');
streamReaderAdapter = new HTMLStreamReaderAdapter();
break;
case XLS:
case XLSM:
case XLSX:
if (processTableAtIndex == 0) {
throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing.");
}
tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat);
int numberOfSheets = tsvConvertor.getTablesCount(sourceResource);
table.setLabel(tsvConvertor.getTableName(sourceResource));
LOG.debug("Number of sheets: {}", numberOfSheets);
if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) {
LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
numberOfSheets,
processTableAtIndex
);
throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
}
setSourceResource(tsvConvertor.convertToTSV(sourceResource));
setDelimiter('\t');
streamReaderAdapter = new XLSStreamReaderAdapter();
break;
default:
csvPreference = new CsvPreference.Builder(
quoteCharacter,
delimiter,
System.lineSeparator()).build();
streamReaderAdapter = new CSVStreamReaderAdapter(csvPreference);
break;
}

Expand All @@ -236,33 +230,26 @@ ExecutionContext executeSelf() {
List<Column> outputColumns = new ArrayList<>();
List<Statement> rowStatements = new ArrayList<>();

CsvPreference csvPreference = new CsvPreference.Builder(
quoteCharacter,
delimiter,
System.lineSeparator()).build();

try {
ICsvListReader listReader = getCsvListReader(csvPreference);

if (listReader == null) {
logMissingQuoteError();
return getExecutionContext(inputModel, outputModel);
}
streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()),
sourceResourceFormat, processTableAtIndex, acceptInvalidQuoting, inputCharset, sourceResource);
String[] header = streamReaderAdapter.getHeader(skipHeader);
Set<String> columnNames = new HashSet<>();

String[] header = listReader.getHeader(true); // skip the header (can't be used with CsvListReader)
if (streamReaderAdapter.getSheetLabel() != null)
table.setLabel(streamReaderAdapter.getSheetLabel());

if (header == null) {
LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri());
return getExecutionContext(inputModel, outputModel);
}
Set<String> columnNames = new HashSet<>();

TableSchema inputTableSchema = getTableSchema(em);
hasInputSchema = hasInputSchema(inputTableSchema);

if (skipHeader) {
header = getHeaderFromSchema(inputModel, header, hasInputSchema);
listReader = new CsvListReader(getReader(), csvPreference);
//streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
} else if (hasInputSchema) {
header = getHeaderFromSchema(inputModel, header, true);
}
Expand All @@ -281,17 +268,16 @@ ExecutionContext executeSelf() {

tableSchema.setAboutUrl(schemaColumn, sourceResource.getUri());
schemaColumn.setProperty(
dataPrefix,
sourceResource.getUri(),
hasInputSchema ? tableSchema.getColumn(columnName) : null);
dataPrefix,
sourceResource.getUri(),
hasInputSchema ? tableSchema.getColumn(columnName) : null);
schemaColumn.setTitle(columnTitle);
if (isDuplicate) throwNotUniqueException(schemaColumn, columnTitle, columnName);
}

List<String> row;
int rowNumber = 0;
//for each row
while ((row = listReader.read()) != null) {
List<String> row;
while ((row = streamReaderAdapter.getNextRow()) != null) {
rowNumber++;
// 4.6.1 and 4.6.3
Row r = new Row();
Expand Down Expand Up @@ -329,37 +315,35 @@ ExecutionContext executeSelf() {
// 4.6.8.7 - else, if cellValue is not null
}
}
listReader.close();
} catch (IOException | MissingArgumentException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
tableSchema.setColumnsSet(new HashSet<>(outputColumns));
tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
tableSchema.setColumnsSet(new HashSet<>(outputColumns));

em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel);
em.getTransaction().begin();
em.persist(tableGroup);
em.merge(tableSchema);
em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel);
em.getTransaction().begin();
em.persist(tableGroup);
em.merge(tableSchema);

if (tsvConvertor != null) {
List<Region> regions = tsvConvertor.getMergedRegions(originalSourceResource);
List<Region> regions = streamReaderAdapter.getMergedRegions();

int cellsNum = 1;
for (Region region : regions) {
int firstCellInRegionNum = cellsNum;
for(int i = region.getFirstRow();i <= region.getLastRow();i++){
for(int j = region.getFirstColumn();j <= region.getLastColumn();j++) {
Cell cell = new Cell(sourceResource.getUri()+"#cell"+(cellsNum));
for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
cell.setRow(tableSchema.createAboutUrl(i));
cell.setColumn(outputColumns.get(j).getUri().toString());
if(cellsNum != firstCellInRegionNum)
cell.setSameValueAsCell(sourceResource.getUri()+"#cell"+(firstCellInRegionNum));
if (cellsNum != firstCellInRegionNum) {
cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
}
em.merge(cell);
cellsNum++;
}
}
}
streamReaderAdapter.close();
} catch (IOException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

em.getTransaction().commit();
Expand All @@ -381,41 +365,31 @@ private String getValueFromRow(List<String> row, int index, int expectedRowLengt
StringBuilder record = new StringBuilder(recordDelimiter);
for (int i = 0; i < row.size(); i++) {
record
.append(i)
.append(":")
.append(row.get(i))
.append(recordDelimiter);
.append(i)
.append(":")
.append(row.get(i))
.append(recordDelimiter);
}
LOG.error("Reading input file failed when reading record #{} (may not reflect the line #).\n" +
" It was expected that the current record contains {} values" +
", but {}. element was not retrieved before whole record was processed.\n" +
"The problematic record: {}",
currentRecordNumber,
expectedRowLength,
index+1,
record
" It was expected that the current record contains {} values" +
", but {}. element was not retrieved before whole record was processed.\n" +
"The problematic record: {}",
currentRecordNumber,
expectedRowLength,
index+1,
record
);
throw new SPipesException("Reading input file failed.", e);
}
}

private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
if (acceptInvalidQuoting) {
if (getQuote() == '\0') {
return null;
} else
return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
}
return new CsvListReader(getReader(), csvPreference);
}

private Statement createRowResource(String cellValue, int rowNumber, Column column) {
Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber));

return ResourceFactory.createStatement(
rowResource,
ResourceFactory.createProperty(column.getPropertyUrl()),
ResourceFactory.createPlainLiteral(cellValue));
rowResource,
ResourceFactory.createProperty(column.getPropertyUrl()),
ResourceFactory.createPlainLiteral(cellValue));
}

private boolean hasInputSchema(TableSchema inputTableSchema) {
Expand All @@ -429,11 +403,11 @@ private boolean hasInputSchema(TableSchema inputTableSchema) {

private TableSchema getTableSchema(EntityManager em) {
TypedQuery<TableSchema> query = em.createNativeQuery(
"PREFIX csvw: <http://www.w3.org/ns/csvw#>\n" +
"SELECT ?t WHERE { \n" +
"?t a csvw:TableSchema. \n" +
"}",
TableSchema.class
"PREFIX csvw: <http://www.w3.org/ns/csvw#>\n" +
"SELECT ?t WHERE { \n" +
"?t a csvw:TableSchema. \n" +
"}",
TableSchema.class
);

int tableSchemaCount = query.getResultList().size();
Expand All @@ -452,14 +426,14 @@ private TableSchema getTableSchema(EntityManager em) {

private void throwNotUniqueException(Column column, String columnTitle, String columnName) {
throw new ResourceNotUniqueException(
String.format("Unable to create value of property %s due to collision. " +
"Both column titles '%s' and '%s' are normalized to '%s' " +
"and thus would refer to the same property url <%s>.",
CSVW.propertyUrl,
columnTitle,
column.getTitle(),
columnName,
column.getPropertyUrl()));
String.format("Unable to create value of property %s due to collision. " +
"Both column titles '%s' and '%s' are normalized to '%s' " +
"and thus would refer to the same property url <%s>.",
CSVW.propertyUrl,
columnTitle,
column.getTitle(),
columnName,
column.getPropertyUrl()));
}

private ExecutionContext getExecutionContext(Model inputModel, Model outputModel) {
Expand All @@ -473,12 +447,12 @@ private ExecutionContext getExecutionContext(Model inputModel, Model outputModel
@Override
public void loadManualConfiguration() {
sourceResourceFormat = ResourceFormat.fromString(
getPropertyValue(P_SOURCE_RESOURCE_FORMAT, ResourceFormat.PLAIN.getValue())
getPropertyValue(P_SOURCE_RESOURCE_FORMAT, ResourceFormat.PLAIN.getValue())
);
delimiter = getPropertyValue(P_DELIMITER, getDefaultDelimiterSupplier(sourceResourceFormat));
quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, getDefaultQuoteCharacterSupplier(sourceResourceFormat));
outputMode = Mode.fromResource(
getPropertyValue(P_OUTPUT_MODE, Mode.STANDARD.getResource())
getPropertyValue(P_OUTPUT_MODE, Mode.STANDARD.getResource())
);
setInputCharset(delimiter);
}
Expand Down Expand Up @@ -514,7 +488,7 @@ private Supplier<Character> getDefaultQuoteCharacterSupplier(ResourceFormat sour
if (sourceResourceFormat == ResourceFormat.CSV) {
return () -> {
LOG.debug("Quote character not specified, using double-quote as default value" +
" to be compliant with RFC 4180 (CSV)");
" to be compliant with RFC 4180 (CSV)");
return '"';
};
}
Expand All @@ -524,8 +498,8 @@ private Supplier<Character> getDefaultQuoteCharacterSupplier(ResourceFormat sour
private char getPropertyValue(Property property,
Supplier<Character> defaultValueSupplier) {
return Optional.ofNullable(getPropertyValue(property))
.map(n -> n.asLiteral().getChar())
.orElseGet(defaultValueSupplier);
.map(n -> n.asLiteral().getChar())
.orElseGet(defaultValueSupplier);
}

@Override
Expand Down Expand Up @@ -587,10 +561,6 @@ private String normalize(String label) {
return label.trim().replaceAll("[^\\w]", "_");
}

private Reader getReader() {
return new StringReader(new String(sourceResource.getContent(), inputCharset));
}

@NotNull
private StreamResource getResourceByUri(@NotNull String resourceUri) {

Expand Down Expand Up @@ -624,7 +594,7 @@ public int getDelimiter() {

public void setDelimiter(int delimiter) {
if ((sourceResourceFormat == ResourceFormat.CSV && delimiter != ',') ||
(sourceResourceFormat == ResourceFormat.TSV && delimiter != '\t')) {
(sourceResourceFormat == ResourceFormat.TSV && delimiter != '\t')) {
throw new SpecificationNonComplianceException(sourceResourceFormat, delimiter);
}
this.delimiter = delimiter;
Expand Down
Loading

0 comments on commit a4077a9

Please sign in to comment.