Skip to content

Commit

Permalink
[#228] Tabular Module now uses adapters
Browse files Browse the repository at this point in the history
  • Loading branch information
Evgenii Grigorev committed Jan 7, 2025
1 parent b07df0f commit ea2e584
Showing 1 changed file with 26 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import cz.cvut.spipes.modules.annotations.SPipesModule;
import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException;
import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException;
import cz.cvut.spipes.modules.handlers.ModeHandler;
import cz.cvut.spipes.modules.model.*;
import cz.cvut.spipes.modules.util.*;
import cz.cvut.spipes.registry.StreamResource;
Expand Down Expand Up @@ -141,7 +140,7 @@ public class TabularModule extends AnnotatedAbstractModule {
private int processTableAtIndex = 0;

// TODO - revise comment
@Parameter(iri = PARAM_URL_PREFIX + "output-mode", comment = "Output mode. Default is standard-mode('http://onto.fel.cvut.cz/ontologies/lib/module/tabular/standard-mode)", handler = ModeHandler.class)
@Parameter(iri = PARAM_URL_PREFIX + "output-mode", comment = "Output mode. Default is standard-mode('http://onto.fel.cvut.cz/ontologies/lib/module/tabular/standard-mode)")
private Mode outputMode;

//:source-resource-format
Expand Down Expand Up @@ -189,6 +188,8 @@ ExecutionContext executeSelf() {

StreamResource originalSourceResource = sourceResource;
TSVConvertor tsvConvertor = null;
StreamReaderAdapter streamReaderAdapter;
CsvPreference csvPreference = null;

switch (sourceResourceFormat) {
case HTML:
Expand All @@ -198,30 +199,22 @@ ExecutionContext executeSelf() {
if (processTableAtIndex != 1) {
throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet.");
}
tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
table.setLabel(tsvConvertor.getTableName(sourceResource));
setSourceResource(tsvConvertor.convertToTSV(sourceResource));
setDelimiter('\t');
streamReaderAdapter = new HTMLStreamReaderAdapter();
break;
case XLS:
case XLSM:
case XLSX:
if (processTableAtIndex == 0) {
throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing.");
}
tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat);
int numberOfSheets = tsvConvertor.getTablesCount(sourceResource);
table.setLabel(tsvConvertor.getTableName(sourceResource));
LOG.debug("Number of sheets: {}", numberOfSheets);
if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) {
LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
numberOfSheets,
processTableAtIndex
);
throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
}
setSourceResource(tsvConvertor.convertToTSV(sourceResource));
setDelimiter('\t');
streamReaderAdapter = new XLSStreamReaderAdapter();
break;
default:
csvPreference = new CsvPreference.Builder(
quoteCharacter,
delimiter,
System.lineSeparator()).build();
streamReaderAdapter = new CSVStreamReaderAdapter(csvPreference);
break;
}

Expand All @@ -236,33 +229,25 @@ ExecutionContext executeSelf() {
List<Column> outputColumns = new ArrayList<>();
List<Statement> rowStatements = new ArrayList<>();

CsvPreference csvPreference = new CsvPreference.Builder(
quoteCharacter,
delimiter,
System.lineSeparator()).build();

try {
ICsvListReader listReader = getCsvListReader(csvPreference);

if (listReader == null) {
logMissingQuoteError();
return getExecutionContext(inputModel, outputModel);
}
streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()),
sourceResourceFormat, processTableAtIndex, acceptInvalidQuoting, inputCharset, sourceResource);
String[] header = streamReaderAdapter.getHeader(skipHeader);
Set<String> columnNames = new HashSet<>();

String[] header = listReader.getHeader(true); // skip the header (can't be used with CsvListReader)
if (streamReaderAdapter.getSheetLabel() != null)
table.setLabel(streamReaderAdapter.getSheetLabel());

if (header == null) {
LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri());
return getExecutionContext(inputModel, outputModel);
}
Set<String> columnNames = new HashSet<>();

TableSchema inputTableSchema = getTableSchema(em);
hasInputSchema = hasInputSchema(inputTableSchema);

if (skipHeader) {
header = getHeaderFromSchema(inputModel, header, hasInputSchema);
listReader = new CsvListReader(getReader(), csvPreference);
} else if (hasInputSchema) {
header = getHeaderFromSchema(inputModel, header, true);
}
Expand All @@ -288,10 +273,9 @@ ExecutionContext executeSelf() {
if (isDuplicate) throwNotUniqueException(schemaColumn, columnTitle, columnName);
}

List<String> row;
int rowNumber = 0;
//for each row
while ((row = listReader.read()) != null) {
List<String> row;
while ((row = streamReaderAdapter.getNextRow()) != null) {
rowNumber++;
// 4.6.1 and 4.6.3
Row r = new Row();
Expand Down Expand Up @@ -329,21 +313,15 @@ ExecutionContext executeSelf() {
// 4.6.8.7 - else, if cellValue is not null
}
}
listReader.close();
} catch (IOException | MissingArgumentException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
tableSchema.setColumnsSet(new HashSet<>(outputColumns));
tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
tableSchema.setColumnsSet(new HashSet<>(outputColumns));

em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel);
em.getTransaction().begin();
em.persist(tableGroup);
em.merge(tableSchema);

if (tsvConvertor != null) {
List<Region> regions = tsvConvertor.getMergedRegions(originalSourceResource);
List<Region> regions = streamReaderAdapter.getMergedRegions();

int cellsNum = 1;
for (Region region : regions) {
Expand All @@ -361,6 +339,9 @@ ExecutionContext executeSelf() {
}
}
}
streamReaderAdapter.close();
} catch (IOException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

em.getTransaction().commit();
Expand Down Expand Up @@ -400,16 +381,6 @@ private String getValueFromRow(List<String> row, int index, int expectedRowLengt
}
}

private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
if (acceptInvalidQuoting) {
if (getQuote() == '\0') {
return null;
} else
return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
}
return new CsvListReader(getReader(), csvPreference);
}

private Statement createRowResource(String cellValue, int rowNumber, Column column) {
Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber));

Expand Down Expand Up @@ -588,10 +559,6 @@ private String normalize(String label) {
return label.trim().replaceAll("[^\\w]", "_");
}

private Reader getReader() {
return new StringReader(new String(sourceResource.getContent(), inputCharset));
}

@NotNull
private StreamResource getResourceByUri(@NotNull String resourceUri) {

Expand Down

0 comments on commit ea2e584

Please sign in to comment.