Skip to content

Commit

Permalink
[#228] Tabular Module now uses adapters
Browse files Browse the repository at this point in the history
  • Loading branch information
blcham committed Jan 7, 2025
1 parent a1b4b52 commit 03a9272
Show file tree
Hide file tree
Showing 3 changed files with 268 additions and 42 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import cz.cvut.spipes.exception.SPipesException;
import cz.cvut.spipes.modules.annotations.SPipesModule;
import cz.cvut.spipes.modules.exception.MissingArgumentException;
import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException;
import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException;
import cz.cvut.spipes.modules.handlers.ModeHandler;
Expand Down Expand Up @@ -197,30 +196,15 @@ ExecutionContext executeSelf() {
if (processTableAtIndex != 1) {
throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet.");
}
tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
table.setLabel(tsvConvertor.getTableName(sourceResource));
setSourceResource(tsvConvertor.convertToTSV(sourceResource));
streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, '\t');
streamReaderAdapter = new HTMLStreamReaderAdapter();
break;
case XLS:
case XLSM:
case XLSX:
if (processTableAtIndex == 0) {
throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing.");
}
tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat);
int numberOfSheets = tsvConvertor.getTablesCount(sourceResource);
table.setLabel(tsvConvertor.getTableName(sourceResource));
LOG.debug("Number of sheets: {}", numberOfSheets);
if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) {
LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
numberOfSheets,
processTableAtIndex
);
throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
}
setSourceResource(tsvConvertor.convertToTSV(sourceResource));
streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, '\t');
streamReaderAdapter = new XLSStreamReaderAdapter();
break;
default:
streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, delimiter);
Expand Down Expand Up @@ -320,14 +304,6 @@ ExecutionContext executeSelf() {
// 4.6.8.7 - else, if cellValue is not null
}
}
streamReaderAdapter.close();
} catch (MissingArgumentException e) {
if (ExecutionConfig.isExitOnError()) {
return getExecutionContext(inputModel, outputModel);
}
} catch (IOException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
tableSchema.setColumnsSet(new HashSet<>(outputColumns));
Expand All @@ -337,26 +313,32 @@ ExecutionContext executeSelf() {
em.persist(tableGroup);
em.merge(tableSchema);

if (tsvConvertor != null) {
List<Region> regions = tsvConvertor.getMergedRegions(originalSourceResource);

int cellsNum = 1;
for (Region region : regions) {
int firstCellInRegionNum = cellsNum;
for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
cell.setRow(tableSchema.createAboutUrl(i));
cell.setColumn(outputColumns.get(j).getUri().toString());
if (cellsNum != firstCellInRegionNum) {
cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
}
em.merge(cell);
cellsNum++;
List<Region> regions = streamReaderAdapter.getMergedRegions();

int cellsNum = 1;
for (Region region : regions) {
int firstCellInRegionNum = cellsNum;
for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
cell.setRow(tableSchema.createAboutUrl(i));
cell.setColumn(outputColumns.get(j).getUri().toString());
if (cellsNum != firstCellInRegionNum) {
cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
}
em.merge(cell);
cellsNum++;
}
}
}
streamReaderAdapter.close();
} catch (MissingArgumentException e) {
if (ExecutionConfig.isExitOnError()) {
return getExecutionContext(inputModel, outputModel);
}
} catch (IOException e) {
LOG.error("Error while reading file from resource uri {}", sourceResource, e);
}

em.getTransaction().commit();
Model persistedModel = JopaPersistenceUtils.getDataset(em).getDefaultModel();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package cz.cvut.spipes.modules.util;

import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.model.Region;
import cz.cvut.spipes.registry.StreamResource;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.nio.charset.Charset;
import java.util.*;

public class HTMLStreamReaderAdapter implements StreamReaderAdapter {
private Elements rows;
private int currentIndex;
private Element table;
private String label;

private List<Region> mergedRegions;
private Map<Integer, Map<Integer, String>> mergedCells;

@Override
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat,
int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
Document doc = Jsoup.parse(inputStream, "UTF-8", "");
Element table = doc.select("table").first();
rows = table.select("tr");
currentIndex = 0;
this.table = table;
mergedRegions = extractMergedRegions(table);
mergedCells = new HashMap<>();
label = table.attr("data-name");
}


@Override
public String[] getHeader(boolean skipHeader) throws IOException {
Elements headerCells = rows.get(0).select("th, td");
return headerCells.stream()
.map(Element::text)
.toArray(String[]::new);
}

private boolean hasNextRow() {
return currentIndex < rows.size() - 1; // Skip header row
}

@Override
public List<String> getNextRow() {
if (!hasNextRow()) {
return null;
}

currentIndex++;
Elements cells = rows.get(currentIndex).select("td, th");
List<String> row = new ArrayList<>();
int cellIndex = 0;

for (Element cell : cells) {
int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
String cellValue = cell.text();

if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) {
cellValue = cellValue.replace(",", ".");
}

while (row.size() < cellIndex) {
row.add(null);
}

row.add(cellValue);

for (int i = 1; i < colspan; i++) {
row.add(null);
}

if (rowspan > 1) {
for (int i = 1; i < rowspan; i++) {
mergedCells.computeIfAbsent(currentIndex + i, k -> new HashMap<>()).put(cellIndex, cellValue);
}
}

cellIndex += colspan;
}

if (mergedCells.containsKey(currentIndex)) {
Map<Integer, String> rowMergedCells = mergedCells.get(currentIndex);
for (Map.Entry<Integer, String> entry : rowMergedCells.entrySet()) {
row.add(entry.getKey(), null);
}
mergedCells.remove(currentIndex);
}

return row;
}

@Override
public List<Region> getMergedRegions() {
return mergedRegions;
}

private List<Region> extractMergedRegions(Element table) {
List<Region> regions = new ArrayList<>();
Elements rows = table.select("tr");
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
Elements cells = rows.get(rowIndex).select("td, th");
for (int colIndex = 0; colIndex < cells.size(); colIndex++) {
Element cell = cells.get(colIndex);
int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
if (colspan > 1 || rowspan > 1) {
regions.add(new Region(rowIndex, colIndex, rowIndex + rowspan - 1, colIndex + colspan - 1));
}
}
}
return regions;
}

@Override
public String getSheetLabel(){
return label;
}

@Override
public void close() {
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package cz.cvut.spipes.modules.util;

import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
import cz.cvut.spipes.modules.model.Region;
import cz.cvut.spipes.registry.StreamResource;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.CellRangeAddress;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

public class XLSStreamReaderAdapter implements StreamReaderAdapter {
private Sheet sheet;
private Iterator<org.apache.poi.ss.usermodel.Row> rowIterator;
Boolean skipHeader;
private static final Logger LOG = LoggerFactory.getLogger(XLSStreamReaderAdapter.class);

@Override
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex,
boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
Workbook workbook;
if (sourceResourceFormat == ResourceFormat.XLS) {
workbook = new HSSFWorkbook(inputStream);
} else {
workbook = new XSSFWorkbook(inputStream);
}
if ((tableIndex > workbook.getNumberOfSheets()) || (tableIndex < 1)) {
LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
workbook.getNumberOfSheets(),
tableIndex
);
throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
}
sheet = workbook.getSheetAt(tableIndex - 1);
rowIterator = sheet.iterator();
}

@Override
public String[] getHeader(boolean skipHeader) throws IOException {
Row headerRow = sheet.getRow(0);
if (skipHeader) {
return null;
}
else {
rowIterator.next(); // move iterator to 2nd row
return StreamSupport.stream(headerRow.spliterator(), false)
.map(cell -> cell.getStringCellValue())
.toArray(String[]::new);
}
}

@Override
public List<String> getNextRow() {
if (!rowIterator.hasNext())
return null;
Row currentRow = rowIterator.next();
DataFormatter formatter = new DataFormatter();
List<String> row = StreamSupport.stream(currentRow.spliterator(), false)
.map(cell -> {
String cellValue = formatter.formatCellValue(cell);
cellValue = fixNumberFormat(cellValue);
return cellValue.isEmpty() ? null : cellValue;
})
.collect(Collectors.toList());
return row;
}

@Override
public List<Region> getMergedRegions() {
List<Region> regions = new ArrayList<>();
for (int i = 0; i < sheet.getNumMergedRegions(); i++) {
CellRangeAddress region = sheet.getMergedRegion(i);
regions.add(new Region(
region.getFirstRow(),
region.getFirstColumn(),
region.getLastRow(),
region.getLastColumn()
));
}
return regions;
}

@Override
public String getSheetLabel(){
return sheet.getSheetName();
}

public String fixNumberFormat (String cellValue){
//xls uses ',' as decimal separator, so we should convert it to '.'
if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) {
cellValue = cellValue.replace(",", ".");
}
return cellValue;
}

@Override
public void close() {
}
}

0 comments on commit 03a9272

Please sign in to comment.