[#228] Tabular Module now uses adapters

kbss-cvut · Jan 7, 2025 · 03a9272 · 03a9272
1 parent a1b4b52
commit 03a9272
Show file tree

Hide file tree

Showing 3 changed files with 268 additions and 42 deletions.
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -14,7 +14,6 @@
 import cz.cvut.spipes.exception.SPipesException;
 import cz.cvut.spipes.modules.annotations.SPipesModule;
 import cz.cvut.spipes.modules.exception.MissingArgumentException;
-import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
 import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException;
 import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException;
 import cz.cvut.spipes.modules.handlers.ModeHandler;
@@ -197,30 +196,15 @@ ExecutionContext executeSelf() {
                 if (processTableAtIndex != 1) {
                     throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet.");
                 }
-                tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
-                table.setLabel(tsvConvertor.getTableName(sourceResource));
-                setSourceResource(tsvConvertor.convertToTSV(sourceResource));
-                streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, '\t');
+                streamReaderAdapter = new HTMLStreamReaderAdapter();
                 break;
             case XLS:
             case XLSM:
             case XLSX:
                 if (processTableAtIndex == 0) {
                     throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing.");
                 }
-                tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat);
-                int numberOfSheets = tsvConvertor.getTablesCount(sourceResource);
-                table.setLabel(tsvConvertor.getTableName(sourceResource));
-                LOG.debug("Number of sheets: {}", numberOfSheets);
-                if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) {
-                    LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
-                        numberOfSheets,
-                            processTableAtIndex
-                    );
-                    throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
-                }
-                setSourceResource(tsvConvertor.convertToTSV(sourceResource));
-                streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, '\t');
+                streamReaderAdapter = new XLSStreamReaderAdapter();
                 break;
             default:
                 streamReaderAdapter = new CSVStreamReaderAdapter(quoteCharacter, delimiter);
@@ -320,14 +304,6 @@ ExecutionContext executeSelf() {
                     // 4.6.8.7 - else, if cellValue is not null
                 }
             }
-            streamReaderAdapter.close();
-        } catch (MissingArgumentException e) {
-            if (ExecutionConfig.isExitOnError()) {
-                return getExecutionContext(inputModel, outputModel);
-            }
-        } catch (IOException e) {
-            LOG.error("Error while reading file from resource uri {}", sourceResource, e);
-        }
 
         tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri());
         tableSchema.setColumnsSet(new HashSet<>(outputColumns));
@@ -337,26 +313,32 @@ ExecutionContext executeSelf() {
         em.persist(tableGroup);
         em.merge(tableSchema);
 
-        if (tsvConvertor != null) {
-            List<Region> regions =  tsvConvertor.getMergedRegions(originalSourceResource);
-
-            int cellsNum = 1;
-            for (Region region : regions) {
-                int firstCellInRegionNum = cellsNum;
-                for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
-                    for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
-                        Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
-                        cell.setRow(tableSchema.createAboutUrl(i));
-                        cell.setColumn(outputColumns.get(j).getUri().toString());
-                        if (cellsNum != firstCellInRegionNum) {
-                            cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
-                        }
-                        em.merge(cell);
-                        cellsNum++;
+        List<Region> regions = streamReaderAdapter.getMergedRegions();
+
+        int cellsNum = 1;
+        for (Region region : regions) {
+            int firstCellInRegionNum = cellsNum;
+            for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) {
+                for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) {
+                    Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum);
+                    cell.setRow(tableSchema.createAboutUrl(i));
+                    cell.setColumn(outputColumns.get(j).getUri().toString());
+                    if (cellsNum != firstCellInRegionNum) {
+                        cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum);
                     }
+                    em.merge(cell);
+                    cellsNum++;
                 }
             }
         }
+        streamReaderAdapter.close();
+        } catch (MissingArgumentException e) {
+                if (ExecutionConfig.isExitOnError()) {
+                    return getExecutionContext(inputModel, outputModel);
+                }
+        } catch (IOException e) {
+            LOG.error("Error while reading file from resource uri {}", sourceResource, e);
+        }
 
         em.getTransaction().commit();
         Model persistedModel = JopaPersistenceUtils.getDataset(em).getDefaultModel();

diff --git a/...les/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java b/...les/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java
@@ -0,0 +1,130 @@
+package cz.cvut.spipes.modules.util;
+
+import cz.cvut.spipes.modules.ResourceFormat;
+import cz.cvut.spipes.modules.model.Region;
+import cz.cvut.spipes.registry.StreamResource;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.util.*;
+
+public class HTMLStreamReaderAdapter implements StreamReaderAdapter {
+    private Elements rows;
+    private int currentIndex;
+    private Element table;
+    private String label;
+
+    private List<Region> mergedRegions;
+    private Map<Integer, Map<Integer, String>> mergedCells;
+
+    @Override
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat,
+                           int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
+        Document doc = Jsoup.parse(inputStream, "UTF-8", "");
+        Element table = doc.select("table").first();
+        rows = table.select("tr");
+        currentIndex = 0;
+        this.table = table;
+        mergedRegions = extractMergedRegions(table);
+        mergedCells = new HashMap<>();
+        label = table.attr("data-name");
+    }
+
+
+    @Override
+    public String[] getHeader(boolean skipHeader) throws IOException {
+        Elements headerCells = rows.get(0).select("th, td");
+        return headerCells.stream()
+                .map(Element::text)
+                .toArray(String[]::new);
+    }
+
+    private boolean hasNextRow() {
+        return currentIndex < rows.size() - 1; // Skip header row
+    }
+
+    @Override
+    public List<String> getNextRow() {
+        if (!hasNextRow()) {
+            return null;
+        }
+
+        currentIndex++;
+        Elements cells = rows.get(currentIndex).select("td, th");
+        List<String> row = new ArrayList<>();
+        int cellIndex = 0;
+
+        for (Element cell : cells) {
+            int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
+            int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
+            String cellValue = cell.text();
+
+            if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) {
+                cellValue = cellValue.replace(",", ".");
+            }
+
+            while (row.size() < cellIndex) {
+                row.add(null);
+            }
+
+            row.add(cellValue);
+
+            for (int i = 1; i < colspan; i++) {
+                row.add(null);
+            }
+
+            if (rowspan > 1) {
+                for (int i = 1; i < rowspan; i++) {
+                    mergedCells.computeIfAbsent(currentIndex + i, k -> new HashMap<>()).put(cellIndex, cellValue);
+                }
+            }
+
+            cellIndex += colspan;
+        }
+
+        if (mergedCells.containsKey(currentIndex)) {
+            Map<Integer, String> rowMergedCells = mergedCells.get(currentIndex);
+            for (Map.Entry<Integer, String> entry : rowMergedCells.entrySet()) {
+                row.add(entry.getKey(), null);
+            }
+            mergedCells.remove(currentIndex);
+        }
+
+        return row;
+    }
+
+    @Override
+    public List<Region> getMergedRegions() {
+        return mergedRegions;
+    }
+
+    private List<Region> extractMergedRegions(Element table) {
+        List<Region> regions = new ArrayList<>();
+        Elements rows = table.select("tr");
+        for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
+            Elements cells = rows.get(rowIndex).select("td, th");
+            for (int colIndex = 0; colIndex < cells.size(); colIndex++) {
+                Element cell = cells.get(colIndex);
+                int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
+                int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
+                if (colspan > 1 || rowspan > 1) {
+                    regions.add(new Region(rowIndex, colIndex, rowIndex + rowspan - 1, colIndex + colspan - 1));
+                }
+            }
+        }
+        return regions;
+    }
+
+    @Override
+    public String getSheetLabel(){
+        return label;
+    }
+
+    @Override
+    public void close() {
+    }
+}
diff --git a/...ules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java b/...ules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java
@@ -0,0 +1,114 @@
+package cz.cvut.spipes.modules.util;
+
+import cz.cvut.spipes.modules.ResourceFormat;
+import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
+import cz.cvut.spipes.modules.model.Region;
+import cz.cvut.spipes.registry.StreamResource;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.ss.usermodel.DataFormatter;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.ss.util.CellRangeAddress;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+public class XLSStreamReaderAdapter implements StreamReaderAdapter {
+    private Sheet sheet;
+    private Iterator<org.apache.poi.ss.usermodel.Row> rowIterator;
+    Boolean skipHeader;
+    private static final Logger LOG = LoggerFactory.getLogger(XLSStreamReaderAdapter.class);
+
+    @Override
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex,
+                           boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException {
+        Workbook workbook;
+        if (sourceResourceFormat == ResourceFormat.XLS) {
+            workbook = new HSSFWorkbook(inputStream);
+        } else {
+            workbook = new XSSFWorkbook(inputStream);
+        }
+        if ((tableIndex > workbook.getNumberOfSheets()) || (tableIndex < 1)) {
+                LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
+                        workbook.getNumberOfSheets(),
+                        tableIndex
+                );
+                    throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
+                }
+        sheet = workbook.getSheetAt(tableIndex - 1);
+        rowIterator = sheet.iterator();
+    }
+
+    @Override
+    public String[] getHeader(boolean skipHeader) throws IOException {
+        Row headerRow = sheet.getRow(0);
+        if (skipHeader) {
+            return null;
+        }
+        else {
+            rowIterator.next(); // move iterator to 2nd row
+            return StreamSupport.stream(headerRow.spliterator(), false)
+                    .map(cell -> cell.getStringCellValue())
+                    .toArray(String[]::new);
+        }
+    }
+
+    @Override
+    public List<String> getNextRow() {
+        if (!rowIterator.hasNext())
+            return null;
+        Row currentRow = rowIterator.next();
+        DataFormatter formatter = new DataFormatter();
+        List<String> row = StreamSupport.stream(currentRow.spliterator(), false)
+                .map(cell -> {
+                    String cellValue = formatter.formatCellValue(cell);
+                    cellValue = fixNumberFormat(cellValue);
+                    return cellValue.isEmpty() ? null : cellValue;
+                })
+                .collect(Collectors.toList());
+        return row;
+    }
+
+    @Override
+    public List<Region> getMergedRegions() {
+        List<Region> regions = new ArrayList<>();
+        for (int i = 0; i < sheet.getNumMergedRegions(); i++) {
+            CellRangeAddress region = sheet.getMergedRegion(i);
+            regions.add(new Region(
+                    region.getFirstRow(),
+                    region.getFirstColumn(),
+                    region.getLastRow(),
+                    region.getLastColumn()
+            ));
+        }
+        return regions;
+    }
+
+    @Override
+    public String getSheetLabel(){
+        return sheet.getSheetName();
+    }
+
+    public String fixNumberFormat (String cellValue){
+        //xls uses ',' as decimal separator, so we should convert it to '.'
+        if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) {
+            cellValue = cellValue.replace(",", ".");
+        }
+        return cellValue;
+    }
+
+    @Override
+    public void close() {
+    }
+}
+