-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[#228] Tabular Module now uses adapters
- Loading branch information
Showing
3 changed files
with
268 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
130 changes: 130 additions & 0 deletions
130
...les/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
package cz.cvut.spipes.modules.util; | ||
|
||
import cz.cvut.spipes.modules.ResourceFormat; | ||
import cz.cvut.spipes.modules.model.Region; | ||
import cz.cvut.spipes.registry.StreamResource; | ||
import org.jsoup.Jsoup; | ||
import org.jsoup.nodes.Document; | ||
import org.jsoup.nodes.Element; | ||
import org.jsoup.select.Elements; | ||
|
||
import java.io.*; | ||
import java.nio.charset.Charset; | ||
import java.util.*; | ||
|
||
public class HTMLStreamReaderAdapter implements StreamReaderAdapter { | ||
private Elements rows; | ||
private int currentIndex; | ||
private Element table; | ||
private String label; | ||
|
||
private List<Region> mergedRegions; | ||
private Map<Integer, Map<Integer, String>> mergedCells; | ||
|
||
@Override | ||
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, | ||
int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException { | ||
Document doc = Jsoup.parse(inputStream, "UTF-8", ""); | ||
Element table = doc.select("table").first(); | ||
rows = table.select("tr"); | ||
currentIndex = 0; | ||
this.table = table; | ||
mergedRegions = extractMergedRegions(table); | ||
mergedCells = new HashMap<>(); | ||
label = table.attr("data-name"); | ||
} | ||
|
||
|
||
@Override | ||
public String[] getHeader(boolean skipHeader) throws IOException { | ||
Elements headerCells = rows.get(0).select("th, td"); | ||
return headerCells.stream() | ||
.map(Element::text) | ||
.toArray(String[]::new); | ||
} | ||
|
||
private boolean hasNextRow() { | ||
return currentIndex < rows.size() - 1; // Skip header row | ||
} | ||
|
||
@Override | ||
public List<String> getNextRow() { | ||
if (!hasNextRow()) { | ||
return null; | ||
} | ||
|
||
currentIndex++; | ||
Elements cells = rows.get(currentIndex).select("td, th"); | ||
List<String> row = new ArrayList<>(); | ||
int cellIndex = 0; | ||
|
||
for (Element cell : cells) { | ||
int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan")); | ||
int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan")); | ||
String cellValue = cell.text(); | ||
|
||
if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) { | ||
cellValue = cellValue.replace(",", "."); | ||
} | ||
|
||
while (row.size() < cellIndex) { | ||
row.add(null); | ||
} | ||
|
||
row.add(cellValue); | ||
|
||
for (int i = 1; i < colspan; i++) { | ||
row.add(null); | ||
} | ||
|
||
if (rowspan > 1) { | ||
for (int i = 1; i < rowspan; i++) { | ||
mergedCells.computeIfAbsent(currentIndex + i, k -> new HashMap<>()).put(cellIndex, cellValue); | ||
} | ||
} | ||
|
||
cellIndex += colspan; | ||
} | ||
|
||
if (mergedCells.containsKey(currentIndex)) { | ||
Map<Integer, String> rowMergedCells = mergedCells.get(currentIndex); | ||
for (Map.Entry<Integer, String> entry : rowMergedCells.entrySet()) { | ||
row.add(entry.getKey(), null); | ||
} | ||
mergedCells.remove(currentIndex); | ||
} | ||
|
||
return row; | ||
} | ||
|
||
@Override | ||
public List<Region> getMergedRegions() { | ||
return mergedRegions; | ||
} | ||
|
||
private List<Region> extractMergedRegions(Element table) { | ||
List<Region> regions = new ArrayList<>(); | ||
Elements rows = table.select("tr"); | ||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { | ||
Elements cells = rows.get(rowIndex).select("td, th"); | ||
for (int colIndex = 0; colIndex < cells.size(); colIndex++) { | ||
Element cell = cells.get(colIndex); | ||
int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan")); | ||
int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan")); | ||
if (colspan > 1 || rowspan > 1) { | ||
regions.add(new Region(rowIndex, colIndex, rowIndex + rowspan - 1, colIndex + colspan - 1)); | ||
} | ||
} | ||
} | ||
return regions; | ||
} | ||
|
||
@Override | ||
public String getSheetLabel(){ | ||
return label; | ||
} | ||
|
||
@Override | ||
public void close() { | ||
} | ||
} |
114 changes: 114 additions & 0 deletions
114
...ules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
package cz.cvut.spipes.modules.util; | ||
|
||
import cz.cvut.spipes.modules.ResourceFormat; | ||
import cz.cvut.spipes.modules.exception.SheetDoesntExistsException; | ||
import cz.cvut.spipes.modules.model.Region; | ||
import cz.cvut.spipes.registry.StreamResource; | ||
import org.apache.poi.hssf.usermodel.HSSFWorkbook; | ||
import org.apache.poi.ss.usermodel.DataFormatter; | ||
import org.apache.poi.ss.usermodel.Row; | ||
import org.apache.poi.ss.usermodel.Sheet; | ||
import org.apache.poi.ss.usermodel.Workbook; | ||
import org.apache.poi.ss.util.CellRangeAddress; | ||
import org.apache.poi.xssf.usermodel.XSSFWorkbook; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.nio.charset.Charset; | ||
import java.util.ArrayList; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.StreamSupport; | ||
|
||
public class XLSStreamReaderAdapter implements StreamReaderAdapter { | ||
private Sheet sheet; | ||
private Iterator<org.apache.poi.ss.usermodel.Row> rowIterator; | ||
Boolean skipHeader; | ||
private static final Logger LOG = LoggerFactory.getLogger(XLSStreamReaderAdapter.class); | ||
|
||
@Override | ||
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, | ||
boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException { | ||
Workbook workbook; | ||
if (sourceResourceFormat == ResourceFormat.XLS) { | ||
workbook = new HSSFWorkbook(inputStream); | ||
} else { | ||
workbook = new XSSFWorkbook(inputStream); | ||
} | ||
if ((tableIndex > workbook.getNumberOfSheets()) || (tableIndex < 1)) { | ||
LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}", | ||
workbook.getNumberOfSheets(), | ||
tableIndex | ||
); | ||
throw new SheetDoesntExistsException("Requested sheet doesn't exists."); | ||
} | ||
sheet = workbook.getSheetAt(tableIndex - 1); | ||
rowIterator = sheet.iterator(); | ||
} | ||
|
||
@Override | ||
public String[] getHeader(boolean skipHeader) throws IOException { | ||
Row headerRow = sheet.getRow(0); | ||
if (skipHeader) { | ||
return null; | ||
} | ||
else { | ||
rowIterator.next(); // move iterator to 2nd row | ||
return StreamSupport.stream(headerRow.spliterator(), false) | ||
.map(cell -> cell.getStringCellValue()) | ||
.toArray(String[]::new); | ||
} | ||
} | ||
|
||
@Override | ||
public List<String> getNextRow() { | ||
if (!rowIterator.hasNext()) | ||
return null; | ||
Row currentRow = rowIterator.next(); | ||
DataFormatter formatter = new DataFormatter(); | ||
List<String> row = StreamSupport.stream(currentRow.spliterator(), false) | ||
.map(cell -> { | ||
String cellValue = formatter.formatCellValue(cell); | ||
cellValue = fixNumberFormat(cellValue); | ||
return cellValue.isEmpty() ? null : cellValue; | ||
}) | ||
.collect(Collectors.toList()); | ||
return row; | ||
} | ||
|
||
@Override | ||
public List<Region> getMergedRegions() { | ||
List<Region> regions = new ArrayList<>(); | ||
for (int i = 0; i < sheet.getNumMergedRegions(); i++) { | ||
CellRangeAddress region = sheet.getMergedRegion(i); | ||
regions.add(new Region( | ||
region.getFirstRow(), | ||
region.getFirstColumn(), | ||
region.getLastRow(), | ||
region.getLastColumn() | ||
)); | ||
} | ||
return regions; | ||
} | ||
|
||
@Override | ||
public String getSheetLabel(){ | ||
return sheet.getSheetName(); | ||
} | ||
|
||
public String fixNumberFormat (String cellValue){ | ||
//xls uses ',' as decimal separator, so we should convert it to '.' | ||
if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) { | ||
cellValue = cellValue.replace(",", "."); | ||
} | ||
return cellValue; | ||
} | ||
|
||
@Override | ||
public void close() { | ||
} | ||
} | ||
|