diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java index 0316249a..aaca88cb 100644 --- a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java @@ -13,7 +13,6 @@ import cz.cvut.spipes.exception.ResourceNotUniqueException; import cz.cvut.spipes.exception.SPipesException; import cz.cvut.spipes.modules.annotations.SPipesModule; -import cz.cvut.spipes.modules.exception.SheetDoesntExistsException; import cz.cvut.spipes.modules.exception.SheetIsNotSpecifiedException; import cz.cvut.spipes.modules.exception.SpecificationNonComplianceException; import cz.cvut.spipes.modules.handlers.ModeHandler; @@ -31,6 +30,7 @@ import org.supercsv.io.ICsvListReader; import org.supercsv.prefs.CsvPreference; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.Reader; import java.io.StringReader; @@ -113,12 +113,12 @@ public class TabularModule extends AnnotatedAbstractModule { private final Property P_PROCESS_TABLE_AT_INDEX = getSpecificParameter("process-table-at-index"); @Parameter(iri = SML.replace, comment = "Specifies whether a module should overwrite triples" + - " from its predecessors. When set to true (default is false), it prevents" + - " passing through triples from the predecessors.") + " from its predecessors. When set to true (default is false), it prevents" + + " passing through triples from the predecessors.") private boolean isReplace = false; @Parameter(iri = PARAM_URL_PREFIX + "source-resource-uri", comment = "URI of resource" + - " that represent tabular data (e.g. resource representing CSV file).") + " that represent tabular data (e.g. resource representing CSV file).") private StreamResource sourceResource; @Parameter(iri = PARAM_URL_PREFIX + "delimiter", comment = "Column delimiter. Default value is comma ','.") @@ -189,6 +189,8 @@ ExecutionContext executeSelf() { StreamResource originalSourceResource = sourceResource; TSVConvertor tsvConvertor = null; + StreamReaderAdapter streamReaderAdapter = new XLSStreamReaderAdapter(); + CsvPreference csvPreference = null; switch (sourceResourceFormat) { case HTML: @@ -198,10 +200,7 @@ ExecutionContext executeSelf() { if (processTableAtIndex != 1) { throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet."); } - tsvConvertor = new HTML2TSVConvertor(processTableAtIndex); - table.setLabel(tsvConvertor.getTableName(sourceResource)); - setSourceResource(tsvConvertor.convertToTSV(sourceResource)); - setDelimiter('\t'); + streamReaderAdapter = new HTMLStreamReaderAdapter(); break; case XLS: case XLSM: @@ -209,19 +208,14 @@ ExecutionContext executeSelf() { if (processTableAtIndex == 0) { throw new SheetIsNotSpecifiedException("Source resource format is set to XLS(X,M) file but no specific table is set for processing."); } - tsvConvertor = new XLS2TSVConvertor(processTableAtIndex, sourceResourceFormat); - int numberOfSheets = tsvConvertor.getTablesCount(sourceResource); - table.setLabel(tsvConvertor.getTableName(sourceResource)); - LOG.debug("Number of sheets: {}", numberOfSheets); - if ((processTableAtIndex > numberOfSheets) || (processTableAtIndex < 1)) { - LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}", - numberOfSheets, - processTableAtIndex - ); - throw new SheetDoesntExistsException("Requested sheet doesn't exists."); - } - setSourceResource(tsvConvertor.convertToTSV(sourceResource)); - setDelimiter('\t'); + streamReaderAdapter = new XLSStreamReaderAdapter(); + break; + default: + csvPreference = new CsvPreference.Builder( + quoteCharacter, + delimiter, + System.lineSeparator()).build(); + streamReaderAdapter = new CSVStreamReaderAdapter(csvPreference); break; } @@ -236,33 +230,26 @@ ExecutionContext executeSelf() { List outputColumns = new ArrayList<>(); List rowStatements = new ArrayList<>(); - CsvPreference csvPreference = new CsvPreference.Builder( - quoteCharacter, - delimiter, - System.lineSeparator()).build(); - try { - ICsvListReader listReader = getCsvListReader(csvPreference); - - if (listReader == null) { - logMissingQuoteError(); - return getExecutionContext(inputModel, outputModel); - } + streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), + sourceResourceFormat, processTableAtIndex, acceptInvalidQuoting, inputCharset, sourceResource); + String[] header = streamReaderAdapter.getHeader(skipHeader); + Set columnNames = new HashSet<>(); - String[] header = listReader.getHeader(true); // skip the header (can't be used with CsvListReader) + if (streamReaderAdapter.getSheetLabel() != null) + table.setLabel(streamReaderAdapter.getSheetLabel()); if (header == null) { LOG.warn("Input stream resource {} to provide tabular data is empty.", this.sourceResource.getUri()); return getExecutionContext(inputModel, outputModel); } - Set columnNames = new HashSet<>(); TableSchema inputTableSchema = getTableSchema(em); hasInputSchema = hasInputSchema(inputTableSchema); if (skipHeader) { header = getHeaderFromSchema(inputModel, header, hasInputSchema); - listReader = new CsvListReader(getReader(), csvPreference); + //streamReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex); } else if (hasInputSchema) { header = getHeaderFromSchema(inputModel, header, true); } @@ -281,17 +268,16 @@ ExecutionContext executeSelf() { tableSchema.setAboutUrl(schemaColumn, sourceResource.getUri()); schemaColumn.setProperty( - dataPrefix, - sourceResource.getUri(), - hasInputSchema ? tableSchema.getColumn(columnName) : null); + dataPrefix, + sourceResource.getUri(), + hasInputSchema ? tableSchema.getColumn(columnName) : null); schemaColumn.setTitle(columnTitle); if (isDuplicate) throwNotUniqueException(schemaColumn, columnTitle, columnName); } - List row; int rowNumber = 0; - //for each row - while ((row = listReader.read()) != null) { + List row; + while ((row = streamReaderAdapter.getNextRow()) != null) { rowNumber++; // 4.6.1 and 4.6.3 Row r = new Row(); @@ -329,37 +315,35 @@ ExecutionContext executeSelf() { // 4.6.8.7 - else, if cellValue is not null } } - listReader.close(); - } catch (IOException | MissingArgumentException e) { - LOG.error("Error while reading file from resource uri {}", sourceResource, e); - } - - tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri()); - tableSchema.setColumnsSet(new HashSet<>(outputColumns)); + tableSchema.adjustProperties(hasInputSchema, outputColumns, sourceResource.getUri()); + tableSchema.setColumnsSet(new HashSet<>(outputColumns)); - em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel); - em.getTransaction().begin(); - em.persist(tableGroup); - em.merge(tableSchema); + em = JopaPersistenceUtils.getEntityManager("cz.cvut.spipes.modules.model", outputModel); + em.getTransaction().begin(); + em.persist(tableGroup); + em.merge(tableSchema); - if (tsvConvertor != null) { - List regions = tsvConvertor.getMergedRegions(originalSourceResource); + List regions = streamReaderAdapter.getMergedRegions(); int cellsNum = 1; for (Region region : regions) { int firstCellInRegionNum = cellsNum; - for(int i = region.getFirstRow();i <= region.getLastRow();i++){ - for(int j = region.getFirstColumn();j <= region.getLastColumn();j++) { - Cell cell = new Cell(sourceResource.getUri()+"#cell"+(cellsNum)); + for (int i = region.getFirstRow(); i <= region.getLastRow(); i++) { + for (int j = region.getFirstColumn(); j <= region.getLastColumn(); j++) { + Cell cell = new Cell(sourceResource.getUri() + "#cell" + cellsNum); cell.setRow(tableSchema.createAboutUrl(i)); cell.setColumn(outputColumns.get(j).getUri().toString()); - if(cellsNum != firstCellInRegionNum) - cell.setSameValueAsCell(sourceResource.getUri()+"#cell"+(firstCellInRegionNum)); + if (cellsNum != firstCellInRegionNum) { + cell.setSameValueAsCell(sourceResource.getUri() + "#cell" + firstCellInRegionNum); + } em.merge(cell); cellsNum++; } } } + streamReaderAdapter.close(); + } catch (IOException e) { + LOG.error("Error while reading file from resource uri {}", sourceResource, e); } em.getTransaction().commit(); @@ -381,41 +365,31 @@ private String getValueFromRow(List row, int index, int expectedRowLengt StringBuilder record = new StringBuilder(recordDelimiter); for (int i = 0; i < row.size(); i++) { record - .append(i) - .append(":") - .append(row.get(i)) - .append(recordDelimiter); + .append(i) + .append(":") + .append(row.get(i)) + .append(recordDelimiter); } LOG.error("Reading input file failed when reading record #{} (may not reflect the line #).\n" + - " It was expected that the current record contains {} values" + - ", but {}. element was not retrieved before whole record was processed.\n" + - "The problematic record: {}", - currentRecordNumber, - expectedRowLength, - index+1, - record + " It was expected that the current record contains {} values" + + ", but {}. element was not retrieved before whole record was processed.\n" + + "The problematic record: {}", + currentRecordNumber, + expectedRowLength, + index+1, + record ); throw new SPipesException("Reading input file failed.", e); } } - private ICsvListReader getCsvListReader(CsvPreference csvPreference) { - if (acceptInvalidQuoting) { - if (getQuote() == '\0') { - return null; - } else - return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference); - } - return new CsvListReader(getReader(), csvPreference); - } - private Statement createRowResource(String cellValue, int rowNumber, Column column) { Resource rowResource = ResourceFactory.createResource(tableSchema.createAboutUrl(rowNumber)); return ResourceFactory.createStatement( - rowResource, - ResourceFactory.createProperty(column.getPropertyUrl()), - ResourceFactory.createPlainLiteral(cellValue)); + rowResource, + ResourceFactory.createProperty(column.getPropertyUrl()), + ResourceFactory.createPlainLiteral(cellValue)); } private boolean hasInputSchema(TableSchema inputTableSchema) { @@ -429,11 +403,11 @@ private boolean hasInputSchema(TableSchema inputTableSchema) { private TableSchema getTableSchema(EntityManager em) { TypedQuery query = em.createNativeQuery( - "PREFIX csvw: \n" + - "SELECT ?t WHERE { \n" + - "?t a csvw:TableSchema. \n" + - "}", - TableSchema.class + "PREFIX csvw: \n" + + "SELECT ?t WHERE { \n" + + "?t a csvw:TableSchema. \n" + + "}", + TableSchema.class ); int tableSchemaCount = query.getResultList().size(); @@ -452,14 +426,14 @@ private TableSchema getTableSchema(EntityManager em) { private void throwNotUniqueException(Column column, String columnTitle, String columnName) { throw new ResourceNotUniqueException( - String.format("Unable to create value of property %s due to collision. " + - "Both column titles '%s' and '%s' are normalized to '%s' " + - "and thus would refer to the same property url <%s>.", - CSVW.propertyUrl, - columnTitle, - column.getTitle(), - columnName, - column.getPropertyUrl())); + String.format("Unable to create value of property %s due to collision. " + + "Both column titles '%s' and '%s' are normalized to '%s' " + + "and thus would refer to the same property url <%s>.", + CSVW.propertyUrl, + columnTitle, + column.getTitle(), + columnName, + column.getPropertyUrl())); } private ExecutionContext getExecutionContext(Model inputModel, Model outputModel) { @@ -473,12 +447,12 @@ private ExecutionContext getExecutionContext(Model inputModel, Model outputModel @Override public void loadManualConfiguration() { sourceResourceFormat = ResourceFormat.fromString( - getPropertyValue(P_SOURCE_RESOURCE_FORMAT, ResourceFormat.PLAIN.getValue()) + getPropertyValue(P_SOURCE_RESOURCE_FORMAT, ResourceFormat.PLAIN.getValue()) ); delimiter = getPropertyValue(P_DELIMITER, getDefaultDelimiterSupplier(sourceResourceFormat)); quoteCharacter = getPropertyValue(P_QUOTE_CHARACTER, getDefaultQuoteCharacterSupplier(sourceResourceFormat)); outputMode = Mode.fromResource( - getPropertyValue(P_OUTPUT_MODE, Mode.STANDARD.getResource()) + getPropertyValue(P_OUTPUT_MODE, Mode.STANDARD.getResource()) ); setInputCharset(delimiter); } @@ -514,7 +488,7 @@ private Supplier getDefaultQuoteCharacterSupplier(ResourceFormat sour if (sourceResourceFormat == ResourceFormat.CSV) { return () -> { LOG.debug("Quote character not specified, using double-quote as default value" + - " to be compliant with RFC 4180 (CSV)"); + " to be compliant with RFC 4180 (CSV)"); return '"'; }; } @@ -524,8 +498,8 @@ private Supplier getDefaultQuoteCharacterSupplier(ResourceFormat sour private char getPropertyValue(Property property, Supplier defaultValueSupplier) { return Optional.ofNullable(getPropertyValue(property)) - .map(n -> n.asLiteral().getChar()) - .orElseGet(defaultValueSupplier); + .map(n -> n.asLiteral().getChar()) + .orElseGet(defaultValueSupplier); } @Override @@ -587,10 +561,6 @@ private String normalize(String label) { return label.trim().replaceAll("[^\\w]", "_"); } - private Reader getReader() { - return new StringReader(new String(sourceResource.getContent(), inputCharset)); - } - @NotNull private StreamResource getResourceByUri(@NotNull String resourceUri) { @@ -624,7 +594,7 @@ public int getDelimiter() { public void setDelimiter(int delimiter) { if ((sourceResourceFormat == ResourceFormat.CSV && delimiter != ',') || - (sourceResourceFormat == ResourceFormat.TSV && delimiter != '\t')) { + (sourceResourceFormat == ResourceFormat.TSV && delimiter != '\t')) { throw new SpecificationNonComplianceException(sourceResourceFormat, delimiter); } this.delimiter = delimiter; diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java new file mode 100644 index 00000000..6935ffcf --- /dev/null +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVStreamReaderAdapter.java @@ -0,0 +1,97 @@ +package cz.cvut.spipes.modules.util; + +import cz.cvut.spipes.InvalidQuotingTokenizer; +import cz.cvut.spipes.modules.ResourceFormat; +import cz.cvut.spipes.modules.model.Region; +import cz.cvut.spipes.registry.StreamResource; +import org.supercsv.io.CsvListReader; +import org.supercsv.io.ICsvListReader; +import org.supercsv.prefs.CsvPreference; + +import java.io.*; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.io.StringReader; + +public class CSVStreamReaderAdapter implements StreamReaderAdapter { + private ICsvListReader listReader; + private CsvPreference csvPreference; + String [] header = null; + String [] firstRow = null; + boolean acceptInvalidQuoting; + Charset inputCharset; + StreamResource sourceResource; + + public CSVStreamReaderAdapter(CsvPreference csvPreference) { + this.csvPreference = csvPreference; + } + + @Override + public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, + boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException { + //listReader = new CsvListReader(new InputStreamReader(inputStream), csvPreference); + this.acceptInvalidQuoting = acceptInvalidQuoting; + this.inputCharset = inputCharset; + this.sourceResource = sourceResource; + listReader = getCsvListReader(csvPreference); + } + + @Override + public String[] getHeader(Boolean skipHeader) throws IOException { + header = listReader.getHeader(true); + if (skipHeader) { + firstRow = header; + } + return header; + } + + @Override + public boolean hasNextRow() throws IOException { + return ((listReader.read() != null) || (firstRow != null)); + } + + @Override + public List getNextRow() throws IOException { + if (firstRow != null) { + List row = Arrays.asList(firstRow); + firstRow = null; + return row; + } + return listReader.read(); + } + + @Override + public List getMergedRegions() { + return new ArrayList<>(); + } + + @Override + public String getSheetLabel(){ + return null; + } + + @Override + public void close() throws IOException{ + listReader.close(); + } + + private ICsvListReader getCsvListReader(CsvPreference csvPreference) { + if (acceptInvalidQuoting) { + if (getQuote() == '\0') { + return null; + } else + return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference); + } + return new CsvListReader(getReader(), csvPreference); + } + + private Reader getReader() { + return new StringReader(new String(sourceResource.getContent(), inputCharset)); + } + + public char getQuote() { + return csvPreference.getQuoteChar(); + } +} diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java new file mode 100644 index 00000000..857731b1 --- /dev/null +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLStreamReaderAdapter.java @@ -0,0 +1,131 @@ +package cz.cvut.spipes.modules.util; + +import cz.cvut.spipes.modules.ResourceFormat; +import cz.cvut.spipes.modules.model.Region; +import cz.cvut.spipes.registry.StreamResource; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.io.*; +import java.nio.charset.Charset; +import java.util.*; + +public class HTMLStreamReaderAdapter implements StreamReaderAdapter { + private Elements rows; + private int currentIndex; + private Element table; + private String label; + + private List mergedRegions; + private Map> mergedCells; + + @Override + public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, + int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException { + Document doc = Jsoup.parse(inputStream, "UTF-8", ""); + Element table = doc.select("table").first(); + rows = table.select("tr"); + currentIndex = 0; + this.table = table; + mergedRegions = extractMergedRegions(table); + mergedCells = new HashMap<>(); + label = table.attr("data-name"); + } + + + @Override + public String[] getHeader(Boolean skipHeader) throws IOException { + Elements headerCells = rows.get(0).select("th, td"); + return headerCells.stream() + .map(Element::text) + .toArray(String[]::new); + } + + @Override + public boolean hasNextRow() { + return currentIndex < rows.size() - 1; // Skip header row + } + + @Override + public List getNextRow() { + if (!hasNextRow()) { + return null; + } + + currentIndex++; + Elements cells = rows.get(currentIndex).select("td, th"); + List row = new ArrayList<>(); + int cellIndex = 0; + + for (Element cell : cells) { + int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan")); + int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan")); + String cellValue = cell.text(); + + if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) { + cellValue = cellValue.replace(",", "."); + } + + while (row.size() < cellIndex) { + row.add(null); + } + + row.add(cellValue); + + for (int i = 1; i < colspan; i++) { + row.add(null); + } + + if (rowspan > 1) { + for (int i = 1; i < rowspan; i++) { + mergedCells.computeIfAbsent(currentIndex + i, k -> new HashMap<>()).put(cellIndex, cellValue); + } + } + + cellIndex += colspan; + } + + if (mergedCells.containsKey(currentIndex)) { + Map rowMergedCells = mergedCells.get(currentIndex); + for (Map.Entry entry : rowMergedCells.entrySet()) { + row.add(entry.getKey(), null); + } + mergedCells.remove(currentIndex); + } + + return row; + } + + @Override + public List getMergedRegions() { + return mergedRegions; + } + + private List extractMergedRegions(Element table) { + List regions = new ArrayList<>(); + Elements rows = table.select("tr"); + for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) { + Elements cells = rows.get(rowIndex).select("td, th"); + for (int colIndex = 0; colIndex < cells.size(); colIndex++) { + Element cell = cells.get(colIndex); + int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan")); + int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan")); + if (colspan > 1 || rowspan > 1) { + regions.add(new Region(rowIndex, colIndex, rowIndex + rowspan - 1, colIndex + colspan - 1)); + } + } + } + return regions; + } + + @Override + public String getSheetLabel(){ + return label; + } + + @Override + public void close() { + } +} diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java new file mode 100644 index 00000000..a8148354 --- /dev/null +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/StreamReaderAdapter.java @@ -0,0 +1,24 @@ +package cz.cvut.spipes.modules.util; + +import cz.cvut.spipes.modules.ResourceFormat; +import cz.cvut.spipes.modules.TabularModule; +import cz.cvut.spipes.modules.model.Region; +import cz.cvut.spipes.registry.StreamResource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.List; + +public interface StreamReaderAdapter { + static final Logger LOG = LoggerFactory.getLogger(TabularModule.class); + void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException; + String[] getHeader(Boolean skipHeader) throws IOException; + boolean hasNextRow() throws IOException; + List getNextRow() throws IOException; + List getMergedRegions(); + String getSheetLabel() throws IOException; + void close() throws IOException; +} diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java new file mode 100644 index 00000000..0dc84cb4 --- /dev/null +++ b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSStreamReaderAdapter.java @@ -0,0 +1,116 @@ +package cz.cvut.spipes.modules.util; + +import cz.cvut.spipes.modules.ResourceFormat; +import cz.cvut.spipes.modules.exception.SheetDoesntExistsException; +import cz.cvut.spipes.modules.model.Region; +import cz.cvut.spipes.registry.StreamResource; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.ss.usermodel.DataFormatter; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.ss.usermodel.Workbook; +import org.apache.poi.ss.util.CellRangeAddress; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +public class XLSStreamReaderAdapter implements StreamReaderAdapter { + private Sheet sheet; + private Iterator rowIterator; + Boolean skipHeader; + + @Override + public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex, + boolean acceptInvalidQuoting, Charset inputCharset, StreamResource sourceResource) throws IOException { + Workbook workbook; + if (sourceResourceFormat == ResourceFormat.XLS) { + workbook = new HSSFWorkbook(inputStream); + } else { + workbook = new XSSFWorkbook(inputStream); + } + if ((tableIndex > workbook.getNumberOfSheets()) || (tableIndex < 1)) { + LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}", + workbook.getNumberOfSheets(), + tableIndex + ); + throw new SheetDoesntExistsException("Requested sheet doesn't exists."); + } + sheet = workbook.getSheetAt(tableIndex - 1); + rowIterator = sheet.iterator(); + } + + @Override + public String[] getHeader(Boolean skipHeader) throws IOException { + Row headerRow = sheet.getRow(0); + if (skipHeader) { + return null; + } + else { + rowIterator.next(); // move iterator to 2nd row + return StreamSupport.stream(headerRow.spliterator(), false) + .map(cell -> cell.getStringCellValue()) + .toArray(String[]::new); + } + } + + @Override + public boolean hasNextRow() { + return rowIterator.hasNext(); + } + + @Override + public List getNextRow() { + if (!rowIterator.hasNext()) + return null; + Row currentRow = rowIterator.next(); + DataFormatter formatter = new DataFormatter(); + List row = StreamSupport.stream(currentRow.spliterator(), false) + .map(cell -> { + String cellValue = formatter.formatCellValue(cell); + cellValue = fixNumberFormat(cellValue); + return cellValue.isEmpty() ? null : cellValue; + }) + .collect(Collectors.toList()); + return row; + } + + @Override + public List getMergedRegions() { + List regions = new ArrayList<>(); + for (int i = 0; i < sheet.getNumMergedRegions(); i++) { + CellRangeAddress region = sheet.getMergedRegion(i); + regions.add(new Region( + region.getFirstRow(), + region.getFirstColumn(), + region.getLastRow(), + region.getLastColumn() + )); + } + return regions; + } + + @Override + public String getSheetLabel(){ + return sheet.getSheetName(); + } + + public String fixNumberFormat (String cellValue){ + //xls uses ',' as decimal separator, so we should convert it to '.' + if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) { + cellValue = cellValue.replace(",", "."); + } + return cellValue; + } + + @Override + public void close() { + } +} +