From c7a111d6252eafefd5b4efc51c5fff8de275c7e0 Mon Sep 17 00:00:00 2001 From: Sivan Ratson <89018301+Sivan22@users.noreply.github.com> Date: Sun, 24 Nov 2024 22:00:33 +0200 Subject: [PATCH] fix: upgrade docx support added support for: * italic * underscore * footenotes (after each paragraph) * font size - big or small * font family * font color --- lib/utils/docx_to_otzaria.dart | 127 +++++++++++++++++++++++++++++---- 1 file changed, 112 insertions(+), 15 deletions(-) diff --git a/lib/utils/docx_to_otzaria.dart b/lib/utils/docx_to_otzaria.dart index dc6daf3e..10c0aee6 100644 --- a/lib/utils/docx_to_otzaria.dart +++ b/lib/utils/docx_to_otzaria.dart @@ -5,13 +5,91 @@ import 'dart:convert'; ZipDecoder? _zipDecoder; +/// Processes a run element and returns HTML-formatted text with styling +String _processRun(xml.XmlElement node, {double defaultFontSize = 12}) { + final rPr = node.getElement('w:rPr'); + final text = node.getElement('w:t')?.innerText ?? ''; + if (text.isEmpty) return ''; + + var result = text; + + if (rPr != null) { + // Font size + final sz = rPr.getElement('w:sz')?.getAttribute('w:val'); + if (sz != null) { + final fontSize = double.parse(sz) / 2; // Word uses half-points + if (fontSize > defaultFontSize) { + result = '$result'; + } else if (fontSize < defaultFontSize) { + result = '$result'; + } + } + + // Font color + final color = rPr.getElement('w:color')?.getAttribute('w:val'); + if (color != null) { + result = '$result'; + } + + // Font family + final fontFamily = rPr.getElement('w:rFonts')?.getAttribute('w:ascii') ?? + rPr.getElement('w:rFonts')?.getAttribute('w:eastAsia'); + if (fontFamily != null) { + result = '$result'; + } + + // Underline + if (rPr.getElement('w:u') != null) { + result = '$result'; + } + + // Italic + if (rPr.getElement('w:i') != null) { + result = '$result'; + } + + // Bold + if (rPr.getElement('w:b') != null) { + result = '$result'; + } + } + + return result; +} + +/// Extracts footnotes from the document +Map _extractFootnotes(Archive archive) { + final footnotes = {}; + + for (final file in archive) { + if (file.isFile && file.name == 'word/footnotes.xml') { + final content = utf8.decode(file.content); + final document = xml.XmlDocument.parse(content); + + final footnoteNodes = document.findAllElements('w:footnote'); + for (final footnote in footnoteNodes) { + final id = footnote.getAttribute('w:id'); + if (id != null && id != '-1' && id != '0') { + // Skip automatic footnotes + final text = + footnote.findAllElements('w:t').map((e) => e.innerText).join(''); + footnotes[id] = text; + } + } + break; + } + } + + return footnotes; +} + /// Converts a docx file to text. -/// marks up headings and lists +/// Marks up headings, lists, text styling, and includes footnotes after their respective paragraphs String docxToText(Uint8List bytes, String title) { _zipDecoder ??= ZipDecoder(); final archive = _zipDecoder!.decodeBytes(bytes); - + final footnotes = _extractFootnotes(archive); final List list = ['

$title

']; for (final file in archive) { @@ -20,43 +98,62 @@ String docxToText(Uint8List bytes, String title) { final document = xml.XmlDocument.parse(fileContent); final paragraphNodes = document.findAllElements('w:p'); + var footnoteCounter = 1; + var paragraphFootnotes = []; for (final paragraph in paragraphNodes) { final textNodes = paragraph.findAllElements('w:r'); - var text = textNodes.map((node) { - final innerText = node.getElement('w:t')?.innerText ?? ''; - //mark bold text - if (node.getElement('w:rPr')?.getElement('w:b') != null) { - return '$innerText'; + var text = ''; + paragraphFootnotes.clear(); + + for (final node in textNodes) { + // Check for footnote reference + final footnoteRef = node.getElement('w:footnoteReference'); + if (footnoteRef != null) { + final footnoteId = footnoteRef.getAttribute('w:id'); + if (footnoteId != null && footnotes.containsKey(footnoteId)) { + text += '$footnoteCounter'; + paragraphFootnotes + .add('$footnoteCounter) ${footnotes[footnoteId]}'); + footnoteCounter++; + } + } else { + text += _processRun(node); } - return innerText; - }).join(); + } - //mark up headings + // Process paragraph style var style = paragraph .getElement('w:pPr') ?.getElement('w:pStyle') ?.getAttribute('w:val'); - //if val is a number, that means it is a heading + + // Handle headings if (style != null && double.tryParse(style) != null) { int styleNum = int.parse(style) + 1; text = '$text'; } - //mark up lists - //get the numbering level + // Handle lists var numbering = paragraph.getElement('w:pPr')?.getElement('w:numPr'); if (numbering != null) { String? level = numbering.getElement('w:ilvl')?.getAttribute('w:val'); if (level != null) { - // indent the text with the correct amount of spaces (0 for first level) int levelInt = int.parse(level); for (int i = 0; i <= levelInt; i++) { text = '
  • $text
'; } } } - list.add(text); + + // Add paragraph with its footnotes + if (!text.trim().isEmpty) { + list.add('$text'); + if (paragraphFootnotes.isNotEmpty) { + list.add( + '
${paragraphFootnotes.join('
')}
'); + } + } } } }