From c7a111d6252eafefd5b4efc51c5fff8de275c7e0 Mon Sep 17 00:00:00 2001
From: Sivan Ratson <89018301+Sivan22@users.noreply.github.com>
Date: Sun, 24 Nov 2024 22:00:33 +0200
Subject: [PATCH] fix: upgrade docx support added support for: * italic *
underscore * footenotes (after each paragraph) * font size - big or small *
font family * font color
---
lib/utils/docx_to_otzaria.dart | 127 +++++++++++++++++++++++++++++----
1 file changed, 112 insertions(+), 15 deletions(-)
diff --git a/lib/utils/docx_to_otzaria.dart b/lib/utils/docx_to_otzaria.dart
index dc6daf3e..10c0aee6 100644
--- a/lib/utils/docx_to_otzaria.dart
+++ b/lib/utils/docx_to_otzaria.dart
@@ -5,13 +5,91 @@ import 'dart:convert';
ZipDecoder? _zipDecoder;
+/// Processes a run element and returns HTML-formatted text with styling
+String _processRun(xml.XmlElement node, {double defaultFontSize = 12}) {
+ final rPr = node.getElement('w:rPr');
+ final text = node.getElement('w:t')?.innerText ?? '';
+ if (text.isEmpty) return '';
+
+ var result = text;
+
+ if (rPr != null) {
+ // Font size
+ final sz = rPr.getElement('w:sz')?.getAttribute('w:val');
+ if (sz != null) {
+ final fontSize = double.parse(sz) / 2; // Word uses half-points
+ if (fontSize > defaultFontSize) {
+ result = '$result';
+ } else if (fontSize < defaultFontSize) {
+ result = '$result';
+ }
+ }
+
+ // Font color
+ final color = rPr.getElement('w:color')?.getAttribute('w:val');
+ if (color != null) {
+ result = '$result';
+ }
+
+ // Font family
+ final fontFamily = rPr.getElement('w:rFonts')?.getAttribute('w:ascii') ??
+ rPr.getElement('w:rFonts')?.getAttribute('w:eastAsia');
+ if (fontFamily != null) {
+ result = '$result';
+ }
+
+ // Underline
+ if (rPr.getElement('w:u') != null) {
+ result = '$result';
+ }
+
+ // Italic
+ if (rPr.getElement('w:i') != null) {
+ result = '$result';
+ }
+
+ // Bold
+ if (rPr.getElement('w:b') != null) {
+ result = '$result';
+ }
+ }
+
+ return result;
+}
+
+/// Extracts footnotes from the document
+Map _extractFootnotes(Archive archive) {
+ final footnotes = {};
+
+ for (final file in archive) {
+ if (file.isFile && file.name == 'word/footnotes.xml') {
+ final content = utf8.decode(file.content);
+ final document = xml.XmlDocument.parse(content);
+
+ final footnoteNodes = document.findAllElements('w:footnote');
+ for (final footnote in footnoteNodes) {
+ final id = footnote.getAttribute('w:id');
+ if (id != null && id != '-1' && id != '0') {
+ // Skip automatic footnotes
+ final text =
+ footnote.findAllElements('w:t').map((e) => e.innerText).join('');
+ footnotes[id] = text;
+ }
+ }
+ break;
+ }
+ }
+
+ return footnotes;
+}
+
/// Converts a docx file to text.
-/// marks up headings and lists
+/// Marks up headings, lists, text styling, and includes footnotes after their respective paragraphs
String docxToText(Uint8List bytes, String title) {
_zipDecoder ??= ZipDecoder();
final archive = _zipDecoder!.decodeBytes(bytes);
-
+ final footnotes = _extractFootnotes(archive);
final List list = ['$title
'];
for (final file in archive) {
@@ -20,43 +98,62 @@ String docxToText(Uint8List bytes, String title) {
final document = xml.XmlDocument.parse(fileContent);
final paragraphNodes = document.findAllElements('w:p');
+ var footnoteCounter = 1;
+ var paragraphFootnotes = [];
for (final paragraph in paragraphNodes) {
final textNodes = paragraph.findAllElements('w:r');
- var text = textNodes.map((node) {
- final innerText = node.getElement('w:t')?.innerText ?? '';
- //mark bold text
- if (node.getElement('w:rPr')?.getElement('w:b') != null) {
- return '$innerText';
+ var text = '';
+ paragraphFootnotes.clear();
+
+ for (final node in textNodes) {
+ // Check for footnote reference
+ final footnoteRef = node.getElement('w:footnoteReference');
+ if (footnoteRef != null) {
+ final footnoteId = footnoteRef.getAttribute('w:id');
+ if (footnoteId != null && footnotes.containsKey(footnoteId)) {
+ text += '$footnoteCounter';
+ paragraphFootnotes
+ .add('$footnoteCounter) ${footnotes[footnoteId]}');
+ footnoteCounter++;
+ }
+ } else {
+ text += _processRun(node);
}
- return innerText;
- }).join();
+ }
- //mark up headings
+ // Process paragraph style
var style = paragraph
.getElement('w:pPr')
?.getElement('w:pStyle')
?.getAttribute('w:val');
- //if val is a number, that means it is a heading
+
+ // Handle headings
if (style != null && double.tryParse(style) != null) {
int styleNum = int.parse(style) + 1;
text = '$text';
}
- //mark up lists
- //get the numbering level
+ // Handle lists
var numbering = paragraph.getElement('w:pPr')?.getElement('w:numPr');
if (numbering != null) {
String? level = numbering.getElement('w:ilvl')?.getAttribute('w:val');
if (level != null) {
- // indent the text with the correct amount of spaces (0 for first level)
int levelInt = int.parse(level);
for (int i = 0; i <= levelInt; i++) {
text = '';
}
}
}
- list.add(text);
+
+ // Add paragraph with its footnotes
+ if (!text.trim().isEmpty) {
+ list.add('$text');
+ if (paragraphFootnotes.isNotEmpty) {
+ list.add(
+ '');
+ }
+ }
}
}
}