From 28194a4012ea15e49960a5a18b95bfe37eeb0770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mence=20Lesn=C3=A9?= Date: Sat, 15 Jun 2024 18:45:47 +0200 Subject: [PATCH] quality: Unescape HTML entities in Document Intelligence --- persistence/document_intelligence.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/persistence/document_intelligence.py b/persistence/document_intelligence.py index f4a49f1..62a64fe 100644 --- a/persistence/document_intelligence.py +++ b/persistence/document_intelligence.py @@ -13,6 +13,7 @@ ParagraphRole, ) import math +import html class DocumentIntelligenceAnalyze(IAnalyze): @@ -62,8 +63,8 @@ async def analyze( None, # Third, nothing ), ) - content = res.content - title = title_paragraph.content if title_paragraph else None + content = html.unescape(res.content) + title = html.unescape(title_paragraph.content) if title_paragraph else None langs = {lang.locale for lang in res.languages or [] if lang.confidence >= self._config.extract_lang_threshold} # Return title, content and langs