Skip to content

Commit

Permalink
quality: Unescape HTML entities in Document Intelligence
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Jun 15, 2024
1 parent fabe09c commit 28194a4
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions persistence/document_intelligence.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
ParagraphRole,
)
import math
import html


class DocumentIntelligenceAnalyze(IAnalyze):
Expand Down Expand Up @@ -62,8 +63,8 @@ async def analyze(
None, # Third, nothing
),
)
content = res.content
title = title_paragraph.content if title_paragraph else None
content = html.unescape(res.content)
title = html.unescape(title_paragraph.content) if title_paragraph else None
langs = {lang.locale for lang in res.languages or [] if lang.confidence >= self._config.extract_lang_threshold}

# Return title, content and langs
Expand Down

0 comments on commit 28194a4

Please sign in to comment.