diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index ad91f2d8..e4e30201 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -1,5 +1,6 @@ from typing import Dict, Iterator, List, Optional, Set +from dedoc.api.schema import LineMetadata, ParsedDocument, Table, TreeNode from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation @@ -10,10 +11,6 @@ from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from dedoc.data_structures.concrete_annotations.underlined_annotation import UnderlinedAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata -from dedoc.data_structures.parsed_document import ParsedDocument -from dedoc.data_structures.table import Table -from dedoc.data_structures.tree_node import TreeNode from dedoc.extensions import converted_mimes, recognized_mimes @@ -39,7 +36,7 @@ def _node2tree(paragraph: TreeNode, depth: int, depths: Set[int] = None) -> str: space = "".join(space) node_result = [] - node_result.append(f" {space} {paragraph.metadata.hierarchy_level.line_type} {paragraph.node_id} ") + node_result.append(f" {space} {paragraph.metadata.paragraph_type} {paragraph.node_id} ") for text in __prettify_text(paragraph.text): space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol] space = "".join(space) @@ -98,7 +95,7 @@ def json2tree(paragraph: TreeNode) -> str: depths = {d for d in depths if d <= depth} space = [space_symbol] * 4 * (depth - 1) + 4 * ["-"] space = __add_vertical_line(depths, space) - node_result.append(f"

{space} {node.metadata.hierarchy_level.line_type} {node.node_id}

") + node_result.append(f"

{space} {node.metadata.paragraph_type} {node.node_id}

") for text in __prettify_text(node.text): space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol] space = __add_vertical_line(depths, space) @@ -136,14 +133,14 @@ def json2html(text: str, ptext = __annotations2html(paragraph=paragraph, table2id=table2id, attach2id=attach2id, tabs=tabs) - if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]: + if paragraph.metadata.paragraph_type in [HierarchyLevel.header, HierarchyLevel.root]: ptext = f"{ptext.strip()}" - elif paragraph.metadata.hierarchy_level.line_type == HierarchyLevel.list_item: + elif paragraph.metadata.paragraph_type == HierarchyLevel.list_item: ptext = f"{ptext.strip()}" else: ptext = ptext.strip() - ptext = f'

{" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type}

' + ptext = f'

{" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.paragraph_type}

' if hasattr(paragraph.metadata, "uid"): ptext = f'
{ptext}
' text += ptext @@ -259,11 +256,10 @@ def table2html(table: Table, table2id: Dict[str, int]) -> str: text += ' style="display: none" ' cell_node = TreeNode( node_id="0", - text=cell.get_text(), - annotations=cell.get_annotations(), - metadata=LineMetadata(page_id=table.metadata.page_id, line_id=0), - subparagraphs=[], - parent=None + text="\n".join([line.text for line in cell.lines]), + annotations=cell.lines[0].annotations if cell.lines else [], + metadata=LineMetadata(page_id=0, line_id=0, paragraph_type=HierarchyLevel.raw_text), + subparagraphs=[] ) text += f' colspan="{cell.colspan}" rowspan="{cell.rowspan}">{__annotations2html(cell_node, {}, {})}\n' diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index 024da69f..545b8efc 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -2,7 +2,7 @@ import importlib import json import os -import traceback +import tempfile from typing import Optional from fastapi import Depends, FastAPI, File, Request, Response, UploadFile @@ -18,6 +18,7 @@ from dedoc.common.exceptions.dedoc_error import DedocError from dedoc.common.exceptions.missing_file_error import MissingFileError from dedoc.config import get_config +from dedoc.utils.utils import save_upload_file config = get_config() logger = config["logger"] @@ -64,7 +65,10 @@ async def upload(request: Request, file: UploadFile = File(...), query_params: Q if not file or file.filename == "": raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.version.__version__) - document_tree = await process_handler.handle(request=request, parameters=parameters, file=file) + with tempfile.TemporaryDirectory() as tmpdir: + file_path = save_upload_file(file, tmpdir) + document_tree = await process_handler.handle(request=request, parameters=parameters, file_path=file_path, tmpdir=tmpdir) + if document_tree is None: return JSONResponse(status_code=499, content={}) @@ -88,24 +92,25 @@ async def upload(request: Request, file: UploadFile = File(...), query_params: Q return HTMLResponse(content=html_content) if return_format == "ujson": - return UJSONResponse(content=document_tree.to_api_schema().model_dump()) + return UJSONResponse(content=document_tree.model_dump()) if return_format == "collapsed_tree": html_content = json2collapsed_tree(paragraph=document_tree.content.structure) return HTMLResponse(content=html_content) if return_format == "pretty_json": - return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2)) + return PlainTextResponse(content=json.dumps(document_tree.model_dump(), ensure_ascii=False, indent=2)) logger.info(f"Send result. File {file.filename} with parameters {parameters}") - return ORJSONResponse(content=document_tree.to_api_schema().model_dump()) + return ORJSONResponse(content=document_tree.model_dump()) @app.get("/upload_example") async def upload_example(request: Request, file_name: str, return_format: Optional[str] = None) -> Response: file_path = os.path.join(static_path, "examples", file_name) parameters = {} if return_format is None else {"return_format": return_format} - document_tree = await process_handler.handle(request=request, parameters=parameters, file=file_path) + with tempfile.TemporaryDirectory() as tmpdir: + document_tree = await process_handler.handle(request=request, parameters=parameters, file_path=file_path, tmpdir=tmpdir) if return_format == "html": html_page = json2html( @@ -116,7 +121,7 @@ async def upload_example(request: Request, file_name: str, return_format: Option tabs=0 ) return HTMLResponse(content=html_page) - return ORJSONResponse(content=document_tree.to_api_schema().model_dump(), status_code=200) + return ORJSONResponse(content=document_tree.model_dump(), status_code=200) @app.exception_handler(DedocError) diff --git a/dedoc/api/process_handler.py b/dedoc/api/process_handler.py index 8083abf8..c29a8e7d 100644 --- a/dedoc/api/process_handler.py +++ b/dedoc/api/process_handler.py @@ -4,21 +4,18 @@ import os import pickle import signal -import tempfile import traceback from multiprocessing import Process, Queue -from typing import Optional, Union +from typing import Optional from urllib.request import Request from anyio import get_cancelled_exc_class -from fastapi import UploadFile -from dedoc import DedocManager from dedoc.api.cancellation import cancel_on_disconnect +from dedoc.api.schema import ParsedDocument from dedoc.common.exceptions.dedoc_error import DedocError from dedoc.config import get_config -from dedoc.data_structures import ParsedDocument -from dedoc.utils.utils import save_upload_file +from dedoc.dedoc_manager import DedocManager class ProcessHandler: @@ -41,7 +38,7 @@ def __init__(self, logger: logging.Logger) -> None: self.process = Process(target=self.__parse_file, args=[self.input_queue, self.output_queue]) self.process.start() - async def handle(self, request: Request, parameters: dict, file: Union[UploadFile, str]) -> Optional[ParsedDocument]: + async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]: """ Handle request in a separate process. Checks for client disconnection and terminate the child process if client disconnected. @@ -50,7 +47,7 @@ async def handle(self, request: Request, parameters: dict, file: Union[UploadFil self.__init__(logger=self.logger) self.logger.info("Putting file to the input queue") - self.input_queue.put(pickle.dumps((parameters, file)), block=True) + self.input_queue.put(pickle.dumps((parameters, file_path, tmpdir)), block=True) loop = asyncio.get_running_loop() async with cancel_on_disconnect(request, self.logger): @@ -88,17 +85,15 @@ def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None: while True: try: - parameters, file = pickle.loads(input_queue.get(block=True)) + parameters, file_path, tmp_dir = pickle.loads(input_queue.get(block=True)) manager.logger.info("Parsing process got task from the input queue") return_format = str(parameters.get("return_format", "json")).lower() - with tempfile.TemporaryDirectory() as tmpdir: - file_path = file if isinstance(file, str) else save_upload_file(file, tmpdir) - document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir}) + document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmp_dir}) - if return_format == "html": - self.__add_base64_info_to_attachments(document_tree, tmpdir) + if return_format == "html": + self.__add_base64_info_to_attachments(document_tree, tmp_dir) - output_queue.put(pickle.dumps(document_tree), block=True) + output_queue.put(pickle.dumps(document_tree.to_api_schema()), block=True) manager.logger.info("Parsing process put task to the output queue") except Exception as e: tb = traceback.format_exc() diff --git a/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py index 5c8e81b9..eb44051b 100644 --- a/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py @@ -34,7 +34,7 @@ docx_document = structure_extractor.extract(document=docx_document, parameters={"patterns": patterns}) docx_document.metadata = docx_metadata_extractor.extract(file_path=docx_file_path) -docx_parsed_document = structure_constructor.construct(document=docx_document) +docx_parsed_document = structure_constructor.construct(document=docx_document).to_api_schema() html = json2html( paragraph=docx_parsed_document.content.structure, attachments=docx_parsed_document.attachments, @@ -46,7 +46,7 @@ def print_document_tree(document: UnstructuredDocument, patterns: List[AbstractPattern]) -> None: document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) - parsed_document = structure_constructor.construct(document=document) + parsed_document = structure_constructor.construct(document=document).to_api_schema() html = json2html(paragraph=parsed_document.content.structure, attachments=parsed_document.attachments, tables=parsed_document.content.tables, text="") print(f"\n\nDocument tree\n{html2text.html2text(html)}") diff --git a/docs/source/tutorials/using_patterns.rst b/docs/source/tutorials/using_patterns.rst index e2ea2d71..dc229423 100644 --- a/docs/source/tutorials/using_patterns.rst +++ b/docs/source/tutorials/using_patterns.rst @@ -91,7 +91,7 @@ which applies patterns if lines match them, else line becomes simple raw text li :language: python :lines: 30-37 -Let's see the resulting tree. In the code below we use an auxiliary function to convert :class:`~dedoc.data_structures.ParsedDocument` +Let's see the resulting tree. In the code below we use an auxiliary function to convert :class:`~dedoc.api.schema.ParsedDocument` to the HTML representation and print it: .. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py