diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py
index ad91f2d8..e4e30201 100644
--- a/dedoc/api/api_utils.py
+++ b/dedoc/api/api_utils.py
@@ -1,5 +1,6 @@
from typing import Dict, Iterator, List, Optional, Set
+from dedoc.api.schema import LineMetadata, ParsedDocument, Table, TreeNode
from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
@@ -10,10 +11,6 @@
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
from dedoc.data_structures.concrete_annotations.underlined_annotation import UnderlinedAnnotation
from dedoc.data_structures.hierarchy_level import HierarchyLevel
-from dedoc.data_structures.line_metadata import LineMetadata
-from dedoc.data_structures.parsed_document import ParsedDocument
-from dedoc.data_structures.table import Table
-from dedoc.data_structures.tree_node import TreeNode
from dedoc.extensions import converted_mimes, recognized_mimes
@@ -39,7 +36,7 @@ def _node2tree(paragraph: TreeNode, depth: int, depths: Set[int] = None) -> str:
space = "".join(space)
node_result = []
- node_result.append(f" {space} {paragraph.metadata.hierarchy_level.line_type} {paragraph.node_id} ")
+ node_result.append(f" {space} {paragraph.metadata.paragraph_type} {paragraph.node_id} ")
for text in __prettify_text(paragraph.text):
space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol]
space = "".join(space)
@@ -98,7 +95,7 @@ def json2tree(paragraph: TreeNode) -> str:
depths = {d for d in depths if d <= depth}
space = [space_symbol] * 4 * (depth - 1) + 4 * ["-"]
space = __add_vertical_line(depths, space)
- node_result.append(f"
{space} {node.metadata.hierarchy_level.line_type} {node.node_id}
")
+ node_result.append(f" {space} {node.metadata.paragraph_type} {node.node_id}
")
for text in __prettify_text(node.text):
space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol]
space = __add_vertical_line(depths, space)
@@ -136,14 +133,14 @@ def json2html(text: str,
ptext = __annotations2html(paragraph=paragraph, table2id=table2id, attach2id=attach2id, tabs=tabs)
- if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]:
+ if paragraph.metadata.paragraph_type in [HierarchyLevel.header, HierarchyLevel.root]:
ptext = f"{ptext.strip()}"
- elif paragraph.metadata.hierarchy_level.line_type == HierarchyLevel.list_item:
+ elif paragraph.metadata.paragraph_type == HierarchyLevel.list_item:
ptext = f"{ptext.strip()}"
else:
ptext = ptext.strip()
- ptext = f' {" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type}
'
+ ptext = f' {" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.paragraph_type}
'
if hasattr(paragraph.metadata, "uid"):
ptext = f'{ptext}
'
text += ptext
@@ -259,11 +256,10 @@ def table2html(table: Table, table2id: Dict[str, int]) -> str:
text += ' style="display: none" '
cell_node = TreeNode(
node_id="0",
- text=cell.get_text(),
- annotations=cell.get_annotations(),
- metadata=LineMetadata(page_id=table.metadata.page_id, line_id=0),
- subparagraphs=[],
- parent=None
+ text="\n".join([line.text for line in cell.lines]),
+ annotations=cell.lines[0].annotations if cell.lines else [],
+ metadata=LineMetadata(page_id=0, line_id=0, paragraph_type=HierarchyLevel.raw_text),
+ subparagraphs=[]
)
text += f' colspan="{cell.colspan}" rowspan="{cell.rowspan}">{__annotations2html(cell_node, {}, {})}\n'
diff --git a/dedoc/api/cancellation.py b/dedoc/api/cancellation.py
new file mode 100644
index 00000000..e9a6ddbb
--- /dev/null
+++ b/dedoc/api/cancellation.py
@@ -0,0 +1,34 @@
+import logging
+from contextlib import asynccontextmanager
+
+from anyio import create_task_group
+from fastapi import Request
+
+
+@asynccontextmanager
+async def cancel_on_disconnect(request: Request, logger: logging.Logger) -> None:
+ """
+ Async context manager for async code that needs to be cancelled if client disconnects prematurely.
+ The client disconnect is monitored through the Request object.
+
+ Source: https://github.com/dorinclisu/runner-with-api
+ See discussion: https://github.com/fastapi/fastapi/discussions/8805
+ """
+ async with create_task_group() as task_group:
+ async def watch_disconnect() -> None:
+ while True:
+ message = await request.receive()
+
+ if message["type"] == "http.disconnect":
+ client = f"{request.client.host}:{request.client.port}" if request.client else "-:-"
+ logger.warning(f"{client} - `{request.method} {request.url.path}` 499 DISCONNECTED")
+
+ task_group.cancel_scope.cancel()
+ break
+
+ task_group.start_soon(watch_disconnect)
+
+ try:
+ yield
+ finally:
+ task_group.cancel_scope.cancel()
diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py
index 1458ffd3..545b8efc 100644
--- a/dedoc/api/dedoc_api.py
+++ b/dedoc/api/dedoc_api.py
@@ -1,10 +1,8 @@
-import base64
import dataclasses
import importlib
import json
import os
import tempfile
-import traceback
from typing import Optional
from fastapi import Depends, FastAPI, File, Request, Response, UploadFile
@@ -15,24 +13,23 @@
import dedoc.version
from dedoc.api.api_args import QueryParameters
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
+from dedoc.api.process_handler import ProcessHandler
from dedoc.api.schema.parsed_document import ParsedDocument
from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.common.exceptions.missing_file_error import MissingFileError
from dedoc.config import get_config
-from dedoc.dedoc_manager import DedocManager
from dedoc.utils.utils import save_upload_file
config = get_config()
+logger = config["logger"]
PORT = config["api_port"]
static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "web")
static_files_dirs = config.get("static_files_dirs")
app = FastAPI()
app.mount("/web", StaticFiles(directory=config.get("static_path", static_path)), name="web")
-
module_api_args = importlib.import_module(config["import_path_init_api_args"])
-logger = config["logger"]
-manager = DedocManager(config=config)
+process_handler = ProcessHandler(logger=logger)
@app.get("/")
@@ -62,27 +59,20 @@ def _get_static_file_path(request: Request) -> str:
return os.path.abspath(os.path.join(directory, file))
-def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_dir: str) -> None:
- for attachment in document_tree.attachments:
- with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
- attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))
-
-
@app.post("/upload", response_model=ParsedDocument)
-async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:
+async def upload(request: Request, file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response:
parameters = dataclasses.asdict(query_params)
if not file or file.filename == "":
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.version.__version__)
- return_format = str(parameters.get("return_format", "json")).lower()
-
with tempfile.TemporaryDirectory() as tmpdir:
file_path = save_upload_file(file, tmpdir)
- document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
+ document_tree = await process_handler.handle(request=request, parameters=parameters, file_path=file_path, tmpdir=tmpdir)
- if return_format == "html":
- __add_base64_info_to_attachments(document_tree, tmpdir)
+ if document_tree is None:
+ return JSONResponse(status_code=499, content={})
+ return_format = str(parameters.get("return_format", "json")).lower()
if return_format == "html":
html_content = json2html(
text="",
@@ -102,24 +92,25 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
return HTMLResponse(content=html_content)
if return_format == "ujson":
- return UJSONResponse(content=document_tree.to_api_schema().model_dump())
+ return UJSONResponse(content=document_tree.model_dump())
if return_format == "collapsed_tree":
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
return HTMLResponse(content=html_content)
if return_format == "pretty_json":
- return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2))
+ return PlainTextResponse(content=json.dumps(document_tree.model_dump(), ensure_ascii=False, indent=2))
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
- return ORJSONResponse(content=document_tree.to_api_schema().model_dump())
+ return ORJSONResponse(content=document_tree.model_dump())
@app.get("/upload_example")
-async def upload_example(file_name: str, return_format: Optional[str] = None) -> Response:
+async def upload_example(request: Request, file_name: str, return_format: Optional[str] = None) -> Response:
file_path = os.path.join(static_path, "examples", file_name)
parameters = {} if return_format is None else {"return_format": return_format}
- document_tree = manager.parse(file_path, parameters=parameters)
+ with tempfile.TemporaryDirectory() as tmpdir:
+ document_tree = await process_handler.handle(request=request, parameters=parameters, file_path=file_path, tmpdir=tmpdir)
if return_format == "html":
html_page = json2html(
@@ -130,12 +121,11 @@ async def upload_example(file_name: str, return_format: Optional[str] = None) ->
tabs=0
)
return HTMLResponse(content=html_page)
- return ORJSONResponse(content=document_tree.to_api_schema().model_dump(), status_code=200)
+ return ORJSONResponse(content=document_tree.model_dump(), status_code=200)
@app.exception_handler(DedocError)
async def exception_handler(request: Request, exc: DedocError) -> Response:
- logger.error(f"Exception {exc}\n{traceback.format_exc()}")
result = {"message": exc.msg}
if exc.filename:
result["file_name"] = exc.filename
diff --git a/dedoc/api/process_handler.py b/dedoc/api/process_handler.py
new file mode 100644
index 00000000..2868357d
--- /dev/null
+++ b/dedoc/api/process_handler.py
@@ -0,0 +1,115 @@
+import asyncio
+import base64
+import logging
+import os
+import pickle
+import signal
+import traceback
+from multiprocessing import Process, Queue
+from typing import Optional
+from urllib.request import Request
+
+from anyio import get_cancelled_exc_class
+
+from dedoc.api.cancellation import cancel_on_disconnect
+from dedoc.api.schema import ParsedDocument
+from dedoc.common.exceptions.dedoc_error import DedocError
+from dedoc.config import get_config
+from dedoc.dedoc_manager import DedocManager
+
+
+class ProcessHandler:
+ """
+ Class for file parsing by DedocManager with support for client disconnection.
+ If client disconnects during file parsing, the process of parsing is fully terminated and API is available to receive new connections.
+
+ Handler uses the following algorithm:
+ 1. Master process is used for checking current connection (client disconnect)
+ 2. Child process is working on the background and waiting for the input file in the input_queue
+ 3. Master process calls the child process for parsing and transfers data through the input_queue
+ 4. Child process is parsing file using DedocManager
+ 5. The result of parsing is transferred to the master process through the output_queue
+ 6. If client disconnects, the child process is terminated. The new child process with queues will start with the new request
+ """
+ def __init__(self, logger: logging.Logger) -> None:
+ self.input_queue = Queue()
+ self.output_queue = Queue()
+ self.logger = logger
+ self.process = Process(target=self.__parse_file, args=[self.input_queue, self.output_queue])
+ self.process.start()
+
+ async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]:
+ """
+ Handle request in a separate process.
+ Checks for client disconnection and terminate the child process if client disconnected.
+ """
+ if self.process is None:
+ self.logger.info("Initialization of a new parsing process")
+ self.__init__(logger=self.logger)
+
+ self.logger.info("Putting file to the input queue")
+ self.input_queue.put(pickle.dumps((parameters, file_path, tmpdir)), block=True)
+
+ loop = asyncio.get_running_loop()
+ async with cancel_on_disconnect(request, self.logger):
+ try:
+ future = loop.run_in_executor(None, self.output_queue.get)
+ result = await future
+ except get_cancelled_exc_class():
+ self.logger.warning("Terminating the parsing process")
+ if self.process is not None:
+ self.process.terminate()
+ self.process = None
+ future.cancel(DedocError)
+ return None
+
+ result = pickle.loads(result)
+ if isinstance(result, ParsedDocument):
+ self.logger.info("Got the result from the output queue")
+ return result
+
+ raise DedocError.from_dict(result)
+
+ def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None:
+ """
+ Function for file parsing in a separate (child) process.
+ It's a background process, i.e. it is waiting for a task in the input queue.
+ The result of parsing is returned in the output queue.
+
+ Operations with `signal` are used for saving master process while killing child process.
+ See the issue for more details: https://github.com/fastapi/fastapi/issues/1487
+ """
+ signal.set_wakeup_fd(-1)
+ signal.signal(signal.SIGTERM, signal.SIG_DFL)
+ signal.signal(signal.SIGINT, signal.SIG_DFL)
+
+ manager = DedocManager(config=get_config())
+ manager.logger.info("Parsing process is waiting for the task in the input queue")
+
+ while True:
+ file_path = None
+ try:
+ parameters, file_path, tmp_dir = pickle.loads(input_queue.get(block=True))
+ manager.logger.info("Parsing process got task from the input queue")
+ return_format = str(parameters.get("return_format", "json")).lower()
+ document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmp_dir})
+
+ if return_format == "html":
+ self.__add_base64_info_to_attachments(document_tree, tmp_dir)
+
+ output_queue.put(pickle.dumps(document_tree.to_api_schema()), block=True)
+ manager.logger.info("Parsing process put task to the output queue")
+ except DedocError as e:
+ tb = traceback.format_exc()
+ manager.logger.error(f"Exception {e}: {e.msg_api}\n{tb}")
+ output_queue.put(pickle.dumps(e.__dict__), block=True)
+ except Exception as e:
+ exc_message = f"Exception {e}\n{traceback.format_exc()}"
+ filename = "" if file_path is None else os.path.basename(file_path)
+ manager.logger.error(exc_message)
+ output_queue.put(pickle.dumps({"msg": exc_message, "filename": filename}), block=True)
+
+ def __add_base64_info_to_attachments(self, document_tree: ParsedDocument, attachments_dir: str) -> None:
+ for attachment in document_tree.attachments:
+ with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
+ attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))
diff --git a/dedoc/common/exceptions/bad_file_error.py b/dedoc/common/exceptions/bad_file_error.py
index 4b800c9d..2c1176bc 100644
--- a/dedoc/common/exceptions/bad_file_error.py
+++ b/dedoc/common/exceptions/bad_file_error.py
@@ -9,11 +9,7 @@ class BadFileFormatError(DedocError):
"""
def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None:
- super(BadFileFormatError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version)
+ super(BadFileFormatError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=415)
def __str__(self) -> str:
return f"BadFileFormatError({self.msg})"
-
- @property
- def code(self) -> int:
- return 415
diff --git a/dedoc/common/exceptions/bad_parameters_error.py b/dedoc/common/exceptions/bad_parameters_error.py
index dc8c0aa9..98e81d29 100644
--- a/dedoc/common/exceptions/bad_parameters_error.py
+++ b/dedoc/common/exceptions/bad_parameters_error.py
@@ -14,7 +14,3 @@ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[s
def __str__(self) -> str:
return f"BadParametersError({self.msg})"
-
- @property
- def code(self) -> int:
- return 400
diff --git a/dedoc/common/exceptions/conversion_error.py b/dedoc/common/exceptions/conversion_error.py
index f95207b3..70551230 100644
--- a/dedoc/common/exceptions/conversion_error.py
+++ b/dedoc/common/exceptions/conversion_error.py
@@ -9,11 +9,7 @@ class ConversionError(DedocError):
"""
def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None:
- super(ConversionError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version)
+ super(ConversionError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=415)
def __str__(self) -> str:
return f"ConversionError({self.msg})"
-
- @property
- def code(self) -> int:
- return 415
diff --git a/dedoc/common/exceptions/dedoc_error.py b/dedoc/common/exceptions/dedoc_error.py
index f91c8bd0..9c793256 100644
--- a/dedoc/common/exceptions/dedoc_error.py
+++ b/dedoc/common/exceptions/dedoc_error.py
@@ -9,17 +9,26 @@ def __init__(self,
msg_api: Optional[str] = None,
filename: Optional[str] = None,
version: Optional[str] = None,
- metadata: Optional[dict] = None) -> None:
+ metadata: Optional[dict] = None,
+ code: Optional[int] = None) -> None:
super(DedocError, self).__init__()
self.msg = msg
self.msg_api = msg if msg_api is None else msg_api
self.filename = filename
self.version = version if version is not None else dedoc.version.__version__
self.metadata = metadata
+ self.code = 400 if code is None else code
def __str__(self) -> str:
return f"DedocError({self.msg})"
- @property
- def code(self) -> int:
- return 400
+ @staticmethod
+ def from_dict(error_dict: dict) -> "DedocError":
+ return DedocError(
+ msg=error_dict.get("msg", ""),
+ msg_api=error_dict.get("msg_api", ""),
+ filename=error_dict.get("filename", ""),
+ version=error_dict.get("version", dedoc.version.__version__),
+ metadata=error_dict.get("metadata", {}),
+ code=error_dict.get("code", 500)
+ )
diff --git a/dedoc/common/exceptions/java_not_found_error.py b/dedoc/common/exceptions/java_not_found_error.py
index c6d96384..105556ba 100644
--- a/dedoc/common/exceptions/java_not_found_error.py
+++ b/dedoc/common/exceptions/java_not_found_error.py
@@ -9,11 +9,7 @@ class JavaNotFoundError(DedocError):
"""
def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None:
- super(JavaNotFoundError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version)
+ super(JavaNotFoundError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=500)
def __str__(self) -> str:
return f"JavaNotFoundError({self.msg})"
-
- @property
- def code(self) -> int:
- return 500
diff --git a/dedoc/common/exceptions/minio_error.py b/dedoc/common/exceptions/minio_error.py
deleted file mode 100644
index 6d43c64f..00000000
--- a/dedoc/common/exceptions/minio_error.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from typing import Optional
-
-from dedoc.common.exceptions.dedoc_error import DedocError
-
-
-class MinioError(DedocError):
- """
- Raise if there is no file in minio
- """
-
- def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None:
- super(MinioError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version)
-
- def __str__(self) -> str:
- return f"MinioError({self.msg})"
-
- @property
- def code(self) -> int:
- return 404
diff --git a/dedoc/common/exceptions/missing_file_error.py b/dedoc/common/exceptions/missing_file_error.py
index 7bc861e9..1272376f 100644
--- a/dedoc/common/exceptions/missing_file_error.py
+++ b/dedoc/common/exceptions/missing_file_error.py
@@ -13,7 +13,3 @@ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[s
def __str__(self) -> str:
return f"MissingFileError({self.msg})"
-
- @property
- def code(self) -> int:
- return 400
diff --git a/dedoc/common/exceptions/recognize_error.py b/dedoc/common/exceptions/recognize_error.py
index 05c388ce..767cba6a 100644
--- a/dedoc/common/exceptions/recognize_error.py
+++ b/dedoc/common/exceptions/recognize_error.py
@@ -6,11 +6,7 @@
class RecognizeError(DedocError):
def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None:
- super(RecognizeError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version)
+ super(RecognizeError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=500)
def __str__(self) -> str:
return f"RecognizeError({self.msg})"
-
- @property
- def code(self) -> int:
- return 500
diff --git a/dedoc/common/exceptions/structure_extractor_error.py b/dedoc/common/exceptions/structure_extractor_error.py
index 1bb9bd00..803d4f1c 100644
--- a/dedoc/common/exceptions/structure_extractor_error.py
+++ b/dedoc/common/exceptions/structure_extractor_error.py
@@ -13,7 +13,3 @@ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[s
def __str__(self) -> str:
return f"StructureExtractorError({self.msg})"
-
- @property
- def code(self) -> int:
- return 400
diff --git a/dedoc/common/exceptions/tabby_pdf_error.py b/dedoc/common/exceptions/tabby_pdf_error.py
index eff2ec8d..c3380be1 100644
--- a/dedoc/common/exceptions/tabby_pdf_error.py
+++ b/dedoc/common/exceptions/tabby_pdf_error.py
@@ -9,11 +9,7 @@ class TabbyPdfError(DedocError):
"""
def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None:
- super(TabbyPdfError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version)
+ super(TabbyPdfError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=500)
def __str__(self) -> str:
return f"TabbyPdfError({self.msg})"
-
- @property
- def code(self) -> int:
- return 500
diff --git a/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py
index 5c8e81b9..eb44051b 100644
--- a/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py
+++ b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py
@@ -34,7 +34,7 @@
docx_document = structure_extractor.extract(document=docx_document, parameters={"patterns": patterns})
docx_document.metadata = docx_metadata_extractor.extract(file_path=docx_file_path)
-docx_parsed_document = structure_constructor.construct(document=docx_document)
+docx_parsed_document = structure_constructor.construct(document=docx_document).to_api_schema()
html = json2html(
paragraph=docx_parsed_document.content.structure,
attachments=docx_parsed_document.attachments,
@@ -46,7 +46,7 @@
def print_document_tree(document: UnstructuredDocument, patterns: List[AbstractPattern]) -> None:
document = structure_extractor.extract(document=document, parameters={"patterns": patterns})
- parsed_document = structure_constructor.construct(document=document)
+ parsed_document = structure_constructor.construct(document=document).to_api_schema()
html = json2html(paragraph=parsed_document.content.structure, attachments=parsed_document.attachments, tables=parsed_document.content.tables, text="")
print(f"\n\nDocument tree\n{html2text.html2text(html)}")
diff --git a/docs/source/tutorials/using_patterns.rst b/docs/source/tutorials/using_patterns.rst
index e2ea2d71..dc229423 100644
--- a/docs/source/tutorials/using_patterns.rst
+++ b/docs/source/tutorials/using_patterns.rst
@@ -91,7 +91,7 @@ which applies patterns if lines match them, else line becomes simple raw text li
:language: python
:lines: 30-37
-Let's see the resulting tree. In the code below we use an auxiliary function to convert :class:`~dedoc.data_structures.ParsedDocument`
+Let's see the resulting tree. In the code below we use an auxiliary function to convert :class:`~dedoc.api.schema.ParsedDocument`
to the HTML representation and print it:
.. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py
diff --git a/resources/benchmarks/time_benchmark.csv b/resources/benchmarks/time_benchmark.csv
index 9be87ffa..1e9467a9 100644
--- a/resources/benchmarks/time_benchmark.csv
+++ b/resources/benchmarks/time_benchmark.csv
@@ -1,13 +1,13 @@
,Dataset,total_file_size,total_files,total_pages,total_time_raw,throughput_raw,mean_time_on_file_raw,mean_time_cpu_on_page_raw,total_time_indp_cpu,throughput_indp_cpu,mean_time_on_file_indp_cpu,mean_time_cpu_on_page_indp_cpu,cpu_performance,version
-0,images,105240044,259,259,819.3893718719482,128437.16017401138,3.1636655284631208,3.1636655284631208,845.0002507880153,124544.39380563155,3.2625492308417576,3.262549230841758,1.0312560545636043,2.1
-0,law_html,215921201,1526,1526,227.0532796382904,950971.5135759128,0.14878982938289018,0.14878982938289018,234.1500693355101,922148.7809623912,0.15344041240859116,0.15344041240859116,1.0312560545636043,2.1
-0,other_html,215921201,1526,1526,156.9773073196411,1375493.0867831479,0.1028684844820715,0.1028684844820715,161.8837986024715,1333803.6472088536,0.10608374744591842,0.1060837474459184,1.0312560545636043,2.1
-0,txt,2483851,999,999,13.047960042953491,190363.16725551253,0.013061021064017509,0.013061021064017509,13.455787793999773,184593.5026641549,0.013469257051050823,0.013469257051050825,1.0312560545636043,2.1
-0,pdf_text_layer_true,109643533,33,445,417.5641739368439,262578.8797115134,12.653459816267997,1.0459708427522103,430.6155825412202,254620.4490626033,13.048957046703642,1.0786637644852126,1.0312560545636043,2.1
-0,pdf_text_layer_auto,109643533,33,445,744.6476347446442,147242.16916045017,22.565079840746794,1.9358688088909384,767.9223818468816,142779.44697523108,23.270375207481262,1.9963764300096132,1.0312560545636043,2.1
-0,pdf_text_layer_auto_tabby,109643533,33,445,861.5465660095215,127263.61792357055,26.107471697258227,2.408536994270351,888.475112485801,123406.42012271588,26.923488257145486,2.4838183579817246,1.0312560545636043,2.1
-0,pdf_text_layer_false,109643533,33,445,1923.4744081497192,57002.85511231277,58.28710327726422,4.837624405643553,1983.5946292025433,55275.171340869965,60.10892815765283,4.988829458024572,1.0312560545636043,2.1
-0,pdf_text_layer_tabby,109643533,33,445,459.48560762405396,238622.3445973723,13.923806291637998,1.2937336014756313,473.84731484714223,231390.00594604985,14.359009540822491,1.334170609514122,1.0312560545636043,2.1
-0,docx,417727,22,22,16.942837953567505,24655.078514284138,0.770128997889432,0.770128997889432,17.472404221106515,23907.814557963888,0.794200191868478,0.794200191868478,1.0312560545636043,2.1
-0,pdf,6086144,18,117,375.61194705963135,16203.275874592393,20.86733039220174,3.0367271868588284,387.35209457166883,15712.175267130062,21.519560809537158,3.131643297506068,1.0312560545636043,2.1
-0,pdf_tables,16536264,2,267,1197.7023212909698,13806.656049706928,598.8511606454849,4.039958413717207,1235.137770396196,13388.193929731136,617.568885198098,4.166231574331044,1.0312560545636043,2.1
+0,images,105240044,259,259,780.3763222694397,134858.06910946214,3.0130359933183,3.0130359933183,1066.0429167915163,98720.2694585152,4.115995817727862,4.115995817727862,1.366062611550437,2.3.1
+0,law_html,215921201,1526,1526,204.2208013534546,1057292.8887214332,0.13382752382270943,0.13382752382270943,278.9784012298232,773971.0316216326,0.18281677669057877,0.18281677669057877,1.366062611550437,2.3.1
+0,other_html,215921201,1526,1526,152.16186046600342,1419023.139824463,0.0997128836605527,0.0997128836605527,207.86262848656185,1038768.7415102572,0.13621404225855954,0.13621404225855951,1.366062611550437,2.3.1
+0,txt,2483851,999,999,12.656875133514404,196245.20063589464,0.012669544678192597,0.012669544678192597,17.290083898956475,143657.5446663917,0.01730739129024672,0.01730739129024672,1.366062611550437,2.3.1
+0,pdf_text_layer_true,109643533,33,445,294.70041608810425,372050.825225916,8.930315639033463,0.7666412830448923,402.57922002631614,272352.6887275323,12.199370303827761,1.0472799932386834,1.366062611550437,2.3.1
+0,pdf_text_layer_auto,109643533,33,445,715.7886617183685,153178.6389809286,21.69056550661723,1.9423069545744724,977.8121285451869,112131.49213349436,29.63067056197536,2.65331291079858,1.366062611550437,2.3.1
+0,pdf_text_layer_auto_tabby,109643533,33,445,844.7789170742035,129789.61806923167,25.59936112346071,2.380779043078811,1154.0208936411366,95010.00684143213,34.97033011033747,3.2522932371127915,1.366062611550437,2.3.1
+0,pdf_text_layer_false,109643533,33,445,1591.9220836162567,68874.93686307219,48.240063139886566,3.9471288925826884,2174.665238929637,50418.57985184248,65.89894663423142,5.392025203127692,1.366062611550437,2.3.1
+0,pdf_text_layer_tabby,109643533,33,445,421.8361530303955,259919.71577670728,12.782913728193803,1.1935813540785523,576.2545968550919,190269.25528816486,17.46226051076036,1.630506861650454,1.366062611550437,2.3.1
+0,docx,417727,22,22,17.311132431030273,24130.541526631885,0.7868696559559215,0.7868696559559215,23.64809077762868,17664.30127184617,1.07491321716494,1.07491321716494,1.366062611550437,2.3.1
+0,pdf,6086144,18,117,310.7921574115753,19582.68204284271,17.26623096730974,2.519494602322346,424.5615462030511,14335.127743974337,23.58675256683617,3.441787376235694,1.366062611550437,2.3.1
+0,pdf_tables,16536264,2,267,1083.6798040866852,15259.363455551895,541.8399020433426,3.6198095974726074,1480.3744632551231,11170.325083586768,740.1872316275616,4.944886552038766,1.366062611550437,2.3.1
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index a82b2131..39cfbce4 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -100,7 +100,10 @@ def get_times(spend_page_times: List, total_size: int, total_time: int, total_fi
file_size = os.path.getsize(file_path)
total_size += file_size
time_start = time.time()
- send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)
+ try:
+ send_file(host=host, file_name=file, file_path=file_path, parameters=parameters)
+ except AssertionError as e:
+ print(f"Error on file {file_path}: {e}")
time_finish = time.time()
spend_file_time = time_finish - time_start
pages = page_func(file_path)
diff --git a/tests/api_tests/test_api_misc_main.py b/tests/api_tests/test_api_misc_main.py
index cbc47976..550ebc59 100644
--- a/tests/api_tests/test_api_misc_main.py
+++ b/tests/api_tests/test_api_misc_main.py
@@ -1,7 +1,9 @@
import json
import os
+import time
import requests
+from requests import ReadTimeout
from tests.api_tests.abstract_api_test import AbstractTestApiDocReader
@@ -14,6 +16,26 @@ def __get_version(self) -> str:
version = file.read().strip()
return version
+ def test_cancellation(self) -> None:
+ file_name = "article.pdf"
+ start_time = time.time()
+ with open(self._get_abs_path(os.path.join("pdf_with_text_layer", file_name)), "rb") as file:
+ files = {"file": (file_name, file)}
+ parameters = dict(pdf_with_text_layer=False)
+ try:
+ requests.post(f"http://{self._get_host()}:{self._get_port()}/upload", files=files, data=parameters, timeout=1)
+ except ReadTimeout:
+ pass
+
+ file_name = "example.txt"
+ with open(self._get_abs_path(os.path.join("txt", file_name)), "rb") as file:
+ files = {"file": (file_name, file)}
+ r = requests.post(f"http://{self._get_host()}:{self._get_port()}/upload", files=files, data={}, timeout=60)
+
+ end_time = time.time()
+ self.assertLess(end_time - start_time, 60)
+ self.assertEqual(200, r.status_code)
+
def test_bin_file(self) -> None:
file_name = "file.bin"
result = self._send_request(file_name, expected_code=415)