diff --git a/dedoc/api/api_utils.py b/dedoc/api/api_utils.py index ad91f2d8..e4e30201 100644 --- a/dedoc/api/api_utils.py +++ b/dedoc/api/api_utils.py @@ -1,5 +1,6 @@ from typing import Dict, Iterator, List, Optional, Set +from dedoc.api.schema import LineMetadata, ParsedDocument, Table, TreeNode from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation @@ -10,10 +11,6 @@ from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from dedoc.data_structures.concrete_annotations.underlined_annotation import UnderlinedAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel -from dedoc.data_structures.line_metadata import LineMetadata -from dedoc.data_structures.parsed_document import ParsedDocument -from dedoc.data_structures.table import Table -from dedoc.data_structures.tree_node import TreeNode from dedoc.extensions import converted_mimes, recognized_mimes @@ -39,7 +36,7 @@ def _node2tree(paragraph: TreeNode, depth: int, depths: Set[int] = None) -> str: space = "".join(space) node_result = [] - node_result.append(f" {space} {paragraph.metadata.hierarchy_level.line_type} {paragraph.node_id} ") + node_result.append(f" {space} {paragraph.metadata.paragraph_type} {paragraph.node_id} ") for text in __prettify_text(paragraph.text): space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol] space = "".join(space) @@ -98,7 +95,7 @@ def json2tree(paragraph: TreeNode) -> str: depths = {d for d in depths if d <= depth} space = [space_symbol] * 4 * (depth - 1) + 4 * ["-"] space = __add_vertical_line(depths, space) - node_result.append(f"

{space} {node.metadata.hierarchy_level.line_type} {node.node_id}

") + node_result.append(f"

{space} {node.metadata.paragraph_type} {node.node_id}

") for text in __prettify_text(node.text): space = [space_symbol] * 4 * (depth - 1) + 4 * [space_symbol] space = __add_vertical_line(depths, space) @@ -136,14 +133,14 @@ def json2html(text: str, ptext = __annotations2html(paragraph=paragraph, table2id=table2id, attach2id=attach2id, tabs=tabs) - if paragraph.metadata.hierarchy_level.line_type in [HierarchyLevel.header, HierarchyLevel.root]: + if paragraph.metadata.paragraph_type in [HierarchyLevel.header, HierarchyLevel.root]: ptext = f"{ptext.strip()}" - elif paragraph.metadata.hierarchy_level.line_type == HierarchyLevel.list_item: + elif paragraph.metadata.paragraph_type == HierarchyLevel.list_item: ptext = f"{ptext.strip()}" else: ptext = ptext.strip() - ptext = f'

{" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.hierarchy_level.line_type}

' + ptext = f'

{" " * tabs} {ptext} id = {paragraph.node_id} ; type = {paragraph.metadata.paragraph_type}

' if hasattr(paragraph.metadata, "uid"): ptext = f'
{ptext}
' text += ptext @@ -259,11 +256,10 @@ def table2html(table: Table, table2id: Dict[str, int]) -> str: text += ' style="display: none" ' cell_node = TreeNode( node_id="0", - text=cell.get_text(), - annotations=cell.get_annotations(), - metadata=LineMetadata(page_id=table.metadata.page_id, line_id=0), - subparagraphs=[], - parent=None + text="\n".join([line.text for line in cell.lines]), + annotations=cell.lines[0].annotations if cell.lines else [], + metadata=LineMetadata(page_id=0, line_id=0, paragraph_type=HierarchyLevel.raw_text), + subparagraphs=[] ) text += f' colspan="{cell.colspan}" rowspan="{cell.rowspan}">{__annotations2html(cell_node, {}, {})}\n' diff --git a/dedoc/api/cancellation.py b/dedoc/api/cancellation.py new file mode 100644 index 00000000..e9a6ddbb --- /dev/null +++ b/dedoc/api/cancellation.py @@ -0,0 +1,34 @@ +import logging +from contextlib import asynccontextmanager + +from anyio import create_task_group +from fastapi import Request + + +@asynccontextmanager +async def cancel_on_disconnect(request: Request, logger: logging.Logger) -> None: + """ + Async context manager for async code that needs to be cancelled if client disconnects prematurely. + The client disconnect is monitored through the Request object. + + Source: https://github.com/dorinclisu/runner-with-api + See discussion: https://github.com/fastapi/fastapi/discussions/8805 + """ + async with create_task_group() as task_group: + async def watch_disconnect() -> None: + while True: + message = await request.receive() + + if message["type"] == "http.disconnect": + client = f"{request.client.host}:{request.client.port}" if request.client else "-:-" + logger.warning(f"{client} - `{request.method} {request.url.path}` 499 DISCONNECTED") + + task_group.cancel_scope.cancel() + break + + task_group.start_soon(watch_disconnect) + + try: + yield + finally: + task_group.cancel_scope.cancel() diff --git a/dedoc/api/dedoc_api.py b/dedoc/api/dedoc_api.py index 1458ffd3..545b8efc 100644 --- a/dedoc/api/dedoc_api.py +++ b/dedoc/api/dedoc_api.py @@ -1,10 +1,8 @@ -import base64 import dataclasses import importlib import json import os import tempfile -import traceback from typing import Optional from fastapi import Depends, FastAPI, File, Request, Response, UploadFile @@ -15,24 +13,23 @@ import dedoc.version from dedoc.api.api_args import QueryParameters from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt +from dedoc.api.process_handler import ProcessHandler from dedoc.api.schema.parsed_document import ParsedDocument from dedoc.common.exceptions.dedoc_error import DedocError from dedoc.common.exceptions.missing_file_error import MissingFileError from dedoc.config import get_config -from dedoc.dedoc_manager import DedocManager from dedoc.utils.utils import save_upload_file config = get_config() +logger = config["logger"] PORT = config["api_port"] static_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "web") static_files_dirs = config.get("static_files_dirs") app = FastAPI() app.mount("/web", StaticFiles(directory=config.get("static_path", static_path)), name="web") - module_api_args = importlib.import_module(config["import_path_init_api_args"]) -logger = config["logger"] -manager = DedocManager(config=config) +process_handler = ProcessHandler(logger=logger) @app.get("/") @@ -62,27 +59,20 @@ def _get_static_file_path(request: Request) -> str: return os.path.abspath(os.path.join(directory, file)) -def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_dir: str) -> None: - for attachment in document_tree.attachments: - with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file: - attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8")) - - @app.post("/upload", response_model=ParsedDocument) -async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: +async def upload(request: Request, file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: parameters = dataclasses.asdict(query_params) if not file or file.filename == "": raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.version.__version__) - return_format = str(parameters.get("return_format", "json")).lower() - with tempfile.TemporaryDirectory() as tmpdir: file_path = save_upload_file(file, tmpdir) - document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir}) + document_tree = await process_handler.handle(request=request, parameters=parameters, file_path=file_path, tmpdir=tmpdir) - if return_format == "html": - __add_base64_info_to_attachments(document_tree, tmpdir) + if document_tree is None: + return JSONResponse(status_code=499, content={}) + return_format = str(parameters.get("return_format", "json")).lower() if return_format == "html": html_content = json2html( text="", @@ -102,24 +92,25 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D return HTMLResponse(content=html_content) if return_format == "ujson": - return UJSONResponse(content=document_tree.to_api_schema().model_dump()) + return UJSONResponse(content=document_tree.model_dump()) if return_format == "collapsed_tree": html_content = json2collapsed_tree(paragraph=document_tree.content.structure) return HTMLResponse(content=html_content) if return_format == "pretty_json": - return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2)) + return PlainTextResponse(content=json.dumps(document_tree.model_dump(), ensure_ascii=False, indent=2)) logger.info(f"Send result. File {file.filename} with parameters {parameters}") - return ORJSONResponse(content=document_tree.to_api_schema().model_dump()) + return ORJSONResponse(content=document_tree.model_dump()) @app.get("/upload_example") -async def upload_example(file_name: str, return_format: Optional[str] = None) -> Response: +async def upload_example(request: Request, file_name: str, return_format: Optional[str] = None) -> Response: file_path = os.path.join(static_path, "examples", file_name) parameters = {} if return_format is None else {"return_format": return_format} - document_tree = manager.parse(file_path, parameters=parameters) + with tempfile.TemporaryDirectory() as tmpdir: + document_tree = await process_handler.handle(request=request, parameters=parameters, file_path=file_path, tmpdir=tmpdir) if return_format == "html": html_page = json2html( @@ -130,12 +121,11 @@ async def upload_example(file_name: str, return_format: Optional[str] = None) -> tabs=0 ) return HTMLResponse(content=html_page) - return ORJSONResponse(content=document_tree.to_api_schema().model_dump(), status_code=200) + return ORJSONResponse(content=document_tree.model_dump(), status_code=200) @app.exception_handler(DedocError) async def exception_handler(request: Request, exc: DedocError) -> Response: - logger.error(f"Exception {exc}\n{traceback.format_exc()}") result = {"message": exc.msg} if exc.filename: result["file_name"] = exc.filename diff --git a/dedoc/api/process_handler.py b/dedoc/api/process_handler.py new file mode 100644 index 00000000..2868357d --- /dev/null +++ b/dedoc/api/process_handler.py @@ -0,0 +1,115 @@ +import asyncio +import base64 +import logging +import os +import pickle +import signal +import traceback +from multiprocessing import Process, Queue +from typing import Optional +from urllib.request import Request + +from anyio import get_cancelled_exc_class + +from dedoc.api.cancellation import cancel_on_disconnect +from dedoc.api.schema import ParsedDocument +from dedoc.common.exceptions.dedoc_error import DedocError +from dedoc.config import get_config +from dedoc.dedoc_manager import DedocManager + + +class ProcessHandler: + """ + Class for file parsing by DedocManager with support for client disconnection. + If client disconnects during file parsing, the process of parsing is fully terminated and API is available to receive new connections. + + Handler uses the following algorithm: + 1. Master process is used for checking current connection (client disconnect) + 2. Child process is working on the background and waiting for the input file in the input_queue + 3. Master process calls the child process for parsing and transfers data through the input_queue + 4. Child process is parsing file using DedocManager + 5. The result of parsing is transferred to the master process through the output_queue + 6. If client disconnects, the child process is terminated. The new child process with queues will start with the new request + """ + def __init__(self, logger: logging.Logger) -> None: + self.input_queue = Queue() + self.output_queue = Queue() + self.logger = logger + self.process = Process(target=self.__parse_file, args=[self.input_queue, self.output_queue]) + self.process.start() + + async def handle(self, request: Request, parameters: dict, file_path: str, tmpdir: str) -> Optional[ParsedDocument]: + """ + Handle request in a separate process. + Checks for client disconnection and terminate the child process if client disconnected. + """ + if self.process is None: + self.logger.info("Initialization of a new parsing process") + self.__init__(logger=self.logger) + + self.logger.info("Putting file to the input queue") + self.input_queue.put(pickle.dumps((parameters, file_path, tmpdir)), block=True) + + loop = asyncio.get_running_loop() + async with cancel_on_disconnect(request, self.logger): + try: + future = loop.run_in_executor(None, self.output_queue.get) + result = await future + except get_cancelled_exc_class(): + self.logger.warning("Terminating the parsing process") + if self.process is not None: + self.process.terminate() + self.process = None + future.cancel(DedocError) + return None + + result = pickle.loads(result) + if isinstance(result, ParsedDocument): + self.logger.info("Got the result from the output queue") + return result + + raise DedocError.from_dict(result) + + def __parse_file(self, input_queue: Queue, output_queue: Queue) -> None: + """ + Function for file parsing in a separate (child) process. + It's a background process, i.e. it is waiting for a task in the input queue. + The result of parsing is returned in the output queue. + + Operations with `signal` are used for saving master process while killing child process. + See the issue for more details: https://github.com/fastapi/fastapi/issues/1487 + """ + signal.set_wakeup_fd(-1) + signal.signal(signal.SIGTERM, signal.SIG_DFL) + signal.signal(signal.SIGINT, signal.SIG_DFL) + + manager = DedocManager(config=get_config()) + manager.logger.info("Parsing process is waiting for the task in the input queue") + + while True: + file_path = None + try: + parameters, file_path, tmp_dir = pickle.loads(input_queue.get(block=True)) + manager.logger.info("Parsing process got task from the input queue") + return_format = str(parameters.get("return_format", "json")).lower() + document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmp_dir}) + + if return_format == "html": + self.__add_base64_info_to_attachments(document_tree, tmp_dir) + + output_queue.put(pickle.dumps(document_tree.to_api_schema()), block=True) + manager.logger.info("Parsing process put task to the output queue") + except DedocError as e: + tb = traceback.format_exc() + manager.logger.error(f"Exception {e}: {e.msg_api}\n{tb}") + output_queue.put(pickle.dumps(e.__dict__), block=True) + except Exception as e: + exc_message = f"Exception {e}\n{traceback.format_exc()}" + filename = "" if file_path is None else os.path.basename(file_path) + manager.logger.error(exc_message) + output_queue.put(pickle.dumps({"msg": exc_message, "filename": filename}), block=True) + + def __add_base64_info_to_attachments(self, document_tree: ParsedDocument, attachments_dir: str) -> None: + for attachment in document_tree.attachments: + with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file: + attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8")) diff --git a/dedoc/common/exceptions/bad_file_error.py b/dedoc/common/exceptions/bad_file_error.py index 4b800c9d..2c1176bc 100644 --- a/dedoc/common/exceptions/bad_file_error.py +++ b/dedoc/common/exceptions/bad_file_error.py @@ -9,11 +9,7 @@ class BadFileFormatError(DedocError): """ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: - super(BadFileFormatError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + super(BadFileFormatError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=415) def __str__(self) -> str: return f"BadFileFormatError({self.msg})" - - @property - def code(self) -> int: - return 415 diff --git a/dedoc/common/exceptions/bad_parameters_error.py b/dedoc/common/exceptions/bad_parameters_error.py index dc8c0aa9..98e81d29 100644 --- a/dedoc/common/exceptions/bad_parameters_error.py +++ b/dedoc/common/exceptions/bad_parameters_error.py @@ -14,7 +14,3 @@ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[s def __str__(self) -> str: return f"BadParametersError({self.msg})" - - @property - def code(self) -> int: - return 400 diff --git a/dedoc/common/exceptions/conversion_error.py b/dedoc/common/exceptions/conversion_error.py index f95207b3..70551230 100644 --- a/dedoc/common/exceptions/conversion_error.py +++ b/dedoc/common/exceptions/conversion_error.py @@ -9,11 +9,7 @@ class ConversionError(DedocError): """ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: - super(ConversionError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + super(ConversionError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=415) def __str__(self) -> str: return f"ConversionError({self.msg})" - - @property - def code(self) -> int: - return 415 diff --git a/dedoc/common/exceptions/dedoc_error.py b/dedoc/common/exceptions/dedoc_error.py index f91c8bd0..9c793256 100644 --- a/dedoc/common/exceptions/dedoc_error.py +++ b/dedoc/common/exceptions/dedoc_error.py @@ -9,17 +9,26 @@ def __init__(self, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None, - metadata: Optional[dict] = None) -> None: + metadata: Optional[dict] = None, + code: Optional[int] = None) -> None: super(DedocError, self).__init__() self.msg = msg self.msg_api = msg if msg_api is None else msg_api self.filename = filename self.version = version if version is not None else dedoc.version.__version__ self.metadata = metadata + self.code = 400 if code is None else code def __str__(self) -> str: return f"DedocError({self.msg})" - @property - def code(self) -> int: - return 400 + @staticmethod + def from_dict(error_dict: dict) -> "DedocError": + return DedocError( + msg=error_dict.get("msg", ""), + msg_api=error_dict.get("msg_api", ""), + filename=error_dict.get("filename", ""), + version=error_dict.get("version", dedoc.version.__version__), + metadata=error_dict.get("metadata", {}), + code=error_dict.get("code", 500) + ) diff --git a/dedoc/common/exceptions/java_not_found_error.py b/dedoc/common/exceptions/java_not_found_error.py index c6d96384..105556ba 100644 --- a/dedoc/common/exceptions/java_not_found_error.py +++ b/dedoc/common/exceptions/java_not_found_error.py @@ -9,11 +9,7 @@ class JavaNotFoundError(DedocError): """ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: - super(JavaNotFoundError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + super(JavaNotFoundError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=500) def __str__(self) -> str: return f"JavaNotFoundError({self.msg})" - - @property - def code(self) -> int: - return 500 diff --git a/dedoc/common/exceptions/minio_error.py b/dedoc/common/exceptions/minio_error.py deleted file mode 100644 index 6d43c64f..00000000 --- a/dedoc/common/exceptions/minio_error.py +++ /dev/null @@ -1,19 +0,0 @@ -from typing import Optional - -from dedoc.common.exceptions.dedoc_error import DedocError - - -class MinioError(DedocError): - """ - Raise if there is no file in minio - """ - - def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: - super(MinioError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) - - def __str__(self) -> str: - return f"MinioError({self.msg})" - - @property - def code(self) -> int: - return 404 diff --git a/dedoc/common/exceptions/missing_file_error.py b/dedoc/common/exceptions/missing_file_error.py index 7bc861e9..1272376f 100644 --- a/dedoc/common/exceptions/missing_file_error.py +++ b/dedoc/common/exceptions/missing_file_error.py @@ -13,7 +13,3 @@ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[s def __str__(self) -> str: return f"MissingFileError({self.msg})" - - @property - def code(self) -> int: - return 400 diff --git a/dedoc/common/exceptions/recognize_error.py b/dedoc/common/exceptions/recognize_error.py index 05c388ce..767cba6a 100644 --- a/dedoc/common/exceptions/recognize_error.py +++ b/dedoc/common/exceptions/recognize_error.py @@ -6,11 +6,7 @@ class RecognizeError(DedocError): def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: - super(RecognizeError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + super(RecognizeError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=500) def __str__(self) -> str: return f"RecognizeError({self.msg})" - - @property - def code(self) -> int: - return 500 diff --git a/dedoc/common/exceptions/structure_extractor_error.py b/dedoc/common/exceptions/structure_extractor_error.py index 1bb9bd00..803d4f1c 100644 --- a/dedoc/common/exceptions/structure_extractor_error.py +++ b/dedoc/common/exceptions/structure_extractor_error.py @@ -13,7 +13,3 @@ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[s def __str__(self) -> str: return f"StructureExtractorError({self.msg})" - - @property - def code(self) -> int: - return 400 diff --git a/dedoc/common/exceptions/tabby_pdf_error.py b/dedoc/common/exceptions/tabby_pdf_error.py index eff2ec8d..c3380be1 100644 --- a/dedoc/common/exceptions/tabby_pdf_error.py +++ b/dedoc/common/exceptions/tabby_pdf_error.py @@ -9,11 +9,7 @@ class TabbyPdfError(DedocError): """ def __init__(self, msg: str, msg_api: Optional[str] = None, filename: Optional[str] = None, version: Optional[str] = None) -> None: - super(TabbyPdfError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version) + super(TabbyPdfError, self).__init__(msg_api=msg_api, msg=msg, filename=filename, version=version, code=500) def __str__(self) -> str: return f"TabbyPdfError({self.msg})" - - @property - def code(self) -> int: - return 500 diff --git a/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py index 5c8e81b9..eb44051b 100644 --- a/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py +++ b/docs/source/_static/code_examples/dedoc_using_patterns_tutorial.py @@ -34,7 +34,7 @@ docx_document = structure_extractor.extract(document=docx_document, parameters={"patterns": patterns}) docx_document.metadata = docx_metadata_extractor.extract(file_path=docx_file_path) -docx_parsed_document = structure_constructor.construct(document=docx_document) +docx_parsed_document = structure_constructor.construct(document=docx_document).to_api_schema() html = json2html( paragraph=docx_parsed_document.content.structure, attachments=docx_parsed_document.attachments, @@ -46,7 +46,7 @@ def print_document_tree(document: UnstructuredDocument, patterns: List[AbstractPattern]) -> None: document = structure_extractor.extract(document=document, parameters={"patterns": patterns}) - parsed_document = structure_constructor.construct(document=document) + parsed_document = structure_constructor.construct(document=document).to_api_schema() html = json2html(paragraph=parsed_document.content.structure, attachments=parsed_document.attachments, tables=parsed_document.content.tables, text="") print(f"\n\nDocument tree\n{html2text.html2text(html)}") diff --git a/docs/source/tutorials/using_patterns.rst b/docs/source/tutorials/using_patterns.rst index e2ea2d71..dc229423 100644 --- a/docs/source/tutorials/using_patterns.rst +++ b/docs/source/tutorials/using_patterns.rst @@ -91,7 +91,7 @@ which applies patterns if lines match them, else line becomes simple raw text li :language: python :lines: 30-37 -Let's see the resulting tree. In the code below we use an auxiliary function to convert :class:`~dedoc.data_structures.ParsedDocument` +Let's see the resulting tree. In the code below we use an auxiliary function to convert :class:`~dedoc.api.schema.ParsedDocument` to the HTML representation and print it: .. literalinclude:: ../_static/code_examples/dedoc_using_patterns_tutorial.py diff --git a/resources/benchmarks/time_benchmark.csv b/resources/benchmarks/time_benchmark.csv index 9be87ffa..1e9467a9 100644 --- a/resources/benchmarks/time_benchmark.csv +++ b/resources/benchmarks/time_benchmark.csv @@ -1,13 +1,13 @@ ,Dataset,total_file_size,total_files,total_pages,total_time_raw,throughput_raw,mean_time_on_file_raw,mean_time_cpu_on_page_raw,total_time_indp_cpu,throughput_indp_cpu,mean_time_on_file_indp_cpu,mean_time_cpu_on_page_indp_cpu,cpu_performance,version -0,images,105240044,259,259,819.3893718719482,128437.16017401138,3.1636655284631208,3.1636655284631208,845.0002507880153,124544.39380563155,3.2625492308417576,3.262549230841758,1.0312560545636043,2.1 -0,law_html,215921201,1526,1526,227.0532796382904,950971.5135759128,0.14878982938289018,0.14878982938289018,234.1500693355101,922148.7809623912,0.15344041240859116,0.15344041240859116,1.0312560545636043,2.1 -0,other_html,215921201,1526,1526,156.9773073196411,1375493.0867831479,0.1028684844820715,0.1028684844820715,161.8837986024715,1333803.6472088536,0.10608374744591842,0.1060837474459184,1.0312560545636043,2.1 -0,txt,2483851,999,999,13.047960042953491,190363.16725551253,0.013061021064017509,0.013061021064017509,13.455787793999773,184593.5026641549,0.013469257051050823,0.013469257051050825,1.0312560545636043,2.1 -0,pdf_text_layer_true,109643533,33,445,417.5641739368439,262578.8797115134,12.653459816267997,1.0459708427522103,430.6155825412202,254620.4490626033,13.048957046703642,1.0786637644852126,1.0312560545636043,2.1 -0,pdf_text_layer_auto,109643533,33,445,744.6476347446442,147242.16916045017,22.565079840746794,1.9358688088909384,767.9223818468816,142779.44697523108,23.270375207481262,1.9963764300096132,1.0312560545636043,2.1 -0,pdf_text_layer_auto_tabby,109643533,33,445,861.5465660095215,127263.61792357055,26.107471697258227,2.408536994270351,888.475112485801,123406.42012271588,26.923488257145486,2.4838183579817246,1.0312560545636043,2.1 -0,pdf_text_layer_false,109643533,33,445,1923.4744081497192,57002.85511231277,58.28710327726422,4.837624405643553,1983.5946292025433,55275.171340869965,60.10892815765283,4.988829458024572,1.0312560545636043,2.1 -0,pdf_text_layer_tabby,109643533,33,445,459.48560762405396,238622.3445973723,13.923806291637998,1.2937336014756313,473.84731484714223,231390.00594604985,14.359009540822491,1.334170609514122,1.0312560545636043,2.1 -0,docx,417727,22,22,16.942837953567505,24655.078514284138,0.770128997889432,0.770128997889432,17.472404221106515,23907.814557963888,0.794200191868478,0.794200191868478,1.0312560545636043,2.1 -0,pdf,6086144,18,117,375.61194705963135,16203.275874592393,20.86733039220174,3.0367271868588284,387.35209457166883,15712.175267130062,21.519560809537158,3.131643297506068,1.0312560545636043,2.1 -0,pdf_tables,16536264,2,267,1197.7023212909698,13806.656049706928,598.8511606454849,4.039958413717207,1235.137770396196,13388.193929731136,617.568885198098,4.166231574331044,1.0312560545636043,2.1 +0,images,105240044,259,259,780.3763222694397,134858.06910946214,3.0130359933183,3.0130359933183,1066.0429167915163,98720.2694585152,4.115995817727862,4.115995817727862,1.366062611550437,2.3.1 +0,law_html,215921201,1526,1526,204.2208013534546,1057292.8887214332,0.13382752382270943,0.13382752382270943,278.9784012298232,773971.0316216326,0.18281677669057877,0.18281677669057877,1.366062611550437,2.3.1 +0,other_html,215921201,1526,1526,152.16186046600342,1419023.139824463,0.0997128836605527,0.0997128836605527,207.86262848656185,1038768.7415102572,0.13621404225855954,0.13621404225855951,1.366062611550437,2.3.1 +0,txt,2483851,999,999,12.656875133514404,196245.20063589464,0.012669544678192597,0.012669544678192597,17.290083898956475,143657.5446663917,0.01730739129024672,0.01730739129024672,1.366062611550437,2.3.1 +0,pdf_text_layer_true,109643533,33,445,294.70041608810425,372050.825225916,8.930315639033463,0.7666412830448923,402.57922002631614,272352.6887275323,12.199370303827761,1.0472799932386834,1.366062611550437,2.3.1 +0,pdf_text_layer_auto,109643533,33,445,715.7886617183685,153178.6389809286,21.69056550661723,1.9423069545744724,977.8121285451869,112131.49213349436,29.63067056197536,2.65331291079858,1.366062611550437,2.3.1 +0,pdf_text_layer_auto_tabby,109643533,33,445,844.7789170742035,129789.61806923167,25.59936112346071,2.380779043078811,1154.0208936411366,95010.00684143213,34.97033011033747,3.2522932371127915,1.366062611550437,2.3.1 +0,pdf_text_layer_false,109643533,33,445,1591.9220836162567,68874.93686307219,48.240063139886566,3.9471288925826884,2174.665238929637,50418.57985184248,65.89894663423142,5.392025203127692,1.366062611550437,2.3.1 +0,pdf_text_layer_tabby,109643533,33,445,421.8361530303955,259919.71577670728,12.782913728193803,1.1935813540785523,576.2545968550919,190269.25528816486,17.46226051076036,1.630506861650454,1.366062611550437,2.3.1 +0,docx,417727,22,22,17.311132431030273,24130.541526631885,0.7868696559559215,0.7868696559559215,23.64809077762868,17664.30127184617,1.07491321716494,1.07491321716494,1.366062611550437,2.3.1 +0,pdf,6086144,18,117,310.7921574115753,19582.68204284271,17.26623096730974,2.519494602322346,424.5615462030511,14335.127743974337,23.58675256683617,3.441787376235694,1.366062611550437,2.3.1 +0,pdf_tables,16536264,2,267,1083.6798040866852,15259.363455551895,541.8399020433426,3.6198095974726074,1480.3744632551231,11170.325083586768,740.1872316275616,4.944886552038766,1.366062611550437,2.3.1 diff --git a/scripts/benchmark.py b/scripts/benchmark.py index a82b2131..39cfbce4 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -100,7 +100,10 @@ def get_times(spend_page_times: List, total_size: int, total_time: int, total_fi file_size = os.path.getsize(file_path) total_size += file_size time_start = time.time() - send_file(host=host, file_name=file, file_path=file_path, parameters=parameters) + try: + send_file(host=host, file_name=file, file_path=file_path, parameters=parameters) + except AssertionError as e: + print(f"Error on file {file_path}: {e}") time_finish = time.time() spend_file_time = time_finish - time_start pages = page_func(file_path) diff --git a/tests/api_tests/test_api_misc_main.py b/tests/api_tests/test_api_misc_main.py index cbc47976..550ebc59 100644 --- a/tests/api_tests/test_api_misc_main.py +++ b/tests/api_tests/test_api_misc_main.py @@ -1,7 +1,9 @@ import json import os +import time import requests +from requests import ReadTimeout from tests.api_tests.abstract_api_test import AbstractTestApiDocReader @@ -14,6 +16,26 @@ def __get_version(self) -> str: version = file.read().strip() return version + def test_cancellation(self) -> None: + file_name = "article.pdf" + start_time = time.time() + with open(self._get_abs_path(os.path.join("pdf_with_text_layer", file_name)), "rb") as file: + files = {"file": (file_name, file)} + parameters = dict(pdf_with_text_layer=False) + try: + requests.post(f"http://{self._get_host()}:{self._get_port()}/upload", files=files, data=parameters, timeout=1) + except ReadTimeout: + pass + + file_name = "example.txt" + with open(self._get_abs_path(os.path.join("txt", file_name)), "rb") as file: + files = {"file": (file_name, file)} + r = requests.post(f"http://{self._get_host()}:{self._get_port()}/upload", files=files, data={}, timeout=60) + + end_time = time.time() + self.assertLess(end_time - start_time, 60) + self.assertEqual(200, r.status_code) + def test_bin_file(self) -> None: file_name = "file.bin" result = self._send_request(file_name, expected_code=415)