From 2ff929b33e8a15906ac1bae279502fc1297925ca Mon Sep 17 00:00:00 2001 From: Bogatenkova Anastasiya Date: Wed, 20 Sep 2023 11:12:57 +0300 Subject: [PATCH] TLDR-405 remove is_one_column_document_list (#332) * TLDR-405 remove is_one_column_document_list * TLDR-405 fix tests * TLDR-405 review fix --- README.md | 16 ++++---- dedoc/readers/pdf_reader/pdf_base_reader.py | 6 +-- .../pdf_image_reader/pdf_image_reader.py | 39 +++++++------------ .../pdf_txtlayer_reader/pdf_tabby_reader.py | 10 ++--- .../pdf_txtlayer_reader.py | 6 +-- .../pdfminer_reader/pdfminer_extractor.py | 16 ++++---- dedoc/utils/parameter_utils.py | 4 -- 7 files changed, 38 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 02b9cb26..13dd88a0 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ There are two ways to install and run dedoc as a web application or a library th ## Install and run dedoc using docker -You should have [`git`] (https://git-scm.com) and [`docker`](https://www.docker.com) installed for running dedoc by this method. +You should have [`git`](https://git-scm.com) and [`docker`](https://www.docker.com) installed for running dedoc by this method. This method is more flexible because it doesn't depend on the operating system and other user's limitations, still, the docker application should be installed and configured properly. @@ -130,7 +130,7 @@ export TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/ ## Install the dedoc library via pip. -You need torch~=1.11.0 and torchvision~=0.12.0 installed. If you already have torch and torchvision in your environment: +You need `torch~=1.11.0` and `torchvision~=0.12.0` installed. If you already have torch and torchvision in your environment: ```bash pip install dedoc @@ -144,10 +144,10 @@ pip install "dedoc[torch]" ## Install and run dedoc from sources -If you want to run dedoc as a service from sources. it's possible to run dedoc locally. -However, it isn't suitable for any operating system (Ubuntu 20+ is recommended) and +If you want to run dedoc as a service from sources, it's possible to run dedoc locally. +However, it is suitable not for all operating systems (`Ubuntu 20+` is recommended) and there may be not enough machine's resources for its work. -You should have `python` (python3.8, python3.9 are recommended) and `pip` installed. +You should have `python` (`python3.8`, `python3.9` are recommended) and `pip` installed. ### 1. Install necessary packages: according to instructions [install necessary packages](#1-Install-necessary-packages) @@ -183,14 +183,14 @@ python dedoc/main.py -c ./dedoc/config.py Now you can go to the `localhost:1231` and look at the docs and examples. ## Option: You can change the port of service: -you need to change environment DOCREADER_PORT +You need to change environment `DOCREADER_PORT` -1. For local service launching on your_port (1166 example). [Install instruction from sources](#Install-and-run-dedoc-from-sources) and launch with environment: +1. For local service launching on `your_port` (e.g. `1166`). Install ([installation instruction](#Install-and-run-dedoc-from-sources)) and launch with environment: ```bash DOCREADER_PORT=1166 python dedoc/main.py -c ./dedoc/config.py ``` -2. For service launching in docker-container you need to change port value in DOCREADER_PORT env and field 'ports' in docker-compose.yml file: +2. For service launching in docker-container you need to change port value in `DOCREADER_PORT` env and field `ports` in `docker-compose.yml` file: ```yaml ... dedoc: diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py index 43dea82d..0f1ad924 100644 --- a/dedoc/readers/pdf_reader/pdf_base_reader.py +++ b/dedoc/readers/pdf_reader/pdf_base_reader.py @@ -44,8 +44,7 @@ "first_page", "last_page", "need_binarization", - "table_type", - "is_one_column_document_list"]) + "table_type"]) class PdfBaseReader(BaseReader): @@ -84,8 +83,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio first_page=first_page, last_page=last_page, need_binarization=param_utils.get_param_need_binarization(parameters), - table_type=param_utils.get_param_table_type(parameters), - is_one_column_document_list=param_utils.get_is_one_column_document_list(parameters) + table_type=param_utils.get_param_table_type(parameters) ) lines, scan_tables, attachments, warnings, other_fields = self._parse_document(path, params_for_parse) diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py index bffe9940..0cab0781 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/pdf_image_reader.py @@ -72,7 +72,7 @@ def _process_one_page(self, page_number: int, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]: # --- Step 1: correct orientation and detect column count --- - rotated_image, is_one_column_document = self._detect_columncount_and_orientation(image, parameters) + rotated_image, is_one_column_document = self._detect_column_count_and_orientation(image, parameters) # --- Step 2: do binarization --- if parameters.need_binarization: @@ -102,37 +102,28 @@ def _process_one_page(self, return lines, tables, page.attachments - def _detect_columncount_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool]: + def _detect_column_count_and_orientation(self, image: np.ndarray, parameters: ParametersForParseDoc) -> Tuple[np.ndarray, bool]: """ Function : - - detects the count of the column - - detects document orientation angle - - rotates document on detected angle - - updates a parameters.is_one_column_document - Return: rotated_image + - detects the number of page columns + - detects page orientation angle + - rotates the page on detected angle + Return: rotated_image and indicator if the page is one-column """ - angle = 0 # parameters.document_orientation is False - columns = None + columns, angle = None, None + if parameters.is_one_column_document is None or parameters.document_orientation is None: - self.logger.info("Call orientation and columns classifier") columns, angle = self.column_orientation_classifier.predict(image) + self.logger.info(f"Predicted orientation angle = {angle}, columns = {columns}") - self.logger.debug(f"Predict {angle}") - if columns is not None: - self.logger.info(f"Final number of columns: {columns}") - else: - self.logger.info("Final number of columns: not detected") - - if parameters.is_one_column_document is not None: - is_one_column_document = parameters.is_one_column_document - else: - is_one_column_document = True if columns == 1 else False - - self.logger.info(f"Final orientation angle: {angle}") + is_one_column_document = columns == 1 if parameters.is_one_column_document is None else parameters.is_one_column_document + angle = angle if parameters.document_orientation is None else 0 + self.logger.info(f"Final orientation angle = {angle}, is_one_column_document = {is_one_column_document}") rotated_image, _ = self.scan_rotator.auto_rotate(image, angle) if self.config.get("debug_mode"): - self.logger.info(self.config["path_debug"]) - cv2.imwrite(os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg"), rotated_image) + img_path = os.path.join(self.config["path_debug"], f"{datetime.now().strftime('%H-%M-%S')}_result_orientation.jpg") + self.logger.info(f"Save image to {img_path}") + cv2.imwrite(img_path, rotated_image) return rotated_image, is_one_column_document diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py index e8438f03..650d25cf 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py @@ -117,7 +117,8 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio return self._postprocess(result) - def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable], List[List[CellPropertyInfo]]]: + def __extract(self, path: str, start_page: int = None, end_page: int = None) \ + -> Tuple[List[LineWithMeta], List[ScanTable], List[List[List[CellPropertyInfo]]]]: file_hash = calculate_file_hash(path=path) document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page) all_lines = [] @@ -134,7 +135,7 @@ def __extract(self, path: str, start_page: int = None, end_page: int = None) -> return all_lines, all_tables, all_cell_properties - def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]: + def __get_tables(self, page: dict, file_hash: str) -> Tuple[List[ScanTable], List[List[List[CellPropertyInfo]]]]: tables = [] cell_properties = [] page_number = page["number"] @@ -154,10 +155,7 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]: cell_property_row_list = [] for cell_property in cell_properties_row: - cell_property_info = CellPropertyInfo(cell_property["col_span"], - cell_property["row_span"], - bool(cell_property["invisible"])) - + cell_property_info = CellPropertyInfo(cell_property["col_span"], cell_property["row_span"], bool(cell_property["invisible"])) cell_property_row_list.append(cell_property_info) cell_property_list.append(cell_property_row_list) diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py index 0f17ce23..27b76d5f 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py @@ -44,8 +44,8 @@ def _process_one_page(self, parameters: ParametersForParseDoc, page_number: int, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment]]: - gray_image = self._convert_to_gray(image) if parameters.need_pdf_table_analysis: + gray_image = self._convert_to_gray(image) cleaned_image, tables = self.table_recognizer.recognize_tables_from_image( image=gray_image, page_number=page_number, @@ -57,9 +57,7 @@ def _process_one_page(self, else: tables = [] - is_one_column_document_list = None if parameters.is_one_column_document_list is None else parameters.is_one_column_document_list[page_number] - - page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, is_one_column_document=is_one_column_document_list) + page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number) if page is None: return [], [], [] unreadable_blocks = [location.bbox for table in tables for location in table.locations] diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py index d91d8439..f467f063 100644 --- a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py +++ b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/pdfminer_extractor.py @@ -44,10 +44,11 @@ def __init__(self, *, config: dict) -> None: self.config = config self.logger = self.config.get("logger", logging.getLogger()) - def extract_text_layer(self, path: str, page_number: int, is_one_column_document: bool) -> Optional[PageWithBBox]: + def extract_text_layer(self, path: str, page_number: int) -> Optional[PageWithBBox]: """ Extract text information with metadata from pdf with help pdfminer.six :param path: path to pdf + :param page_number: number of the page to read :return: pages_with_bbox - page with extracted text """ with open(path, "rb") as fp: @@ -55,11 +56,11 @@ def extract_text_layer(self, path: str, page_number: int, is_one_column_document for page_num, page in enumerate(pages): if page_num != page_number: continue - return self.__handle_page(page=page, page_number=page_number, path=path, is_one_column_document=is_one_column_document) + return self.__handle_page(page=page, page_number=page_number, path=path) - def __handle_page(self, page: PDFPage, page_number: int, path: str, is_one_column_document: bool) -> PageWithBBox: + def __handle_page(self, page: PDFPage, page_number: int, path: str) -> PageWithBBox: directory = os.path.dirname(path) - device, interpreter = self.__get_interpreter(is_one_column_document=is_one_column_document) + device, interpreter = self.__get_interpreter() try: interpreter.process_page(page) except Exception as e: @@ -139,12 +140,9 @@ def __get_image(path: str, page_num: int) -> np.ndarray: image_page = cv2.cvtColor(image_page, cv2.COLOR_GRAY2BGR) return image_page - def __get_interpreter(self, is_one_column_document: bool) -> Tuple[PDFPageAggregator, PDFPageInterpreter]: + def __get_interpreter(self) -> Tuple[PDFPageAggregator, PDFPageInterpreter]: rsrcmgr = PDFResourceManager() - if is_one_column_document is not None and is_one_column_document: - laparams = LAParams(line_margin=3.0, line_overlap=0.1, boxes_flow=0.5, word_margin=1.5, char_margin=100.0, detect_vertical=False) - else: - laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, detect_vertical=False) + laparams = LAParams(line_margin=1.5, line_overlap=0.5, boxes_flow=0.5, word_margin=0.1, detect_vertical=False) # TODO find the best parameters device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) return device, interpreter diff --git a/dedoc/utils/parameter_utils.py b/dedoc/utils/parameter_utils.py index e15d0015..93fc4da1 100644 --- a/dedoc/utils/parameter_utils.py +++ b/dedoc/utils/parameter_utils.py @@ -103,10 +103,6 @@ def get_param_table_type(parameters: Optional[dict]) -> str: return str(parameters.get("table_type", "")) -def get_is_one_column_document_list(parameters: Optional[dict]) -> Optional[bool]: - return None if parameters is None else parameters.get("is_one_column_document_list") - - def get_param_page_slice(parameters: Dict[str, Any]) -> Tuple[Optional[int], Optional[int]]: """ Parse parameter pages = ["page_number:page_number" | "" | "page_number:" | ":page_number" : ":"]