diff --git a/.github/workflows/conventionalcommits.yml b/.github/workflows/conventionalcommits.yml index add48c6..ae2d6a7 100644 --- a/.github/workflows/conventionalcommits.yml +++ b/.github/workflows/conventionalcommits.yml @@ -3,6 +3,7 @@ name: Conventional Commits on: pull_request: branches: [ main ] + types: [opened, reopened, edited] jobs: build: diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index 0312cb2..05a9b9a 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -8,6 +8,7 @@ on: branches: [ main ] pull_request: branches: [ main ] + types: [opened, reopened, edited] jobs: build: diff --git a/.gitignore b/.gitignore index ce0145f..f72a6a7 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ +.debug-* # Translations *.mo diff --git a/beancount_reds_importers/importers/bamboohr/__init__.py b/beancount_reds_importers/importers/bamboohr/__init__.py new file mode 100644 index 0000000..a1ac1bc --- /dev/null +++ b/beancount_reds_importers/importers/bamboohr/__init__.py @@ -0,0 +1,64 @@ +"""BambooHR paycheck importer""" + +import re + +from dateparser.search import search_dates + +from beancount_reds_importers.libreader import pdfreader +from beancount_reds_importers.libtransactionbuilder import paycheck + +# BambooHR exports paycheck stubs to pdf, with multiple tables across multiple pages. +# Call this importer with a config that looks like: +# +# bamboohr.Importer({"desc":"Paycheck (My Company)", +# "main_account":"Income:Employment", +# "paycheck_template": {}, # See beancount_reds_importers/libtransactionbuilder/paycheck.py for sample template +# "currency": "PENNIES", +# }), +# + + +class Importer(paycheck.Importer, pdfreader.Importer): + IMPORTER_NAME = "BambooHR Paycheck" + + def custom_init(self): + self.max_rounding_error = 0.04 + self.filename_pattern_def = r"PayStub.*\.pdf" + self.pdf_table_extraction_settings = {"join_tolerance": 4, "snap_tolerance": 4} + self.pdf_table_extraction_crop = (0, 40, 0, 0) + self.debug = False + + self.header_map = { + "Deduction Type": "description", + "Pay Type": "description", + "Paycheck Total": "amount", + "Tax Type": "description", + } + + self.currency_fields = ["ytd_total", "amount"] + + def paycheck_date(self, input_file): + if not self.file_read_done: + self.read_file(input_file) + dates = [date for _, date in search_dates(self.meta_text)] + return dates[2].date() + + def prepare_tables(self): + def valid_header(label): + if label in self.header_map: + return self.header_map[header] + + label = label.lower().replace(" ", "_") + return re.sub(r"20\d{2}", "ytd", label) + + for section, table in self.alltables.items(): + # rename columns + for header in table.header(): + table = table.rename(header, valid_header(header)) + # convert columns + table = self.convert_columns(table) + + self.alltables[section] = table + + def build_metadata(self, file, metatype=None, data={}): + return {"filing_account": self.config["main_account"]} diff --git a/beancount_reds_importers/importers/genericpdf/__init__.py b/beancount_reds_importers/importers/genericpdf/__init__.py new file mode 100644 index 0000000..91b210c --- /dev/null +++ b/beancount_reds_importers/importers/genericpdf/__init__.py @@ -0,0 +1,72 @@ +"""Generic pdf paycheck importer""" + +import datetime + +from beancount_reds_importers.libreader import pdfreader +from beancount_reds_importers.libtransactionbuilder import paycheck + +# Generic pdf paystub importer. Use this to build your own pdf paystub importer. +# Call this importer with a config that looks like: +# +# genericpdf.Importer({"desc":"Paycheck (My Company)", +# "main_account":"Income:Employment", +# "paycheck_template": {}, # See beancount_reds_importers/libtransactionbuilder/paycheck.py for sample template +# "currency": "PENNIES", +# }), +# + + +class Importer(paycheck.Importer, pdfreader.Importer): + IMPORTER_NAME = "Generic PDF Paycheck" + + def custom_init(self): + self.max_rounding_error = 0.04 + self.filename_pattern_def = r"paystub.*\.pdf" + self.pdf_table_extraction_settings = {"join_tolerance": 4, "snap_tolerance": 4} + self.pdf_table_extraction_crop = (0, 0, 0, 0) + self.pdf_table_title_height = 0 + # Set this true as you play with the extraction settings and crop to view images of what the pdf parser detects + self.debug = True + + self.header_map = { + "CURRENT": "amount", + "CURRENT PAY": "amount", + "PAY DESCRIPTION": "description", + "DEDUCTIONS": "description", + "TAX TYPE": "description", + "TOTAL NET PAY": "description", + "YTD": "ytd", + "YTD PAY": "ytd", + } + + self.currency_fields = ["ytd", "amount"] + self.date_format = "%m/%d/%Y" + + def paycheck_date(self, input_file): + if not self.file_read_done: + self.read_file(input_file) + *_, d = self.alltables["table_1"].header() + self.date = datetime.datetime.strptime(d, self.date_format) + return self.date.date() + + def prepare_tables(self): + def valid_header(label): + if label in self.header_map: + return self.header_map[header] + + return label.lower().replace(" ", "_") + + for section, table in self.alltables.items(): + # rename columns + for header in table.header(): + if section == "table_6" and header == "": + table = table.rename(header, "amount") + else: + table = table.rename(header, valid_header(header)) + # convert columns + table = self.convert_columns(table) + + self.alltables[section] = table + + def build_metadata(self, file, metatype=None, data={}): + return {"filing_account": self.config["main_account"]} diff --git a/beancount_reds_importers/importers/genericpdf/tests/genericpdf_test.py b/beancount_reds_importers/importers/genericpdf/tests/genericpdf_test.py new file mode 100644 index 0000000..2d9eaa2 --- /dev/null +++ b/beancount_reds_importers/importers/genericpdf/tests/genericpdf_test.py @@ -0,0 +1,33 @@ +from os import path + +from beancount.ingest import regression_pytest as regtest + +from beancount_reds_importers.importers import genericpdf + + +@regtest.with_importer( + genericpdf.Importer( + { + "desc": "Paycheck", + "main_account": "Income:Salary:FakeCompany", + "paycheck_template": { + "table_4": { + "Bonus": "Income:Bonus:FakeCompany", + "Overtime": "Income:Overtime:FakeCompany", + "Regular": "Income:Salary:FakeCompany", + }, + "table_5": { + "Federal MED/EE": "Expenses:Taxes:Medicare", + "Federal OASDI/EE": "Expenses:Taxes:SocialSecurity", + "Federal Withholding": "Expenses:Taxes:FederalIncome", + "State Withholding": "Expenses:Taxes:StateIncome", + }, + "table_6": {"CURRENT": "Assets:Checking:ABCBank"}, + }, + "currency": "USD", + } + ) +) +@regtest.with_testdir(path.dirname(__file__)) +class TestGenericPDF(regtest.ImporterTestBase): + pass diff --git a/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf new file mode 100644 index 0000000..5890f3a Binary files /dev/null and b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf differ diff --git a/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.extract b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.extract new file mode 100644 index 0000000..9f54acb --- /dev/null +++ b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.extract @@ -0,0 +1,11 @@ + +2023-12-03 * "Paycheck" + filing_account: "Income:Salary:FakeCompany" + Assets:Checking:ABCBank 4228.00 USD + Expenses:Taxes:FederalIncome 416.00 USD + Expenses:Taxes:Medicare 128.00 USD + Expenses:Taxes:SocialSecurity 96.00 USD + Expenses:Taxes:StateIncome 32.00 USD + Income:Bonus:FakeCompany -3000.00 USD + Income:Overtime:FakeCompany -300.00 USD + Income:Salary:FakeCompany -1600.00 USD diff --git a/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.file_account b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.file_account new file mode 100644 index 0000000..e80daef --- /dev/null +++ b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.file_account @@ -0,0 +1 @@ +Income:Salary:FakeCompany diff --git a/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.file_date b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.file_date new file mode 100644 index 0000000..ba67902 --- /dev/null +++ b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.file_date @@ -0,0 +1 @@ +2023-12-03 diff --git a/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.file_name b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.file_name new file mode 100644 index 0000000..0307945 --- /dev/null +++ b/beancount_reds_importers/importers/genericpdf/tests/paystub.sample.pdf.file_name @@ -0,0 +1 @@ +paystub.sample.pdf diff --git a/beancount_reds_importers/libreader/pdfreader.py b/beancount_reds_importers/libreader/pdfreader.py new file mode 100644 index 0000000..20a2815 --- /dev/null +++ b/beancount_reds_importers/libreader/pdfreader.py @@ -0,0 +1,204 @@ +from pprint import pformat + +import pdfplumber +import petl as etl + +from beancount_reds_importers.libreader import csvreader + +LEFT = 0 +TOP = 1 +RIGHT = 2 +BOTTOM = 3 + +BLACK = (0, 0, 0) +RED = (255, 0, 0) +PURPLE = (135, 0, 255) +TRANSPARENT = (0, 0, 0, 0) + + +class Importer(csvreader.Importer): + """ + A reader that converts a pdf with tables into a multi-petl-table format understood by transaction builders. + + + ### Attributes customized in `custom_init` + self.pdf_table_extraction_settings: `{}` + a dictionary containing settings used to extract tables, see [pdfplumber documentation](https://github.com/jsvine/pdfplumber?tab=readme-ov-file#table-extraction-settings) for what settings are available + + self.pdf_table_extraction_crop: `(int,int,int,int)` + a tuple with 4 values representing distance from left, top, right, bottom of the page respectively, + this will crop the input (each page) before searching for tables + + self.pdf_table_title_height: `int` + an integer representing how far up from the top of the table should we look for a table title. + Set to 0 to not extract table titles, in which case sections will be labelled as `table_#` in the order + they were encountered + + self.pdf_page_break_top: `int` + an integer representing the threshold where a table can be considered page-broken. If the top of a table is + lower than the provided value, it will be in consideration for amending to the previous page's last table. + Set to 0 to never consider page-broken tables + + self.debug: `boolean` + When debug is True a few images and text file are generated: + .debug-pdf-metadata-page_#.png + shows the text available in self.meta_text with table data blacked out + + .debug-pdf-table-detection-page_#.png + shows the tables detected with cells outlined in red, and the background light blue. The purple box shows where we are looking for the table title. + + .debug-pdf-data.txt + is a printout of the meta_text and table data found before being processed into petl tables, as well as some generated helper objects to add to new importers or import configs + + ### Outputs + self.meta_text: `str` + contains all text found in the document outside of tables + + self.alltables: `{'table_1': , ...}` + contains all the tables found in the document keyed by the extracted title if available, otherwise by the 1-based index in the form of `table_#` + """ + + FILE_EXTS = ["pdf"] + + def initialize_reader(self, file): + if getattr(self, "file", None) != file: + self.pdf_table_extraction_settings = {} + self.pdf_table_extraction_crop = (0, 0, 0, 0) + self.pdf_table_title_height = 20 + self.pdf_page_break_top = 45 + self.debug = False + + self.meta_text = "" + self.file = file + self.file_read_done = False + self.reader_ready = True + + def file_date(self, file): + raise "Not implemented, must overwrite, check self.alltables, or self.meta_text for the data" + pass + + def prepare_tables(self): + return + + def read_file(self, file): + tables = [] + + with pdfplumber.open(file.name) as pdf: + for page_idx, page in enumerate(pdf.pages): + # all bounding boxes are (left, top, right, bottom) + adjusted_crop = ( + min(0 + self.pdf_table_extraction_crop[LEFT], page.width), + min(0 + self.pdf_table_extraction_crop[TOP], page.height), + max(page.width - self.pdf_table_extraction_crop[RIGHT], 0), + max(page.height - self.pdf_table_extraction_crop[BOTTOM], 0), + ) + + # Debug image + image = page.crop(adjusted_crop).to_image() + image.debug_tablefinder(tf=self.pdf_table_extraction_settings) + + table_ref = page.crop(adjusted_crop).find_tables( + table_settings=self.pdf_table_extraction_settings + ) + page_tables = [{"table": i.extract(), "bbox": i.bbox} for i in table_ref] + + # Get Metadata (all data outside tables) + meta_page = page + meta_image = meta_page.to_image() + for table in page_tables: + meta_page = meta_page.outside_bbox(table["bbox"]) + meta_image.draw_rect(table["bbox"], BLACK, RED) + + meta_text = meta_page.extract_text() + self.meta_text = self.meta_text + meta_text + + # Attach section headers + for table_idx, table in enumerate(page_tables): + section_title_bbox = ( + table["bbox"][LEFT], + max(table["bbox"][TOP] - self.pdf_table_title_height, 0), + table["bbox"][RIGHT], + table["bbox"][TOP], + ) + + bbox_area = pdfplumber.utils.calculate_area(section_title_bbox) + if bbox_area > 0: + section_title = meta_page.crop(section_title_bbox).extract_text() + image.draw_rect(section_title_bbox, TRANSPARENT, PURPLE) + page_tables[table_idx]["section"] = section_title + else: + page_tables[table_idx]["section"] = "" + + # replace None with '' + for row_idx, row in enumerate(table["table"]): + page_tables[table_idx]["table"][row_idx] = [ + "" if v is None else v for v in row + ] + + tables = tables + page_tables + + if self.debug: + image.save(".debug-pdf-table-detection-page_{}.png".format(page_idx)) + meta_image.save(".debug-pdf-metadata-page_{}.png".format(page_idx)) + + # Find and fix page broken tables + for table_idx, table in enumerate(tables[:]): + if ( + # if not the first table, + table_idx >= 1 + # and the top of the table is close to the top of the page + and table["bbox"][TOP] < self.pdf_page_break_top + # and there is no section title + and table["section"] == "" + # and the header rows are the same, + and tables[table_idx - 1]["table"][0] == tables[table_idx]["table"][0] + ): # assume a page break + tables[table_idx - 1]["table"] = ( + tables[table_idx - 1]["table"] + tables[table_idx]["table"][1:] + ) + del tables[table_idx] + continue + + # if there is no table section give it one + if table["section"] == "": + tables[table_idx]["section"] = "table_{}".format(table_idx + 1) + + if self.debug: + # generate helpers + paycheck_template = {} + header_map = {} + for table in tables: + for header in table["table"][0]: + header_map[header] = "overwrite_me" + paycheck_template[table["section"]] = {} + for row_idx, row in enumerate(table["table"]): + if row_idx == 0: + continue + paycheck_template[table["section"]][row[0]] = "overwrite_me" + with open(".debug-pdf-data.txt", "w") as debug_file: + debug_file.write( + pformat( + { + "_output": {"tables": tables, "meta_text": self.meta_text}, + "_input": { + "table_settings": self.pdf_table_extraction_settings, + "crop_settings": self.pdf_table_extraction_crop, + "pdf_table_title_height": self.pdf_table_title_height, + "pdf_page_break_top": self.pdf_page_break_top, + }, + "helpers": { + "header_map_generated": header_map, + "paycheck_template_generated": paycheck_template, + }, + } + ) + ) + + self.alltables = {table["section"]: etl.wrap(table["table"]) for table in tables} + self.prepare_tables() + + if self.debug: + with open(".debug-pdf-prepared-tables.txt", "w") as debug_file: + debug_file.write(pformat({"prepared_tables": self.alltables})) + + self.file_read_done = True diff --git a/requirements.txt b/requirements.txt index ffd5956..53470d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,20 @@ # Automatically generated by https://github.com/damnever/pigar. beancount>=2.3.5 -beautifulsoup4>=4.12.2 -click>=8.1.4 -click-aliases>=1.0.1 +beautifulsoup4>=4.12.3 +click>=8.1.7 +click-aliases>=1.0.4 +dateparser>=1.2.0 importlib-metadata>=6.8.0 ofxparse>=0.21 openpyxl>=3.1.2 packaging>=23.1 -pdbpp>=0.10.3 -petl>=1.7.12 +pdfplumber>=0.11.0 +petl>=1.7.15 setuptools>=69.0.2 setuptools-scm>=8.0.4 tabulate>=0.9.0 tomli>=2.0.1 -tqdm>=4.65.0 +tqdm>=4.66.2 typing_extensions>=4.7.1 xlrd>=2.0.1 diff --git a/setup.py b/setup.py index 217e140..2018a5c 100644 --- a/setup.py +++ b/setup.py @@ -29,9 +29,11 @@ "Click >= 7.0", "beancount >= 2.3.5", "click_aliases >= 1.0.1", + "dateparser >= 1.2.0", "ofxparse >= 0.21", "openpyxl >= 3.0.9", "packaging >= 20.3", + "pdfplumber>=0.11.0", "petl >= 1.7.4", "tabulate >= 0.8.9", "tqdm >= 4.64.0",