diff --git a/beancount_reds_importers/importers/genericpdfpaycheck/__init__.py b/beancount_reds_importers/importers/genericpdfpaycheck/__init__.py index 3d04b10..af166a1 100644 --- a/beancount_reds_importers/importers/genericpdfpaycheck/__init__.py +++ b/beancount_reds_importers/importers/genericpdfpaycheck/__init__.py @@ -25,6 +25,7 @@ def custom_init(self): self.pdf_table_extraction_settings = {"join_tolerance": 4, "snap_tolerance": 4} self.pdf_table_extraction_crop = (0, 0, 0, 0) self.pdf_table_title_height = 0 + self.pdf_page_break_top = 0 # Set this true as you play with the extraction settings and crop to view images of what the pdf parser detects self.debug = True diff --git a/beancount_reds_importers/importers/mercurycards/__init__.py b/beancount_reds_importers/importers/mercurycards/__init__.py new file mode 100644 index 0000000..9b12425 --- /dev/null +++ b/beancount_reds_importers/importers/mercurycards/__init__.py @@ -0,0 +1,161 @@ +"""Mercury Cards pdf importer for beancount.""" + +from datetime import datetime +import petl as etl +import re +from beancount_reds_importers.libreader import pdfreader +from beancount_reds_importers.libtransactionbuilder import banking + + +class Importer(banking.Importer, pdfreader.Importer): + IMPORTER_NAME = "Mercury Cards" + + def custom_init(self): + if not self.custom_init_run: + self.max_rounding_error = 0.04 + self.filename_pattern_def = "Mercury Statement *.pdf" + self.pdf_table_extraction_settings = { + "vertical_strategy": "text", + "horizontal_strategy": "text", + } + self.pdf_table_extraction_crop = (0, 0, 0, 0) + self.pdf_table_title_height = 0 + self.pdf_page_break_top = 0 + self.date_format = "%m/%d/%Y" + self.transaction_table_section = "table_1" + self.meta_text = "" + self.skip_transaction_types = {} + self.header_map = { + "Post Date": "settleDate", + "Trans Date": "date", + "Description": "memo", + "Reference": "reference", + "Amount": "amount", + } + + # payee and narration are swapped + # We need to swap them back. See banking.py + self.get_payee = lambda ot: ot.memo + self.get_narration = lambda ot: None # setting to none to use smart importer + + self.debug = True + self.custom_init_run = True + + def file_date(self, file): + if not self.file_read_done: + self.read_file(file) + + return self.get_closing_date() + + def get_closing_date(self): + if self.meta_text == "": + raise ValueError("No meta_text has been found") + + # Pattern to match "Closing Date" followed by a date in mm/dd/yyyy format + pattern = r"Closing Date\s+(\d{2}/\d{2}/\d{4})" + + # Search for all matches in self.meta_text + matches = re.findall(pattern, self.meta_text) + + date_string = matches[0] + date_format = "%m/%d/%Y" + datetime_object = datetime.strptime(date_string, date_format) + + return datetime_object + + def get_adjusted_crop(self, page_idx, page): + """Dynamically find the crop positon based on the position of text found on the page.""" + adjusted_crop = (0, 0, 1, 1) + table_start_search_text = "TRANSACTIONS" + table_start_search_results = page.search(table_start_search_text) + if table_start_search_results: + table_start = table_start_search_results[0] + table_start_x = table_start["x0"] - 30 + table_start_y = table_start["bottom"] + 50 + + table_end_search_text = "YEAR-TO-DATE" + table_end_search_results = page.search(table_end_search_text) + + if table_end_search_results: + table_end = table_end_search_results[0] + table_end_y = table_end["top"] - 10 + else: + table_end_y = page.bbox[3] # if no end text is found use the whole page + + adjusted_crop = ( + (table_start_x), + (table_start_y), + (page.bbox[2]), + (table_end_y), + ) + return adjusted_crop + + def fix_years(self, table): + """ + Determine the correct year for the given date string (MM/DD format). + """ + + def get_year(d): + # Get the current year + current_year = self.get_closing_date().year + + return f"{d}/{current_year}" + + date_headers = ["Post Date", "Trans Date"] + for i in date_headers: + if i in table.header(): + table = table.convert(i, lambda d: get_year(d)) + + return table + + def prepare_tables(self): + """Make final adjustments to tables before processing by the transaction builder.""" + for section, table in self.alltables.items(): + # set table headers. table was goofy, so they had to be croped out + headers = [ + "Post Date", + "Trans Date", + "Description", + "City", + "State", + "Reference", + "Amount", + ] + table = etl.wrap(etl.pushheader(table, headers)) + + # add year to mm/dd formatted date + table = self.fix_years(table) + + table = table.rename(self.header_map) + table = self.convert_columns(table) + + # the amounts should be negative since they're charges + table = etl.convert(table, "amount", lambda a: a * -1) + + table = self.fix_column_names(table) + table = self.prepare_processed_table( + table + ) # override this to make additonal adjustments + + self.alltables[section] = table + + self.combine_tables() + return + + def combine_tables(self): + # Initialize an empty table + combined_table = None + + for section, table in self.alltables.items(): + # Convert each table to a petl table + petl_table = etl.wrap(table) + + # Combine tables + if combined_table is None: + combined_table = petl_table # First table initializes the combined table + else: + combined_table = etl.cat( + combined_table, petl_table + ) # Concatenate additional tables + + return combined_table diff --git a/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf new file mode 100644 index 0000000..5e85596 Binary files /dev/null and b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf differ diff --git a/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.extract b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.extract new file mode 100644 index 0000000..198ebd7 --- /dev/null +++ b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.extract @@ -0,0 +1,120 @@ + +2024-10-05 * "Whole Foods" + Liabilities:Credit-Cards:Mercury -15.01 USD + +2024-10-06 * "Car Wash" + Liabilities:Credit-Cards:Mercury -35.30 USD + +2024-10-07 * "Taco Bell" + Liabilities:Credit-Cards:Mercury -22.76 USD + +2024-10-07 * "Mcdonald's" + Liabilities:Credit-Cards:Mercury -10.11 USD + +2024-10-08 * "Papa John's" + Liabilities:Credit-Cards:Mercury -30.16 USD + +2024-10-10 * "Paypal" + Liabilities:Credit-Cards:Mercury -33.97 USD + +2024-10-11 * "Amazon" + Liabilities:Credit-Cards:Mercury -101.55 USD + +2024-10-11 * "Wm Supercenter" + Liabilities:Credit-Cards:Mercury -53.44 USD + +2024-10-12 * "Amazon" + Liabilities:Credit-Cards:Mercury -204.32 USD + +2024-10-12 * "Amazon" + Liabilities:Credit-Cards:Mercury -4.90 USD + +2024-10-12 * "Target" + Liabilities:Credit-Cards:Mercury -10.90 USD + +2024-10-13 * "Bp" + Liabilities:Credit-Cards:Mercury -106.95 USD + +2024-10-14 * "Circle K" + Liabilities:Credit-Cards:Mercury -50.69 USD + +2024-10-15 * "Amazon" + Liabilities:Credit-Cards:Mercury -10.11 USD + +2024-10-15 * "Doordash" + Liabilities:Credit-Cards:Mercury -73.82 USD + +2024-10-16 * "Mcdonald's" + Liabilities:Credit-Cards:Mercury -10.68 USD + +2024-10-16 * "7-Eleven" + Liabilities:Credit-Cards:Mercury -10.11 USD + +2024-10-17 * "Advance Auto Parts" + Liabilities:Credit-Cards:Mercury -13.86 USD + +2024-10-18 * "Mcdonald's" + Liabilities:Credit-Cards:Mercury -15.11 USD + +2024-10-18 * "Papa John's" + Liabilities:Credit-Cards:Mercury -31.29 USD + +2024-10-18 * "Mcdonald's" + Liabilities:Credit-Cards:Mercury -12.62 USD + +2024-10-19 * "Amazon Prime" + Liabilities:Credit-Cards:Mercury -23.52 USD + +2024-10-19 * "Amazon" + Liabilities:Credit-Cards:Mercury -30.46 USD + +2024-10-19 * "Amazon" + Liabilities:Credit-Cards:Mercury -44.21 USD + +2024-10-19 * "Amazon" + Liabilities:Credit-Cards:Mercury -23.57 USD + +2024-10-19 * "Amazon" + Liabilities:Credit-Cards:Mercury -22.24 USD + +2024-10-19 * "Amazon" + Liabilities:Credit-Cards:Mercury -69.23 USD + +2024-10-19 * "Amazon" + Liabilities:Credit-Cards:Mercury -60.00 USD + +2024-10-19 * "Amazon" + Liabilities:Credit-Cards:Mercury -57.07 USD + +2024-10-19 * "Amazon" + Liabilities:Credit-Cards:Mercury -224.05 USD + +2024-10-20 * "Whole Foods" + Liabilities:Credit-Cards:Mercury -6.39 USD + +2024-10-21 * "Doordash" + Liabilities:Credit-Cards:Mercury -79.56 USD + +2024-10-21 * "Amazon" + Liabilities:Credit-Cards:Mercury -20.24 USD + +2024-10-22 * "Papa John's" + Liabilities:Credit-Cards:Mercury -37.85 USD + +2024-10-22 * "Mcdonald's" + Liabilities:Credit-Cards:Mercury -10.11 USD + +2024-10-22 * "Racetrac" + Liabilities:Credit-Cards:Mercury -32.90 USD + +2024-10-23 * "Mcdonald's" + Liabilities:Credit-Cards:Mercury -10.11 USD + +2024-10-24 * "Doordash" + Liabilities:Credit-Cards:Mercury -28.41 USD + +2024-10-24 * "Doordash" + Liabilities:Credit-Cards:Mercury -40.83 USD + +2024-10-25 * "Doordash" + Liabilities:Credit-Cards:Mercury -68.35 USD diff --git a/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.file_account b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.file_account new file mode 100644 index 0000000..7260519 --- /dev/null +++ b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.file_account @@ -0,0 +1 @@ +Liabilities:Credit-Cards:Mercury diff --git a/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.file_date b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.file_date new file mode 100644 index 0000000..ec271d0 --- /dev/null +++ b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.file_date @@ -0,0 +1 @@ +2024-11-05T00:00:00 diff --git a/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.file_name b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.file_name new file mode 100644 index 0000000..ec88f66 --- /dev/null +++ b/beancount_reds_importers/importers/mercurycards/tests/mercury_statement_20241105.pdf.file_name @@ -0,0 +1 @@ +mercury_statement_20241105.pdf diff --git a/beancount_reds_importers/importers/mercurycards/tests/mercurycards_test.py b/beancount_reds_importers/importers/mercurycards/tests/mercurycards_test.py new file mode 100644 index 0000000..aff62a8 --- /dev/null +++ b/beancount_reds_importers/importers/mercurycards/tests/mercurycards_test.py @@ -0,0 +1,21 @@ +from os import path + +from beancount.ingest import regression_pytest as regtest + +from beancount_reds_importers.importers import mercurycards + + +@regtest.with_importer( + mercurycards.Importer( + { + "main_account": "Liabilities:Credit-Cards:Mercury", + "emit_filing_account_metadata": False, + "filename_pattern": "mercury_statement_20241105.pdf", + "skip_transaction_types": {}, + "currency": "USD", + } + ) +) +@regtest.with_testdir(path.dirname(__file__)) +class TestMercuryCards(regtest.ImporterTestBase): + pass diff --git a/beancount_reds_importers/libreader/csv_multitable_reader.py b/beancount_reds_importers/libreader/csv_multitable_reader.py index 6cf320b..2d6b095 100644 --- a/beancount_reds_importers/libreader/csv_multitable_reader.py +++ b/beancount_reds_importers/libreader/csv_multitable_reader.py @@ -42,7 +42,8 @@ def file_date(self, file): pass def convert_columns(self, rdr): - pass + # Convert columns for a single table. Can be called from prepare_tables for each table. + return super().convert_columns(rdr) def is_section_title(self, row): # Match against rows that contain section titles. Eg: 'section1', 'section2', ... diff --git a/beancount_reds_importers/libreader/pdfreader.py b/beancount_reds_importers/libreader/pdfreader.py index 20a2815..21f7862 100644 --- a/beancount_reds_importers/libreader/pdfreader.py +++ b/beancount_reds_importers/libreader/pdfreader.py @@ -3,7 +3,7 @@ import pdfplumber import petl as etl -from beancount_reds_importers.libreader import csvreader +from beancount_reds_importers.libreader import csv_multitable_reader LEFT = 0 TOP = 1 @@ -16,7 +16,7 @@ TRANSPARENT = (0, 0, 0, 0) -class Importer(csvreader.Importer): +class Importer(csv_multitable_reader.Importer): """ A reader that converts a pdf with tables into a multi-petl-table format understood by transaction builders. @@ -50,6 +50,10 @@ class Importer(csvreader.Importer): .debug-pdf-data.txt is a printout of the meta_text and table data found before being processed into petl tables, as well as some generated helper objects to add to new importers or import configs + self.transaction_table_section: `str` + When reading a pdf that contains transactions, set this setting to the name of the table section that + contains the transactions. This is the key for the table in the self.alltables dictionary. + ### Outputs self.meta_text: `str` contains all text found in the document outside of tables @@ -62,109 +66,131 @@ class Importer(csvreader.Importer): def initialize_reader(self, file): if getattr(self, "file", None) != file: - self.pdf_table_extraction_settings = {} - self.pdf_table_extraction_crop = (0, 0, 0, 0) - self.pdf_table_title_height = 20 - self.pdf_page_break_top = 45 - self.debug = False - - self.meta_text = "" self.file = file + self.meta_text = "" + self.debug_images = {} self.file_read_done = False self.reader_ready = True - def file_date(self, file): - raise "Not implemented, must overwrite, check self.alltables, or self.meta_text for the data" - pass - def prepare_tables(self): + """Make final adjustments to tables before processing by the transaction builder.""" + for section, table in self.alltables.items(): + table = table.rename(self.header_map) + table = self.convert_columns(table) + table = self.fix_column_names(table) + table = self.prepare_processed_table( + table + ) # override this to make additonal adjustments + + self.alltables[section] = table return - def read_file(self, file): - tables = [] - - with pdfplumber.open(file.name) as pdf: - for page_idx, page in enumerate(pdf.pages): - # all bounding boxes are (left, top, right, bottom) - adjusted_crop = ( - min(0 + self.pdf_table_extraction_crop[LEFT], page.width), - min(0 + self.pdf_table_extraction_crop[TOP], page.height), - max(page.width - self.pdf_table_extraction_crop[RIGHT], 0), - max(page.height - self.pdf_table_extraction_crop[BOTTOM], 0), - ) - - # Debug image - image = page.crop(adjusted_crop).to_image() - image.debug_tablefinder(tf=self.pdf_table_extraction_settings) + def get_transactions(self): + """Provides the transactions to the transaction builder.""" + # Transactions are usually in a single table with other tables containing additonal + # context information for the institution or statement period (See csv_multitable_reader definition). + # Specify the transaction table section in the config. + try: + transaction_table = self.alltables[self.transaction_table_section] + except KeyError: + raise KeyError( + f"Table section '{self.transaction_table_section}' not found in self.alltables." + "Check the configuration value set in self.transaction_table_section." + ) + + for ot in transaction_table.namedtuples(): + if self.skip_transaction(ot): + continue + yield ot + + def get_adjusted_crop(self, page_idx, page): + """Calculate the adjusted crop coordinates for the page.""" + return ( + min(0 + self.pdf_table_extraction_crop[LEFT], page.width), + min(0 + self.pdf_table_extraction_crop[TOP], page.height), + max(page.width - self.pdf_table_extraction_crop[RIGHT], 0), + max(page.height - self.pdf_table_extraction_crop[BOTTOM], 0), + ) + + def extract_tables(self, page_idx, page, adjusted_crop): + """Extract tables from a page within the given crop area.""" + cropped_page = page.crop(adjusted_crop) + + image = page.crop(adjusted_crop).to_image() # debug + image.debug_tablefinder(tf=self.pdf_table_extraction_settings) # debug + self.debug_images[page_idx] = image # debug + + table_refs = cropped_page.find_tables(table_settings=self.pdf_table_extraction_settings) + + return [{"table": t.extract(), "bbox": t.bbox} for t in table_refs] + + def extract_metadata(self, page_idx, page, tables): + """Extract metadata text outside of table bounding boxes.""" + meta_page = page + meta_image = meta_page.to_image() # debug + + for table in tables: + meta_page = meta_page.outside_bbox(table["bbox"]) + meta_image.draw_rect(table["bbox"], BLACK, RED) # debug - table_ref = page.crop(adjusted_crop).find_tables( - table_settings=self.pdf_table_extraction_settings + if self.debug: + meta_image.save(".debug-pdf-metadata-page_{}.png".format(page_idx)) # debug + + return meta_page.extract_text() + + def attach_section_headers(self, page_idx, page_tables, page): + """Attach section headers to tables.""" + image = self.debug_images[page_idx] # debug + + for table_idx, table in enumerate(page_tables): + section_title_bbox = ( + table["bbox"][LEFT], + max(table["bbox"][TOP] - self.pdf_table_title_height, 0), + table["bbox"][RIGHT], + table["bbox"][TOP], + ) + + bbox_area = pdfplumber.utils.calculate_area(section_title_bbox) + if bbox_area > 0: + section_title = page.crop(section_title_bbox).extract_text() + image.draw_rect(section_title_bbox, TRANSPARENT, PURPLE) # debuglogic + page_tables[table_idx]["section"] = section_title + else: + page_tables[table_idx]["section"] = "" + + # replace None with '' + for row_idx, row in enumerate(table["table"]): + page_tables[table_idx]["table"][row_idx] = ["" if v is None else v for v in row] + + return page_tables + + def find_and_fix_broken_tables(self, tables): + """Combine tables that are split up by page breaks.""" + for table_idx, table in enumerate(tables[:]): + if ( + # if not the first table, + table_idx >= 1 + # and the top of the table is close to the top of the page + and table["bbox"][TOP] < self.pdf_page_break_top + # and there is no section title + and table["section"] == "" + # and the header rows are the same, + and tables[table_idx - 1]["table"][0] == tables[table_idx]["table"][0] + ): # assume a page break + tables[table_idx - 1]["table"] = ( + tables[table_idx - 1]["table"] + tables[table_idx]["table"][1:] ) - page_tables = [{"table": i.extract(), "bbox": i.bbox} for i in table_ref] - - # Get Metadata (all data outside tables) - meta_page = page - meta_image = meta_page.to_image() - for table in page_tables: - meta_page = meta_page.outside_bbox(table["bbox"]) - meta_image.draw_rect(table["bbox"], BLACK, RED) - - meta_text = meta_page.extract_text() - self.meta_text = self.meta_text + meta_text - - # Attach section headers - for table_idx, table in enumerate(page_tables): - section_title_bbox = ( - table["bbox"][LEFT], - max(table["bbox"][TOP] - self.pdf_table_title_height, 0), - table["bbox"][RIGHT], - table["bbox"][TOP], - ) + del tables[table_idx] + continue - bbox_area = pdfplumber.utils.calculate_area(section_title_bbox) - if bbox_area > 0: - section_title = meta_page.crop(section_title_bbox).extract_text() - image.draw_rect(section_title_bbox, TRANSPARENT, PURPLE) - page_tables[table_idx]["section"] = section_title - else: - page_tables[table_idx]["section"] = "" + # if there is no table section give it one + if table["section"] == "": + tables[table_idx]["section"] = "table_{}".format(table_idx + 1) - # replace None with '' - for row_idx, row in enumerate(table["table"]): - page_tables[table_idx]["table"][row_idx] = [ - "" if v is None else v for v in row - ] - - tables = tables + page_tables - - if self.debug: - image.save(".debug-pdf-table-detection-page_{}.png".format(page_idx)) - meta_image.save(".debug-pdf-metadata-page_{}.png".format(page_idx)) - - # Find and fix page broken tables - for table_idx, table in enumerate(tables[:]): - if ( - # if not the first table, - table_idx >= 1 - # and the top of the table is close to the top of the page - and table["bbox"][TOP] < self.pdf_page_break_top - # and there is no section title - and table["section"] == "" - # and the header rows are the same, - and tables[table_idx - 1]["table"][0] == tables[table_idx]["table"][0] - ): # assume a page break - tables[table_idx - 1]["table"] = ( - tables[table_idx - 1]["table"] + tables[table_idx]["table"][1:] - ) - del tables[table_idx] - continue - - # if there is no table section give it one - if table["section"] == "": - tables[table_idx]["section"] = "table_{}".format(table_idx + 1) + return tables + def generate_debug_helpers(self, tables): if self.debug: - # generate helpers paycheck_template = {} header_map = {} for table in tables: @@ -194,10 +220,35 @@ def read_file(self, file): ) ) + def read_file(self, file): + """Main method to read and process a PDF into self.alltables.""" + if self.file_read_done: + return + + self.meta_text = "" + tables = [] + + with pdfplumber.open(file.name) as pdf: + for page_idx, page in enumerate(pdf.pages): + adjusted_crop = self.get_adjusted_crop(page_idx, page) + page_tables = self.extract_tables(page_idx, page, adjusted_crop) + self.meta_text += self.extract_metadata(page_idx, page, page_tables) + page_tables = self.attach_section_headers(page_idx, page_tables, page) + + if self.debug: + self.debug_images[page_idx].save( + ".debug-pdf-table-detection-page_{}.png".format(page_idx) + ) # debug + + tables.extend(page_tables) + + tables = self.find_and_fix_broken_tables(tables) + self.generate_debug_helpers(tables) # debug + self.alltables = {table["section"]: etl.wrap(table["table"]) for table in tables} self.prepare_tables() - if self.debug: + if self.debug: # debug with open(".debug-pdf-prepared-tables.txt", "w") as debug_file: debug_file.write(pformat({"prepared_tables": self.alltables}))