import pdfplumber import re from datetime import datetime def parse_invoice(pdf_file): transactions = [] with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: text = page.extract_text() lines = text.split("\n") for line in lines: pattern = r"(\d{2}/\d{2}/\d{4})\s+(.+?)\s+([\d,.]+)\s+(.+)" match = re.match(pattern, line.strip()) if match: date_str, vendor, amount, description = match.groups() try: date = datetime.strptime(date_str, "%d/%m/%Y") amount = float(amount.replace(",", "")) transactions.append({ "date": date, "vendor": vendor.strip(), "amount": amount, "description": description.strip() }) except Exception as e: print(f"Error parsing line: {line}, Error: {e}") continue return transactions