import pdfplumber | |
import re | |
from datetime import datetime | |
def parse_invoice(pdf_file): | |
transactions = [] | |
with pdfplumber.open(pdf_file) as pdf: | |
for page in pdf.pages: | |
text = page.extract_text() | |
lines = text.split("\n") | |
for line in lines: | |
pattern = r"(\d{2}/\d{2}/\d{4})\s+(.+?)\s+([\d,.]+)\s+(.+)" | |
match = re.match(pattern, line.strip()) | |
if match: | |
date_str, vendor, amount, description = match.groups() | |
try: | |
date = datetime.strptime(date_str, "%d/%m/%Y") | |
amount = float(amount.replace(",", "")) | |
transactions.append({ | |
"date": date, | |
"vendor": vendor.strip(), | |
"amount": amount, | |
"description": description.strip() | |
}) | |
except Exception as e: | |
print(f"Error parsing line: {line}, Error: {e}") | |
continue | |
return transactions |