|
import pdfplumber |
|
import re |
|
from datetime import datetime |
|
|
|
def parse_invoice(pdf_file): |
|
transactions = [] |
|
|
|
with pdfplumber.open(pdf_file) as pdf: |
|
for page in pdf.pages: |
|
text = page.extract_text() |
|
lines = text.split("\n") |
|
|
|
for line in lines: |
|
|
|
pattern = r"(\d{2}/\d{2}/\d{4})\s+(.+?)\s+([\d,.]+)\s+(.+)" |
|
match = re.match(pattern, line.strip()) |
|
|
|
if match: |
|
date_str, vendor, amount, description = match.groups() |
|
try: |
|
date = datetime.strptime(date_str, "%d/%m/%Y") |
|
amount = float(amount.replace(",", "")) |
|
transactions.append({ |
|
"date": date, |
|
"vendor": vendor.strip(), |
|
"amount": amount, |
|
"description": description.strip() |
|
}) |
|
except Exception as e: |
|
print(f"Error parsing line: {line}, Error: {e}") |
|
continue |
|
|
|
return transactions |