varun321's picture
Switch to saving images as files instead of base64, optimize LLM parameters
0004b69
import pdfplumber
import re
from datetime import datetime
def parse_invoice(pdf_file):
transactions = []
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text = page.extract_text()
lines = text.split("\n")
for line in lines:
pattern = r"(\d{2}/\d{2}/\d{4})\s+(.+?)\s+([\d,.]+)\s+(.+)"
match = re.match(pattern, line.strip())
if match:
date_str, vendor, amount, description = match.groups()
try:
date = datetime.strptime(date_str, "%d/%m/%Y")
amount = float(amount.replace(",", ""))
transactions.append({
"date": date,
"vendor": vendor.strip(),
"amount": amount,
"description": description.strip()
})
except Exception as e:
print(f"Error parsing line: {line}, Error: {e}")
continue
return transactions