Spaces:

varun321
/

invoice-reader-budget-categorizer

Sleeping

Switch to saving images as files instead of base64, optimize LLM parameters

0004b69 4 months ago

1.17 kB

	import pdfplumber
	import re
	from datetime import datetime

	def parse_invoice(pdf_file):
	transactions = []

	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	text = page.extract_text()
	lines = text.split("\n")

	for line in lines:
	pattern = r"(\d{2}/\d{2}/\d{4})\s+(.+?)\s+([\d,.]+)\s+(.+)"
	match = re.match(pattern, line.strip())

	if match:
	date_str, vendor, amount, description = match.groups()
	try:
	date = datetime.strptime(date_str, "%d/%m/%Y")
	amount = float(amount.replace(",", ""))
	transactions.append({
	"date": date,
	"vendor": vendor.strip(),
	"amount": amount,
	"description": description.strip()
	})
	except Exception as e:
	print(f"Error parsing line: {line}, Error: {e}")
	continue

	return transactions