Spaces:

dschandra
/

ToshibaPO

Sleeping

App Files Files Community

ToshibaPO / app.py

dschandra

Update app.py

f705371 verified 9 months ago

raw

history blame contribute delete

4.31 kB

	import gradio as gr
	import pdfplumber
	import pandas as pd
	import re

	def extract_cleaned_po_data(pdf_file):
	"""
	Extract and clean data from a Toshiba PO PDF file.
	"""
	extracted_data = []
	current_row = {}

	with pdfplumber.open(pdf_file) as pdf:
	for page in pdf.pages:
	text = page.extract_text()
	if text:
	lines = text.split("\n")
	for line in lines:
	line = line.strip()

	# Match rows starting with POS and numeric Item Code
	if re.match(r"^\d+\s+\d{12}\s+", line):
	parts = re.split(r'\s+', line, maxsplit=9) # Split only the first 9 elements to handle descriptions correctly
	if len(parts) >= 9:
	# Save the previous row if exists
	if current_row:
	extracted_data.append(current_row)
	current_row = {
	"Pos": parts[0],
	"Item Code": parts[1],
	"Description": "",
	"Unit": parts[2],
	"Delivery Date": parts[3],
	"Quantity": parts[4],
	"Basic Price": parts[5],
	"Discount": parts[6],
	"Cur": parts[7],
	"Amount": parts[8],
	"Sub Total": ""
	}
	elif "SUB TOTAL" in line and current_row:
	# Capture the Sub Total
	sub_total_match = re.search(r"SUB TOTAL\s:\s(\d+\.\d+)", line)
	if sub_total_match:
	current_row["Sub Total"] = sub_total_match.group(1)
	extracted_data.append(current_row)
	current_row = {}
	else:
	# Clean and append descriptions only
	if current_row and "Item Code" in current_row:
	clean_line = re.sub(
	r"(Calculation Method.\|Landed Cost.\|Central GST.\|State GST.\|Perc:.\|\d+\/\d+\|\d+-\d+-\d+\|Cal.Method:.*\|\/\d+\|\s{2,}\|[A-Za-z]+:[0-9\.]+)",
	"",
	line
	).strip()
	if clean_line:
	current_row["Description"] += f" {clean_line}".strip()

	# Add the last row if exists
	if current_row:
	extracted_data.append(current_row)

	# Combine Item Code and Description
	for row in extracted_data:
	if "Description" in row:
	row["Item Code"] = f"{row['Item Code']}\n{row['Description']}".strip()
	del row["Description"]

	# Convert to DataFrame
	columns = [
	"Pos", "Item Code", "Unit", "Delivery Date", "Quantity",
	"Basic Price", "Discount", "Cur", "Amount", "Sub Total"
	]
	df = pd.DataFrame(extracted_data, columns=columns)

	# Ensure Pos is numeric and filter rows for POS 10 to POS 450
	df['Pos'] = pd.to_numeric(df['Pos'], errors='coerce')
	df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]

	# Identify missing POS numbers
	expected_pos = set(range(10, 451))
	extracted_pos = set(df['Pos'].dropna().astype(int))
	missing_pos = sorted(expected_pos - extracted_pos)

	print("Missing POS numbers:", missing_pos) # Debug output to identify skipped POS numbers

	# Save as Excel for download
	output_path = "cleaned_extracted_po_data.xlsx"
	df.to_excel(output_path, index=False)
	return output_path

	# Gradio interface
	def process_pdf(file):
	excel_path = extract_cleaned_po_data(file.name)
	return excel_path

	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(label="Upload Toshiba PO PDF"),
	outputs=gr.File(label="Download Cleaned Extracted Excel"),
	title="Toshiba PO Data Extraction",
	description="Upload a Toshiba PO PDF file to extract cleaned data in the specified format and download as an Excel file.",
	)

	if __name__ == "__main__":
	iface.launch()