ToshibaPO / app.py
dschandra's picture
Update app.py
f705371 verified
import gradio as gr
import pdfplumber
import pandas as pd
import re
def extract_cleaned_po_data(pdf_file):
"""
Extract and clean data from a Toshiba PO PDF file.
"""
extracted_data = []
current_row = {}
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
text = page.extract_text()
if text:
lines = text.split("\n")
for line in lines:
line = line.strip()
# Match rows starting with POS and numeric Item Code
if re.match(r"^\d+\s+\d{12}\s+", line):
parts = re.split(r'\s+', line, maxsplit=9) # Split only the first 9 elements to handle descriptions correctly
if len(parts) >= 9:
# Save the previous row if exists
if current_row:
extracted_data.append(current_row)
current_row = {
"Pos": parts[0],
"Item Code": parts[1],
"Description": "",
"Unit": parts[2],
"Delivery Date": parts[3],
"Quantity": parts[4],
"Basic Price": parts[5],
"Discount": parts[6],
"Cur": parts[7],
"Amount": parts[8],
"Sub Total": ""
}
elif "SUB TOTAL" in line and current_row:
# Capture the Sub Total
sub_total_match = re.search(r"SUB TOTAL\s*:\s*(\d+\.\d+)", line)
if sub_total_match:
current_row["Sub Total"] = sub_total_match.group(1)
extracted_data.append(current_row)
current_row = {}
else:
# Clean and append descriptions only
if current_row and "Item Code" in current_row:
clean_line = re.sub(
r"(Calculation Method.*|Landed Cost.*|Central GST.*|State GST.*|Perc:.*|\d+\/\d+|\d+-\d+-\d+|Cal.*Method:.*|\/\d+|\s{2,}|[A-Za-z]+:[0-9\.]+)",
"",
line
).strip()
if clean_line:
current_row["Description"] += f" {clean_line}".strip()
# Add the last row if exists
if current_row:
extracted_data.append(current_row)
# Combine Item Code and Description
for row in extracted_data:
if "Description" in row:
row["Item Code"] = f"{row['Item Code']}\n{row['Description']}".strip()
del row["Description"]
# Convert to DataFrame
columns = [
"Pos", "Item Code", "Unit", "Delivery Date", "Quantity",
"Basic Price", "Discount", "Cur", "Amount", "Sub Total"
]
df = pd.DataFrame(extracted_data, columns=columns)
# Ensure Pos is numeric and filter rows for POS 10 to POS 450
df['Pos'] = pd.to_numeric(df['Pos'], errors='coerce')
df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]
# Identify missing POS numbers
expected_pos = set(range(10, 451))
extracted_pos = set(df['Pos'].dropna().astype(int))
missing_pos = sorted(expected_pos - extracted_pos)
print("Missing POS numbers:", missing_pos) # Debug output to identify skipped POS numbers
# Save as Excel for download
output_path = "cleaned_extracted_po_data.xlsx"
df.to_excel(output_path, index=False)
return output_path
# Gradio interface
def process_pdf(file):
excel_path = extract_cleaned_po_data(file.name)
return excel_path
iface = gr.Interface(
fn=process_pdf,
inputs=gr.File(label="Upload Toshiba PO PDF"),
outputs=gr.File(label="Download Cleaned Extracted Excel"),
title="Toshiba PO Data Extraction",
description="Upload a Toshiba PO PDF file to extract cleaned data in the specified format and download as an Excel file.",
)
if __name__ == "__main__":
iface.launch()