|
import gradio as gr |
|
import pdfplumber |
|
import pandas as pd |
|
import re |
|
|
|
def extract_cleaned_po_data(pdf_file): |
|
""" |
|
Extract and clean data from a Toshiba PO PDF file. |
|
""" |
|
extracted_data = [] |
|
current_row = {} |
|
|
|
with pdfplumber.open(pdf_file) as pdf: |
|
for page in pdf.pages: |
|
text = page.extract_text() |
|
if text: |
|
lines = text.split("\n") |
|
for line in lines: |
|
line = line.strip() |
|
|
|
|
|
if re.match(r"^\d+\s+\d{12}\s+", line): |
|
parts = re.split(r'\s+', line, maxsplit=9) |
|
if len(parts) >= 9: |
|
|
|
if current_row: |
|
extracted_data.append(current_row) |
|
current_row = { |
|
"Pos": parts[0], |
|
"Item Code": parts[1], |
|
"Description": "", |
|
"Unit": parts[2], |
|
"Delivery Date": parts[3], |
|
"Quantity": parts[4], |
|
"Basic Price": parts[5], |
|
"Discount": parts[6], |
|
"Cur": parts[7], |
|
"Amount": parts[8], |
|
"Sub Total": "" |
|
} |
|
elif "SUB TOTAL" in line and current_row: |
|
|
|
sub_total_match = re.search(r"SUB TOTAL\s*:\s*(\d+\.\d+)", line) |
|
if sub_total_match: |
|
current_row["Sub Total"] = sub_total_match.group(1) |
|
extracted_data.append(current_row) |
|
current_row = {} |
|
else: |
|
|
|
if current_row and "Item Code" in current_row: |
|
clean_line = re.sub( |
|
r"(Calculation Method.*|Landed Cost.*|Central GST.*|State GST.*|Perc:.*|\d+\/\d+|\d+-\d+-\d+|Cal.*Method:.*|\/\d+|\s{2,}|[A-Za-z]+:[0-9\.]+)", |
|
"", |
|
line |
|
).strip() |
|
if clean_line: |
|
current_row["Description"] += f" {clean_line}".strip() |
|
|
|
|
|
if current_row: |
|
extracted_data.append(current_row) |
|
|
|
|
|
for row in extracted_data: |
|
if "Description" in row: |
|
row["Item Code"] = f"{row['Item Code']}\n{row['Description']}".strip() |
|
del row["Description"] |
|
|
|
|
|
columns = [ |
|
"Pos", "Item Code", "Unit", "Delivery Date", "Quantity", |
|
"Basic Price", "Discount", "Cur", "Amount", "Sub Total" |
|
] |
|
df = pd.DataFrame(extracted_data, columns=columns) |
|
|
|
|
|
df['Pos'] = pd.to_numeric(df['Pos'], errors='coerce') |
|
df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)] |
|
|
|
|
|
expected_pos = set(range(10, 451)) |
|
extracted_pos = set(df['Pos'].dropna().astype(int)) |
|
missing_pos = sorted(expected_pos - extracted_pos) |
|
|
|
print("Missing POS numbers:", missing_pos) |
|
|
|
|
|
output_path = "cleaned_extracted_po_data.xlsx" |
|
df.to_excel(output_path, index=False) |
|
return output_path |
|
|
|
|
|
def process_pdf(file): |
|
excel_path = extract_cleaned_po_data(file.name) |
|
return excel_path |
|
|
|
iface = gr.Interface( |
|
fn=process_pdf, |
|
inputs=gr.File(label="Upload Toshiba PO PDF"), |
|
outputs=gr.File(label="Download Cleaned Extracted Excel"), |
|
title="Toshiba PO Data Extraction", |
|
description="Upload a Toshiba PO PDF file to extract cleaned data in the specified format and download as an Excel file.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|