import gradio as gr import pdfplumber import pandas as pd import re def extract_cleaned_po_data(pdf_file): """ Extract and clean data from a Toshiba PO PDF file. """ extracted_data = [] current_row = {} with pdfplumber.open(pdf_file) as pdf: for page in pdf.pages: text = page.extract_text() if text: lines = text.split("\n") for line in lines: line = line.strip() # Match rows starting with POS and numeric Item Code if re.match(r"^\d+\s+\d{12}\s+", line): parts = re.split(r'\s+', line, maxsplit=9) # Split only the first 9 elements to handle descriptions correctly if len(parts) >= 9: # Save the previous row if exists if current_row: extracted_data.append(current_row) current_row = { "Pos": parts[0], "Item Code": parts[1], "Description": "", "Unit": parts[2], "Delivery Date": parts[3], "Quantity": parts[4], "Basic Price": parts[5], "Discount": parts[6], "Cur": parts[7], "Amount": parts[8], "Sub Total": "" } elif "SUB TOTAL" in line and current_row: # Capture the Sub Total sub_total_match = re.search(r"SUB TOTAL\s*:\s*(\d+\.\d+)", line) if sub_total_match: current_row["Sub Total"] = sub_total_match.group(1) extracted_data.append(current_row) current_row = {} else: # Clean and append descriptions only if current_row and "Item Code" in current_row: clean_line = re.sub( r"(Calculation Method.*|Landed Cost.*|Central GST.*|State GST.*|Perc:.*|\d+\/\d+|\d+-\d+-\d+|Cal.*Method:.*|\/\d+|\s{2,}|[A-Za-z]+:[0-9\.]+)", "", line ).strip() if clean_line: current_row["Description"] += f" {clean_line}".strip() # Add the last row if exists if current_row: extracted_data.append(current_row) # Combine Item Code and Description for row in extracted_data: if "Description" in row: row["Item Code"] = f"{row['Item Code']}\n{row['Description']}".strip() del row["Description"] # Convert to DataFrame columns = [ "Pos", "Item Code", "Unit", "Delivery Date", "Quantity", "Basic Price", "Discount", "Cur", "Amount", "Sub Total" ] df = pd.DataFrame(extracted_data, columns=columns) # Ensure Pos is numeric and filter rows for POS 10 to POS 450 df['Pos'] = pd.to_numeric(df['Pos'], errors='coerce') df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)] # Identify missing POS numbers expected_pos = set(range(10, 451)) extracted_pos = set(df['Pos'].dropna().astype(int)) missing_pos = sorted(expected_pos - extracted_pos) print("Missing POS numbers:", missing_pos) # Debug output to identify skipped POS numbers # Save as Excel for download output_path = "cleaned_extracted_po_data.xlsx" df.to_excel(output_path, index=False) return output_path # Gradio interface def process_pdf(file): excel_path = extract_cleaned_po_data(file.name) return excel_path iface = gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload Toshiba PO PDF"), outputs=gr.File(label="Download Cleaned Extracted Excel"), title="Toshiba PO Data Extraction", description="Upload a Toshiba PO PDF file to extract cleaned data in the specified format and download as an Excel file.", ) if __name__ == "__main__": iface.launch()