import gradio as gr
import pdfplumber
import pandas as pd
import re

def extract_cleaned_po_data(pdf_file):
    """
    Extract and clean data from a Toshiba PO PDF file.
    """
    extracted_data = []
    current_row = {}

    with pdfplumber.open(pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                lines = text.split("\n")
                for line in lines:
                    line = line.strip()

                    # Match rows starting with POS and numeric Item Code
                    if re.match(r"^\d+\s+\d{12}\s+", line):
                        parts = re.split(r'\s+', line, maxsplit=9)  # Split only the first 9 elements to handle descriptions correctly
                        if len(parts) >= 9:
                            # Save the previous row if exists
                            if current_row:
                                extracted_data.append(current_row)
                            current_row = {
                                "Pos": parts[0],
                                "Item Code": parts[1],
                                "Description": "",
                                "Unit": parts[2],
                                "Delivery Date": parts[3],
                                "Quantity": parts[4],
                                "Basic Price": parts[5],
                                "Discount": parts[6],
                                "Cur": parts[7],
                                "Amount": parts[8],
                                "Sub Total": ""
                            }
                    elif "SUB TOTAL" in line and current_row:
                        # Capture the Sub Total
                        sub_total_match = re.search(r"SUB TOTAL\s*:\s*(\d+\.\d+)", line)
                        if sub_total_match:
                            current_row["Sub Total"] = sub_total_match.group(1)
                        extracted_data.append(current_row)
                        current_row = {}
                    else:
                        # Clean and append descriptions only
                        if current_row and "Item Code" in current_row:
                            clean_line = re.sub(
                                r"(Calculation Method.*|Landed Cost.*|Central GST.*|State GST.*|Perc:.*|\d+\/\d+|\d+-\d+-\d+|Cal.*Method:.*|\/\d+|\s{2,}|[A-Za-z]+:[0-9\.]+)",
                                "",
                                line
                            ).strip()
                            if clean_line:
                                current_row["Description"] += f" {clean_line}".strip()

    # Add the last row if exists
    if current_row:
        extracted_data.append(current_row)

    # Combine Item Code and Description
    for row in extracted_data:
        if "Description" in row:
            row["Item Code"] = f"{row['Item Code']}\n{row['Description']}".strip()
            del row["Description"]

    # Convert to DataFrame
    columns = [
        "Pos", "Item Code", "Unit", "Delivery Date", "Quantity",
        "Basic Price", "Discount", "Cur", "Amount", "Sub Total"
    ]
    df = pd.DataFrame(extracted_data, columns=columns)

    # Ensure Pos is numeric and filter rows for POS 10 to POS 450
    df['Pos'] = pd.to_numeric(df['Pos'], errors='coerce')
    df = df[(df['Pos'] >= 10) & (df['Pos'] <= 450)]

    # Identify missing POS numbers
    expected_pos = set(range(10, 451))
    extracted_pos = set(df['Pos'].dropna().astype(int))
    missing_pos = sorted(expected_pos - extracted_pos)

    print("Missing POS numbers:", missing_pos)  # Debug output to identify skipped POS numbers

    # Save as Excel for download
    output_path = "cleaned_extracted_po_data.xlsx"
    df.to_excel(output_path, index=False)
    return output_path

# Gradio interface
def process_pdf(file):
    excel_path = extract_cleaned_po_data(file.name)
    return excel_path

iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload Toshiba PO PDF"),
    outputs=gr.File(label="Download Cleaned Extracted Excel"),
    title="Toshiba PO Data Extraction",
    description="Upload a Toshiba PO PDF file to extract cleaned data in the specified format and download as an Excel file.",
)

if __name__ == "__main__":
    iface.launch()