import re import pandas as pd import gradio as gr def extract_po_data(text): """ Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE. Args: text (str): Raw text extracted from the PDF. Returns: tuple: A DataFrame containing structured data and a status message. """ lines = text.splitlines() data = [] for line in lines: # Match table row patterns with a flexible regex row_match = re.match( r"^\s*(?P\d+)\s+(?P.+?)\s+(?P\d+)\s+(?P(Nos\.|Set))\s+(?P[\d.]+)\s+(?P[\d.]+)\s*$", line, ) if row_match: # Extract fields item = row_match.group("Item") description = format_description(row_match.group("Description")) qty = row_match.group("Qty") unit = row_match.group("Unit") unit_price = row_match.group("UnitPrice") total_price = row_match.group("TotalPrice") # Append to the data list data.append( { "ITEM": item, "DESCRIPTION": description, "QTY": qty, "UNIT": unit, "UNIT PRICE": unit_price, "TOTAL PRICE": total_price, } ) else: # Log invalid row for debugging print(f"Skipping line (does not match expected format): {line}") # Convert to DataFrame if not data: return None, "No valid data found in the provided text." df = pd.DataFrame(data) return df, "Data extracted successfully." def format_description(description): """ Formats the description field into multiple lines based on predefined structure. Args: description (str): Raw description text. Returns: str: Formatted description with line breaks. """ # Define patterns for splitting the description line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description) line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description) line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description) line4 = re.search(r"With Serial No:.*", description) # Build the formatted description lines = [] if line1: lines.append(line1.group().strip()) if line2: lines.append(line2.group().strip()) if line3: lines.append(line3.group().strip()) if line4: lines.append(line4.group().strip()) return "\n".join(lines) def save_to_excel(df, output_path="extracted_po_data.xlsx"): """ Saves the extracted data to an Excel file. Args: df (pd.DataFrame): DataFrame containing the structured data. output_path (str): Path to save the Excel file. Returns: str: Path to the saved file. """ df.to_excel(output_path, index=False) return output_path def process_text_input(text): """ Processes the raw text input, extracts data, and saves it to an Excel file. Args: text (str): Raw text input. Returns: str: Path to the saved Excel file. """ df, status = extract_po_data(text) if df is not None: output_path = save_to_excel(df) return output_path, status return None, status # Gradio Interface def create_interface(): """ Creates a Gradio interface for processing PO data. """ interface = gr.Interface( fn=process_text_input, inputs=gr.Textbox(label="Paste Raw Text from PDF", lines=10, placeholder="Paste extracted text here..."), outputs=[ gr.File(label="Download Extracted Excel"), gr.Textbox(label="Status"), ], title="PO Data Extraction", description="Paste the raw text from the PDF to extract purchase order data into an Excel file.", ) return interface if __name__ == "__main__": # Run the Gradio app app = create_interface() app.launch()