import pdfplumber import pandas as pd import re import gradio as gr # Function: Extract Text from PDF def extract_text_from_pdf(pdf_file): """ Extracts raw text from a PDF file. """ with pdfplumber.open(pdf_file.name) as pdf: text = "" for page in pdf.pages: text += page.extract_text() return text # Function: Clean Description def clean_description(description, item_number=None): """ Cleans the description by removing unwanted patterns dynamically. """ # General unwanted patterns description = re.sub(r"Page \d+ of \d+.*", "", description) # Remove page references description = re.sub(r"TOTAL EX-WORK.*", "", description) # Remove EX-WORK-related text description = re.sub(r"NOTES:.*", "", description) # Remove notes section description = re.sub(r"HS CODE.*", "", description) # Remove HS CODE-related data description = re.sub(r"DELIVERY:.*", "", description) # Remove delivery instructions # Remove redundant quantity/price in descriptions description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description) # Specific fix for Item 7 if item_number == 7: description = re.sub(r"300 Sets 4.20 1260.00", "", description) return description.strip() # Function to clean item description def clean_description(description): """ Cleans up the description for an item to ensure it's correctly formatted. """ return description.strip() # Function to extract PO items with better error handling and checks def parse_po_items_with_filters(text): """ Parses purchase order items from the extracted text systematically. """ lines = text.splitlines() data = [] current_item = None description_accumulator = [] # Regex patterns qty_pattern = r"(?P\d+)\s+(Nos\.|Set|pcs)" # Match quantities price_pattern = r"(?P[\d.]+)\s+(?P[\d.]+)$" # Match prices for line in lines: print(f"Processing Line: {line}") # Debugging # Match the start of a new item item_match = re.match(r"^(?P\d+)\s+(?P.+)", line) if item_match: print(f"Item match found: {item_match.group('Item')}") # Debugging # Save the previous item if current_item is not None if current_item is not None: # Clean and add the description to the current item current_item["Description"] = clean_description(" ".join(description_accumulator).strip()) data.append(current_item) description_accumulator = [] # Reset description accumulator print(f"Item {current_item['Item']} added to data.") # Debugging # Start a new item current_item = { "Item": item_match.group("Item"), "Description": "", "Qty": "", "Unit": "", "Unit Price": "", "Total Price": "", } description_accumulator.append(item_match.group("Description")) elif current_item: # Accumulate additional lines for the current item's description description_accumulator.append(line.strip()) # Match Qty (quantity) in the line qty_match = re.search(qty_pattern, line) if qty_match: print(f"Qty match found: {qty_match.group('Qty')} {qty_match.group(2)}") # Debugging current_item["Qty"] = qty_match.group("Qty") current_item["Unit"] = qty_match.group(2) else: print(f"No Qty match found in line: {line}") # Debugging # Match Unit Price and Total Price price_match = re.search(price_pattern, line) if price_match: print(f"Price match found: {price_match.group('UnitPrice')} {price_match.group('TotalPrice')}") # Debugging current_item["Unit Price"] = price_match.group("UnitPrice") current_item["Total Price"] = price_match.group("TotalPrice") else: print(f"No price match found in line: {line}") # Debugging # Finalize the last item if current_item is not None: # Clean and add the description to the current item current_item["Description"] = clean_description(" ".join(description_accumulator).strip()) data.append(current_item) print(f"Finalized Item {current_item['Item']}") # Debugging # Split merged descriptions and assign items (if necessary) for i, row in enumerate(data): if row["Item"] == "2" and "Mfd:-2022" in row["Description"]: # Find the item description boundary item_3_match = re.search( r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)", row["Description"] ) if item_3_match: data.insert( i + 1, { "Item": "3", "Description": item_3_match.group().strip(), "Qty": "12", "Unit": "Nos.", "Unit Price": "3.80", # Extracted from the description "Total Price": "45.60", # Extracted from the description }, ) row["Description"] = row["Description"].replace(item_3_match.group(), "").strip() # Clean up the data to remove empty items or incomplete data data = [row for row in data if row["Description"]] # Return data as a DataFrame if not data: print("No items found.") # Debugging return None, "No items found. Please check the PDF file format." # Create DataFrame from the extracted data df = pd.DataFrame(data) return df, "Data extracted successfully." # Example text (as provided) text = """ ITEM 1 Stainless Steel RATING AND DIAGRAM PLATE 24 Nos. 3.00 72.00 As per Drg.No. G 000822 RI RDP 50KVA NT001 51 SIZE : 150mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022 ITEM 2 Stainless Steel RATING AND DIAGRAM PLATE 12 Nos. 3.80 45.60 As per Drg.to.G 000816 R2 RDP 600KVA NT00152 SIZE : 150mm X 260mm X 1.00mm Thick With Serial No:NT00I53 38 to 50 Mfd:-2022 """ # Running the function df, status = parse_po_items_with_filters(text) print(status) if df is not None: print(df) # Function: Save to Excel def save_to_excel(df, output_path="extracted_po_data.xlsx"): """ Saves the extracted data to an Excel file. """ df.to_excel(output_path, index=False) return output_path # Gradio Interface Function def process_pdf(file): """ Processes the uploaded PDF file and returns extracted data and status. """ try: text = extract_text_from_pdf(file) df, status = parse_po_items_with_filters(text) if df is not None: output_path = save_to_excel(df) return output_path, status return None, status except Exception as e: return None, f"Error during processing: {str(e)}" # Gradio Interface Setup def create_gradio_interface(): """ Creates a Gradio interface for PO data extraction. """ return gr.Interface( fn=process_pdf, inputs=gr.File(label="Upload PDF", file_types=[".pdf"]), outputs=[ gr.File(label="Download Extracted Data"), gr.Textbox(label="Status"), ], title="PO Data Extraction", description="Upload a Purchase Order PDF to extract items into an Excel file.", ) if __name__ == "__main__": interface = create_gradio_interface() interface.launch()