|
import re |
|
import pandas as pd |
|
import gradio as gr |
|
|
|
|
|
def extract_po_data(text): |
|
""" |
|
Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE. |
|
Args: |
|
text (str): Raw text extracted from the PDF. |
|
Returns: |
|
tuple: A DataFrame containing structured data and a status message. |
|
""" |
|
lines = text.splitlines() |
|
data = [] |
|
|
|
for line in lines: |
|
|
|
row_match = re.match( |
|
r"^\s*(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>(Nos\.|Set))\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)\s*$", |
|
line, |
|
) |
|
if row_match: |
|
|
|
item = row_match.group("Item") |
|
description = format_description(row_match.group("Description")) |
|
qty = row_match.group("Qty") |
|
unit = row_match.group("Unit") |
|
unit_price = row_match.group("UnitPrice") |
|
total_price = row_match.group("TotalPrice") |
|
|
|
|
|
data.append( |
|
{ |
|
"ITEM": item, |
|
"DESCRIPTION": description, |
|
"QTY": qty, |
|
"UNIT": unit, |
|
"UNIT PRICE": unit_price, |
|
"TOTAL PRICE": total_price, |
|
} |
|
) |
|
else: |
|
|
|
print(f"Skipping line (does not match expected format): {line}") |
|
|
|
|
|
if not data: |
|
return None, "No valid data found in the provided text." |
|
df = pd.DataFrame(data) |
|
return df, "Data extracted successfully." |
|
|
|
|
|
def format_description(description): |
|
""" |
|
Formats the description field into multiple lines based on predefined structure. |
|
Args: |
|
description (str): Raw description text. |
|
Returns: |
|
str: Formatted description with line breaks. |
|
""" |
|
|
|
line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description) |
|
line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description) |
|
line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description) |
|
line4 = re.search(r"With Serial No:.*", description) |
|
|
|
|
|
lines = [] |
|
if line1: |
|
lines.append(line1.group().strip()) |
|
if line2: |
|
lines.append(line2.group().strip()) |
|
if line3: |
|
lines.append(line3.group().strip()) |
|
if line4: |
|
lines.append(line4.group().strip()) |
|
|
|
return "\n".join(lines) |
|
|
|
|
|
def save_to_excel(df, output_path="extracted_po_data.xlsx"): |
|
""" |
|
Saves the extracted data to an Excel file. |
|
Args: |
|
df (pd.DataFrame): DataFrame containing the structured data. |
|
output_path (str): Path to save the Excel file. |
|
Returns: |
|
str: Path to the saved file. |
|
""" |
|
df.to_excel(output_path, index=False) |
|
return output_path |
|
|
|
|
|
def process_text_input(text): |
|
""" |
|
Processes the raw text input, extracts data, and saves it to an Excel file. |
|
Args: |
|
text (str): Raw text input. |
|
Returns: |
|
str: Path to the saved Excel file. |
|
""" |
|
df, status = extract_po_data(text) |
|
if df is not None: |
|
output_path = save_to_excel(df) |
|
return output_path, status |
|
return None, status |
|
|
|
|
|
|
|
def create_interface(): |
|
""" |
|
Creates a Gradio interface for processing PO data. |
|
""" |
|
interface = gr.Interface( |
|
fn=process_text_input, |
|
inputs=gr.Textbox(label="Paste Raw Text from PDF", lines=10, placeholder="Paste extracted text here..."), |
|
outputs=[ |
|
gr.File(label="Download Extracted Excel"), |
|
gr.Textbox(label="Status"), |
|
], |
|
title="PO Data Extraction", |
|
description="Paste the raw text from the PDF to extract purchase order data into an Excel file.", |
|
) |
|
return interface |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
app = create_interface() |
|
app.launch() |
|
|