ALNISFPO / app.py
dschandra's picture
Update app.py
4fdb3ac verified
raw
history blame
4.13 kB
import re
import pandas as pd
import gradio as gr
def extract_po_data(text):
"""
Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
Args:
text (str): Raw text extracted from the PDF.
Returns:
tuple: A DataFrame containing structured data and a status message.
"""
lines = text.splitlines()
data = []
for line in lines:
# Match table row patterns with a flexible regex
row_match = re.match(
r"^\s*(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>(Nos\.|Set))\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)\s*$",
line,
)
if row_match:
# Extract fields
item = row_match.group("Item")
description = format_description(row_match.group("Description"))
qty = row_match.group("Qty")
unit = row_match.group("Unit")
unit_price = row_match.group("UnitPrice")
total_price = row_match.group("TotalPrice")
# Append to the data list
data.append(
{
"ITEM": item,
"DESCRIPTION": description,
"QTY": qty,
"UNIT": unit,
"UNIT PRICE": unit_price,
"TOTAL PRICE": total_price,
}
)
else:
# Log invalid row for debugging
print(f"Skipping line (does not match expected format): {line}")
# Convert to DataFrame
if not data:
return None, "No valid data found in the provided text."
df = pd.DataFrame(data)
return df, "Data extracted successfully."
def format_description(description):
"""
Formats the description field into multiple lines based on predefined structure.
Args:
description (str): Raw description text.
Returns:
str: Formatted description with line breaks.
"""
# Define patterns for splitting the description
line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
line4 = re.search(r"With Serial No:.*", description)
# Build the formatted description
lines = []
if line1:
lines.append(line1.group().strip())
if line2:
lines.append(line2.group().strip())
if line3:
lines.append(line3.group().strip())
if line4:
lines.append(line4.group().strip())
return "\n".join(lines)
def save_to_excel(df, output_path="extracted_po_data.xlsx"):
"""
Saves the extracted data to an Excel file.
Args:
df (pd.DataFrame): DataFrame containing the structured data.
output_path (str): Path to save the Excel file.
Returns:
str: Path to the saved file.
"""
df.to_excel(output_path, index=False)
return output_path
def process_text_input(text):
"""
Processes the raw text input, extracts data, and saves it to an Excel file.
Args:
text (str): Raw text input.
Returns:
str: Path to the saved Excel file.
"""
df, status = extract_po_data(text)
if df is not None:
output_path = save_to_excel(df)
return output_path, status
return None, status
# Gradio Interface
def create_interface():
"""
Creates a Gradio interface for processing PO data.
"""
interface = gr.Interface(
fn=process_text_input,
inputs=gr.Textbox(label="Paste Raw Text from PDF", lines=10, placeholder="Paste extracted text here..."),
outputs=[
gr.File(label="Download Extracted Excel"),
gr.Textbox(label="Status"),
],
title="PO Data Extraction",
description="Paste the raw text from the PDF to extract purchase order data into an Excel file.",
)
return interface
if __name__ == "__main__":
# Run the Gradio app
app = create_interface()
app.launch()