Spaces:

dschandra
/

ALNISFPO

Sleeping

App Files Files Community

ALNISFPO / app.py

dschandra

Update app.py

4fdb3ac verified 10 months ago

raw

history blame

4.13 kB

	import re
	import pandas as pd
	import gradio as gr


	def extract_po_data(text):
	"""
	Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
	Args:
	text (str): Raw text extracted from the PDF.
	Returns:
	tuple: A DataFrame containing structured data and a status message.
	"""
	lines = text.splitlines()
	data = []

	for line in lines:
	# Match table row patterns with a flexible regex
	row_match = re.match(
	r"^\s(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>(Nos\.\|Set))\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)\s$",
	line,
	)
	if row_match:
	# Extract fields
	item = row_match.group("Item")
	description = format_description(row_match.group("Description"))
	qty = row_match.group("Qty")
	unit = row_match.group("Unit")
	unit_price = row_match.group("UnitPrice")
	total_price = row_match.group("TotalPrice")

	# Append to the data list
	data.append(
	{
	"ITEM": item,
	"DESCRIPTION": description,
	"QTY": qty,
	"UNIT": unit,
	"UNIT PRICE": unit_price,
	"TOTAL PRICE": total_price,
	}
	)
	else:
	# Log invalid row for debugging
	print(f"Skipping line (does not match expected format): {line}")

	# Convert to DataFrame
	if not data:
	return None, "No valid data found in the provided text."
	df = pd.DataFrame(data)
	return df, "Data extracted successfully."


	def format_description(description):
	"""
	Formats the description field into multiple lines based on predefined structure.
	Args:
	description (str): Raw description text.
	Returns:
	str: Formatted description with line breaks.
	"""
	# Define patterns for splitting the description
	line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
	line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
	line3 = re.search(r"SIZE\s:\s\d+mm\sX\s\d+mm\sX\s[\d.]+mm\s*Thick", description)
	line4 = re.search(r"With Serial No:.*", description)

	# Build the formatted description
	lines = []
	if line1:
	lines.append(line1.group().strip())
	if line2:
	lines.append(line2.group().strip())
	if line3:
	lines.append(line3.group().strip())
	if line4:
	lines.append(line4.group().strip())

	return "\n".join(lines)


	def save_to_excel(df, output_path="extracted_po_data.xlsx"):
	"""
	Saves the extracted data to an Excel file.
	Args:
	df (pd.DataFrame): DataFrame containing the structured data.
	output_path (str): Path to save the Excel file.
	Returns:
	str: Path to the saved file.
	"""
	df.to_excel(output_path, index=False)
	return output_path


	def process_text_input(text):
	"""
	Processes the raw text input, extracts data, and saves it to an Excel file.
	Args:
	text (str): Raw text input.
	Returns:
	str: Path to the saved Excel file.
	"""
	df, status = extract_po_data(text)
	if df is not None:
	output_path = save_to_excel(df)
	return output_path, status
	return None, status


	# Gradio Interface
	def create_interface():
	"""
	Creates a Gradio interface for processing PO data.
	"""
	interface = gr.Interface(
	fn=process_text_input,
	inputs=gr.Textbox(label="Paste Raw Text from PDF", lines=10, placeholder="Paste extracted text here..."),
	outputs=[
	gr.File(label="Download Extracted Excel"),
	gr.Textbox(label="Status"),
	],
	title="PO Data Extraction",
	description="Paste the raw text from the PDF to extract purchase order data into an Excel file.",
	)
	return interface


	if __name__ == "__main__":
	# Run the Gradio app
	app = create_interface()
	app.launch()