|
import pdfplumber |
|
import pandas as pd |
|
import re |
|
import gradio as gr |
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
""" |
|
Extracts raw text from a PDF file. |
|
""" |
|
with pdfplumber.open(pdf_file.name) as pdf: |
|
text = "" |
|
for page in pdf.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
def clean_description(description, item_number=None): |
|
""" |
|
Cleans the description by removing unwanted patterns dynamically. |
|
""" |
|
|
|
description = re.sub(r"Page \d+ of \d+.*", "", description) |
|
description = re.sub(r"TOTAL EX-WORK.*", "", description) |
|
description = re.sub(r"NOTES:.*", "", description) |
|
description = re.sub(r"HS CODE.*", "", description) |
|
description = re.sub(r"DELIVERY:.*", "", description) |
|
|
|
|
|
description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description) |
|
|
|
|
|
if item_number == 7: |
|
description = re.sub(r"300 Sets 4.20 1260.00", "", description) |
|
|
|
return description.strip() |
|
|
|
|
|
def parse_po_items_with_filters(text): |
|
""" |
|
Parses purchase order items from the extracted text systematically. |
|
""" |
|
lines = text.splitlines() |
|
data = [] |
|
current_item = None |
|
description_accumulator = [] |
|
|
|
for line in lines: |
|
print(f"Processing Line: {line}") |
|
|
|
|
|
item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line) |
|
if item_match: |
|
|
|
if current_item: |
|
current_item["Description"] = clean_description( |
|
" ".join(description_accumulator).strip(), |
|
item_number=int(current_item["Item"]), |
|
) |
|
data.append(current_item) |
|
description_accumulator = [] |
|
|
|
|
|
current_item = { |
|
"Item": item_match.group("Item"), |
|
"Description": "", |
|
"Qty": "", |
|
"Unit": "", |
|
"Unit Price": "", |
|
"Total Price": "", |
|
} |
|
description_accumulator.append(item_match.group("Description")) |
|
elif current_item: |
|
|
|
description_accumulator.append(line.strip()) |
|
|
|
|
|
qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line) |
|
if qty_match: |
|
current_item["Qty"] = qty_match.group("Qty") |
|
current_item["Unit"] = qty_match.group(2) |
|
|
|
|
|
if not re.search(r"(Mfd:-2022|\(NT00192\)|SIZE)", line): |
|
price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line) |
|
if price_match: |
|
current_item["Unit Price"] = price_match.group("UnitPrice") |
|
current_item["Total Price"] = price_match.group("TotalPrice") |
|
|
|
|
|
if re.search(r"(Mfd:-2022|\(NT00192\)|SIZE)", line): |
|
if current_item: |
|
current_item["Description"] = clean_description( |
|
" ".join(description_accumulator).strip(), |
|
item_number=int(current_item["Item"]), |
|
) |
|
data.append(current_item) |
|
description_accumulator = [] |
|
current_item = None |
|
|
|
|
|
if current_item: |
|
current_item["Description"] = clean_description( |
|
" ".join(description_accumulator).strip(), |
|
item_number=int(current_item["Item"]), |
|
) |
|
data.append(current_item) |
|
|
|
|
|
data = [row for row in data if row["Description"]] |
|
|
|
|
|
if not data: |
|
return None, "No items found. Please check the PDF file format." |
|
df = pd.DataFrame(data) |
|
return df, "Data extracted successfully." |
|
|
|
|
|
def save_to_excel(df, output_path="extracted_po_data.xlsx"): |
|
""" |
|
Saves the extracted data to an Excel file. |
|
""" |
|
df.to_excel(output_path, index=False) |
|
return output_path |
|
|
|
|
|
def process_pdf(file): |
|
""" |
|
Processes the uploaded PDF file and returns extracted data and status. |
|
""" |
|
try: |
|
text = extract_text_from_pdf(file) |
|
df, status = parse_po_items_with_filters(text) |
|
if df is not None: |
|
output_path = save_to_excel(df) |
|
return output_path, status |
|
return None, status |
|
except Exception as e: |
|
return None, f"Error during processing: {str(e)}" |
|
|
|
|
|
def create_gradio_interface(): |
|
""" |
|
Creates a Gradio interface for PO data extraction. |
|
""" |
|
return gr.Interface( |
|
fn=process_pdf, |
|
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]), |
|
outputs=[ |
|
gr.File(label="Download Extracted Data"), |
|
gr.Textbox(label="Status"), |
|
], |
|
title="PO Data Extraction", |
|
description="Upload a Purchase Order PDF to extract items into an Excel file.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface = create_gradio_interface() |
|
interface.launch() |
|
|