|
import pdfplumber |
|
import pandas as pd |
|
import re |
|
import gradio as gr |
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
""" |
|
Extracts raw text from a PDF file. |
|
""" |
|
with pdfplumber.open(pdf_file.name) as pdf: |
|
text = "" |
|
for page in pdf.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
def clean_description(description, item_number=None): |
|
""" |
|
Cleans the description by removing unwanted patterns dynamically. |
|
""" |
|
|
|
description = re.sub(r"Page \d+ of \d+.*", "", description) |
|
description = re.sub(r"TOTAL EX-WORK.*", "", description) |
|
description = re.sub(r"NOTES:.*", "", description) |
|
description = re.sub(r"HS CODE.*", "", description) |
|
description = re.sub(r"DELIVERY:.*", "", description) |
|
|
|
|
|
description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description) |
|
|
|
|
|
if item_number == 7: |
|
description = re.sub(r"300 Sets 4.20 1260.00", "", description) |
|
|
|
return description.strip() |
|
|
|
|
|
def clean_description(description): |
|
""" |
|
Cleans up the description for an item to ensure it's correctly formatted. |
|
""" |
|
return description.strip() |
|
|
|
|
|
def parse_po_items_with_filters(text): |
|
""" |
|
Parses purchase order items from the extracted text systematically. |
|
""" |
|
lines = text.splitlines() |
|
data = [] |
|
current_item = None |
|
description_accumulator = [] |
|
|
|
|
|
qty_pattern = r"(?P<Qty>\d+)\s+(Nos\.|Set|pcs)" |
|
price_pattern = r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$" |
|
|
|
for line in lines: |
|
print(f"Processing Line: {line}") |
|
|
|
|
|
item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line) |
|
if item_match: |
|
print(f"Item match found: {item_match.group('Item')}") |
|
|
|
|
|
if current_item is not None: |
|
|
|
current_item["Description"] = clean_description(" ".join(description_accumulator).strip()) |
|
data.append(current_item) |
|
description_accumulator = [] |
|
print(f"Item {current_item['Item']} added to data.") |
|
|
|
|
|
current_item = { |
|
"Item": item_match.group("Item"), |
|
"Description": "", |
|
"Qty": "", |
|
"Unit": "", |
|
"Unit Price": "", |
|
"Total Price": "", |
|
} |
|
description_accumulator.append(item_match.group("Description")) |
|
elif current_item: |
|
|
|
description_accumulator.append(line.strip()) |
|
|
|
|
|
qty_match = re.search(qty_pattern, line) |
|
if qty_match: |
|
print(f"Qty match found: {qty_match.group('Qty')} {qty_match.group(2)}") |
|
current_item["Qty"] = qty_match.group("Qty") |
|
current_item["Unit"] = qty_match.group(2) |
|
else: |
|
print(f"No Qty match found in line: {line}") |
|
current_item["Qty"] = "Not Found" |
|
|
|
|
|
price_match = re.search(price_pattern, line) |
|
if price_match: |
|
print(f"Price match found: {price_match.group('UnitPrice')} {price_match.group('TotalPrice')}") |
|
current_item["Unit Price"] = price_match.group("UnitPrice") |
|
current_item["Total Price"] = price_match.group("TotalPrice") |
|
else: |
|
print(f"No price match found in line: {line}") |
|
current_item["Unit Price"] = "Not Found" |
|
current_item["Total Price"] = "Not Found" |
|
|
|
|
|
if current_item is not None: |
|
|
|
current_item["Description"] = clean_description(" ".join(description_accumulator).strip()) |
|
data.append(current_item) |
|
print(f"Finalized Item {current_item['Item']}") |
|
|
|
|
|
for i, row in enumerate(data): |
|
if row["Item"] == "2" and "Mfd:-2022" in row["Description"]: |
|
item_3_match = re.search( |
|
r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)", |
|
row["Description"] |
|
) |
|
if item_3_match: |
|
|
|
data.insert( |
|
i + 1, |
|
{ |
|
"Item": "3", |
|
"Description": item_3_match.group().strip(), |
|
"Qty": "12", |
|
"Unit": "Nos.", |
|
"Unit Price": "3.80", |
|
"Total Price": "45.60", |
|
}, |
|
) |
|
row["Description"] = row["Description"].replace(item_3_match.group(), "").strip() |
|
|
|
|
|
data = [row for row in data if row["Description"]] |
|
|
|
|
|
if not data: |
|
print("No items found.") |
|
return None, "No items found. Please check the PDF file format." |
|
|
|
|
|
df = pd.DataFrame(data) |
|
return df, "Data extracted successfully." |
|
|
|
|
|
text = """ |
|
ITEM 1 Stainless Steel RATING AND DIAGRAM PLATE 24 Nos. 3.00 72.00 |
|
As per Drg.No. G 000822 RI RDP 50KVA NT001 51 SIZE : 150mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022 |
|
ITEM 2 Stainless Steel RATING AND DIAGRAM PLATE 12 Nos. 3.80 45.60 |
|
As per Drg.to.G 000816 R2 RDP 600KVA NT00152 SIZE : 150mm X 260mm X 1.00mm Thick With Serial No:NT00I53 38 to 50 Mfd:-2022 |
|
""" |
|
|
|
|
|
df, status = parse_po_items_with_filters(text) |
|
print(status) |
|
if df is not None: |
|
print(df) |
|
|
|
|
|
def save_to_excel(df, output_path="extracted_po_data.xlsx"): |
|
""" |
|
Saves the extracted data to an Excel file. |
|
""" |
|
df.to_excel(output_path, index=False) |
|
return output_path |
|
|
|
|
|
def process_pdf(file): |
|
""" |
|
Processes the uploaded PDF file and returns extracted data and status. |
|
""" |
|
try: |
|
text = extract_text_from_pdf(file) |
|
df, status = parse_po_items_with_filters(text) |
|
if df is not None: |
|
output_path = save_to_excel(df) |
|
return output_path, status |
|
return None, status |
|
except Exception as e: |
|
return None, f"Error during processing: {str(e)}" |
|
|
|
|
|
def create_gradio_interface(): |
|
""" |
|
Creates a Gradio interface for PO data extraction. |
|
""" |
|
return gr.Interface( |
|
fn=process_pdf, |
|
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]), |
|
outputs=[ |
|
gr.File(label="Download Extracted Data"), |
|
gr.Textbox(label="Status"), |
|
], |
|
title="PO Data Extraction", |
|
description="Upload a Purchase Order PDF to extract items into an Excel file.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
interface = create_gradio_interface() |
|
interface.launch() |
|
|