Spaces:

dschandra
/

ALNISFPO

Sleeping

App Files Files Community

dschandra commited on Dec 3, 2024

Commit

f09760f

verified ·

1 Parent(s): 21b7e40

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -123

app.py CHANGED Viewed

@@ -1,118 +1,64 @@
-import pdfplumber
-import pandas as pd
 import re
-import gradio as gr
-# Function: Extract Text from PDF
-def extract_text_from_pdf(pdf_file):
-    with pdfplumber.open(pdf_file.name) as pdf:
-        text = ""
-        for page in pdf.pages:
-            text += page.extract_text()
-    return text
-# Function: Clean Description
-def clean_description(description, item_number=None):
-    """
-    Cleans the description by removing unwanted data such as Qty, Unit, Unit Price, Total Price, and other invalid entries.
-    Args:
-        description (str): Raw description string.
-        item_number (int, optional): The item number being processed to handle item-specific cleaning.
-    Returns:
-        str: Cleaned description.
-    """
-    # Remove common unwanted patterns
-    description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description)  # Remove Qty + Unit + Price
-    description = re.sub(r"Page \d+ of \d+.*", "", description)  # Remove page references
-    description = re.sub(r"\(Q\. No:.*?\)", "", description)  # Remove Q.No-related data
-    description = re.sub(r"TOTAL EX-WORK.*", "", description)  # Remove EX-WORK-related text
-    description = re.sub(r"NOTES:.*", "", description)  # Remove notes section
-    description = re.sub(r"HS CODE.*", "", description)  # Remove HS CODE-related data
-    description = re.sub(r"DELIVERY:.*", "", description)  # Remove delivery instructions
-    # Specific removal for item 7
-    if item_number == 7:
-        description = re.sub(r"\b300 Sets 4.20 1260.00\b", "", description)
-    return description.strip()
-def parse_po_items_with_filters(text):
     """
-    Parses purchase order items from the extracted text using regex with filters.
-    Ensures items are formatted correctly into rows and columns.
     Args:
-        text (str): Extracted text from the PDF.
     Returns:
-        tuple: A DataFrame with parsed data and a status message.
     """
     lines = text.splitlines()
     data = []
-    current_item = None
-    description_accumulator = []
     for line in lines:
-        # Match the start of a new item row (e.g., Item No. followed by description)
-        item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
-        if item_match:
-            # Save the previous item
-            if current_item:
-                current_item["Description"] = format_description(
-                    " ".join(description_accumulator).strip()
-                )
-                data.append(current_item)
-                description_accumulator = []
-            # Start a new item
-            current_item = {
-                "Item": item_match.group("Item"),
-                "Description": "",
-                "Qty": "",
-                "Unit": "",
-                "Unit Price": "",
-                "Total Price": "",
-            }
-            description_accumulator.append(item_match.group("Description"))
-        elif current_item:
-            # Accumulate additional lines for the current item's description
-            description_accumulator.append(line.strip())
-        # Match Quantity, Unit, Unit Price, and Total Price
-        qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
-        if qty_match:
-            current_item["Qty"] = qty_match.group("Qty")
-            current_item["Unit"] = qty_match.group(2)
-        price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
-        if price_match:
-            current_item["Unit Price"] = price_match.group("UnitPrice")
-            current_item["Total Price"] = price_match.group("TotalPrice")
-    # Save the last item
-    if current_item:
-        current_item["Description"] = format_description(
-            " ".join(description_accumulator).strip()
         )
-        data.append(current_item)
-    # Remove empty rows
-    data = [row for row in data if row["Description"]]
-    # Return data as a DataFrame
     if not data:
-        return None, "No items found. Please check the PDF file format."
     df = pd.DataFrame(data)
     return df, "Data extracted successfully."
 def format_description(description):
     """
-    Formats the description into multiple lines based on patterns.
     Args:
         description (str): Raw description text.
     Returns:
-        str: Formatted description.
     """
-    # Break the description into multiple lines
     line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
     line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
     line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
@@ -132,36 +78,18 @@ def format_description(description):
     return "\n".join(lines)
-# Function: Save to Excel
-def save_to_excel(df, output_path="extracted_po_data.xlsx"):
-    df.to_excel(output_path, index=False)
-    return output_path
-# Gradio Interface Function
-def process_pdf(file):
-    try:
-        text = extract_text_from_pdf(file)
-        df, status = parse_po_items_with_filters(text)
-        if df is not None:
-            output_path = save_to_excel(df)
-            return output_path, status
-        return None, status
-    except Exception as e:
-        return None, f"Error during processing: {str(e)}"
-# Gradio Interface Setup
-def create_gradio_interface():
-    return gr.Interface(
-        fn=process_pdf,
-        inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
-        outputs=[
-            gr.File(label="Download Extracted Data"),
-            gr.Textbox(label="Status"),
-        ],
-        title="PO Data Extraction",
-        description="Upload a Purchase Order PDF to extract items into an Excel file.",
-    )
-if __name__ == "__main__":
-    interface = create_gradio_interface()
-    interface.launch()

 import re
+import pandas as pd
+def extract_po_data(text):
     """
+    Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
     Args:
+        text (str): Raw text extracted from the PDF.
     Returns:
+        tuple: A DataFrame containing structured data and a status message.
     """
     lines = text.splitlines()
     data = []
     for line in lines:
+        # Match table row patterns
+        row_match = re.match(
+            r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>(Nos\.|Set))\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$",
+            line,
         )
+        if row_match:
+            # Extract fields
+            item = row_match.group("Item")
+            description = format_description(row_match.group("Description"))
+            qty = row_match.group("Qty")
+            unit = row_match.group("Unit")
+            unit_price = row_match.group("UnitPrice")
+            total_price = row_match.group("TotalPrice")
+            # Append to the data list
+            data.append(
+                {
+                    "ITEM": item,
+                    "DESCRIPTION": description,
+                    "QTY": qty,
+                    "UNIT": unit,
+                    "UNIT PRICE": unit_price,
+                    "TOTAL PRICE": total_price,
+                }
+            )
+        else:
+            # Log invalid row for debugging
+            print(f"Skipping line (does not match expected format): {line}")
+    # Convert to DataFrame
     if not data:
+        return None, "No valid data found in the provided text."
     df = pd.DataFrame(data)
     return df, "Data extracted successfully."
 def format_description(description):
     """
+    Formats the description field into multiple lines based on predefined structure.
     Args:
         description (str): Raw description text.
     Returns:
+        str: Formatted description with line breaks.
     """
+    # Define patterns for splitting the description
     line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
     line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
     line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
     return "\n".join(lines)
+# Example Usage
+if __name__ == "__main__":
+    # Example raw text (replace this with actual extracted text from PDF)
+    raw_text = """
+    1 Stainless Steel RATING AND DIAGRAM PLATE As per Drg.No. G 000822 RI RDP 50KVA NT00l 51 SIZE : l50mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022 24 Nos. 3.00 72.00
+    """
+    # Extract data
+    df, status = extract_po_data(raw_text)
+    # Output results
+    if df is not None:
+        print(df)
+    else:
+        print(status)