Spaces:

dschandra
/

ALNISFPO

Sleeping

App Files Files Community

dschandra commited on Dec 3, 2024

Commit

704df50

verified ·

1 Parent(s): 391b6b5

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -23

app.py CHANGED Viewed

@@ -3,41 +3,42 @@ import pandas as pd
 import re
 import gradio as gr
 # Function: Extract Text from PDF
 def extract_text_from_pdf(pdf_file):
     """
-    Extracts text from an uploaded PDF file.
     """
     with pdfplumber.open(pdf_file.name) as pdf:
         text = ""
         for page in pdf.pages:
-            text += page.extract_text() + "\n"
-    print("\nExtracted Text:\n", text)  # Debugging
     return text
 # Function: Clean Description
 def clean_description(description, item_number=None):
     """
-    Cleans the description by removing unwanted data such as Qty, Unit, Unit Price, Total Price, and other invalid entries.
     """
     description = re.sub(r"Page \d+ of \d+.*", "", description)  # Remove page references
     description = re.sub(r"TOTAL EX-WORK.*", "", description)  # Remove EX-WORK-related text
     description = re.sub(r"NOTES:.*", "", description)  # Remove notes section
     description = re.sub(r"HS CODE.*", "", description)  # Remove HS CODE-related data
     description = re.sub(r"DELIVERY:.*", "", description)  # Remove delivery instructions
-    description = re.sub(r"\(Q\. No:.*?\)", "", description)  # Remove Q.No-related data
     if item_number == 7:
-        description = re.sub(r"300 Sets 4.20 1260.00", "", description)  # Remove unwanted text in item 7
-    return description.strip()
-# Function: Parse PO Items
 def parse_po_items_with_filters(text):
     """
-    Parses purchase order items from the extracted text systematically, avoiding merging issues.
-    Ensures Item 3 is split correctly from Item 2.
     """
     lines = text.splitlines()
     data = []
@@ -95,8 +96,11 @@ def parse_po_items_with_filters(text):
     # Split merged descriptions and assign items
     for i, row in enumerate(data):
         if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
-            # Dynamically split merged descriptions for Item 3
-            item_3_match = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"])
             if item_3_match:
                 # Insert Item 3 into the data list
                 data.insert(
@@ -110,10 +114,10 @@ def parse_po_items_with_filters(text):
                         "Total Price": "45.60",
                     },
                 )
-                # Remove the extracted portion from Item 2's description
                 row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
-    # Clean and finalize descriptions
     data = [row for row in data if row["Description"]]
     # Return data as a DataFrame
@@ -122,18 +126,18 @@ def parse_po_items_with_filters(text):
     df = pd.DataFrame(data)
     return df, "Data extracted successfully."
 # Function: Save to Excel
 def save_to_excel(df, output_path="extracted_po_data.xlsx"):
     df.to_excel(output_path, index=False)
     return output_path
 # Gradio Interface Function
 def process_pdf(file):
     """
-    Processes the uploaded PDF file and extracts PO items.
     """
     try:
         text = extract_text_from_pdf(file)
@@ -145,18 +149,22 @@ def process_pdf(file):
     except Exception as e:
         return None, f"Error during processing: {str(e)}"
 # Gradio Interface Setup
 def create_gradio_interface():
     return gr.Interface(
         fn=process_pdf,
         inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
-        outputs=[gr.File(label="Download Extracted Data"), gr.Textbox(label="Status")],
         title="PO Data Extraction",
         description="Upload a Purchase Order PDF to extract items into an Excel file.",
     )
 if __name__ == "__main__":
     interface = create_gradio_interface()
     interface.launch()

 import re
 import gradio as gr
 # Function: Extract Text from PDF
 def extract_text_from_pdf(pdf_file):
     """
+    Extracts raw text from a PDF file.
     """
     with pdfplumber.open(pdf_file.name) as pdf:
         text = ""
         for page in pdf.pages:
+            text += page.extract_text()
     return text
 # Function: Clean Description
 def clean_description(description, item_number=None):
     """
+    Cleans the description by removing unwanted patterns dynamically.
     """
+    # General unwanted patterns
     description = re.sub(r"Page \d+ of \d+.*", "", description)  # Remove page references
     description = re.sub(r"TOTAL EX-WORK.*", "", description)  # Remove EX-WORK-related text
     description = re.sub(r"NOTES:.*", "", description)  # Remove notes section
     description = re.sub(r"HS CODE.*", "", description)  # Remove HS CODE-related data
     description = re.sub(r"DELIVERY:.*", "", description)  # Remove delivery instructions
+    # Remove redundant quantity/price in descriptions
+    description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description)
+    # Specific fix for Item 7
     if item_number == 7:
+        description = re.sub(r"300 Sets 4.20 1260.00", "", description)
+    return description.strip()
+# Function: Parse PO Items with Filters
 def parse_po_items_with_filters(text):
     """
+    Parses purchase order items from the extracted text systematically.
     """
     lines = text.splitlines()
     data = []
     # Split merged descriptions and assign items
     for i, row in enumerate(data):
         if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
+            # Dynamically identify and split Item 3's description
+            item_3_match = re.search(
+                r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
+                row["Description"]
+            )
             if item_3_match:
                 # Insert Item 3 into the data list
                 data.insert(
                         "Total Price": "45.60",
                     },
                 )
+                # Remove extracted Item 3 description from Item 2's description
                 row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
+    # Remove invalid rows
     data = [row for row in data if row["Description"]]
     # Return data as a DataFrame
     df = pd.DataFrame(data)
     return df, "Data extracted successfully."
 # Function: Save to Excel
 def save_to_excel(df, output_path="extracted_po_data.xlsx"):
+    """
+    Saves the extracted data to an Excel file.
+    """
     df.to_excel(output_path, index=False)
     return output_path
 # Gradio Interface Function
 def process_pdf(file):
     """
+    Processes the uploaded PDF file and returns extracted data and status.
     """
     try:
         text = extract_text_from_pdf(file)
     except Exception as e:
         return None, f"Error during processing: {str(e)}"
 # Gradio Interface Setup
 def create_gradio_interface():
+    """
+    Creates a Gradio interface for PO data extraction.
+    """
     return gr.Interface(
         fn=process_pdf,
         inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
+        outputs=[
+            gr.File(label="Download Extracted Data"),
+            gr.Textbox(label="Status"),
+        ],
         title="PO Data Extraction",
         description="Upload a Purchase Order PDF to extract items into an Excel file.",
     )
 if __name__ == "__main__":
     interface = create_gradio_interface()
     interface.launch()