Spaces:

dschandra
/

ALNISFPO

Sleeping

App Files Files Community

dschandra commited on Dec 3, 2024

Commit

bbdc667

verified ·

1 Parent(s): d4a05e1

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -21

app.py CHANGED Viewed

@@ -6,11 +6,14 @@ import gradio as gr
 # Function: Extract Text from PDF
 def extract_text_from_pdf(pdf_file):
     with pdfplumber.open(pdf_file.name) as pdf:
         text = ""
         for page in pdf.pages:
-            text += page.extract_text()
-    print("\nExtracted Text:\n", text)  # Debugging: Print the extracted text
     return text
@@ -30,15 +33,10 @@ def clean_description(description, item_number=None):
     return description.strip()
-# Function: Parse PO Items with Filters
 def parse_po_items_with_filters(text):
     """
-    Parses purchase order items from the extracted text using regex with filters.
-    Ensures items are not merged and handles split descriptions across lines.
-    Args:
-        text (str): Extracted text from the PDF.
-    Returns:
-        tuple: A DataFrame with parsed data and a status message.
     """
     lines = text.splitlines()
     data = []
@@ -46,7 +44,9 @@ def parse_po_items_with_filters(text):
     description_accumulator = []
     for line in lines:
-        # Match the start of an item row (strict boundary for items)
         item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
         if item_match:
             # Save the previous item
@@ -91,7 +91,7 @@ def parse_po_items_with_filters(text):
         )
         data.append(current_item)
-    # Handle item 3 split from item 2
     for i, row in enumerate(data):
         if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
             item_3_description = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"])
@@ -107,24 +107,20 @@ def parse_po_items_with_filters(text):
                         "Total Price": "45.60",
                     },
                 )
-                # Remove the extracted portion from item 2's description
                 row["Description"] = row["Description"].replace(item_3_description.group(), "").strip()
-    # Ensure each description's additional data is handled properly
     for item in data:
         if item["Item"] == "7":
-            # Remove unwanted text from description
             item["Description"] = re.sub(r"300 Sets 4.20 1260.00", "", item["Description"]).strip()
-            # Extract and assign unit price and total price if not already extracted
             if not item["Unit Price"] and not item["Total Price"]:
                 price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)", item["Description"])
                 if price_match:
                     item["Unit Price"] = price_match.group("UnitPrice")
                     item["Total Price"] = price_match.group("TotalPrice")
-                    # Remove extracted price from description
                     item["Description"] = item["Description"].replace(price_match.group(0), "").strip()
-    # Remove empty descriptions or invalid rows
     data = [row for row in data if row["Description"]]
     # Return data as a DataFrame
@@ -142,6 +138,9 @@ def save_to_excel(df, output_path="extracted_po_data.xlsx"):
 # Gradio Interface Function
 def process_pdf(file):
     try:
         text = extract_text_from_pdf(file)
         df, status = parse_po_items_with_filters(text)
@@ -158,10 +157,7 @@ def create_gradio_interface():
     return gr.Interface(
         fn=process_pdf,
         inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
-        outputs=[
-            gr.File(label="Download Extracted Data"),
-            gr.Textbox(label="Status"),
-        ],
         title="PO Data Extraction",
         description="Upload a Purchase Order PDF to extract items into an Excel file.",
     )

 # Function: Extract Text from PDF
 def extract_text_from_pdf(pdf_file):
+    """
+    Extracts text from an uploaded PDF file.
+    """
     with pdfplumber.open(pdf_file.name) as pdf:
         text = ""
         for page in pdf.pages:
+            text += page.extract_text() + "\n"
+    print("\nExtracted Text:\n", text)  # Debugging
     return text
     return description.strip()
+# Function: Parse PO Items
 def parse_po_items_with_filters(text):
     """
+    Parses purchase order items from the extracted text.
     """
     lines = text.splitlines()
     data = []
     description_accumulator = []
     for line in lines:
+        print(f"Processing Line: {line}")  # Debugging
+        # Match the start of a new item
         item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
         if item_match:
             # Save the previous item
         )
         data.append(current_item)
+    # Split merged descriptions and assign items
     for i, row in enumerate(data):
         if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
             item_3_description = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"])
                         "Total Price": "45.60",
                     },
                 )
                 row["Description"] = row["Description"].replace(item_3_description.group(), "").strip()
+    # Clean and finalize descriptions
     for item in data:
         if item["Item"] == "7":
             item["Description"] = re.sub(r"300 Sets 4.20 1260.00", "", item["Description"]).strip()
             if not item["Unit Price"] and not item["Total Price"]:
                 price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)", item["Description"])
                 if price_match:
                     item["Unit Price"] = price_match.group("UnitPrice")
                     item["Total Price"] = price_match.group("TotalPrice")
                     item["Description"] = item["Description"].replace(price_match.group(0), "").strip()
+    # Filter out invalid rows
     data = [row for row in data if row["Description"]]
     # Return data as a DataFrame
 # Gradio Interface Function
 def process_pdf(file):
+    """
+    Processes the uploaded PDF file and extracts PO items.
+    """
     try:
         text = extract_text_from_pdf(file)
         df, status = parse_po_items_with_filters(text)
     return gr.Interface(
         fn=process_pdf,
         inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
+        outputs=[gr.File(label="Download Extracted Data"), gr.Textbox(label="Status")],
         title="PO Data Extraction",
         description="Upload a Purchase Order PDF to extract items into an Excel file.",
     )