Spaces:

dschandra
/

ALNISFPO

Sleeping

App Files Files Community

dschandra commited on Dec 7, 2024

Commit

2736e3b

verified ·

1 Parent(s): e9d8f2a

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -5

app.py CHANGED Viewed

@@ -14,6 +14,14 @@ def extract_text_from_pdf(pdf_file):
             text += page.extract_text()
     return text
 # Function: Clean Description
 def clean_description(description, item_number=None):
     """
@@ -35,7 +43,7 @@ def clean_description(description, item_number=None):
     return description.strip()
-# Function: Parse PO Items with Filters
 def parse_po_items_with_filters(text):
     """
     Parses purchase order items from the extracted text systematically.
@@ -101,7 +109,7 @@ def parse_po_items_with_filters(text):
     # Split merged descriptions and assign items
     for i, row in enumerate(data):
-        if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
             item_3_match = re.search(
                 r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
                 row["Description"]
@@ -114,22 +122,38 @@ def parse_po_items_with_filters(text):
                         "Description": item_3_match.group().strip(),
                         "Qty": "12",
                         "Unit": "Nos.",
-                        "Unit Price": "3.80",
-                        "Total Price": "45.60",
                     },
                 )
                 row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
-    # Remove invalid rows
     data = [row for row in data if row["Description"]]
     # Return data as a DataFrame
     if not data:
         print("No items found.")  # Debugging
         return None, "No items found. Please check the PDF file format."
     df = pd.DataFrame(data)
     return df, "Data extracted successfully."
 # Function: Save to Excel

             text += page.extract_text()
     return text
+    # Function: Clean Description (Basic cleaning logic)
+def clean_description(description, item_number):
+    """
+    Cleans up the description for an item to ensure it's correctly formatted.
+    """
+    # Placeholder for actual cleaning process (e.g., removing unwanted characters)
+    return description.strip()
 # Function: Clean Description
 def clean_description(description, item_number=None):
     """
     return description.strip()
+# Function to extract PO Items with splitting
 def parse_po_items_with_filters(text):
     """
     Parses purchase order items from the extracted text systematically.
     # Split merged descriptions and assign items
     for i, row in enumerate(data):
+        if row["Item"] == "2" and "Mfd:-2022" in row["Description"]:  # Find the item description boundary
             item_3_match = re.search(
                 r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
                 row["Description"]
                         "Description": item_3_match.group().strip(),
                         "Qty": "12",
                         "Unit": "Nos.",
+                        "Unit Price": "3.80",  # Extracted from the description
+                        "Total Price": "45.60",  # Extracted from the description
                     },
                 )
                 row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
+    # Clean up the data to remove empty items or incomplete data
     data = [row for row in data if row["Description"]]
     # Return data as a DataFrame
     if not data:
         print("No items found.")  # Debugging
         return None, "No items found. Please check the PDF file format."
+    # Create DataFrame from the extracted data
     df = pd.DataFrame(data)
     return df, "Data extracted successfully."
+# Example text (as provided)
+text = """
+ITEM 1 Stainless Steel RATING AND DIAGRAM PLATE 24 Nos. 3.00 72.00
+As per Drg.No. G 000822 RI RDP 50KVA NT001 51 SIZE : 150mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022
+ITEM 2 Stainless Steel RATING AND DIAGRAM PLATE 12 Nos. 3.80 45.60
+As per Drg.to.G 000816 R2 RDP 600KVA NT00152 SIZE : 150mm X 260mm X 1.00mm Thick With Serial No:NT00I53 38 to 50 Mfd:-2022
+"""
+# Running the function
+df, status = parse_po_items_with_filters(text)
+print(status)
+if df is not None:
+    print(df)
 # Function: Save to Excel