Update app.py
Browse files
app.py
CHANGED
|
@@ -14,6 +14,14 @@ def extract_text_from_pdf(pdf_file):
|
|
| 14 |
text += page.extract_text()
|
| 15 |
return text
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# Function: Clean Description
|
| 18 |
def clean_description(description, item_number=None):
|
| 19 |
"""
|
|
@@ -35,7 +43,7 @@ def clean_description(description, item_number=None):
|
|
| 35 |
|
| 36 |
return description.strip()
|
| 37 |
|
| 38 |
-
# Function
|
| 39 |
def parse_po_items_with_filters(text):
|
| 40 |
"""
|
| 41 |
Parses purchase order items from the extracted text systematically.
|
|
@@ -101,7 +109,7 @@ def parse_po_items_with_filters(text):
|
|
| 101 |
|
| 102 |
# Split merged descriptions and assign items
|
| 103 |
for i, row in enumerate(data):
|
| 104 |
-
if row["Item"] == "2" and "
|
| 105 |
item_3_match = re.search(
|
| 106 |
r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
|
| 107 |
row["Description"]
|
|
@@ -114,22 +122,38 @@ def parse_po_items_with_filters(text):
|
|
| 114 |
"Description": item_3_match.group().strip(),
|
| 115 |
"Qty": "12",
|
| 116 |
"Unit": "Nos.",
|
| 117 |
-
"Unit Price": "3.80",
|
| 118 |
-
"Total Price": "45.60",
|
| 119 |
},
|
| 120 |
)
|
| 121 |
row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
|
| 122 |
|
| 123 |
-
#
|
| 124 |
data = [row for row in data if row["Description"]]
|
| 125 |
|
| 126 |
# Return data as a DataFrame
|
| 127 |
if not data:
|
| 128 |
print("No items found.") # Debugging
|
| 129 |
return None, "No items found. Please check the PDF file format."
|
|
|
|
|
|
|
| 130 |
df = pd.DataFrame(data)
|
| 131 |
return df, "Data extracted successfully."
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
|
| 135 |
# Function: Save to Excel
|
|
|
|
| 14 |
text += page.extract_text()
|
| 15 |
return text
|
| 16 |
|
| 17 |
+
# Function: Clean Description (Basic cleaning logic)
|
| 18 |
+
def clean_description(description, item_number):
|
| 19 |
+
"""
|
| 20 |
+
Cleans up the description for an item to ensure it's correctly formatted.
|
| 21 |
+
"""
|
| 22 |
+
# Placeholder for actual cleaning process (e.g., removing unwanted characters)
|
| 23 |
+
return description.strip()
|
| 24 |
+
|
| 25 |
# Function: Clean Description
|
| 26 |
def clean_description(description, item_number=None):
|
| 27 |
"""
|
|
|
|
| 43 |
|
| 44 |
return description.strip()
|
| 45 |
|
| 46 |
+
# Function to extract PO Items with splitting
|
| 47 |
def parse_po_items_with_filters(text):
|
| 48 |
"""
|
| 49 |
Parses purchase order items from the extracted text systematically.
|
|
|
|
| 109 |
|
| 110 |
# Split merged descriptions and assign items
|
| 111 |
for i, row in enumerate(data):
|
| 112 |
+
if row["Item"] == "2" and "Mfd:-2022" in row["Description"]: # Find the item description boundary
|
| 113 |
item_3_match = re.search(
|
| 114 |
r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
|
| 115 |
row["Description"]
|
|
|
|
| 122 |
"Description": item_3_match.group().strip(),
|
| 123 |
"Qty": "12",
|
| 124 |
"Unit": "Nos.",
|
| 125 |
+
"Unit Price": "3.80", # Extracted from the description
|
| 126 |
+
"Total Price": "45.60", # Extracted from the description
|
| 127 |
},
|
| 128 |
)
|
| 129 |
row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
|
| 130 |
|
| 131 |
+
# Clean up the data to remove empty items or incomplete data
|
| 132 |
data = [row for row in data if row["Description"]]
|
| 133 |
|
| 134 |
# Return data as a DataFrame
|
| 135 |
if not data:
|
| 136 |
print("No items found.") # Debugging
|
| 137 |
return None, "No items found. Please check the PDF file format."
|
| 138 |
+
|
| 139 |
+
# Create DataFrame from the extracted data
|
| 140 |
df = pd.DataFrame(data)
|
| 141 |
return df, "Data extracted successfully."
|
| 142 |
|
| 143 |
+
# Example text (as provided)
|
| 144 |
+
text = """
|
| 145 |
+
ITEM 1 Stainless Steel RATING AND DIAGRAM PLATE 24 Nos. 3.00 72.00
|
| 146 |
+
As per Drg.No. G 000822 RI RDP 50KVA NT001 51 SIZE : 150mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022
|
| 147 |
+
ITEM 2 Stainless Steel RATING AND DIAGRAM PLATE 12 Nos. 3.80 45.60
|
| 148 |
+
As per Drg.to.G 000816 R2 RDP 600KVA NT00152 SIZE : 150mm X 260mm X 1.00mm Thick With Serial No:NT00I53 38 to 50 Mfd:-2022
|
| 149 |
+
"""
|
| 150 |
+
|
| 151 |
+
# Running the function
|
| 152 |
+
df, status = parse_po_items_with_filters(text)
|
| 153 |
+
print(status)
|
| 154 |
+
if df is not None:
|
| 155 |
+
print(df)
|
| 156 |
+
|
| 157 |
|
| 158 |
|
| 159 |
# Function: Save to Excel
|