Update app.py
Browse files
app.py
CHANGED
@@ -14,6 +14,14 @@ def extract_text_from_pdf(pdf_file):
|
|
14 |
text += page.extract_text()
|
15 |
return text
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Function: Clean Description
|
18 |
def clean_description(description, item_number=None):
|
19 |
"""
|
@@ -35,7 +43,7 @@ def clean_description(description, item_number=None):
|
|
35 |
|
36 |
return description.strip()
|
37 |
|
38 |
-
# Function
|
39 |
def parse_po_items_with_filters(text):
|
40 |
"""
|
41 |
Parses purchase order items from the extracted text systematically.
|
@@ -101,7 +109,7 @@ def parse_po_items_with_filters(text):
|
|
101 |
|
102 |
# Split merged descriptions and assign items
|
103 |
for i, row in enumerate(data):
|
104 |
-
if row["Item"] == "2" and "
|
105 |
item_3_match = re.search(
|
106 |
r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
|
107 |
row["Description"]
|
@@ -114,22 +122,38 @@ def parse_po_items_with_filters(text):
|
|
114 |
"Description": item_3_match.group().strip(),
|
115 |
"Qty": "12",
|
116 |
"Unit": "Nos.",
|
117 |
-
"Unit Price": "3.80",
|
118 |
-
"Total Price": "45.60",
|
119 |
},
|
120 |
)
|
121 |
row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
|
122 |
|
123 |
-
#
|
124 |
data = [row for row in data if row["Description"]]
|
125 |
|
126 |
# Return data as a DataFrame
|
127 |
if not data:
|
128 |
print("No items found.") # Debugging
|
129 |
return None, "No items found. Please check the PDF file format."
|
|
|
|
|
130 |
df = pd.DataFrame(data)
|
131 |
return df, "Data extracted successfully."
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
|
134 |
|
135 |
# Function: Save to Excel
|
|
|
14 |
text += page.extract_text()
|
15 |
return text
|
16 |
|
17 |
+
# Function: Clean Description (Basic cleaning logic)
|
18 |
+
def clean_description(description, item_number):
|
19 |
+
"""
|
20 |
+
Cleans up the description for an item to ensure it's correctly formatted.
|
21 |
+
"""
|
22 |
+
# Placeholder for actual cleaning process (e.g., removing unwanted characters)
|
23 |
+
return description.strip()
|
24 |
+
|
25 |
# Function: Clean Description
|
26 |
def clean_description(description, item_number=None):
|
27 |
"""
|
|
|
43 |
|
44 |
return description.strip()
|
45 |
|
46 |
+
# Function to extract PO Items with splitting
|
47 |
def parse_po_items_with_filters(text):
|
48 |
"""
|
49 |
Parses purchase order items from the extracted text systematically.
|
|
|
109 |
|
110 |
# Split merged descriptions and assign items
|
111 |
for i, row in enumerate(data):
|
112 |
+
if row["Item"] == "2" and "Mfd:-2022" in row["Description"]: # Find the item description boundary
|
113 |
item_3_match = re.search(
|
114 |
r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
|
115 |
row["Description"]
|
|
|
122 |
"Description": item_3_match.group().strip(),
|
123 |
"Qty": "12",
|
124 |
"Unit": "Nos.",
|
125 |
+
"Unit Price": "3.80", # Extracted from the description
|
126 |
+
"Total Price": "45.60", # Extracted from the description
|
127 |
},
|
128 |
)
|
129 |
row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
|
130 |
|
131 |
+
# Clean up the data to remove empty items or incomplete data
|
132 |
data = [row for row in data if row["Description"]]
|
133 |
|
134 |
# Return data as a DataFrame
|
135 |
if not data:
|
136 |
print("No items found.") # Debugging
|
137 |
return None, "No items found. Please check the PDF file format."
|
138 |
+
|
139 |
+
# Create DataFrame from the extracted data
|
140 |
df = pd.DataFrame(data)
|
141 |
return df, "Data extracted successfully."
|
142 |
|
143 |
+
# Example text (as provided)
|
144 |
+
text = """
|
145 |
+
ITEM 1 Stainless Steel RATING AND DIAGRAM PLATE 24 Nos. 3.00 72.00
|
146 |
+
As per Drg.No. G 000822 RI RDP 50KVA NT001 51 SIZE : 150mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022
|
147 |
+
ITEM 2 Stainless Steel RATING AND DIAGRAM PLATE 12 Nos. 3.80 45.60
|
148 |
+
As per Drg.to.G 000816 R2 RDP 600KVA NT00152 SIZE : 150mm X 260mm X 1.00mm Thick With Serial No:NT00I53 38 to 50 Mfd:-2022
|
149 |
+
"""
|
150 |
+
|
151 |
+
# Running the function
|
152 |
+
df, status = parse_po_items_with_filters(text)
|
153 |
+
print(status)
|
154 |
+
if df is not None:
|
155 |
+
print(df)
|
156 |
+
|
157 |
|
158 |
|
159 |
# Function: Save to Excel
|