dschandra commited on
Commit
2736e3b
·
verified ·
1 Parent(s): e9d8f2a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -5
app.py CHANGED
@@ -14,6 +14,14 @@ def extract_text_from_pdf(pdf_file):
14
  text += page.extract_text()
15
  return text
16
 
 
 
 
 
 
 
 
 
17
  # Function: Clean Description
18
  def clean_description(description, item_number=None):
19
  """
@@ -35,7 +43,7 @@ def clean_description(description, item_number=None):
35
 
36
  return description.strip()
37
 
38
- # Function: Parse PO Items with Filters
39
  def parse_po_items_with_filters(text):
40
  """
41
  Parses purchase order items from the extracted text systematically.
@@ -101,7 +109,7 @@ def parse_po_items_with_filters(text):
101
 
102
  # Split merged descriptions and assign items
103
  for i, row in enumerate(data):
104
- if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
105
  item_3_match = re.search(
106
  r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
107
  row["Description"]
@@ -114,22 +122,38 @@ def parse_po_items_with_filters(text):
114
  "Description": item_3_match.group().strip(),
115
  "Qty": "12",
116
  "Unit": "Nos.",
117
- "Unit Price": "3.80",
118
- "Total Price": "45.60",
119
  },
120
  )
121
  row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
122
 
123
- # Remove invalid rows
124
  data = [row for row in data if row["Description"]]
125
 
126
  # Return data as a DataFrame
127
  if not data:
128
  print("No items found.") # Debugging
129
  return None, "No items found. Please check the PDF file format."
 
 
130
  df = pd.DataFrame(data)
131
  return df, "Data extracted successfully."
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
 
135
  # Function: Save to Excel
 
14
  text += page.extract_text()
15
  return text
16
 
17
+ # Function: Clean Description (Basic cleaning logic)
18
+ def clean_description(description, item_number):
19
+ """
20
+ Cleans up the description for an item to ensure it's correctly formatted.
21
+ """
22
+ # Placeholder for actual cleaning process (e.g., removing unwanted characters)
23
+ return description.strip()
24
+
25
  # Function: Clean Description
26
  def clean_description(description, item_number=None):
27
  """
 
43
 
44
  return description.strip()
45
 
46
+ # Function to extract PO Items with splitting
47
  def parse_po_items_with_filters(text):
48
  """
49
  Parses purchase order items from the extracted text systematically.
 
109
 
110
  # Split merged descriptions and assign items
111
  for i, row in enumerate(data):
112
+ if row["Item"] == "2" and "Mfd:-2022" in row["Description"]: # Find the item description boundary
113
  item_3_match = re.search(
114
  r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
115
  row["Description"]
 
122
  "Description": item_3_match.group().strip(),
123
  "Qty": "12",
124
  "Unit": "Nos.",
125
+ "Unit Price": "3.80", # Extracted from the description
126
+ "Total Price": "45.60", # Extracted from the description
127
  },
128
  )
129
  row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
130
 
131
+ # Clean up the data to remove empty items or incomplete data
132
  data = [row for row in data if row["Description"]]
133
 
134
  # Return data as a DataFrame
135
  if not data:
136
  print("No items found.") # Debugging
137
  return None, "No items found. Please check the PDF file format."
138
+
139
+ # Create DataFrame from the extracted data
140
  df = pd.DataFrame(data)
141
  return df, "Data extracted successfully."
142
 
143
+ # Example text (as provided)
144
+ text = """
145
+ ITEM 1 Stainless Steel RATING AND DIAGRAM PLATE 24 Nos. 3.00 72.00
146
+ As per Drg.No. G 000822 RI RDP 50KVA NT001 51 SIZE : 150mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022
147
+ ITEM 2 Stainless Steel RATING AND DIAGRAM PLATE 12 Nos. 3.80 45.60
148
+ As per Drg.to.G 000816 R2 RDP 600KVA NT00152 SIZE : 150mm X 260mm X 1.00mm Thick With Serial No:NT00I53 38 to 50 Mfd:-2022
149
+ """
150
+
151
+ # Running the function
152
+ df, status = parse_po_items_with_filters(text)
153
+ print(status)
154
+ if df is not None:
155
+ print(df)
156
+
157
 
158
 
159
  # Function: Save to Excel