dschandra commited on
Commit
704df50
·
verified ·
1 Parent(s): 391b6b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -23
app.py CHANGED
@@ -3,41 +3,42 @@ import pandas as pd
3
  import re
4
  import gradio as gr
5
 
6
-
7
  # Function: Extract Text from PDF
8
  def extract_text_from_pdf(pdf_file):
9
  """
10
- Extracts text from an uploaded PDF file.
11
  """
12
  with pdfplumber.open(pdf_file.name) as pdf:
13
  text = ""
14
  for page in pdf.pages:
15
- text += page.extract_text() + "\n"
16
- print("\nExtracted Text:\n", text) # Debugging
17
  return text
18
 
19
-
20
  # Function: Clean Description
21
  def clean_description(description, item_number=None):
22
  """
23
- Cleans the description by removing unwanted data such as Qty, Unit, Unit Price, Total Price, and other invalid entries.
24
  """
 
25
  description = re.sub(r"Page \d+ of \d+.*", "", description) # Remove page references
26
  description = re.sub(r"TOTAL EX-WORK.*", "", description) # Remove EX-WORK-related text
27
  description = re.sub(r"NOTES:.*", "", description) # Remove notes section
28
  description = re.sub(r"HS CODE.*", "", description) # Remove HS CODE-related data
29
  description = re.sub(r"DELIVERY:.*", "", description) # Remove delivery instructions
30
- description = re.sub(r"\(Q\. No:.*?\)", "", description) # Remove Q.No-related data
 
 
 
 
31
  if item_number == 7:
32
- description = re.sub(r"300 Sets 4.20 1260.00", "", description) # Remove unwanted text in item 7
33
- return description.strip()
34
 
 
35
 
36
- # Function: Parse PO Items
37
  def parse_po_items_with_filters(text):
38
  """
39
- Parses purchase order items from the extracted text systematically, avoiding merging issues.
40
- Ensures Item 3 is split correctly from Item 2.
41
  """
42
  lines = text.splitlines()
43
  data = []
@@ -95,8 +96,11 @@ def parse_po_items_with_filters(text):
95
  # Split merged descriptions and assign items
96
  for i, row in enumerate(data):
97
  if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
98
- # Dynamically split merged descriptions for Item 3
99
- item_3_match = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"])
 
 
 
100
  if item_3_match:
101
  # Insert Item 3 into the data list
102
  data.insert(
@@ -110,10 +114,10 @@ def parse_po_items_with_filters(text):
110
  "Total Price": "45.60",
111
  },
112
  )
113
- # Remove the extracted portion from Item 2's description
114
  row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
115
 
116
- # Clean and finalize descriptions
117
  data = [row for row in data if row["Description"]]
118
 
119
  # Return data as a DataFrame
@@ -122,18 +126,18 @@ def parse_po_items_with_filters(text):
122
  df = pd.DataFrame(data)
123
  return df, "Data extracted successfully."
124
 
125
-
126
-
127
  # Function: Save to Excel
128
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
 
 
 
129
  df.to_excel(output_path, index=False)
130
  return output_path
131
 
132
-
133
  # Gradio Interface Function
134
  def process_pdf(file):
135
  """
136
- Processes the uploaded PDF file and extracts PO items.
137
  """
138
  try:
139
  text = extract_text_from_pdf(file)
@@ -145,18 +149,22 @@ def process_pdf(file):
145
  except Exception as e:
146
  return None, f"Error during processing: {str(e)}"
147
 
148
-
149
  # Gradio Interface Setup
150
  def create_gradio_interface():
 
 
 
151
  return gr.Interface(
152
  fn=process_pdf,
153
  inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
154
- outputs=[gr.File(label="Download Extracted Data"), gr.Textbox(label="Status")],
 
 
 
155
  title="PO Data Extraction",
156
  description="Upload a Purchase Order PDF to extract items into an Excel file.",
157
  )
158
 
159
-
160
  if __name__ == "__main__":
161
  interface = create_gradio_interface()
162
  interface.launch()
 
3
  import re
4
  import gradio as gr
5
 
 
6
  # Function: Extract Text from PDF
7
  def extract_text_from_pdf(pdf_file):
8
  """
9
+ Extracts raw text from a PDF file.
10
  """
11
  with pdfplumber.open(pdf_file.name) as pdf:
12
  text = ""
13
  for page in pdf.pages:
14
+ text += page.extract_text()
 
15
  return text
16
 
 
17
  # Function: Clean Description
18
  def clean_description(description, item_number=None):
19
  """
20
+ Cleans the description by removing unwanted patterns dynamically.
21
  """
22
+ # General unwanted patterns
23
  description = re.sub(r"Page \d+ of \d+.*", "", description) # Remove page references
24
  description = re.sub(r"TOTAL EX-WORK.*", "", description) # Remove EX-WORK-related text
25
  description = re.sub(r"NOTES:.*", "", description) # Remove notes section
26
  description = re.sub(r"HS CODE.*", "", description) # Remove HS CODE-related data
27
  description = re.sub(r"DELIVERY:.*", "", description) # Remove delivery instructions
28
+
29
+ # Remove redundant quantity/price in descriptions
30
+ description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description)
31
+
32
+ # Specific fix for Item 7
33
  if item_number == 7:
34
+ description = re.sub(r"300 Sets 4.20 1260.00", "", description)
 
35
 
36
+ return description.strip()
37
 
38
+ # Function: Parse PO Items with Filters
39
  def parse_po_items_with_filters(text):
40
  """
41
+ Parses purchase order items from the extracted text systematically.
 
42
  """
43
  lines = text.splitlines()
44
  data = []
 
96
  # Split merged descriptions and assign items
97
  for i, row in enumerate(data):
98
  if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
99
+ # Dynamically identify and split Item 3's description
100
+ item_3_match = re.search(
101
+ r"(Stainless Steel RATING AND DIAGRAM PLATE.*?With Serial No:NT00I53 38 to 50 Mfd:-2022)",
102
+ row["Description"]
103
+ )
104
  if item_3_match:
105
  # Insert Item 3 into the data list
106
  data.insert(
 
114
  "Total Price": "45.60",
115
  },
116
  )
117
+ # Remove extracted Item 3 description from Item 2's description
118
  row["Description"] = row["Description"].replace(item_3_match.group(), "").strip()
119
 
120
+ # Remove invalid rows
121
  data = [row for row in data if row["Description"]]
122
 
123
  # Return data as a DataFrame
 
126
  df = pd.DataFrame(data)
127
  return df, "Data extracted successfully."
128
 
 
 
129
  # Function: Save to Excel
130
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
131
+ """
132
+ Saves the extracted data to an Excel file.
133
+ """
134
  df.to_excel(output_path, index=False)
135
  return output_path
136
 
 
137
  # Gradio Interface Function
138
  def process_pdf(file):
139
  """
140
+ Processes the uploaded PDF file and returns extracted data and status.
141
  """
142
  try:
143
  text = extract_text_from_pdf(file)
 
149
  except Exception as e:
150
  return None, f"Error during processing: {str(e)}"
151
 
 
152
  # Gradio Interface Setup
153
  def create_gradio_interface():
154
+ """
155
+ Creates a Gradio interface for PO data extraction.
156
+ """
157
  return gr.Interface(
158
  fn=process_pdf,
159
  inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
160
+ outputs=[
161
+ gr.File(label="Download Extracted Data"),
162
+ gr.Textbox(label="Status"),
163
+ ],
164
  title="PO Data Extraction",
165
  description="Upload a Purchase Order PDF to extract items into an Excel file.",
166
  )
167
 
 
168
  if __name__ == "__main__":
169
  interface = create_gradio_interface()
170
  interface.launch()