dschandra commited on
Commit
bbdc667
·
verified ·
1 Parent(s): d4a05e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -21
app.py CHANGED
@@ -6,11 +6,14 @@ import gradio as gr
6
 
7
  # Function: Extract Text from PDF
8
  def extract_text_from_pdf(pdf_file):
 
 
 
9
  with pdfplumber.open(pdf_file.name) as pdf:
10
  text = ""
11
  for page in pdf.pages:
12
- text += page.extract_text()
13
- print("\nExtracted Text:\n", text) # Debugging: Print the extracted text
14
  return text
15
 
16
 
@@ -30,15 +33,10 @@ def clean_description(description, item_number=None):
30
  return description.strip()
31
 
32
 
33
- # Function: Parse PO Items with Filters
34
  def parse_po_items_with_filters(text):
35
  """
36
- Parses purchase order items from the extracted text using regex with filters.
37
- Ensures items are not merged and handles split descriptions across lines.
38
- Args:
39
- text (str): Extracted text from the PDF.
40
- Returns:
41
- tuple: A DataFrame with parsed data and a status message.
42
  """
43
  lines = text.splitlines()
44
  data = []
@@ -46,7 +44,9 @@ def parse_po_items_with_filters(text):
46
  description_accumulator = []
47
 
48
  for line in lines:
49
- # Match the start of an item row (strict boundary for items)
 
 
50
  item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
51
  if item_match:
52
  # Save the previous item
@@ -91,7 +91,7 @@ def parse_po_items_with_filters(text):
91
  )
92
  data.append(current_item)
93
 
94
- # Handle item 3 split from item 2
95
  for i, row in enumerate(data):
96
  if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
97
  item_3_description = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"])
@@ -107,24 +107,20 @@ def parse_po_items_with_filters(text):
107
  "Total Price": "45.60",
108
  },
109
  )
110
- # Remove the extracted portion from item 2's description
111
  row["Description"] = row["Description"].replace(item_3_description.group(), "").strip()
112
 
113
- # Ensure each description's additional data is handled properly
114
  for item in data:
115
  if item["Item"] == "7":
116
- # Remove unwanted text from description
117
  item["Description"] = re.sub(r"300 Sets 4.20 1260.00", "", item["Description"]).strip()
118
- # Extract and assign unit price and total price if not already extracted
119
  if not item["Unit Price"] and not item["Total Price"]:
120
  price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)", item["Description"])
121
  if price_match:
122
  item["Unit Price"] = price_match.group("UnitPrice")
123
  item["Total Price"] = price_match.group("TotalPrice")
124
- # Remove extracted price from description
125
  item["Description"] = item["Description"].replace(price_match.group(0), "").strip()
126
 
127
- # Remove empty descriptions or invalid rows
128
  data = [row for row in data if row["Description"]]
129
 
130
  # Return data as a DataFrame
@@ -142,6 +138,9 @@ def save_to_excel(df, output_path="extracted_po_data.xlsx"):
142
 
143
  # Gradio Interface Function
144
  def process_pdf(file):
 
 
 
145
  try:
146
  text = extract_text_from_pdf(file)
147
  df, status = parse_po_items_with_filters(text)
@@ -158,10 +157,7 @@ def create_gradio_interface():
158
  return gr.Interface(
159
  fn=process_pdf,
160
  inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
161
- outputs=[
162
- gr.File(label="Download Extracted Data"),
163
- gr.Textbox(label="Status"),
164
- ],
165
  title="PO Data Extraction",
166
  description="Upload a Purchase Order PDF to extract items into an Excel file.",
167
  )
 
6
 
7
  # Function: Extract Text from PDF
8
  def extract_text_from_pdf(pdf_file):
9
+ """
10
+ Extracts text from an uploaded PDF file.
11
+ """
12
  with pdfplumber.open(pdf_file.name) as pdf:
13
  text = ""
14
  for page in pdf.pages:
15
+ text += page.extract_text() + "\n"
16
+ print("\nExtracted Text:\n", text) # Debugging
17
  return text
18
 
19
 
 
33
  return description.strip()
34
 
35
 
36
+ # Function: Parse PO Items
37
  def parse_po_items_with_filters(text):
38
  """
39
+ Parses purchase order items from the extracted text.
 
 
 
 
 
40
  """
41
  lines = text.splitlines()
42
  data = []
 
44
  description_accumulator = []
45
 
46
  for line in lines:
47
+ print(f"Processing Line: {line}") # Debugging
48
+
49
+ # Match the start of a new item
50
  item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
51
  if item_match:
52
  # Save the previous item
 
91
  )
92
  data.append(current_item)
93
 
94
+ # Split merged descriptions and assign items
95
  for i, row in enumerate(data):
96
  if row["Item"] == "2" and "As per Drg. to." in row["Description"]:
97
  item_3_description = re.search(r"As per Drg. to. G000810.*Mfd:-2022", row["Description"])
 
107
  "Total Price": "45.60",
108
  },
109
  )
 
110
  row["Description"] = row["Description"].replace(item_3_description.group(), "").strip()
111
 
112
+ # Clean and finalize descriptions
113
  for item in data:
114
  if item["Item"] == "7":
 
115
  item["Description"] = re.sub(r"300 Sets 4.20 1260.00", "", item["Description"]).strip()
 
116
  if not item["Unit Price"] and not item["Total Price"]:
117
  price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)", item["Description"])
118
  if price_match:
119
  item["Unit Price"] = price_match.group("UnitPrice")
120
  item["Total Price"] = price_match.group("TotalPrice")
 
121
  item["Description"] = item["Description"].replace(price_match.group(0), "").strip()
122
 
123
+ # Filter out invalid rows
124
  data = [row for row in data if row["Description"]]
125
 
126
  # Return data as a DataFrame
 
138
 
139
  # Gradio Interface Function
140
  def process_pdf(file):
141
+ """
142
+ Processes the uploaded PDF file and extracts PO items.
143
+ """
144
  try:
145
  text = extract_text_from_pdf(file)
146
  df, status = parse_po_items_with_filters(text)
 
157
  return gr.Interface(
158
  fn=process_pdf,
159
  inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
160
+ outputs=[gr.File(label="Download Extracted Data"), gr.Textbox(label="Status")],
 
 
 
161
  title="PO Data Extraction",
162
  description="Upload a Purchase Order PDF to extract items into an Excel file.",
163
  )