dschandra commited on
Commit
f09760f
·
verified ·
1 Parent(s): 21b7e40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -123
app.py CHANGED
@@ -1,118 +1,64 @@
1
- import pdfplumber
2
- import pandas as pd
3
  import re
4
- import gradio as gr
5
-
6
- # Function: Extract Text from PDF
7
- def extract_text_from_pdf(pdf_file):
8
- with pdfplumber.open(pdf_file.name) as pdf:
9
- text = ""
10
- for page in pdf.pages:
11
- text += page.extract_text()
12
- return text
13
-
14
- # Function: Clean Description
15
- def clean_description(description, item_number=None):
16
- """
17
- Cleans the description by removing unwanted data such as Qty, Unit, Unit Price, Total Price, and other invalid entries.
18
- Args:
19
- description (str): Raw description string.
20
- item_number (int, optional): The item number being processed to handle item-specific cleaning.
21
- Returns:
22
- str: Cleaned description.
23
- """
24
- # Remove common unwanted patterns
25
- description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description) # Remove Qty + Unit + Price
26
- description = re.sub(r"Page \d+ of \d+.*", "", description) # Remove page references
27
- description = re.sub(r"\(Q\. No:.*?\)", "", description) # Remove Q.No-related data
28
- description = re.sub(r"TOTAL EX-WORK.*", "", description) # Remove EX-WORK-related text
29
- description = re.sub(r"NOTES:.*", "", description) # Remove notes section
30
- description = re.sub(r"HS CODE.*", "", description) # Remove HS CODE-related data
31
- description = re.sub(r"DELIVERY:.*", "", description) # Remove delivery instructions
32
-
33
- # Specific removal for item 7
34
- if item_number == 7:
35
- description = re.sub(r"\b300 Sets 4.20 1260.00\b", "", description)
36
 
37
- return description.strip()
38
 
39
- def parse_po_items_with_filters(text):
40
  """
41
- Parses purchase order items from the extracted text using regex with filters.
42
- Ensures items are formatted correctly into rows and columns.
43
  Args:
44
- text (str): Extracted text from the PDF.
45
  Returns:
46
- tuple: A DataFrame with parsed data and a status message.
47
  """
48
  lines = text.splitlines()
49
  data = []
50
- current_item = None
51
- description_accumulator = []
52
 
53
  for line in lines:
54
- # Match the start of a new item row (e.g., Item No. followed by description)
55
- item_match = re.match(r"^(?P<Item>\d+)\s+(?P<Description>.+)", line)
56
- if item_match:
57
- # Save the previous item
58
- if current_item:
59
- current_item["Description"] = format_description(
60
- " ".join(description_accumulator).strip()
61
- )
62
- data.append(current_item)
63
- description_accumulator = []
64
-
65
- # Start a new item
66
- current_item = {
67
- "Item": item_match.group("Item"),
68
- "Description": "",
69
- "Qty": "",
70
- "Unit": "",
71
- "Unit Price": "",
72
- "Total Price": "",
73
- }
74
- description_accumulator.append(item_match.group("Description"))
75
- elif current_item:
76
- # Accumulate additional lines for the current item's description
77
- description_accumulator.append(line.strip())
78
-
79
- # Match Quantity, Unit, Unit Price, and Total Price
80
- qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
81
- if qty_match:
82
- current_item["Qty"] = qty_match.group("Qty")
83
- current_item["Unit"] = qty_match.group(2)
84
-
85
- price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
86
- if price_match:
87
- current_item["Unit Price"] = price_match.group("UnitPrice")
88
- current_item["Total Price"] = price_match.group("TotalPrice")
89
-
90
- # Save the last item
91
- if current_item:
92
- current_item["Description"] = format_description(
93
- " ".join(description_accumulator).strip()
94
  )
95
- data.append(current_item)
96
-
97
- # Remove empty rows
98
- data = [row for row in data if row["Description"]]
99
-
100
- # Return data as a DataFrame
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  if not data:
102
- return None, "No items found. Please check the PDF file format."
103
  df = pd.DataFrame(data)
104
  return df, "Data extracted successfully."
105
 
106
 
107
  def format_description(description):
108
  """
109
- Formats the description into multiple lines based on patterns.
110
  Args:
111
  description (str): Raw description text.
112
  Returns:
113
- str: Formatted description.
114
  """
115
- # Break the description into multiple lines
116
  line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
117
  line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
118
  line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
@@ -132,36 +78,18 @@ def format_description(description):
132
  return "\n".join(lines)
133
 
134
 
135
- # Function: Save to Excel
136
- def save_to_excel(df, output_path="extracted_po_data.xlsx"):
137
- df.to_excel(output_path, index=False)
138
- return output_path
139
-
140
- # Gradio Interface Function
141
- def process_pdf(file):
142
- try:
143
- text = extract_text_from_pdf(file)
144
- df, status = parse_po_items_with_filters(text)
145
- if df is not None:
146
- output_path = save_to_excel(df)
147
- return output_path, status
148
- return None, status
149
- except Exception as e:
150
- return None, f"Error during processing: {str(e)}"
151
 
152
- # Gradio Interface Setup
153
- def create_gradio_interface():
154
- return gr.Interface(
155
- fn=process_pdf,
156
- inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
157
- outputs=[
158
- gr.File(label="Download Extracted Data"),
159
- gr.Textbox(label="Status"),
160
- ],
161
- title="PO Data Extraction",
162
- description="Upload a Purchase Order PDF to extract items into an Excel file.",
163
- )
164
 
165
- if __name__ == "__main__":
166
- interface = create_gradio_interface()
167
- interface.launch()
 
 
 
 
 
1
  import re
2
+ import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
 
4
 
5
+ def extract_po_data(text):
6
  """
7
+ Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
 
8
  Args:
9
+ text (str): Raw text extracted from the PDF.
10
  Returns:
11
+ tuple: A DataFrame containing structured data and a status message.
12
  """
13
  lines = text.splitlines()
14
  data = []
 
 
15
 
16
  for line in lines:
17
+ # Match table row patterns
18
+ row_match = re.match(
19
+ r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>(Nos\.|Set))\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$",
20
+ line,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  )
22
+ if row_match:
23
+ # Extract fields
24
+ item = row_match.group("Item")
25
+ description = format_description(row_match.group("Description"))
26
+ qty = row_match.group("Qty")
27
+ unit = row_match.group("Unit")
28
+ unit_price = row_match.group("UnitPrice")
29
+ total_price = row_match.group("TotalPrice")
30
+
31
+ # Append to the data list
32
+ data.append(
33
+ {
34
+ "ITEM": item,
35
+ "DESCRIPTION": description,
36
+ "QTY": qty,
37
+ "UNIT": unit,
38
+ "UNIT PRICE": unit_price,
39
+ "TOTAL PRICE": total_price,
40
+ }
41
+ )
42
+ else:
43
+ # Log invalid row for debugging
44
+ print(f"Skipping line (does not match expected format): {line}")
45
+
46
+ # Convert to DataFrame
47
  if not data:
48
+ return None, "No valid data found in the provided text."
49
  df = pd.DataFrame(data)
50
  return df, "Data extracted successfully."
51
 
52
 
53
  def format_description(description):
54
  """
55
+ Formats the description field into multiple lines based on predefined structure.
56
  Args:
57
  description (str): Raw description text.
58
  Returns:
59
+ str: Formatted description with line breaks.
60
  """
61
+ # Define patterns for splitting the description
62
  line1 = re.search(r"Stainless Steel RATING AND DIAGRAM PLATE", description)
63
  line2 = re.search(r"As per Drg\.No\..*?[A-Z0-9]+\s", description)
64
  line3 = re.search(r"SIZE\s*:\s*\d+mm\s*X\s*\d+mm\s*X\s*[\d.]+mm\s*Thick", description)
 
78
  return "\n".join(lines)
79
 
80
 
81
+ # Example Usage
82
+ if __name__ == "__main__":
83
+ # Example raw text (replace this with actual extracted text from PDF)
84
+ raw_text = """
85
+ 1 Stainless Steel RATING AND DIAGRAM PLATE As per Drg.No. G 000822 RI RDP 50KVA NT00l 51 SIZE : l50mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022 24 Nos. 3.00 72.00
86
+ """
 
 
 
 
 
 
 
 
 
 
87
 
88
+ # Extract data
89
+ df, status = extract_po_data(raw_text)
 
 
 
 
 
 
 
 
 
 
90
 
91
+ # Output results
92
+ if df is not None:
93
+ print(df)
94
+ else:
95
+ print(status)