dschandra commited on
Commit
a715551
·
verified ·
1 Parent(s): a537fa5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -106
app.py CHANGED
@@ -1,144 +1,109 @@
1
- import re
2
- import pandas as pd
3
  import pdfplumber
 
 
4
  import gradio as gr
5
 
6
-
7
  def extract_text_from_pdf(pdf_file):
8
- """
9
- Extracts text from an uploaded PDF file.
10
- Args:
11
- pdf_file: The uploaded PDF file.
12
- Returns:
13
- str: The extracted text from the PDF.
14
- """
15
  with pdfplumber.open(pdf_file.name) as pdf:
16
  text = ""
17
  for page in pdf.pages:
18
- text += page.extract_text() + "\n"
19
- print("\nExtracted Text:\n", text) # Debugging: Print the extracted text
20
  return text
21
 
22
-
23
- def preprocess_lines(lines):
24
- """
25
- Combines multi-line rows into single rows for better parsing.
26
- Args:
27
- lines (list): List of text lines from the PDF.
28
- Returns:
29
- list: Preprocessed list of single-row strings.
30
- """
31
- combined_rows = []
32
- current_row = ""
 
 
 
 
 
 
33
 
34
  for line in lines:
35
- if re.match(r"^\d+\s", line): # If line starts with an item number
36
- if current_row:
37
- combined_rows.append(current_row.strip())
38
- current_row = line
39
- else:
40
- current_row += " " + line.strip()
41
-
42
- if current_row:
43
- combined_rows.append(current_row.strip())
44
-
45
- return combined_rows
46
-
47
-
48
- def parse_po_items(rows):
49
- """
50
- Parses purchase order items from reconstructed rows.
51
- Args:
52
- rows (list): List of reconstructed rows.
53
- Returns:
54
- tuple: DataFrame with extracted data and a status message.
55
- """
56
- data = []
57
- for row in rows:
58
- try:
59
- # Match ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE
60
- match = re.match(
61
- r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>\S+)\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$",
62
- row,
63
- )
64
- if match:
65
- data.append(
66
- {
67
- "ITEM": match.group("Item"),
68
- "DESCRIPTION": match.group("Description"),
69
- "QTY": match.group("Qty"),
70
- "UNIT": match.group("Unit"),
71
- "UNIT PRICE": match.group("UnitPrice"),
72
- "TOTAL PRICE": match.group("TotalPrice"),
73
- }
74
  )
75
- else:
76
- print(f"Skipped row: {row}") # Log skipped rows
77
- except Exception as e:
78
- print(f"Error parsing row: {row}, Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
  if not data:
81
- return None, "No valid data found in the provided text."
 
82
  return pd.DataFrame(data), "Data extracted successfully."
83
 
84
-
85
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
86
- """
87
- Saves the extracted data to an Excel file.
88
- Args:
89
- df (pd.DataFrame): DataFrame containing the structured data.
90
- output_path (str): Path to save the Excel file.
91
- Returns:
92
- str: Path to the saved file.
93
- """
94
  df.to_excel(output_path, index=False)
95
  return output_path
96
 
97
-
98
  def process_pdf(file):
99
- """
100
- Processes the uploaded PDF file, extracts data, and saves it to an Excel file.
101
- Args:
102
- file: The uploaded PDF file.
103
- Returns:
104
- tuple: Path to the saved Excel file and a status message.
105
- """
106
  try:
107
- # Extract text from the uploaded PDF
108
  text = extract_text_from_pdf(file)
109
- # Split text into lines
110
- lines = text.splitlines()
111
- # Preprocess lines to reconstruct rows
112
- rows = preprocess_lines(lines)
113
- # Parse reconstructed rows
114
- df, status = parse_po_items(rows)
115
  if df is not None:
116
  output_path = save_to_excel(df)
117
  return output_path, status
118
  return None, status
119
  except Exception as e:
120
- return None, f"Error: {str(e)}"
121
 
122
-
123
- # Gradio Interface
124
- def create_interface():
125
- """
126
- Creates a Gradio interface for processing PO data from PDF files.
127
- """
128
- interface = gr.Interface(
129
  fn=process_pdf,
130
  inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
131
  outputs=[
132
- gr.File(label="Download Extracted Excel"),
133
  gr.Textbox(label="Status"),
134
  ],
135
  title="PO Data Extraction",
136
- description="Upload a purchase order PDF file to extract data into an Excel file.",
137
  )
138
- return interface
139
-
140
 
141
  if __name__ == "__main__":
142
- # Run the Gradio app
143
- app = create_interface()
144
- app.launch()
 
 
 
1
  import pdfplumber
2
+ import pandas as pd
3
+ import re
4
  import gradio as gr
5
 
6
+ # Function: Extract Text from PDF
7
  def extract_text_from_pdf(pdf_file):
 
 
 
 
 
 
 
8
  with pdfplumber.open(pdf_file.name) as pdf:
9
  text = ""
10
  for page in pdf.pages:
11
+ text += page.extract_text()
12
+ print("\nExtracted Text:\n", text) # Debugging: Print extracted text
13
  return text
14
 
15
+ # Function: Clean Description
16
+ def clean_description(description, item_number=None):
17
+ description = re.sub(r"\d+\s+(Nos\.|Set)\s+[\d.]+\s+[\d.]+", "", description) # Remove Qty + Unit + Price
18
+ description = re.sub(r"Page \d+ of \d+.*", "", description) # Remove page references
19
+ description = re.sub(r"\(Q\. No:.*?\)", "", description) # Remove Q.No-related data
20
+ description = re.sub(r"TOTAL EX-WORK.*", "", description) # Remove EX-WORK-related text
21
+ description = re.sub(r"NOTES:.*", "", description) # Remove notes section
22
+ description = re.sub(r"HS CODE.*", "", description) # Remove HS CODE-related data
23
+ description = re.sub(r"DELIVERY:.*", "", description) # Remove delivery instructions
24
+ return description.strip()
25
+
26
+ # Function: Parse PO Items with Filters
27
+ def parse_po_items_with_filters(text):
28
+ lines = text.splitlines()
29
+ data = []
30
+ current_item = {}
31
+ description_accumulator = []
32
 
33
  for line in lines:
34
+ print(f"Processing Line: {line}") # Debugging
35
+ item_match = re.match(r"^\s*(?P<Item>\d+)\s+(?P<Description>.+)", line)
36
+ if item_match:
37
+ if current_item:
38
+ current_item["Description"] = clean_description(
39
+ " ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  )
41
+ data.append(current_item)
42
+ description_accumulator = []
43
+
44
+ current_item = {
45
+ "Item": item_match.group("Item"),
46
+ "Description": "",
47
+ "Qty": "",
48
+ "Unit": "",
49
+ "Unit Price": "",
50
+ "Total Price": "",
51
+ }
52
+ description_accumulator.append(item_match.group("Description"))
53
+ elif current_item:
54
+ description_accumulator.append(line.strip())
55
+
56
+ qty_match = re.search(r"(?P<Qty>\d+)\s+(Nos\.|Set)", line)
57
+ if qty_match:
58
+ current_item["Qty"] = qty_match.group("Qty")
59
+ current_item["Unit"] = qty_match.group(2)
60
+
61
+ price_match = re.search(r"(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$", line)
62
+ if price_match:
63
+ current_item["Unit Price"] = price_match.group("UnitPrice")
64
+ current_item["Total Price"] = price_match.group("TotalPrice")
65
+
66
+ if current_item:
67
+ current_item["Description"] = clean_description(
68
+ " ".join(description_accumulator).strip(), item_number=int(current_item["Item"])
69
+ )
70
+ data.append(current_item)
71
 
72
  if not data:
73
+ print("No items found. Check PDF format.") # Debugging
74
+ return None, "No items found. Please check the PDF file format."
75
  return pd.DataFrame(data), "Data extracted successfully."
76
 
77
+ # Function: Save to Excel
78
  def save_to_excel(df, output_path="extracted_po_data.xlsx"):
 
 
 
 
 
 
 
 
79
  df.to_excel(output_path, index=False)
80
  return output_path
81
 
82
+ # Gradio Interface Function
83
  def process_pdf(file):
 
 
 
 
 
 
 
84
  try:
 
85
  text = extract_text_from_pdf(file)
86
+ df, status = parse_po_items_with_filters(text)
 
 
 
 
 
87
  if df is not None:
88
  output_path = save_to_excel(df)
89
  return output_path, status
90
  return None, status
91
  except Exception as e:
92
+ return None, f"Error during processing: {str(e)}"
93
 
94
+ # Gradio Interface Setup
95
+ def create_gradio_interface():
96
+ return gr.Interface(
 
 
 
 
97
  fn=process_pdf,
98
  inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
99
  outputs=[
100
+ gr.File(label="Download Extracted Data"),
101
  gr.Textbox(label="Status"),
102
  ],
103
  title="PO Data Extraction",
104
+ description="Upload a Purchase Order PDF to extract items into an Excel file.",
105
  )
 
 
106
 
107
  if __name__ == "__main__":
108
+ interface = create_gradio_interface()
109
+ interface.launch()