dschandra commited on
Commit
10eea43
·
verified ·
1 Parent(s): ad5c356

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -14
app.py CHANGED
@@ -1,8 +1,24 @@
1
  import re
2
  import pandas as pd
 
3
  import gradio as gr
4
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  def extract_po_data(text):
7
  """
8
  Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
@@ -92,35 +108,41 @@ def save_to_excel(df, output_path="extracted_po_data.xlsx"):
92
  return output_path
93
 
94
 
95
- def process_text_input(text):
96
  """
97
- Processes the raw text input, extracts data, and saves it to an Excel file.
98
  Args:
99
- text (str): Raw text input.
100
  Returns:
101
- str: Path to the saved Excel file.
102
  """
103
- df, status = extract_po_data(text)
104
- if df is not None:
105
- output_path = save_to_excel(df)
106
- return output_path, status
107
- return None, status
 
 
 
 
 
 
108
 
109
 
110
  # Gradio Interface
111
  def create_interface():
112
  """
113
- Creates a Gradio interface for processing PO data.
114
  """
115
  interface = gr.Interface(
116
- fn=process_text_input,
117
- inputs=gr.Textbox(label="Paste Raw Text from PDF", lines=10, placeholder="Paste extracted text here..."),
118
  outputs=[
119
  gr.File(label="Download Extracted Excel"),
120
  gr.Textbox(label="Status"),
121
  ],
122
  title="PO Data Extraction",
123
- description="Paste the raw text from the PDF to extract purchase order data into an Excel file.",
124
  )
125
  return interface
126
 
@@ -128,4 +150,4 @@ def create_interface():
128
  if __name__ == "__main__":
129
  # Run the Gradio app
130
  app = create_interface()
131
- app.launch()
 
1
  import re
2
  import pandas as pd
3
+ import pdfplumber
4
  import gradio as gr
5
 
6
 
7
+ def extract_text_from_pdf(pdf_file):
8
+ """
9
+ Extracts text from an uploaded PDF file.
10
+ Args:
11
+ pdf_file: The uploaded PDF file.
12
+ Returns:
13
+ str: The extracted text from the PDF.
14
+ """
15
+ with pdfplumber.open(pdf_file.name) as pdf:
16
+ text = ""
17
+ for page in pdf.pages:
18
+ text += page.extract_text() + "\n"
19
+ return text
20
+
21
+
22
  def extract_po_data(text):
23
  """
24
  Extracts purchase order data from the text into structured rows with ITEM, DESCRIPTION, QTY, UNIT, UNIT PRICE, TOTAL PRICE.
 
108
  return output_path
109
 
110
 
111
+ def process_pdf(file):
112
  """
113
+ Processes the uploaded PDF file, extracts data, and saves it to an Excel file.
114
  Args:
115
+ file: The uploaded PDF file.
116
  Returns:
117
+ tuple: Path to the saved Excel file and a status message.
118
  """
119
+ try:
120
+ # Extract text from the uploaded PDF
121
+ text = extract_text_from_pdf(file)
122
+ # Extract structured data
123
+ df, status = extract_po_data(text)
124
+ if df is not None:
125
+ output_path = save_to_excel(df)
126
+ return output_path, status
127
+ return None, status
128
+ except Exception as e:
129
+ return None, f"Error: {str(e)}"
130
 
131
 
132
  # Gradio Interface
133
  def create_interface():
134
  """
135
+ Creates a Gradio interface for processing PO data from PDF files.
136
  """
137
  interface = gr.Interface(
138
+ fn=process_pdf,
139
+ inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
140
  outputs=[
141
  gr.File(label="Download Extracted Excel"),
142
  gr.Textbox(label="Status"),
143
  ],
144
  title="PO Data Extraction",
145
+ description="Upload a purchase order PDF file to extract data into an Excel file.",
146
  )
147
  return interface
148
 
 
150
  if __name__ == "__main__":
151
  # Run the Gradio app
152
  app = create_interface()
153
+ app.launch()