dschandra commited on
Commit
4fdb3ac
·
verified ·
1 Parent(s): f09760f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -13
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import re
2
  import pandas as pd
 
3
 
4
 
5
  def extract_po_data(text):
@@ -14,9 +15,9 @@ def extract_po_data(text):
14
  data = []
15
 
16
  for line in lines:
17
- # Match table row patterns
18
  row_match = re.match(
19
- r"^(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>(Nos\.|Set))\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)$",
20
  line,
21
  )
22
  if row_match:
@@ -78,18 +79,53 @@ def format_description(description):
78
  return "\n".join(lines)
79
 
80
 
81
- # Example Usage
82
- if __name__ == "__main__":
83
- # Example raw text (replace this with actual extracted text from PDF)
84
- raw_text = """
85
- 1 Stainless Steel RATING AND DIAGRAM PLATE As per Drg.No. G 000822 RI RDP 50KVA NT00l 51 SIZE : l50mm X 160mm X 1.00mm Thick With Serial No:NT00151 97 to 121 Mfd:-2022 24 Nos. 3.00 72.00
 
 
 
86
  """
 
 
87
 
88
- # Extract data
89
- df, status = extract_po_data(raw_text)
90
 
91
- # Output results
 
 
 
 
 
 
 
 
92
  if df is not None:
93
- print(df)
94
- else:
95
- print(status)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
  import pandas as pd
3
+ import gradio as gr
4
 
5
 
6
  def extract_po_data(text):
 
15
  data = []
16
 
17
  for line in lines:
18
+ # Match table row patterns with a flexible regex
19
  row_match = re.match(
20
+ r"^\s*(?P<Item>\d+)\s+(?P<Description>.+?)\s+(?P<Qty>\d+)\s+(?P<Unit>(Nos\.|Set))\s+(?P<UnitPrice>[\d.]+)\s+(?P<TotalPrice>[\d.]+)\s*$",
21
  line,
22
  )
23
  if row_match:
 
79
  return "\n".join(lines)
80
 
81
 
82
+ def save_to_excel(df, output_path="extracted_po_data.xlsx"):
83
+ """
84
+ Saves the extracted data to an Excel file.
85
+ Args:
86
+ df (pd.DataFrame): DataFrame containing the structured data.
87
+ output_path (str): Path to save the Excel file.
88
+ Returns:
89
+ str: Path to the saved file.
90
  """
91
+ df.to_excel(output_path, index=False)
92
+ return output_path
93
 
 
 
94
 
95
+ def process_text_input(text):
96
+ """
97
+ Processes the raw text input, extracts data, and saves it to an Excel file.
98
+ Args:
99
+ text (str): Raw text input.
100
+ Returns:
101
+ str: Path to the saved Excel file.
102
+ """
103
+ df, status = extract_po_data(text)
104
  if df is not None:
105
+ output_path = save_to_excel(df)
106
+ return output_path, status
107
+ return None, status
108
+
109
+
110
+ # Gradio Interface
111
+ def create_interface():
112
+ """
113
+ Creates a Gradio interface for processing PO data.
114
+ """
115
+ interface = gr.Interface(
116
+ fn=process_text_input,
117
+ inputs=gr.Textbox(label="Paste Raw Text from PDF", lines=10, placeholder="Paste extracted text here..."),
118
+ outputs=[
119
+ gr.File(label="Download Extracted Excel"),
120
+ gr.Textbox(label="Status"),
121
+ ],
122
+ title="PO Data Extraction",
123
+ description="Paste the raw text from the PDF to extract purchase order data into an Excel file.",
124
+ )
125
+ return interface
126
+
127
+
128
+ if __name__ == "__main__":
129
+ # Run the Gradio app
130
+ app = create_interface()
131
+ app.launch()