Ayesha352 commited on
Commit
46023f9
·
verified ·
1 Parent(s): 0013251

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -6
app.py CHANGED
@@ -116,13 +116,20 @@ def extract_numeric_values(pdf_file, schedule1_file=None, client_name="Unknown C
116
  raw = pytesseract.image_to_string(val_img, config=config_val).strip()
117
  value_text = re.sub(r"[^\d,.\-+]", "", raw)
118
  schedule1_values.append(value_text)
 
 
 
 
 
 
 
119
  output += [f"Schedule 1 Line {i*2+1 if i < 2 else 8}: {val}" for i, val in enumerate(schedule1_values)]
120
- doc.close()
121
 
122
  output_dir = "./Clients_Output_Data_Form_1040.csv"
123
  save_to_csv_flat(all_extracted_values, schedule1_values, client_name=client_name, csv_path=output_dir)
124
 
125
- return "\n".join(output), output_dir, output_pdf_path
126
 
127
  except Exception as e:
128
  return f"Error occurred:\n{str(e)}", None, None
@@ -214,13 +221,15 @@ iface = gr.Interface(
214
  gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"]),
215
  gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"]),
216
  gr.Textbox(label="Client Name", placeholder="Enter client name"),
217
- gr.Checkbox(label="Draw Bounding Boxes on Form 1040?")
218
  ],
219
  outputs=[
220
- gr.Textbox(label="Extracted Numeric Values", lines=20),
221
- gr.File(label="Download Excel Output"),
222
- gr.File(label="Download PDF with Bounding Boxes")
 
223
  ],
 
224
  title="Form 1040 & Schedule 1 Extractor",
225
  description="Upload a 2-page 1040 Form and (optionally) Schedule 1 to extract numeric fields. Optionally draw bounding boxes."
226
  )
 
116
  raw = pytesseract.image_to_string(val_img, config=config_val).strip()
117
  value_text = re.sub(r"[^\d,.\-+]", "", raw)
118
  schedule1_values.append(value_text)
119
+ if draw_bboxes:
120
+ page.draw_rect(rect, color=(0, 1, 0), width=1.5) # green boxes
121
+ schedule1_pdf_path = None
122
+ if draw_bboxes and schedule1_file:
123
+ schedule1_pdf_path = "schedule1_bbox.pdf"
124
+ doc.save(schedule1_pdf_path)
125
+
126
  output += [f"Schedule 1 Line {i*2+1 if i < 2 else 8}: {val}" for i, val in enumerate(schedule1_values)]
127
+ doc.close()
128
 
129
  output_dir = "./Clients_Output_Data_Form_1040.csv"
130
  save_to_csv_flat(all_extracted_values, schedule1_values, client_name=client_name, csv_path=output_dir)
131
 
132
+ return "\n".join(output), output_dir, output_pdf_path, schedule1_pdf_path
133
 
134
  except Exception as e:
135
  return f"Error occurred:\n{str(e)}", None, None
 
221
  gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"]),
222
  gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"]),
223
  gr.Textbox(label="Client Name", placeholder="Enter client name"),
224
+ gr.Checkbox(label="Draw Bounding Boxes on Form 1040 and Schedule 1?")
225
  ],
226
  outputs=[
227
+ gr.Textbox(label="Extracted Numeric Values", lines=20),
228
+ gr.File(label="Download Excel Output"),
229
+ gr.File(label="1040 Bounding Boxes PDF"),
230
+ gr.File(label="Schedule 1 Bounding Boxes PDF")
231
  ],
232
+
233
  title="Form 1040 & Schedule 1 Extractor",
234
  description="Upload a 2-page 1040 Form and (optionally) Schedule 1 to extract numeric fields. Optionally draw bounding boxes."
235
  )