Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -116,13 +116,20 @@ def extract_numeric_values(pdf_file, schedule1_file=None, client_name="Unknown C
|
|
116 |
raw = pytesseract.image_to_string(val_img, config=config_val).strip()
|
117 |
value_text = re.sub(r"[^\d,.\-+]", "", raw)
|
118 |
schedule1_values.append(value_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
output += [f"Schedule 1 Line {i*2+1 if i < 2 else 8}: {val}" for i, val in enumerate(schedule1_values)]
|
120 |
-
|
121 |
|
122 |
output_dir = "./Clients_Output_Data_Form_1040.csv"
|
123 |
save_to_csv_flat(all_extracted_values, schedule1_values, client_name=client_name, csv_path=output_dir)
|
124 |
|
125 |
-
return "\n".join(output), output_dir, output_pdf_path
|
126 |
|
127 |
except Exception as e:
|
128 |
return f"Error occurred:\n{str(e)}", None, None
|
@@ -214,13 +221,15 @@ iface = gr.Interface(
|
|
214 |
gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"]),
|
215 |
gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"]),
|
216 |
gr.Textbox(label="Client Name", placeholder="Enter client name"),
|
217 |
-
gr.Checkbox(label="Draw Bounding Boxes on Form 1040?")
|
218 |
],
|
219 |
outputs=[
|
220 |
-
|
221 |
-
|
222 |
-
|
|
|
223 |
],
|
|
|
224 |
title="Form 1040 & Schedule 1 Extractor",
|
225 |
description="Upload a 2-page 1040 Form and (optionally) Schedule 1 to extract numeric fields. Optionally draw bounding boxes."
|
226 |
)
|
|
|
116 |
raw = pytesseract.image_to_string(val_img, config=config_val).strip()
|
117 |
value_text = re.sub(r"[^\d,.\-+]", "", raw)
|
118 |
schedule1_values.append(value_text)
|
119 |
+
if draw_bboxes:
|
120 |
+
page.draw_rect(rect, color=(0, 1, 0), width=1.5) # green boxes
|
121 |
+
schedule1_pdf_path = None
|
122 |
+
if draw_bboxes and schedule1_file:
|
123 |
+
schedule1_pdf_path = "schedule1_bbox.pdf"
|
124 |
+
doc.save(schedule1_pdf_path)
|
125 |
+
|
126 |
output += [f"Schedule 1 Line {i*2+1 if i < 2 else 8}: {val}" for i, val in enumerate(schedule1_values)]
|
127 |
+
doc.close()
|
128 |
|
129 |
output_dir = "./Clients_Output_Data_Form_1040.csv"
|
130 |
save_to_csv_flat(all_extracted_values, schedule1_values, client_name=client_name, csv_path=output_dir)
|
131 |
|
132 |
+
return "\n".join(output), output_dir, output_pdf_path, schedule1_pdf_path
|
133 |
|
134 |
except Exception as e:
|
135 |
return f"Error occurred:\n{str(e)}", None, None
|
|
|
221 |
gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"]),
|
222 |
gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"]),
|
223 |
gr.Textbox(label="Client Name", placeholder="Enter client name"),
|
224 |
+
gr.Checkbox(label="Draw Bounding Boxes on Form 1040 and Schedule 1?")
|
225 |
],
|
226 |
outputs=[
|
227 |
+
gr.Textbox(label="Extracted Numeric Values", lines=20),
|
228 |
+
gr.File(label="Download Excel Output"),
|
229 |
+
gr.File(label="1040 Bounding Boxes PDF"),
|
230 |
+
gr.File(label="Schedule 1 Bounding Boxes PDF")
|
231 |
],
|
232 |
+
|
233 |
title="Form 1040 & Schedule 1 Extractor",
|
234 |
description="Upload a 2-page 1040 Form and (optionally) Schedule 1 to extract numeric fields. Optionally draw bounding boxes."
|
235 |
)
|