Ayesha352 commited on
Commit
df70c76
·
verified ·
1 Parent(s): 18cb4d6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +240 -0
app.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import fitz
3
+ from PIL import Image
4
+ import pytesseract
5
+ import gradio as gr
6
+ import pandas as pd
7
+ import os
8
+
9
+ config_val = "--psm 6 -c tessedit_char_whitelist=0123456789,.-+"
10
+
11
+ # Rectangles for Form 1040 Pages 1 & 2
12
+ page1_rects = [
13
+ [(464, 399), (576, 399), (575, 409), (462, 410)],
14
+ [(462, 519), (577, 518), (577, 531), (463, 529)],
15
+ [(225, 517), (340, 518), (339, 530), (224, 530)],
16
+ [(225, 530), (339, 532), (340, 541), (225, 542)],
17
+ [(464, 531), (576, 531), (576, 542), (464, 542)],
18
+ [(464, 589), (578, 589), (577, 602), (464, 602)],
19
+ [(463, 624), (578, 626), (576, 639), (464, 637)],
20
+ [(462, 652), (576, 651), (577, 661), (464, 663)],
21
+ [(463, 661), (578, 664), (578, 676), (462, 674)],
22
+ [(464, 699), (578, 684), (578, 699), (464, 699)]
23
+ ]
24
+ page2_rects = [
25
+ [(462, 15), (575, 15), (576, 26), (463, 26)],
26
+ [(462, 62), (577, 63), (579, 75), (462, 73)],
27
+ [(463, 98), (576, 98), (578, 110), (462, 110)],
28
+ [(461, 111), (576, 111), (578, 123), (459, 122)]
29
+ ]
30
+
31
+ schedule1_rects = [
32
+ [(470, 204), (579, 203), (577, 216), (471, 216)], # Schedule 1 Line 3
33
+ [(470, 228), (577, 229), (576, 240), (470, 240)], # Schedule 1 Line 5
34
+ [(362, 274), (466, 274), (468, 288), (360, 288)] # Schedule 1 Line 8
35
+ ]
36
+
37
+ adjusted_page1_rects = [[(x, y + 23) for (x, y) in rect] for rect in page1_rects]
38
+ adjusted_page2_rects = [[(x, y + 23) for (x, y) in rect] for rect in page2_rects]
39
+
40
+ def get_bounding_rect(points):
41
+ xs = [pt[0] for pt in points]
42
+ ys = [pt[1] for pt in points]
43
+ return fitz.Rect(min(xs), min(ys), max(xs), max(ys))
44
+
45
+ def extract_numeric_values(pdf_file, schedule1_file=None, client_name="Unknown Client"):
46
+ try:
47
+ # ---- All existing code inside try ----
48
+ if isinstance(pdf_file, str):
49
+ doc = fitz.open(pdf_file)
50
+ else:
51
+ pdf_file.seek(0)
52
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
53
+
54
+ if len(doc) < 2:
55
+ return "Error: Main PDF must have at least 2 pages.", None
56
+
57
+ zoom = fitz.Matrix(2, 2)
58
+ page1 = doc[0]
59
+ page2 = doc[1]
60
+
61
+ page1_values, page2_values = [], []
62
+
63
+ for rect_points in adjusted_page1_rects:
64
+ rect = get_bounding_rect(rect_points)
65
+ pix = page1.get_pixmap(matrix=zoom, clip=rect)
66
+ cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
67
+ w, h = cropped_img.size
68
+ val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
69
+ raw = pytesseract.image_to_string(val_img, config=config_val).strip()
70
+ value_text = re.sub(r"[^\d,.\-+]", "", raw)
71
+ page1_values.append(value_text)
72
+
73
+ for rect_points in adjusted_page2_rects:
74
+ rect = get_bounding_rect(rect_points)
75
+ pix = page2.get_pixmap(matrix=zoom, clip=rect)
76
+ cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
77
+ w, h = cropped_img.size
78
+ val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
79
+ raw = pytesseract.image_to_string(val_img, config=config_val).strip()
80
+ value_text = re.sub(r"[^\d,.\-+]", "", raw)
81
+ page2_values.append(value_text)
82
+
83
+ doc.close()
84
+
85
+ output = [f"1040 Value {i+1}: {val}" for i, val in enumerate(page1_values + page2_values)]
86
+ all_extracated_values = page1_values + page2_values
87
+ schedule1_values = []
88
+
89
+ if schedule1_file:
90
+ if isinstance(schedule1_file, str):
91
+ doc = fitz.open(schedule1_file)
92
+ else:
93
+ schedule1_file.seek(0)
94
+ doc = fitz.open(stream=schedule1_file.read(), filetype="pdf")
95
+
96
+ if len(doc) >= 1:
97
+ page = doc[0]
98
+ schedule1_values = []
99
+ for idx, rect_points in enumerate(schedule1_rects):
100
+ rect = get_bounding_rect(rect_points)
101
+ pix = page.get_pixmap(matrix=zoom, clip=rect)
102
+ cropped_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
103
+ w, h = cropped_img.size
104
+ val_img = cropped_img.crop((int(0.4 * w), 0, w, h))
105
+ raw = pytesseract.image_to_string(val_img, config=config_val).strip()
106
+ value_text = re.sub(r"[^\d,.\-+]", "", raw)
107
+ schedule1_values.append(value_text)
108
+ schedule1 = schedule1_values
109
+ output += [f"Schedule 1 Line {i*2+1 if i < 2 else 8}: {val}" for i, val in enumerate(schedule1_values)]
110
+
111
+ doc.close()
112
+
113
+ save_to_csv_flat(all_extracated_values, schedule1_values, client_name=client_name)
114
+ return "\n".join(output), "Client_Output_Data_Form_1040.csv"
115
+
116
+ except Exception as e:
117
+ return f"Error occurred:\n{str(e)}", None
118
+
119
+
120
+ def save_to_csv_flat(all_extracted_values, schedule1_values, client_name="Unknown Client", csv_path="Client_Output_Data_Form_1040.csv"):
121
+ # Header components
122
+ header_level_1 = [
123
+ "Client Name","Gross Comp", "Taxable Wages", "Taxable Interest Income: Sch. B", "Tax- Exempt Interest",
124
+ "Qualified Dividends", "Ordinary Dividends", "Long Term Capital Gain or Loss",
125
+ "Other Adjustments (from Schedule 1)", "Business Income or Loss (Schedule C)",
126
+ "Rent/ Royalty (Schedule E)", "Other Income", "Standard Deduction", "Qualified Business Income Deduction",
127
+ "Taxable Income", "Tax", "", "", "Total Tax"
128
+ ]
129
+ header_level_2 = [
130
+ "","W2 Box 5", "Line 1", "Line 2b", "Line 2a", "Line 3a", "Line 3b", "Line 7",
131
+ "Line 10", "Schedule 1, Line 3", "Schedule 1, Line 5", "Schedule 1, Line 8",
132
+ "Line 12", "Line 13", "Line 15", "Line 16", "Line 20, Schedule 3", "Line 23, Schedule 2", "Line 24"
133
+ ]
134
+
135
+ # Flatten headers for CSV
136
+ flat_columns = [
137
+ f"{h1.strip()} - {h2.strip()}" if h1.strip() and h2.strip()
138
+ else (h1.strip() + h2.strip()) for h1, h2 in zip(header_level_1, header_level_2)
139
+ ]
140
+
141
+ # If file doesn't exist, create new DataFrame and write headers
142
+ if os.path.exists(csv_path):
143
+ df = pd.read_csv(csv_path)
144
+ else:
145
+ df = pd.DataFrame(columns=flat_columns)
146
+
147
+
148
+
149
+ # Create new row with None
150
+ new_row = pd.Series([None] * len(flat_columns), index=flat_columns)
151
+ new_row.iloc[0] = client_name
152
+ # Map Page 1-2 values
153
+ line_mapping = {
154
+ "Taxable Wages - Line 1": 0,
155
+ "Taxable Interest Income: Sch. B - Line 2b": 1,
156
+ "Tax- Exempt Interest - Line 2a": 2,
157
+ "Qualified Dividends - Line 3a": 3,
158
+ "Ordinary Dividends - Line 3b": 4,
159
+ "Long Term Capital Gain or Loss - Line 7": 5,
160
+ "Other Adjustments (from Schedule 1) - Line 10": 6,
161
+ "Standard Deduction - Line 12": 7,
162
+ "Qualified Business Income Deduction - Line 13": 8,
163
+ "Taxable Income - Line 15": 9,
164
+ "Tax - Line 16": 10,
165
+ "Line 20, Schedule 3": 11,
166
+ "Line 23, Schedule 2": 12,
167
+ "Total Tax - Line 24": 13
168
+ }
169
+
170
+ for key, idx in line_mapping.items():
171
+ if idx < len(all_extracted_values):
172
+ new_row[key] = all_extracted_values[idx] if all_extracted_values[idx] != '' else '0'
173
+
174
+ # Add Schedule 1 values
175
+ if schedule1_values:
176
+ new_row["Business Income or Loss (Schedule C) - Schedule 1, Line 3"] = schedule1_values[0] if schedule1_values[0] != '' else '0'
177
+ new_row["Rent/ Royalty (Schedule E) - Schedule 1, Line 5"] = schedule1_values[1] if schedule1_values[1] != '' else '0'
178
+ new_row["Other Income - Schedule 1, Line 8"] = schedule1_values[2] if schedule1_values[2] != '' else '0'
179
+
180
+ # Append and save
181
+ df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
182
+ df.to_csv(csv_path, index=False)
183
+ print(f" Data saved to CSV: {csv_path}")
184
+
185
+ # Gradio UI
186
+ iface = gr.Interface(
187
+ fn=extract_numeric_values,
188
+ inputs=[
189
+ gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"]),
190
+ gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"]),
191
+ gr.Textbox(label="Client Name", placeholder="Enter client name")
192
+ ],
193
+ outputs=[
194
+ gr.Textbox(label="Extracted Numeric Values", lines=20),
195
+ gr.File(label="Download Excel Output")
196
+ ],
197
+ title="Tax PDF Extractor",
198
+ description="Upload Form 1040 (at least 2 pages). Optionally upload Schedule 1 for extra fields."
199
+ )
200
+
201
+ # with gr.Blocks(title="Tax PDF Extractor") as demo:
202
+ # gr.Markdown("## Tax PDF Extractor")
203
+ # gr.Markdown("Upload Form 1040 (at least 2 pages). Optionally upload Schedule 1 for extra fields.")
204
+
205
+ # client_name = gr.Textbox(label="Client Name (Required)", placeholder="Enter your full name")
206
+
207
+ # form_1040 = gr.File(label="Upload Main Form 1040 PDF (Required)", file_types=[".pdf"])
208
+
209
+ # has_schedule1 = gr.Radio(
210
+ # choices=["Yes", "No"],
211
+ # label="Do you have Schedule 1?",
212
+ # value="No"
213
+ # )
214
+
215
+ # schedule1 = gr.File(label="Upload Schedule 1 PDF (Optional)", file_types=[".pdf"], visible=False)
216
+
217
+ # # Show/hide schedule1 upload box
218
+ # def toggle_schedule1(choice):
219
+ # return gr.update(visible=choice == "Yes")
220
+
221
+ # has_schedule1.change(fn=toggle_schedule1, inputs=has_schedule1, outputs=schedule1)
222
+
223
+ # output_text = gr.Textbox(label="Extracted Numeric Values", lines=20)
224
+ # output_file = gr.File(label="Download Excel Output")
225
+
226
+ # def wrapper_extract(main_pdf, schedule1_pdf, client_name):
227
+ # if not client_name:
228
+ # return "Error: Client name is required.", None
229
+ # return extract_numeric_values(main_pdf, schedule1_pdf)
230
+
231
+ # submit_btn = gr.Button("Extract Data")
232
+
233
+ # submit_btn.click(
234
+ # fn=wrapper_extract,
235
+ # inputs=[form_1040, schedule1, client_name],
236
+ # outputs=[output_text, output_file]
237
+ # )
238
+
239
+
240
+ iface.launch(share=True)