Spaces:
Sleeping
Sleeping
Commit
·
78227e9
1
Parent(s):
f45d315
added few changes and folder
Browse files
app.py
CHANGED
@@ -14,6 +14,7 @@ import gc
|
|
14 |
import requests
|
15 |
from datetime import datetime
|
16 |
import pandas as pd
|
|
|
17 |
|
18 |
# Configuration
|
19 |
JSON_SAVE_FOLDER = "processed_json"
|
@@ -68,16 +69,12 @@ def process_page_safely(page, page_num, attempt=1):
|
|
68 |
return process_page_safely(page, page_num, attempt+1)
|
69 |
return {"error": f"Page {page_num} error after {attempt} attempts: {str(e)}"}
|
70 |
|
71 |
-
def process_pdf(
|
72 |
all_json = []
|
73 |
errors = []
|
74 |
|
75 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tf:
|
76 |
-
tf.write(pdf_file)
|
77 |
-
temp_pdf_path = tf.name
|
78 |
-
|
79 |
try:
|
80 |
-
with fitz.open(
|
81 |
total_pages = len(doc)
|
82 |
|
83 |
for i in range(total_pages):
|
@@ -93,33 +90,78 @@ def process_pdf(pdf_file, progress=gr.Progress()):
|
|
93 |
time.sleep(0.5)
|
94 |
gc.collect()
|
95 |
|
96 |
-
|
97 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
98 |
-
json_filename = f"processed_{timestamp}.json"
|
99 |
-
json_path = os.path.join(JSON_SAVE_FOLDER, json_filename)
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
|
111 |
-
except Exception as e:
|
112 |
-
return (
|
113 |
-
None,
|
114 |
-
None,
|
115 |
-
f"Processing error: {str(e)}"
|
116 |
-
)
|
117 |
-
finally:
|
118 |
try:
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
def chunk_json_by_char_limit(data, char_limit=3500):
|
125 |
chunks = []
|
@@ -143,24 +185,31 @@ def chunk_json_by_char_limit(data, char_limit=3500):
|
|
143 |
|
144 |
return chunks
|
145 |
|
146 |
-
def call_llm_api(api_key,
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
|
162 |
-
|
163 |
-
|
164 |
{repeated_info}
|
165 |
|
166 |
Below is a portion of the voter data in JSON format. Please extract all entries into a CSV format with the following columns:
|
@@ -178,120 +227,139 @@ JSON Data:
|
|
178 |
Respond with ONLY the CSV data (including header ONLY in the first chunk).
|
179 |
""".strip()
|
180 |
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
"https://openrouter.ai/api/v1/chat/completions",
|
193 |
-
headers=headers,
|
194 |
-
json=payload,
|
195 |
-
timeout=120
|
196 |
-
)
|
197 |
-
except Exception as e:
|
198 |
-
return (
|
199 |
-
pd.DataFrame({"Error": [f"Network error: {str(e)}"]}),
|
200 |
-
None,
|
201 |
-
debug_info,
|
202 |
-
False
|
203 |
-
)
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
pd.DataFrame({"Error": [f"API Error on chunk {idx+1}: {response.text}"]}),
|
211 |
-
None,
|
212 |
-
debug_info,
|
213 |
-
False
|
214 |
-
)
|
215 |
|
216 |
-
|
217 |
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
else:
|
224 |
-
if len(lines) > 1:
|
225 |
-
all_csv_chunks.append("\n".join(lines[1:]))
|
226 |
else:
|
227 |
-
|
|
|
|
|
|
|
228 |
|
229 |
-
|
230 |
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
|
236 |
-
|
237 |
-
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
|
|
|
|
244 |
|
245 |
-
|
246 |
-
|
247 |
-
csv_path,
|
248 |
-
debug_info if debug_mode else "",
|
249 |
-
True
|
250 |
-
)
|
251 |
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
259 |
|
260 |
# Gradio interface
|
261 |
-
with gr.Blocks(title="Hindi
|
262 |
-
gr.Markdown("## 📄 Hindi PDF Processor
|
263 |
-
gr.Markdown("Process PDFs to extract text and convert to structured CSV using LLM")
|
264 |
|
265 |
with gr.Tab("PDF Processing"):
|
266 |
with gr.Row():
|
267 |
with gr.Column():
|
268 |
-
|
269 |
-
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
with gr.Column():
|
272 |
json_display = gr.JSON(label="Extracted JSON Data")
|
273 |
pdf_errors = gr.Textbox(label="Processing Errors")
|
274 |
-
json_download = gr.File(label="Download JSON
|
275 |
|
276 |
with gr.Tab("LLM API Processing"):
|
277 |
with gr.Row():
|
278 |
with gr.Column():
|
279 |
api_key = gr.Textbox(label="OpenRouter API Key", type="password")
|
280 |
-
repeated_info = gr.Textbox(
|
281 |
-
|
|
|
|
|
282 |
debug_mode = gr.Checkbox(label="Enable Debug Mode")
|
283 |
api_submit = gr.Button("Call LLM API")
|
284 |
|
285 |
with gr.Column():
|
286 |
dataframe_output = gr.Dataframe(label="CSV Output", wrap=True)
|
287 |
-
csv_download = gr.File(label="Download CSV
|
288 |
api_debug = gr.Textbox(label="Debug Information", visible=False)
|
289 |
api_status = gr.Textbox(label="API Status", visible=False)
|
290 |
|
291 |
# PDF Processing
|
292 |
pdf_submit.click(
|
293 |
-
|
294 |
-
inputs=[
|
295 |
outputs=[json_display, json_download, pdf_errors]
|
296 |
)
|
297 |
|
@@ -317,4 +385,4 @@ with gr.Blocks(title="Hindi PDF Processor with LLM API") as demo:
|
|
317 |
)
|
318 |
|
319 |
if __name__ == "__main__":
|
320 |
-
demo.launch()
|
|
|
14 |
import requests
|
15 |
from datetime import datetime
|
16 |
import pandas as pd
|
17 |
+
from pathlib import Path
|
18 |
|
19 |
# Configuration
|
20 |
JSON_SAVE_FOLDER = "processed_json"
|
|
|
69 |
return process_page_safely(page, page_num, attempt+1)
|
70 |
return {"error": f"Page {page_num} error after {attempt} attempts: {str(e)}"}
|
71 |
|
72 |
+
def process_pdf(pdf_path, progress=gr.Progress()):
|
73 |
all_json = []
|
74 |
errors = []
|
75 |
|
|
|
|
|
|
|
|
|
76 |
try:
|
77 |
+
with fitz.open(pdf_path) as doc:
|
78 |
total_pages = len(doc)
|
79 |
|
80 |
for i in range(total_pages):
|
|
|
90 |
time.sleep(0.5)
|
91 |
gc.collect()
|
92 |
|
93 |
+
return all_json, errors
|
|
|
|
|
|
|
94 |
|
95 |
+
except Exception as e:
|
96 |
+
return None, [f"Processing error: {str(e)}"]
|
97 |
+
|
98 |
+
def process_folder(folder_path, progress=gr.Progress()):
|
99 |
+
folder_name = os.path.basename(folder_path)
|
100 |
+
all_pdfs_json = []
|
101 |
+
all_errors = []
|
102 |
+
|
103 |
+
# Get all PDF files in the folder
|
104 |
+
pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
|
105 |
+
if not pdf_files:
|
106 |
+
return None, None, f"No PDF files found in folder: {folder_name}"
|
107 |
+
|
108 |
+
# Process each PDF in the folder
|
109 |
+
for i, pdf_file in enumerate(pdf_files):
|
110 |
+
progress(i/len(pdf_files), desc=f"Processing {pdf_file} in {folder_name}")
|
111 |
+
pdf_path = os.path.join(folder_path, pdf_file)
|
112 |
|
113 |
+
# Create temp file (needed for fitz)
|
114 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tf:
|
115 |
+
with open(pdf_path, 'rb') as f:
|
116 |
+
tf.write(f.read())
|
117 |
+
temp_pdf_path = tf.name
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
try:
|
120 |
+
pdf_json, errors = process_pdf(temp_pdf_path, progress)
|
121 |
+
if pdf_json:
|
122 |
+
all_pdfs_json.extend(pdf_json)
|
123 |
+
if errors:
|
124 |
+
all_errors.extend(errors)
|
125 |
+
finally:
|
126 |
+
try:
|
127 |
+
if os.path.exists(temp_pdf_path):
|
128 |
+
os.unlink(temp_pdf_path)
|
129 |
+
except:
|
130 |
+
pass
|
131 |
+
|
132 |
+
if not all_pdfs_json:
|
133 |
+
return None, None, "\n".join(all_errors) if all_errors else "No data extracted from any PDF"
|
134 |
+
|
135 |
+
# Save combined JSON for the folder
|
136 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
137 |
+
json_filename = f"{folder_name}_processed_{timestamp}.json"
|
138 |
+
json_path = os.path.join(JSON_SAVE_FOLDER, json_filename)
|
139 |
+
|
140 |
+
with open(json_path, 'w', encoding='utf-8') as f:
|
141 |
+
json.dump(all_pdfs_json, f, indent=2, ensure_ascii=False)
|
142 |
+
|
143 |
+
return all_pdfs_json, json_path, "\n".join(all_errors) if all_errors else "No errors"
|
144 |
+
|
145 |
+
def process_folders(folder_paths, progress=gr.Progress()):
|
146 |
+
all_results = []
|
147 |
+
all_json_paths = []
|
148 |
+
all_errors = []
|
149 |
+
|
150 |
+
for i, folder_path in enumerate(folder_paths):
|
151 |
+
progress(i/len(folder_paths), desc=f"Processing folder {i+1}/{len(folder_paths)}")
|
152 |
+
json_data, json_path, errors = process_folder(folder_path, progress)
|
153 |
+
|
154 |
+
if json_data:
|
155 |
+
all_results.append({
|
156 |
+
"folder": os.path.basename(folder_path),
|
157 |
+
"data": json_data
|
158 |
+
})
|
159 |
+
if json_path:
|
160 |
+
all_json_paths.append(json_path)
|
161 |
+
if errors and errors != "No errors":
|
162 |
+
all_errors.append(f"Folder {os.path.basename(folder_path)}: {errors}")
|
163 |
+
|
164 |
+
return all_results, all_json_paths, "\n".join(all_errors) if all_errors else "No errors"
|
165 |
|
166 |
def chunk_json_by_char_limit(data, char_limit=3500):
|
167 |
chunks = []
|
|
|
185 |
|
186 |
return chunks
|
187 |
|
188 |
+
def call_llm_api(api_key, json_file_paths, repeated_info, debug_mode):
|
189 |
+
all_csv_data = {}
|
190 |
+
all_debug_info = ""
|
191 |
+
api_status = True
|
192 |
+
|
193 |
+
for json_path in json_file_paths:
|
194 |
+
try:
|
195 |
+
with open(json_path, 'r', encoding='utf-8') as f:
|
196 |
+
full_data = json.load(f)
|
197 |
+
|
198 |
+
# Extract folder name from the JSON filename (format: foldername_processed_timestamp.json)
|
199 |
+
folder_name = os.path.basename(json_path).split('_processed_')[0]
|
200 |
+
|
201 |
+
json_chunks = chunk_json_by_char_limit(full_data, char_limit=3500)
|
202 |
+
all_csv_chunks = []
|
203 |
+
header_preserved = False
|
204 |
+
debug_info = f"Processing folder: {folder_name}\n"
|
205 |
+
|
206 |
+
headers = {
|
207 |
+
"Authorization": f"Bearer {api_key}",
|
208 |
+
"Content-Type": "application/json"
|
209 |
+
}
|
210 |
|
211 |
+
for idx, chunk in enumerate(json_chunks):
|
212 |
+
prompt = f"""
|
213 |
{repeated_info}
|
214 |
|
215 |
Below is a portion of the voter data in JSON format. Please extract all entries into a CSV format with the following columns:
|
|
|
227 |
Respond with ONLY the CSV data (including header ONLY in the first chunk).
|
228 |
""".strip()
|
229 |
|
230 |
+
payload = {
|
231 |
+
"model": "google/gemma-3n-e4b-it:free",
|
232 |
+
"messages": [
|
233 |
+
{"role": "user", "content": prompt}
|
234 |
+
],
|
235 |
+
"temperature": 0.1,
|
236 |
+
"max_tokens": 2048
|
237 |
+
}
|
238 |
+
|
239 |
+
try:
|
240 |
+
response = requests.post(
|
241 |
+
"https://openrouter.ai/api/v1/chat/completions",
|
242 |
+
headers=headers,
|
243 |
+
json=payload,
|
244 |
+
timeout=120
|
245 |
+
)
|
246 |
+
except Exception as e:
|
247 |
+
all_csv_data[folder_name] = pd.DataFrame({"Error": [f"Network error: {str(e)}"]})
|
248 |
+
debug_info += f"\nError in chunk {idx+1}: {str(e)}\n"
|
249 |
+
api_status = False
|
250 |
+
continue
|
251 |
|
252 |
+
if debug_mode:
|
253 |
+
debug_info += f"\n--- Chunk {idx+1} ---\nStatus: {response.status_code}\n{response.text}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
+
if response.status_code != 200:
|
256 |
+
all_csv_data[folder_name] = pd.DataFrame({"Error": [f"API Error on chunk {idx+1}: {response.text}"]})
|
257 |
+
debug_info += f"\nAPI Error in chunk {idx+1}: {response.text}\n"
|
258 |
+
api_status = False
|
259 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
260 |
|
261 |
+
chunk_csv = response.json()["choices"][0]["message"]["content"].strip()
|
262 |
|
263 |
+
# Keep header for first chunk only
|
264 |
+
lines = chunk_csv.splitlines()
|
265 |
+
if not header_preserved:
|
266 |
+
all_csv_chunks.append(chunk_csv)
|
267 |
+
header_preserved = True
|
|
|
|
|
|
|
268 |
else:
|
269 |
+
if len(lines) > 1:
|
270 |
+
all_csv_chunks.append("\n".join(lines[1:]))
|
271 |
+
else:
|
272 |
+
all_csv_chunks.append("") # if empty or malformed
|
273 |
|
274 |
+
time.sleep(1.5)
|
275 |
|
276 |
+
# Combine CSV results for this folder
|
277 |
+
combined_csv = "\n".join(all_csv_chunks)
|
278 |
+
csv_filename = f"{folder_name}_output_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
279 |
+
csv_path = os.path.join(JSON_SAVE_FOLDER, csv_filename)
|
280 |
|
281 |
+
with open(csv_path, 'w', encoding='utf-8-sig', newline='') as f:
|
282 |
+
f.write(combined_csv)
|
283 |
|
284 |
+
# Attempt to parse CSV into DataFrame
|
285 |
+
try:
|
286 |
+
df = pd.read_csv(io.StringIO(combined_csv))
|
287 |
+
all_csv_data[folder_name] = df
|
288 |
+
except Exception as e:
|
289 |
+
all_csv_data[folder_name] = pd.DataFrame({"Error": [f"CSV Parsing Error: {str(e)}", combined_csv]})
|
290 |
+
api_status = False
|
291 |
|
292 |
+
if debug_mode:
|
293 |
+
all_debug_info += debug_info + "\n"
|
|
|
|
|
|
|
|
|
294 |
|
295 |
+
except Exception as e:
|
296 |
+
all_csv_data[folder_name] = pd.DataFrame({"Error": [str(e)]})
|
297 |
+
all_debug_info += f"\nError processing {folder_name}: {str(e)}\n"
|
298 |
+
api_status = False
|
299 |
+
|
300 |
+
# Prepare download files
|
301 |
+
download_files = []
|
302 |
+
for folder_name in all_csv_data:
|
303 |
+
csv_filename = f"{folder_name}_output.csv"
|
304 |
+
csv_path = os.path.join(JSON_SAVE_FOLDER, csv_filename)
|
305 |
+
all_csv_data[folder_name].to_csv(csv_path, index=False, encoding='utf-8-sig')
|
306 |
+
download_files.append(csv_path)
|
307 |
+
|
308 |
+
# If only one folder, return its DataFrame directly, otherwise return a dict of DataFrames
|
309 |
+
if len(all_csv_data) == 1:
|
310 |
+
df_output = list(all_csv_data.values())[0]
|
311 |
+
else:
|
312 |
+
df_output = pd.concat(all_csv_data.values(), keys=all_csv_data.keys())
|
313 |
+
|
314 |
+
return (
|
315 |
+
df_output,
|
316 |
+
download_files[0] if len(download_files) == 1 else download_files,
|
317 |
+
all_debug_info if debug_mode else "",
|
318 |
+
api_status
|
319 |
+
)
|
320 |
|
321 |
# Gradio interface
|
322 |
+
with gr.Blocks(title="Hindi Electrol Processing") as demo:
|
323 |
+
gr.Markdown("## 📄 Hindi Electrol PDF Folder Processor")
|
324 |
+
gr.Markdown("Process folders of PDFs to extract text and convert to structured CSV using LLM")
|
325 |
|
326 |
with gr.Tab("PDF Processing"):
|
327 |
with gr.Row():
|
328 |
with gr.Column():
|
329 |
+
folder_input = gr.File(
|
330 |
+
label="Upload Folder(s) (Up to 5)",
|
331 |
+
file_count="directory",
|
332 |
+
file_types=["folder"],
|
333 |
+
max_files=5
|
334 |
+
)
|
335 |
+
pdf_submit = gr.Button("Process PDF Folders")
|
336 |
|
337 |
with gr.Column():
|
338 |
json_display = gr.JSON(label="Extracted JSON Data")
|
339 |
pdf_errors = gr.Textbox(label="Processing Errors")
|
340 |
+
json_download = gr.File(label="Download JSON Files", visible=False)
|
341 |
|
342 |
with gr.Tab("LLM API Processing"):
|
343 |
with gr.Row():
|
344 |
with gr.Column():
|
345 |
api_key = gr.Textbox(label="OpenRouter API Key", type="password")
|
346 |
+
repeated_info = gr.Textbox(
|
347 |
+
label="Additional Instructions",
|
348 |
+
value="Extract voter information from the following text:"
|
349 |
+
)
|
350 |
debug_mode = gr.Checkbox(label="Enable Debug Mode")
|
351 |
api_submit = gr.Button("Call LLM API")
|
352 |
|
353 |
with gr.Column():
|
354 |
dataframe_output = gr.Dataframe(label="CSV Output", wrap=True)
|
355 |
+
csv_download = gr.File(label="Download CSV Files")
|
356 |
api_debug = gr.Textbox(label="Debug Information", visible=False)
|
357 |
api_status = gr.Textbox(label="API Status", visible=False)
|
358 |
|
359 |
# PDF Processing
|
360 |
pdf_submit.click(
|
361 |
+
process_folders,
|
362 |
+
inputs=[folder_input],
|
363 |
outputs=[json_display, json_download, pdf_errors]
|
364 |
)
|
365 |
|
|
|
385 |
)
|
386 |
|
387 |
if __name__ == "__main__":
|
388 |
+
demo.launch(share=True)
|