shivam0109 commited on
Commit
78227e9
·
1 Parent(s): f45d315

added few changes and folder

Browse files
Files changed (1) hide show
  1. app.py +193 -125
app.py CHANGED
@@ -14,6 +14,7 @@ import gc
14
  import requests
15
  from datetime import datetime
16
  import pandas as pd
 
17
 
18
  # Configuration
19
  JSON_SAVE_FOLDER = "processed_json"
@@ -68,16 +69,12 @@ def process_page_safely(page, page_num, attempt=1):
68
  return process_page_safely(page, page_num, attempt+1)
69
  return {"error": f"Page {page_num} error after {attempt} attempts: {str(e)}"}
70
 
71
- def process_pdf(pdf_file, progress=gr.Progress()):
72
  all_json = []
73
  errors = []
74
 
75
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tf:
76
- tf.write(pdf_file)
77
- temp_pdf_path = tf.name
78
-
79
  try:
80
- with fitz.open(temp_pdf_path) as doc:
81
  total_pages = len(doc)
82
 
83
  for i in range(total_pages):
@@ -93,33 +90,78 @@ def process_pdf(pdf_file, progress=gr.Progress()):
93
  time.sleep(0.5)
94
  gc.collect()
95
 
96
- # Generate timestamp for filename
97
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
98
- json_filename = f"processed_{timestamp}.json"
99
- json_path = os.path.join(JSON_SAVE_FOLDER, json_filename)
100
 
101
- # Save JSON to file with UTF-8 encoding
102
- with open(json_path, 'w', encoding='utf-8') as f:
103
- json.dump(all_json, f, indent=2, ensure_ascii=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- return (
106
- all_json, # For JSON display
107
- json_path, # For file download
108
- "\n".join(errors) if errors else "No errors" # For error display
109
- )
110
 
111
- except Exception as e:
112
- return (
113
- None,
114
- None,
115
- f"Processing error: {str(e)}"
116
- )
117
- finally:
118
  try:
119
- if os.path.exists(temp_pdf_path):
120
- os.unlink(temp_pdf_path)
121
- except:
122
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
 
124
  def chunk_json_by_char_limit(data, char_limit=3500):
125
  chunks = []
@@ -143,24 +185,31 @@ def chunk_json_by_char_limit(data, char_limit=3500):
143
 
144
  return chunks
145
 
146
- def call_llm_api(api_key, json_file_path, repeated_info, debug_mode):
147
- try:
148
- with open(json_file_path, 'r', encoding='utf-8') as f:
149
- full_data = json.load(f)
150
-
151
- # NEW: chunk by char limit
152
- json_chunks = chunk_json_by_char_limit(full_data, char_limit=3500)
153
- all_csv_chunks = []
154
- header_preserved = False
155
- debug_info = ""
156
-
157
- headers = {
158
- "Authorization": f"Bearer {api_key}",
159
- "Content-Type": "application/json"
160
- }
 
 
 
 
 
 
 
161
 
162
- for idx, chunk in enumerate(json_chunks):
163
- prompt = f"""
164
  {repeated_info}
165
 
166
  Below is a portion of the voter data in JSON format. Please extract all entries into a CSV format with the following columns:
@@ -178,120 +227,139 @@ JSON Data:
178
  Respond with ONLY the CSV data (including header ONLY in the first chunk).
179
  """.strip()
180
 
181
- payload = {
182
- "model": "google/gemma-3n-e4b-it:free",
183
- "messages": [
184
- {"role": "user", "content": prompt}
185
- ],
186
- "temperature": 0.1,
187
- "max_tokens": 2048
188
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- try:
191
- response = requests.post(
192
- "https://openrouter.ai/api/v1/chat/completions",
193
- headers=headers,
194
- json=payload,
195
- timeout=120
196
- )
197
- except Exception as e:
198
- return (
199
- pd.DataFrame({"Error": [f"Network error: {str(e)}"]}),
200
- None,
201
- debug_info,
202
- False
203
- )
204
 
205
- if debug_mode:
206
- debug_info += f"\n--- Chunk {idx+1} ---\nStatus: {response.status_code}\n{response.text}\n"
207
-
208
- if response.status_code != 200:
209
- return (
210
- pd.DataFrame({"Error": [f"API Error on chunk {idx+1}: {response.text}"]}),
211
- None,
212
- debug_info,
213
- False
214
- )
215
 
216
- chunk_csv = response.json()["choices"][0]["message"]["content"].strip()
217
 
218
- # Keep header for first chunk only
219
- lines = chunk_csv.splitlines()
220
- if not header_preserved:
221
- all_csv_chunks.append(chunk_csv)
222
- header_preserved = True
223
- else:
224
- if len(lines) > 1:
225
- all_csv_chunks.append("\n".join(lines[1:]))
226
  else:
227
- all_csv_chunks.append("") # if empty or malformed
 
 
 
228
 
229
- time.sleep(1.5)
230
 
231
- # Combine CSV results
232
- combined_csv = "\n".join(all_csv_chunks)
233
- csv_filename = f"output_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
234
- csv_path = os.path.join(JSON_SAVE_FOLDER, csv_filename)
235
 
236
- with open(csv_path, 'w', encoding='utf-8-sig', newline='') as f:
237
- f.write(combined_csv)
238
 
239
- # Attempt to parse CSV into DataFrame
240
- try:
241
- df = pd.read_csv(io.StringIO(combined_csv))
242
- except Exception as e:
243
- df = pd.DataFrame({"Error": [f"CSV Parsing Error: {str(e)}", combined_csv]})
 
 
244
 
245
- return (
246
- df,
247
- csv_path,
248
- debug_info if debug_mode else "",
249
- True
250
- )
251
 
252
- except Exception as e:
253
- return (
254
- pd.DataFrame({"Error": [str(e)]}),
255
- None,
256
- f"Unexpected error: {str(e)}",
257
- False
258
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  # Gradio interface
261
- with gr.Blocks(title="Hindi PDF Processor with LLM API") as demo:
262
- gr.Markdown("## 📄 Hindi PDF Processor with LLM API")
263
- gr.Markdown("Process PDFs to extract text and convert to structured CSV using LLM")
264
 
265
  with gr.Tab("PDF Processing"):
266
  with gr.Row():
267
  with gr.Column():
268
- pdf_input = gr.File(label="Upload PDF File", type="binary")
269
- pdf_submit = gr.Button("Process PDF")
 
 
 
 
 
270
 
271
  with gr.Column():
272
  json_display = gr.JSON(label="Extracted JSON Data")
273
  pdf_errors = gr.Textbox(label="Processing Errors")
274
- json_download = gr.File(label="Download JSON File", visible=False)
275
 
276
  with gr.Tab("LLM API Processing"):
277
  with gr.Row():
278
  with gr.Column():
279
  api_key = gr.Textbox(label="OpenRouter API Key", type="password")
280
- repeated_info = gr.Textbox(label="Additional Instructions",
281
- value="Extract voter information from the following text:")
 
 
282
  debug_mode = gr.Checkbox(label="Enable Debug Mode")
283
  api_submit = gr.Button("Call LLM API")
284
 
285
  with gr.Column():
286
  dataframe_output = gr.Dataframe(label="CSV Output", wrap=True)
287
- csv_download = gr.File(label="Download CSV File")
288
  api_debug = gr.Textbox(label="Debug Information", visible=False)
289
  api_status = gr.Textbox(label="API Status", visible=False)
290
 
291
  # PDF Processing
292
  pdf_submit.click(
293
- process_pdf,
294
- inputs=[pdf_input],
295
  outputs=[json_display, json_download, pdf_errors]
296
  )
297
 
@@ -317,4 +385,4 @@ with gr.Blocks(title="Hindi PDF Processor with LLM API") as demo:
317
  )
318
 
319
  if __name__ == "__main__":
320
- demo.launch()
 
14
  import requests
15
  from datetime import datetime
16
  import pandas as pd
17
+ from pathlib import Path
18
 
19
  # Configuration
20
  JSON_SAVE_FOLDER = "processed_json"
 
69
  return process_page_safely(page, page_num, attempt+1)
70
  return {"error": f"Page {page_num} error after {attempt} attempts: {str(e)}"}
71
 
72
+ def process_pdf(pdf_path, progress=gr.Progress()):
73
  all_json = []
74
  errors = []
75
 
 
 
 
 
76
  try:
77
+ with fitz.open(pdf_path) as doc:
78
  total_pages = len(doc)
79
 
80
  for i in range(total_pages):
 
90
  time.sleep(0.5)
91
  gc.collect()
92
 
93
+ return all_json, errors
 
 
 
94
 
95
+ except Exception as e:
96
+ return None, [f"Processing error: {str(e)}"]
97
+
98
+ def process_folder(folder_path, progress=gr.Progress()):
99
+ folder_name = os.path.basename(folder_path)
100
+ all_pdfs_json = []
101
+ all_errors = []
102
+
103
+ # Get all PDF files in the folder
104
+ pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]
105
+ if not pdf_files:
106
+ return None, None, f"No PDF files found in folder: {folder_name}"
107
+
108
+ # Process each PDF in the folder
109
+ for i, pdf_file in enumerate(pdf_files):
110
+ progress(i/len(pdf_files), desc=f"Processing {pdf_file} in {folder_name}")
111
+ pdf_path = os.path.join(folder_path, pdf_file)
112
 
113
+ # Create temp file (needed for fitz)
114
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tf:
115
+ with open(pdf_path, 'rb') as f:
116
+ tf.write(f.read())
117
+ temp_pdf_path = tf.name
118
 
 
 
 
 
 
 
 
119
  try:
120
+ pdf_json, errors = process_pdf(temp_pdf_path, progress)
121
+ if pdf_json:
122
+ all_pdfs_json.extend(pdf_json)
123
+ if errors:
124
+ all_errors.extend(errors)
125
+ finally:
126
+ try:
127
+ if os.path.exists(temp_pdf_path):
128
+ os.unlink(temp_pdf_path)
129
+ except:
130
+ pass
131
+
132
+ if not all_pdfs_json:
133
+ return None, None, "\n".join(all_errors) if all_errors else "No data extracted from any PDF"
134
+
135
+ # Save combined JSON for the folder
136
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
137
+ json_filename = f"{folder_name}_processed_{timestamp}.json"
138
+ json_path = os.path.join(JSON_SAVE_FOLDER, json_filename)
139
+
140
+ with open(json_path, 'w', encoding='utf-8') as f:
141
+ json.dump(all_pdfs_json, f, indent=2, ensure_ascii=False)
142
+
143
+ return all_pdfs_json, json_path, "\n".join(all_errors) if all_errors else "No errors"
144
+
145
+ def process_folders(folder_paths, progress=gr.Progress()):
146
+ all_results = []
147
+ all_json_paths = []
148
+ all_errors = []
149
+
150
+ for i, folder_path in enumerate(folder_paths):
151
+ progress(i/len(folder_paths), desc=f"Processing folder {i+1}/{len(folder_paths)}")
152
+ json_data, json_path, errors = process_folder(folder_path, progress)
153
+
154
+ if json_data:
155
+ all_results.append({
156
+ "folder": os.path.basename(folder_path),
157
+ "data": json_data
158
+ })
159
+ if json_path:
160
+ all_json_paths.append(json_path)
161
+ if errors and errors != "No errors":
162
+ all_errors.append(f"Folder {os.path.basename(folder_path)}: {errors}")
163
+
164
+ return all_results, all_json_paths, "\n".join(all_errors) if all_errors else "No errors"
165
 
166
  def chunk_json_by_char_limit(data, char_limit=3500):
167
  chunks = []
 
185
 
186
  return chunks
187
 
188
+ def call_llm_api(api_key, json_file_paths, repeated_info, debug_mode):
189
+ all_csv_data = {}
190
+ all_debug_info = ""
191
+ api_status = True
192
+
193
+ for json_path in json_file_paths:
194
+ try:
195
+ with open(json_path, 'r', encoding='utf-8') as f:
196
+ full_data = json.load(f)
197
+
198
+ # Extract folder name from the JSON filename (format: foldername_processed_timestamp.json)
199
+ folder_name = os.path.basename(json_path).split('_processed_')[0]
200
+
201
+ json_chunks = chunk_json_by_char_limit(full_data, char_limit=3500)
202
+ all_csv_chunks = []
203
+ header_preserved = False
204
+ debug_info = f"Processing folder: {folder_name}\n"
205
+
206
+ headers = {
207
+ "Authorization": f"Bearer {api_key}",
208
+ "Content-Type": "application/json"
209
+ }
210
 
211
+ for idx, chunk in enumerate(json_chunks):
212
+ prompt = f"""
213
  {repeated_info}
214
 
215
  Below is a portion of the voter data in JSON format. Please extract all entries into a CSV format with the following columns:
 
227
  Respond with ONLY the CSV data (including header ONLY in the first chunk).
228
  """.strip()
229
 
230
+ payload = {
231
+ "model": "google/gemma-3n-e4b-it:free",
232
+ "messages": [
233
+ {"role": "user", "content": prompt}
234
+ ],
235
+ "temperature": 0.1,
236
+ "max_tokens": 2048
237
+ }
238
+
239
+ try:
240
+ response = requests.post(
241
+ "https://openrouter.ai/api/v1/chat/completions",
242
+ headers=headers,
243
+ json=payload,
244
+ timeout=120
245
+ )
246
+ except Exception as e:
247
+ all_csv_data[folder_name] = pd.DataFrame({"Error": [f"Network error: {str(e)}"]})
248
+ debug_info += f"\nError in chunk {idx+1}: {str(e)}\n"
249
+ api_status = False
250
+ continue
251
 
252
+ if debug_mode:
253
+ debug_info += f"\n--- Chunk {idx+1} ---\nStatus: {response.status_code}\n{response.text}\n"
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
+ if response.status_code != 200:
256
+ all_csv_data[folder_name] = pd.DataFrame({"Error": [f"API Error on chunk {idx+1}: {response.text}"]})
257
+ debug_info += f"\nAPI Error in chunk {idx+1}: {response.text}\n"
258
+ api_status = False
259
+ continue
 
 
 
 
 
260
 
261
+ chunk_csv = response.json()["choices"][0]["message"]["content"].strip()
262
 
263
+ # Keep header for first chunk only
264
+ lines = chunk_csv.splitlines()
265
+ if not header_preserved:
266
+ all_csv_chunks.append(chunk_csv)
267
+ header_preserved = True
 
 
 
268
  else:
269
+ if len(lines) > 1:
270
+ all_csv_chunks.append("\n".join(lines[1:]))
271
+ else:
272
+ all_csv_chunks.append("") # if empty or malformed
273
 
274
+ time.sleep(1.5)
275
 
276
+ # Combine CSV results for this folder
277
+ combined_csv = "\n".join(all_csv_chunks)
278
+ csv_filename = f"{folder_name}_output_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
279
+ csv_path = os.path.join(JSON_SAVE_FOLDER, csv_filename)
280
 
281
+ with open(csv_path, 'w', encoding='utf-8-sig', newline='') as f:
282
+ f.write(combined_csv)
283
 
284
+ # Attempt to parse CSV into DataFrame
285
+ try:
286
+ df = pd.read_csv(io.StringIO(combined_csv))
287
+ all_csv_data[folder_name] = df
288
+ except Exception as e:
289
+ all_csv_data[folder_name] = pd.DataFrame({"Error": [f"CSV Parsing Error: {str(e)}", combined_csv]})
290
+ api_status = False
291
 
292
+ if debug_mode:
293
+ all_debug_info += debug_info + "\n"
 
 
 
 
294
 
295
+ except Exception as e:
296
+ all_csv_data[folder_name] = pd.DataFrame({"Error": [str(e)]})
297
+ all_debug_info += f"\nError processing {folder_name}: {str(e)}\n"
298
+ api_status = False
299
+
300
+ # Prepare download files
301
+ download_files = []
302
+ for folder_name in all_csv_data:
303
+ csv_filename = f"{folder_name}_output.csv"
304
+ csv_path = os.path.join(JSON_SAVE_FOLDER, csv_filename)
305
+ all_csv_data[folder_name].to_csv(csv_path, index=False, encoding='utf-8-sig')
306
+ download_files.append(csv_path)
307
+
308
+ # If only one folder, return its DataFrame directly, otherwise return a dict of DataFrames
309
+ if len(all_csv_data) == 1:
310
+ df_output = list(all_csv_data.values())[0]
311
+ else:
312
+ df_output = pd.concat(all_csv_data.values(), keys=all_csv_data.keys())
313
+
314
+ return (
315
+ df_output,
316
+ download_files[0] if len(download_files) == 1 else download_files,
317
+ all_debug_info if debug_mode else "",
318
+ api_status
319
+ )
320
 
321
  # Gradio interface
322
+ with gr.Blocks(title="Hindi Electrol Processing") as demo:
323
+ gr.Markdown("## 📄 Hindi Electrol PDF Folder Processor")
324
+ gr.Markdown("Process folders of PDFs to extract text and convert to structured CSV using LLM")
325
 
326
  with gr.Tab("PDF Processing"):
327
  with gr.Row():
328
  with gr.Column():
329
+ folder_input = gr.File(
330
+ label="Upload Folder(s) (Up to 5)",
331
+ file_count="directory",
332
+ file_types=["folder"],
333
+ max_files=5
334
+ )
335
+ pdf_submit = gr.Button("Process PDF Folders")
336
 
337
  with gr.Column():
338
  json_display = gr.JSON(label="Extracted JSON Data")
339
  pdf_errors = gr.Textbox(label="Processing Errors")
340
+ json_download = gr.File(label="Download JSON Files", visible=False)
341
 
342
  with gr.Tab("LLM API Processing"):
343
  with gr.Row():
344
  with gr.Column():
345
  api_key = gr.Textbox(label="OpenRouter API Key", type="password")
346
+ repeated_info = gr.Textbox(
347
+ label="Additional Instructions",
348
+ value="Extract voter information from the following text:"
349
+ )
350
  debug_mode = gr.Checkbox(label="Enable Debug Mode")
351
  api_submit = gr.Button("Call LLM API")
352
 
353
  with gr.Column():
354
  dataframe_output = gr.Dataframe(label="CSV Output", wrap=True)
355
+ csv_download = gr.File(label="Download CSV Files")
356
  api_debug = gr.Textbox(label="Debug Information", visible=False)
357
  api_status = gr.Textbox(label="API Status", visible=False)
358
 
359
  # PDF Processing
360
  pdf_submit.click(
361
+ process_folders,
362
+ inputs=[folder_input],
363
  outputs=[json_display, json_download, pdf_errors]
364
  )
365
 
 
385
  )
386
 
387
  if __name__ == "__main__":
388
+ demo.launch(share=True)