shivam0109 commited on
Commit
f1a0c7b
·
1 Parent(s): ee1b44b

Added file as per requiremnets

Browse files
Files changed (2) hide show
  1. README.md +59 -11
  2. app.py +320 -0
README.md CHANGED
@@ -1,14 +1,62 @@
 
 
 
 
1
  ---
2
- title: Electrol Roll
3
- emoji: 🌍
4
- colorFrom: purple
5
- colorTo: gray
6
- sdk: gradio
7
- sdk_version: 5.33.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Gradio app for the Electrol csv
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🗳️ Hindi Voter PDF Processor with LLM API (OCR + OpenRouter + Gradio)
2
+
3
+ This app extracts voter information from scanned PDFs using OCR and formats it into a structured CSV using an LLM API (via [OpenRouter](https://openrouter.ai/)).
4
+
5
  ---
6
+
7
+ ## 📦 Features
8
+
9
+ - Extracts text from Hindi/English PDFs using EasyOCR
10
+ - Splits content to avoid LLM token limits
11
+ - Sends chunked JSON to LLM for conversion to clean CSV
12
+ - Uses OpenRouter LLM API (e.g., Gemma-3b)
13
+ - Interactive UI with Gradio
14
+ - Supports download of extracted JSON and final CSV
15
+
16
  ---
17
 
18
+
19
+ ## 🌐 Get Your OpenRouter API Key
20
+ - Go to https://openrouter.ai
21
+
22
+ - Click Login (use Google/GitHub/Email)
23
+
24
+ - Navigate to the Models page
25
+
26
+ - Click on a model like gemma-3b, mistral, etc.
27
+
28
+ - On the model page, click "Create API Key"
29
+
30
+ - Copy the API key
31
+
32
+
33
+ ## 🧪 How to Use the Gradio App
34
+ ### 🔹 Tab 1: PDF Processing
35
+ - Upload a Hindi/English scanned PDF
36
+
37
+ - Click "Process PDF"
38
+
39
+ - View extracted text in JSON format
40
+
41
+ - Download JSON file if needed
42
+
43
+ ### 🔹 Tab 2: LLM API Processing
44
+ - Paste your OpenRouter API key
45
+
46
+ - (Optional) Customize the prompt or add instructions
47
+
48
+ - Click "Call LLM API"
49
+
50
+ - View structured voter data in CSV format
51
+
52
+ - Download the CSV file
53
+
54
+ - Enable Debug Mode to see raw API responses for troubleshooting.
55
+
56
+
57
+ ### 📁 Output Files
58
+ Extracted JSON and CSV files are saved in the processed_json/ folder.
59
+
60
+
61
+
62
+
app.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import easyocr
4
+ import os
5
+ import tempfile
6
+ import numpy as np
7
+ import json
8
+ import cv2
9
+ import re
10
+ import csv
11
+ import io
12
+ import time
13
+ import gc
14
+ import requests
15
+ from datetime import datetime
16
+ import pandas as pd
17
+
18
+ # Configuration
19
+ JSON_SAVE_FOLDER = "processed_json"
20
+ os.makedirs(JSON_SAVE_FOLDER, exist_ok=True)
21
+
22
+ # Initialize EasyOCR reader with CPU only
23
+ def init_ocr():
24
+ return easyocr.Reader(['hi', 'en'], gpu=False) # Force CPU usage
25
+
26
+ reader = init_ocr()
27
+
28
+ def process_page_safely(page, page_num, attempt=1):
29
+ try:
30
+ pix = page.get_pixmap(dpi=200)
31
+ img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
32
+
33
+ if pix.n == 4:
34
+ img_data = cv2.cvtColor(img_data, cv2.COLOR_RGBA2RGB)
35
+
36
+ max_pixels = 2000 * 2000
37
+ if img_data.shape[0] * img_data.shape[1] > max_pixels:
38
+ half = img_data.shape[0] // 2
39
+ top_part = img_data[:half, :]
40
+ bottom_part = img_data[half:, :]
41
+
42
+ results_top = reader.readtext(top_part, detail=1, batch_size=1)
43
+ results_bottom = reader.readtext(bottom_part, detail=1, batch_size=1)
44
+ results = results_top + results_bottom
45
+ else:
46
+ results = reader.readtext(img_data, detail=1, batch_size=1)
47
+
48
+ full_text = []
49
+ confidence_scores = []
50
+ for (bbox, text, confidence) in results:
51
+ cleaned_text = re.sub(r'[oO]', '0', text)
52
+ cleaned_text = re.sub(r'[lL]', '1', cleaned_text)
53
+ full_text.append(cleaned_text)
54
+ confidence_scores.append(confidence)
55
+
56
+ avg_confidence = sum(confidence_scores)/len(confidence_scores) if confidence_scores else 0
57
+
58
+ return {
59
+ "page": page_num,
60
+ "text": "\n".join(full_text),
61
+ "confidence": avg_confidence,
62
+ "dimensions": {"width": pix.width, "height": pix.height}
63
+ }
64
+ except Exception as e:
65
+ if attempt <= 3:
66
+ time.sleep(1)
67
+ gc.collect()
68
+ return process_page_safely(page, page_num, attempt+1)
69
+ return {"error": f"Page {page_num} error after {attempt} attempts: {str(e)}"}
70
+
71
+ def process_pdf(pdf_file, progress=gr.Progress()):
72
+ all_json = []
73
+ errors = []
74
+
75
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tf:
76
+ tf.write(pdf_file)
77
+ temp_pdf_path = tf.name
78
+
79
+ try:
80
+ with fitz.open(temp_pdf_path) as doc:
81
+ total_pages = len(doc)
82
+
83
+ for i in range(total_pages):
84
+ progress(i/total_pages, desc=f"Processing page {i+1}/{total_pages}")
85
+ page = doc.load_page(i)
86
+ page_result = process_page_safely(page, i+1)
87
+
88
+ if "error" in page_result:
89
+ errors.append(page_result["error"])
90
+ continue
91
+
92
+ all_json.append(page_result)
93
+ time.sleep(0.5)
94
+ gc.collect()
95
+
96
+ # Generate timestamp for filename
97
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
98
+ json_filename = f"processed_{timestamp}.json"
99
+ json_path = os.path.join(JSON_SAVE_FOLDER, json_filename)
100
+
101
+ # Save JSON to file with UTF-8 encoding
102
+ with open(json_path, 'w', encoding='utf-8') as f:
103
+ json.dump(all_json, f, indent=2, ensure_ascii=False)
104
+
105
+ return (
106
+ all_json, # For JSON display
107
+ json_path, # For file download
108
+ "\n".join(errors) if errors else "No errors" # For error display
109
+ )
110
+
111
+ except Exception as e:
112
+ return (
113
+ None,
114
+ None,
115
+ f"Processing error: {str(e)}"
116
+ )
117
+ finally:
118
+ try:
119
+ if os.path.exists(temp_pdf_path):
120
+ os.unlink(temp_pdf_path)
121
+ except:
122
+ pass
123
+
124
+ def chunk_json_by_char_limit(data, char_limit=3500):
125
+ chunks = []
126
+ current_chunk = []
127
+ current_length = 0
128
+
129
+ for entry in data:
130
+ entry_str = json.dumps(entry, ensure_ascii=False)
131
+ entry_length = len(entry_str)
132
+
133
+ if current_length + entry_length > char_limit:
134
+ chunks.append(current_chunk)
135
+ current_chunk = [entry]
136
+ current_length = entry_length
137
+ else:
138
+ current_chunk.append(entry)
139
+ current_length += entry_length
140
+
141
+ if current_chunk:
142
+ chunks.append(current_chunk)
143
+
144
+ return chunks
145
+
146
+ def call_llm_api(api_key, json_file_path, repeated_info, debug_mode):
147
+ try:
148
+ with open(json_file_path, 'r', encoding='utf-8') as f:
149
+ full_data = json.load(f)
150
+
151
+ # NEW: chunk by char limit
152
+ json_chunks = chunk_json_by_char_limit(full_data, char_limit=3500)
153
+ all_csv_chunks = []
154
+ header_preserved = False
155
+ debug_info = ""
156
+
157
+ headers = {
158
+ "Authorization": f"Bearer {api_key}",
159
+ "Content-Type": "application/json"
160
+ }
161
+
162
+ for idx, chunk in enumerate(json_chunks):
163
+ prompt = f"""
164
+ {repeated_info}
165
+
166
+ Below is a portion of the voter data in JSON format. Please extract all entries into a CSV format with the following columns:
167
+ विधानसभा, सेक्शन, मतदाता ID, मतदाता का नाम, अभिभावक का नाम, घर संख्या, आयु, लिंग, फोटो उपलब्ध?
168
+
169
+ Rules:
170
+ 1. Use exactly these column headers in Hindi as shown above
171
+ 2. Separate values with COMMAS (,)
172
+ 3. For photo availability, use "हाँ" or "नहीं"
173
+ 4. Do NOT include any extra explanation — only CSV
174
+
175
+ JSON Data:
176
+ {json.dumps(chunk, ensure_ascii=False)}
177
+
178
+ Respond with ONLY the CSV data (including header ONLY in the first chunk).
179
+ """.strip()
180
+
181
+ payload = {
182
+ "model": "google/gemma-3n-e4b-it:free",
183
+ "messages": [
184
+ {"role": "user", "content": prompt}
185
+ ],
186
+ "temperature": 0.1,
187
+ "max_tokens": 2048
188
+ }
189
+
190
+ try:
191
+ response = requests.post(
192
+ "https://openrouter.ai/api/v1/chat/completions",
193
+ headers=headers,
194
+ json=payload,
195
+ timeout=120
196
+ )
197
+ except Exception as e:
198
+ return (
199
+ pd.DataFrame({"Error": [f"Network error: {str(e)}"]}),
200
+ None,
201
+ debug_info,
202
+ False
203
+ )
204
+
205
+ if debug_mode:
206
+ debug_info += f"\n--- Chunk {idx+1} ---\nStatus: {response.status_code}\n{response.text}\n"
207
+
208
+ if response.status_code != 200:
209
+ return (
210
+ pd.DataFrame({"Error": [f"API Error on chunk {idx+1}: {response.text}"]}),
211
+ None,
212
+ debug_info,
213
+ False
214
+ )
215
+
216
+ chunk_csv = response.json()["choices"][0]["message"]["content"].strip()
217
+
218
+ # Keep header for first chunk only
219
+ lines = chunk_csv.splitlines()
220
+ if not header_preserved:
221
+ all_csv_chunks.append(chunk_csv)
222
+ header_preserved = True
223
+ else:
224
+ if len(lines) > 1:
225
+ all_csv_chunks.append("\n".join(lines[1:]))
226
+ else:
227
+ all_csv_chunks.append("") # if empty or malformed
228
+
229
+ time.sleep(1.5)
230
+
231
+ # Combine CSV results
232
+ combined_csv = "\n".join(all_csv_chunks)
233
+ csv_filename = f"output_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
234
+ csv_path = os.path.join(JSON_SAVE_FOLDER, csv_filename)
235
+
236
+ with open(csv_path, 'w', encoding='utf-8-sig', newline='') as f:
237
+ f.write(combined_csv)
238
+
239
+ # Attempt to parse CSV into DataFrame
240
+ try:
241
+ df = pd.read_csv(io.StringIO(combined_csv))
242
+ except Exception as e:
243
+ df = pd.DataFrame({"Error": [f"CSV Parsing Error: {str(e)}", combined_csv]})
244
+
245
+ return (
246
+ df,
247
+ csv_path,
248
+ debug_info if debug_mode else "",
249
+ True
250
+ )
251
+
252
+ except Exception as e:
253
+ return (
254
+ pd.DataFrame({"Error": [str(e)]}),
255
+ None,
256
+ f"Unexpected error: {str(e)}",
257
+ False
258
+ )
259
+
260
+ # Gradio interface
261
+ with gr.Blocks(title="Hindi PDF Processor with LLM API") as demo:
262
+ gr.Markdown("## 📄 Hindi PDF Processor with LLM API")
263
+ gr.Markdown("Process PDFs to extract text and convert to structured CSV using LLM")
264
+
265
+ with gr.Tab("PDF Processing"):
266
+ with gr.Row():
267
+ with gr.Column():
268
+ pdf_input = gr.File(label="Upload PDF File", type="binary")
269
+ pdf_submit = gr.Button("Process PDF")
270
+
271
+ with gr.Column():
272
+ json_display = gr.JSON(label="Extracted JSON Data")
273
+ pdf_errors = gr.Textbox(label="Processing Errors")
274
+ json_download = gr.File(label="Download JSON File", visible=False)
275
+
276
+ with gr.Tab("LLM API Processing"):
277
+ with gr.Row():
278
+ with gr.Column():
279
+ api_key = gr.Textbox(label="OpenRouter API Key", type="password")
280
+ repeated_info = gr.Textbox(label="Additional Instructions",
281
+ value="Extract voter information from the following text:")
282
+ debug_mode = gr.Checkbox(label="Enable Debug Mode")
283
+ api_submit = gr.Button("Call LLM API")
284
+
285
+ with gr.Column():
286
+ dataframe_output = gr.Dataframe(label="CSV Output", wrap=True)
287
+ csv_download = gr.File(label="Download CSV File")
288
+ api_debug = gr.Textbox(label="Debug Information", visible=False)
289
+ api_status = gr.Textbox(label="API Status", visible=False)
290
+
291
+ # PDF Processing
292
+ pdf_submit.click(
293
+ process_pdf,
294
+ inputs=[pdf_input],
295
+ outputs=[json_display, json_download, pdf_errors]
296
+ )
297
+
298
+ # API Processing
299
+ api_submit.click(
300
+ call_llm_api,
301
+ inputs=[api_key, json_download, repeated_info, debug_mode],
302
+ outputs=[dataframe_output, csv_download, api_debug, api_status]
303
+ )
304
+
305
+ # Show/hide debug based on checkbox
306
+ debug_mode.change(
307
+ lambda x: gr.update(visible=x),
308
+ inputs=[debug_mode],
309
+ outputs=[api_debug]
310
+ )
311
+
312
+ # Update API status visibility
313
+ api_submit.click(
314
+ lambda: gr.update(visible=True),
315
+ inputs=None,
316
+ outputs=[api_status]
317
+ )
318
+
319
+ if __name__ == "__main__":
320
+ demo.launch()