Spaces:
Sleeping
Sleeping
Commit
·
f1a0c7b
1
Parent(s):
ee1b44b
Added file as per requiremnets
Browse files
README.md
CHANGED
@@ -1,14 +1,62 @@
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 🗳️ Hindi Voter PDF Processor with LLM API (OCR + OpenRouter + Gradio)
|
2 |
+
|
3 |
+
This app extracts voter information from scanned PDFs using OCR and formats it into a structured CSV using an LLM API (via [OpenRouter](https://openrouter.ai/)).
|
4 |
+
|
5 |
---
|
6 |
+
|
7 |
+
## 📦 Features
|
8 |
+
|
9 |
+
- Extracts text from Hindi/English PDFs using EasyOCR
|
10 |
+
- Splits content to avoid LLM token limits
|
11 |
+
- Sends chunked JSON to LLM for conversion to clean CSV
|
12 |
+
- Uses OpenRouter LLM API (e.g., Gemma-3b)
|
13 |
+
- Interactive UI with Gradio
|
14 |
+
- Supports download of extracted JSON and final CSV
|
15 |
+
|
16 |
---
|
17 |
|
18 |
+
|
19 |
+
## 🌐 Get Your OpenRouter API Key
|
20 |
+
- Go to https://openrouter.ai
|
21 |
+
|
22 |
+
- Click Login (use Google/GitHub/Email)
|
23 |
+
|
24 |
+
- Navigate to the Models page
|
25 |
+
|
26 |
+
- Click on a model like gemma-3b, mistral, etc.
|
27 |
+
|
28 |
+
- On the model page, click "Create API Key"
|
29 |
+
|
30 |
+
- Copy the API key
|
31 |
+
|
32 |
+
|
33 |
+
## 🧪 How to Use the Gradio App
|
34 |
+
### 🔹 Tab 1: PDF Processing
|
35 |
+
- Upload a Hindi/English scanned PDF
|
36 |
+
|
37 |
+
- Click "Process PDF"
|
38 |
+
|
39 |
+
- View extracted text in JSON format
|
40 |
+
|
41 |
+
- Download JSON file if needed
|
42 |
+
|
43 |
+
### 🔹 Tab 2: LLM API Processing
|
44 |
+
- Paste your OpenRouter API key
|
45 |
+
|
46 |
+
- (Optional) Customize the prompt or add instructions
|
47 |
+
|
48 |
+
- Click "Call LLM API"
|
49 |
+
|
50 |
+
- View structured voter data in CSV format
|
51 |
+
|
52 |
+
- Download the CSV file
|
53 |
+
|
54 |
+
- Enable Debug Mode to see raw API responses for troubleshooting.
|
55 |
+
|
56 |
+
|
57 |
+
### 📁 Output Files
|
58 |
+
Extracted JSON and CSV files are saved in the processed_json/ folder.
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
app.py
ADDED
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
import easyocr
|
4 |
+
import os
|
5 |
+
import tempfile
|
6 |
+
import numpy as np
|
7 |
+
import json
|
8 |
+
import cv2
|
9 |
+
import re
|
10 |
+
import csv
|
11 |
+
import io
|
12 |
+
import time
|
13 |
+
import gc
|
14 |
+
import requests
|
15 |
+
from datetime import datetime
|
16 |
+
import pandas as pd
|
17 |
+
|
18 |
+
# Configuration
|
19 |
+
JSON_SAVE_FOLDER = "processed_json"
|
20 |
+
os.makedirs(JSON_SAVE_FOLDER, exist_ok=True)
|
21 |
+
|
22 |
+
# Initialize EasyOCR reader with CPU only
|
23 |
+
def init_ocr():
|
24 |
+
return easyocr.Reader(['hi', 'en'], gpu=False) # Force CPU usage
|
25 |
+
|
26 |
+
reader = init_ocr()
|
27 |
+
|
28 |
+
def process_page_safely(page, page_num, attempt=1):
|
29 |
+
try:
|
30 |
+
pix = page.get_pixmap(dpi=200)
|
31 |
+
img_data = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
|
32 |
+
|
33 |
+
if pix.n == 4:
|
34 |
+
img_data = cv2.cvtColor(img_data, cv2.COLOR_RGBA2RGB)
|
35 |
+
|
36 |
+
max_pixels = 2000 * 2000
|
37 |
+
if img_data.shape[0] * img_data.shape[1] > max_pixels:
|
38 |
+
half = img_data.shape[0] // 2
|
39 |
+
top_part = img_data[:half, :]
|
40 |
+
bottom_part = img_data[half:, :]
|
41 |
+
|
42 |
+
results_top = reader.readtext(top_part, detail=1, batch_size=1)
|
43 |
+
results_bottom = reader.readtext(bottom_part, detail=1, batch_size=1)
|
44 |
+
results = results_top + results_bottom
|
45 |
+
else:
|
46 |
+
results = reader.readtext(img_data, detail=1, batch_size=1)
|
47 |
+
|
48 |
+
full_text = []
|
49 |
+
confidence_scores = []
|
50 |
+
for (bbox, text, confidence) in results:
|
51 |
+
cleaned_text = re.sub(r'[oO]', '0', text)
|
52 |
+
cleaned_text = re.sub(r'[lL]', '1', cleaned_text)
|
53 |
+
full_text.append(cleaned_text)
|
54 |
+
confidence_scores.append(confidence)
|
55 |
+
|
56 |
+
avg_confidence = sum(confidence_scores)/len(confidence_scores) if confidence_scores else 0
|
57 |
+
|
58 |
+
return {
|
59 |
+
"page": page_num,
|
60 |
+
"text": "\n".join(full_text),
|
61 |
+
"confidence": avg_confidence,
|
62 |
+
"dimensions": {"width": pix.width, "height": pix.height}
|
63 |
+
}
|
64 |
+
except Exception as e:
|
65 |
+
if attempt <= 3:
|
66 |
+
time.sleep(1)
|
67 |
+
gc.collect()
|
68 |
+
return process_page_safely(page, page_num, attempt+1)
|
69 |
+
return {"error": f"Page {page_num} error after {attempt} attempts: {str(e)}"}
|
70 |
+
|
71 |
+
def process_pdf(pdf_file, progress=gr.Progress()):
|
72 |
+
all_json = []
|
73 |
+
errors = []
|
74 |
+
|
75 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tf:
|
76 |
+
tf.write(pdf_file)
|
77 |
+
temp_pdf_path = tf.name
|
78 |
+
|
79 |
+
try:
|
80 |
+
with fitz.open(temp_pdf_path) as doc:
|
81 |
+
total_pages = len(doc)
|
82 |
+
|
83 |
+
for i in range(total_pages):
|
84 |
+
progress(i/total_pages, desc=f"Processing page {i+1}/{total_pages}")
|
85 |
+
page = doc.load_page(i)
|
86 |
+
page_result = process_page_safely(page, i+1)
|
87 |
+
|
88 |
+
if "error" in page_result:
|
89 |
+
errors.append(page_result["error"])
|
90 |
+
continue
|
91 |
+
|
92 |
+
all_json.append(page_result)
|
93 |
+
time.sleep(0.5)
|
94 |
+
gc.collect()
|
95 |
+
|
96 |
+
# Generate timestamp for filename
|
97 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
98 |
+
json_filename = f"processed_{timestamp}.json"
|
99 |
+
json_path = os.path.join(JSON_SAVE_FOLDER, json_filename)
|
100 |
+
|
101 |
+
# Save JSON to file with UTF-8 encoding
|
102 |
+
with open(json_path, 'w', encoding='utf-8') as f:
|
103 |
+
json.dump(all_json, f, indent=2, ensure_ascii=False)
|
104 |
+
|
105 |
+
return (
|
106 |
+
all_json, # For JSON display
|
107 |
+
json_path, # For file download
|
108 |
+
"\n".join(errors) if errors else "No errors" # For error display
|
109 |
+
)
|
110 |
+
|
111 |
+
except Exception as e:
|
112 |
+
return (
|
113 |
+
None,
|
114 |
+
None,
|
115 |
+
f"Processing error: {str(e)}"
|
116 |
+
)
|
117 |
+
finally:
|
118 |
+
try:
|
119 |
+
if os.path.exists(temp_pdf_path):
|
120 |
+
os.unlink(temp_pdf_path)
|
121 |
+
except:
|
122 |
+
pass
|
123 |
+
|
124 |
+
def chunk_json_by_char_limit(data, char_limit=3500):
|
125 |
+
chunks = []
|
126 |
+
current_chunk = []
|
127 |
+
current_length = 0
|
128 |
+
|
129 |
+
for entry in data:
|
130 |
+
entry_str = json.dumps(entry, ensure_ascii=False)
|
131 |
+
entry_length = len(entry_str)
|
132 |
+
|
133 |
+
if current_length + entry_length > char_limit:
|
134 |
+
chunks.append(current_chunk)
|
135 |
+
current_chunk = [entry]
|
136 |
+
current_length = entry_length
|
137 |
+
else:
|
138 |
+
current_chunk.append(entry)
|
139 |
+
current_length += entry_length
|
140 |
+
|
141 |
+
if current_chunk:
|
142 |
+
chunks.append(current_chunk)
|
143 |
+
|
144 |
+
return chunks
|
145 |
+
|
146 |
+
def call_llm_api(api_key, json_file_path, repeated_info, debug_mode):
|
147 |
+
try:
|
148 |
+
with open(json_file_path, 'r', encoding='utf-8') as f:
|
149 |
+
full_data = json.load(f)
|
150 |
+
|
151 |
+
# NEW: chunk by char limit
|
152 |
+
json_chunks = chunk_json_by_char_limit(full_data, char_limit=3500)
|
153 |
+
all_csv_chunks = []
|
154 |
+
header_preserved = False
|
155 |
+
debug_info = ""
|
156 |
+
|
157 |
+
headers = {
|
158 |
+
"Authorization": f"Bearer {api_key}",
|
159 |
+
"Content-Type": "application/json"
|
160 |
+
}
|
161 |
+
|
162 |
+
for idx, chunk in enumerate(json_chunks):
|
163 |
+
prompt = f"""
|
164 |
+
{repeated_info}
|
165 |
+
|
166 |
+
Below is a portion of the voter data in JSON format. Please extract all entries into a CSV format with the following columns:
|
167 |
+
विधानसभा, सेक्शन, मतदाता ID, मतदाता का नाम, अभिभावक का नाम, घर संख्या, आयु, लिंग, फोटो उपलब्ध?
|
168 |
+
|
169 |
+
Rules:
|
170 |
+
1. Use exactly these column headers in Hindi as shown above
|
171 |
+
2. Separate values with COMMAS (,)
|
172 |
+
3. For photo availability, use "हाँ" or "नहीं"
|
173 |
+
4. Do NOT include any extra explanation — only CSV
|
174 |
+
|
175 |
+
JSON Data:
|
176 |
+
{json.dumps(chunk, ensure_ascii=False)}
|
177 |
+
|
178 |
+
Respond with ONLY the CSV data (including header ONLY in the first chunk).
|
179 |
+
""".strip()
|
180 |
+
|
181 |
+
payload = {
|
182 |
+
"model": "google/gemma-3n-e4b-it:free",
|
183 |
+
"messages": [
|
184 |
+
{"role": "user", "content": prompt}
|
185 |
+
],
|
186 |
+
"temperature": 0.1,
|
187 |
+
"max_tokens": 2048
|
188 |
+
}
|
189 |
+
|
190 |
+
try:
|
191 |
+
response = requests.post(
|
192 |
+
"https://openrouter.ai/api/v1/chat/completions",
|
193 |
+
headers=headers,
|
194 |
+
json=payload,
|
195 |
+
timeout=120
|
196 |
+
)
|
197 |
+
except Exception as e:
|
198 |
+
return (
|
199 |
+
pd.DataFrame({"Error": [f"Network error: {str(e)}"]}),
|
200 |
+
None,
|
201 |
+
debug_info,
|
202 |
+
False
|
203 |
+
)
|
204 |
+
|
205 |
+
if debug_mode:
|
206 |
+
debug_info += f"\n--- Chunk {idx+1} ---\nStatus: {response.status_code}\n{response.text}\n"
|
207 |
+
|
208 |
+
if response.status_code != 200:
|
209 |
+
return (
|
210 |
+
pd.DataFrame({"Error": [f"API Error on chunk {idx+1}: {response.text}"]}),
|
211 |
+
None,
|
212 |
+
debug_info,
|
213 |
+
False
|
214 |
+
)
|
215 |
+
|
216 |
+
chunk_csv = response.json()["choices"][0]["message"]["content"].strip()
|
217 |
+
|
218 |
+
# Keep header for first chunk only
|
219 |
+
lines = chunk_csv.splitlines()
|
220 |
+
if not header_preserved:
|
221 |
+
all_csv_chunks.append(chunk_csv)
|
222 |
+
header_preserved = True
|
223 |
+
else:
|
224 |
+
if len(lines) > 1:
|
225 |
+
all_csv_chunks.append("\n".join(lines[1:]))
|
226 |
+
else:
|
227 |
+
all_csv_chunks.append("") # if empty or malformed
|
228 |
+
|
229 |
+
time.sleep(1.5)
|
230 |
+
|
231 |
+
# Combine CSV results
|
232 |
+
combined_csv = "\n".join(all_csv_chunks)
|
233 |
+
csv_filename = f"output_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
234 |
+
csv_path = os.path.join(JSON_SAVE_FOLDER, csv_filename)
|
235 |
+
|
236 |
+
with open(csv_path, 'w', encoding='utf-8-sig', newline='') as f:
|
237 |
+
f.write(combined_csv)
|
238 |
+
|
239 |
+
# Attempt to parse CSV into DataFrame
|
240 |
+
try:
|
241 |
+
df = pd.read_csv(io.StringIO(combined_csv))
|
242 |
+
except Exception as e:
|
243 |
+
df = pd.DataFrame({"Error": [f"CSV Parsing Error: {str(e)}", combined_csv]})
|
244 |
+
|
245 |
+
return (
|
246 |
+
df,
|
247 |
+
csv_path,
|
248 |
+
debug_info if debug_mode else "",
|
249 |
+
True
|
250 |
+
)
|
251 |
+
|
252 |
+
except Exception as e:
|
253 |
+
return (
|
254 |
+
pd.DataFrame({"Error": [str(e)]}),
|
255 |
+
None,
|
256 |
+
f"Unexpected error: {str(e)}",
|
257 |
+
False
|
258 |
+
)
|
259 |
+
|
260 |
+
# Gradio interface
|
261 |
+
with gr.Blocks(title="Hindi PDF Processor with LLM API") as demo:
|
262 |
+
gr.Markdown("## 📄 Hindi PDF Processor with LLM API")
|
263 |
+
gr.Markdown("Process PDFs to extract text and convert to structured CSV using LLM")
|
264 |
+
|
265 |
+
with gr.Tab("PDF Processing"):
|
266 |
+
with gr.Row():
|
267 |
+
with gr.Column():
|
268 |
+
pdf_input = gr.File(label="Upload PDF File", type="binary")
|
269 |
+
pdf_submit = gr.Button("Process PDF")
|
270 |
+
|
271 |
+
with gr.Column():
|
272 |
+
json_display = gr.JSON(label="Extracted JSON Data")
|
273 |
+
pdf_errors = gr.Textbox(label="Processing Errors")
|
274 |
+
json_download = gr.File(label="Download JSON File", visible=False)
|
275 |
+
|
276 |
+
with gr.Tab("LLM API Processing"):
|
277 |
+
with gr.Row():
|
278 |
+
with gr.Column():
|
279 |
+
api_key = gr.Textbox(label="OpenRouter API Key", type="password")
|
280 |
+
repeated_info = gr.Textbox(label="Additional Instructions",
|
281 |
+
value="Extract voter information from the following text:")
|
282 |
+
debug_mode = gr.Checkbox(label="Enable Debug Mode")
|
283 |
+
api_submit = gr.Button("Call LLM API")
|
284 |
+
|
285 |
+
with gr.Column():
|
286 |
+
dataframe_output = gr.Dataframe(label="CSV Output", wrap=True)
|
287 |
+
csv_download = gr.File(label="Download CSV File")
|
288 |
+
api_debug = gr.Textbox(label="Debug Information", visible=False)
|
289 |
+
api_status = gr.Textbox(label="API Status", visible=False)
|
290 |
+
|
291 |
+
# PDF Processing
|
292 |
+
pdf_submit.click(
|
293 |
+
process_pdf,
|
294 |
+
inputs=[pdf_input],
|
295 |
+
outputs=[json_display, json_download, pdf_errors]
|
296 |
+
)
|
297 |
+
|
298 |
+
# API Processing
|
299 |
+
api_submit.click(
|
300 |
+
call_llm_api,
|
301 |
+
inputs=[api_key, json_download, repeated_info, debug_mode],
|
302 |
+
outputs=[dataframe_output, csv_download, api_debug, api_status]
|
303 |
+
)
|
304 |
+
|
305 |
+
# Show/hide debug based on checkbox
|
306 |
+
debug_mode.change(
|
307 |
+
lambda x: gr.update(visible=x),
|
308 |
+
inputs=[debug_mode],
|
309 |
+
outputs=[api_debug]
|
310 |
+
)
|
311 |
+
|
312 |
+
# Update API status visibility
|
313 |
+
api_submit.click(
|
314 |
+
lambda: gr.update(visible=True),
|
315 |
+
inputs=None,
|
316 |
+
outputs=[api_status]
|
317 |
+
)
|
318 |
+
|
319 |
+
if __name__ == "__main__":
|
320 |
+
demo.launch()
|