Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
"""
|
2 |
-
DOLPHIN PDF Document AI -
|
3 |
Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
|
|
|
4 |
"""
|
5 |
|
6 |
import gradio as gr
|
@@ -219,6 +220,9 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
|
|
219 |
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
220 |
pil_crop = crop_margin(pil_crop)
|
221 |
|
|
|
|
|
|
|
222 |
buffered = io.BytesIO()
|
223 |
pil_crop.save(buffered, format="PNG")
|
224 |
img_base64 = base64.b64encode(buffered.getvalue()).decode()
|
@@ -226,9 +230,10 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
|
|
226 |
|
227 |
figure_results.append({
|
228 |
"label": label,
|
229 |
-
"text": f"![
|
230 |
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
231 |
"reading_order": reading_order,
|
|
|
232 |
})
|
233 |
else:
|
234 |
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
@@ -305,6 +310,7 @@ def generate_fallback_markdown(recognition_results):
|
|
305 |
elif element["label"] in ["para", "title", "sec", "sub_sec"]:
|
306 |
markdown_content += f"{element['text']}\n\n"
|
307 |
elif element["label"] == "fig":
|
|
|
308 |
markdown_content += f"{element['text']}\n\n"
|
309 |
return markdown_content
|
310 |
|
@@ -407,6 +413,45 @@ def initialize_gemini_model():
|
|
407 |
return None
|
408 |
|
409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
# Global state for managing tabs
|
411 |
processed_markdown = ""
|
412 |
show_results_tab = False
|
@@ -588,11 +633,12 @@ with gr.Blocks(
|
|
588 |
gemini_status = "β
Gemini API ready" if gemini_model else "β Gemini API not configured"
|
589 |
current_status = f"Currently loaded: {current_model or 'None'}"
|
590 |
gr.Markdown(
|
591 |
-
"# Scholar Express\n"
|
592 |
-
"### Upload a research paper to get a web-friendly version
|
593 |
f"**System:** {model_status}\n"
|
594 |
f"**RAG System:** {embedding_status}\n"
|
595 |
f"**Gemini API:** {gemini_status}\n"
|
|
|
596 |
f"**Status:** {current_status}"
|
597 |
)
|
598 |
|
|
|
1 |
"""
|
2 |
+
DOLPHIN PDF Document AI - Alt Text Enhanced Version
|
3 |
Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
|
4 |
+
Features: AI-generated alt text for accessibility using Gemma 3n
|
5 |
"""
|
6 |
|
7 |
import gradio as gr
|
|
|
220 |
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
221 |
pil_crop = crop_margin(pil_crop)
|
222 |
|
223 |
+
# Generate alt text for accessibility
|
224 |
+
alt_text = generate_alt_text_for_image(pil_crop)
|
225 |
+
|
226 |
buffered = io.BytesIO()
|
227 |
pil_crop.save(buffered, format="PNG")
|
228 |
img_base64 = base64.b64encode(buffered.getvalue()).decode()
|
|
|
230 |
|
231 |
figure_results.append({
|
232 |
"label": label,
|
233 |
+
"text": f"\n\n*{alt_text}*",
|
234 |
"bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
|
235 |
"reading_order": reading_order,
|
236 |
+
"alt_text": alt_text,
|
237 |
})
|
238 |
else:
|
239 |
pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
|
|
|
310 |
elif element["label"] in ["para", "title", "sec", "sub_sec"]:
|
311 |
markdown_content += f"{element['text']}\n\n"
|
312 |
elif element["label"] == "fig":
|
313 |
+
# Image should already have alt text from processing
|
314 |
markdown_content += f"{element['text']}\n\n"
|
315 |
return markdown_content
|
316 |
|
|
|
413 |
return None
|
414 |
|
415 |
|
416 |
+
def generate_alt_text_for_image(pil_image):
|
417 |
+
"""Generate alt text for an image using Gemma 3n model"""
|
418 |
+
try:
|
419 |
+
# Initialize Gemini model
|
420 |
+
model = initialize_gemini_model()
|
421 |
+
if model is None:
|
422 |
+
return "Image description unavailable"
|
423 |
+
|
424 |
+
# Create a detailed prompt for alt text generation
|
425 |
+
prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
|
426 |
+
|
427 |
+
Focus on:
|
428 |
+
- Main subject or content of the image
|
429 |
+
- Important details, text, or data shown
|
430 |
+
- Layout and structure if relevant (charts, diagrams, tables)
|
431 |
+
- Context that would help someone understand the image's purpose
|
432 |
+
|
433 |
+
Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
|
434 |
+
|
435 |
+
# Generate alt text using Gemini API
|
436 |
+
response = model.generate_content([prompt, pil_image])
|
437 |
+
alt_text = response.text.strip() if hasattr(response, 'text') else "Image description unavailable"
|
438 |
+
|
439 |
+
# Clean up the alt text
|
440 |
+
alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
|
441 |
+
# Remove common prefixes if they appear
|
442 |
+
prefixes_to_remove = ["This image shows", "The image shows", "This shows", "The figure shows"]
|
443 |
+
for prefix in prefixes_to_remove:
|
444 |
+
if alt_text.startswith(prefix):
|
445 |
+
alt_text = alt_text[len(prefix):].strip()
|
446 |
+
break
|
447 |
+
|
448 |
+
return alt_text if alt_text else "Image description unavailable"
|
449 |
+
|
450 |
+
except Exception as e:
|
451 |
+
print(f"Error generating alt text: {e}")
|
452 |
+
return "Image description unavailable"
|
453 |
+
|
454 |
+
|
455 |
# Global state for managing tabs
|
456 |
processed_markdown = ""
|
457 |
show_results_tab = False
|
|
|
633 |
gemini_status = "β
Gemini API ready" if gemini_model else "β Gemini API not configured"
|
634 |
current_status = f"Currently loaded: {current_model or 'None'}"
|
635 |
gr.Markdown(
|
636 |
+
"# Scholar Express - Alt Text Enhanced\n"
|
637 |
+
"### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by Gemini API.\n"
|
638 |
f"**System:** {model_status}\n"
|
639 |
f"**RAG System:** {embedding_status}\n"
|
640 |
f"**Gemini API:** {gemini_status}\n"
|
641 |
+
f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
|
642 |
f"**Status:** {current_status}"
|
643 |
)
|
644 |
|