raksama19 commited on
Commit
5a9132b
Β·
verified Β·
1 Parent(s): 1e2434f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -4
app.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
- DOLPHIN PDF Document AI - Final Version
3
  Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
 
4
  """
5
 
6
  import gradio as gr
@@ -219,6 +220,9 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
219
  pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
220
  pil_crop = crop_margin(pil_crop)
221
 
 
 
 
222
  buffered = io.BytesIO()
223
  pil_crop.save(buffered, format="PNG")
224
  img_base64 = base64.b64encode(buffered.getvalue()).decode()
@@ -226,9 +230,10 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
226
 
227
  figure_results.append({
228
  "label": label,
229
- "text": f"![Figure {reading_order}]({data_uri})",
230
  "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
231
  "reading_order": reading_order,
 
232
  })
233
  else:
234
  pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
@@ -305,6 +310,7 @@ def generate_fallback_markdown(recognition_results):
305
  elif element["label"] in ["para", "title", "sec", "sub_sec"]:
306
  markdown_content += f"{element['text']}\n\n"
307
  elif element["label"] == "fig":
 
308
  markdown_content += f"{element['text']}\n\n"
309
  return markdown_content
310
 
@@ -407,6 +413,45 @@ def initialize_gemini_model():
407
  return None
408
 
409
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  # Global state for managing tabs
411
  processed_markdown = ""
412
  show_results_tab = False
@@ -588,11 +633,12 @@ with gr.Blocks(
588
  gemini_status = "βœ… Gemini API ready" if gemini_model else "❌ Gemini API not configured"
589
  current_status = f"Currently loaded: {current_model or 'None'}"
590
  gr.Markdown(
591
- "# Scholar Express\n"
592
- "### Upload a research paper to get a web-friendly version and an AI chatbot powered by Gemini API. DOLPHIN model runs on GPU for optimal performance.\n"
593
  f"**System:** {model_status}\n"
594
  f"**RAG System:** {embedding_status}\n"
595
  f"**Gemini API:** {gemini_status}\n"
 
596
  f"**Status:** {current_status}"
597
  )
598
 
 
1
  """
2
+ DOLPHIN PDF Document AI - Alt Text Enhanced Version
3
  Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
4
+ Features: AI-generated alt text for accessibility using Gemma 3n
5
  """
6
 
7
  import gradio as gr
 
220
  pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
221
  pil_crop = crop_margin(pil_crop)
222
 
223
+ # Generate alt text for accessibility
224
+ alt_text = generate_alt_text_for_image(pil_crop)
225
+
226
  buffered = io.BytesIO()
227
  pil_crop.save(buffered, format="PNG")
228
  img_base64 = base64.b64encode(buffered.getvalue()).decode()
 
230
 
231
  figure_results.append({
232
  "label": label,
233
+ "text": f"![{alt_text}]({data_uri})\n\n*{alt_text}*",
234
  "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
235
  "reading_order": reading_order,
236
+ "alt_text": alt_text,
237
  })
238
  else:
239
  pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
 
310
  elif element["label"] in ["para", "title", "sec", "sub_sec"]:
311
  markdown_content += f"{element['text']}\n\n"
312
  elif element["label"] == "fig":
313
+ # Image should already have alt text from processing
314
  markdown_content += f"{element['text']}\n\n"
315
  return markdown_content
316
 
 
413
  return None
414
 
415
 
416
+ def generate_alt_text_for_image(pil_image):
417
+ """Generate alt text for an image using Gemma 3n model"""
418
+ try:
419
+ # Initialize Gemini model
420
+ model = initialize_gemini_model()
421
+ if model is None:
422
+ return "Image description unavailable"
423
+
424
+ # Create a detailed prompt for alt text generation
425
+ prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
426
+
427
+ Focus on:
428
+ - Main subject or content of the image
429
+ - Important details, text, or data shown
430
+ - Layout and structure if relevant (charts, diagrams, tables)
431
+ - Context that would help someone understand the image's purpose
432
+
433
+ Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
434
+
435
+ # Generate alt text using Gemini API
436
+ response = model.generate_content([prompt, pil_image])
437
+ alt_text = response.text.strip() if hasattr(response, 'text') else "Image description unavailable"
438
+
439
+ # Clean up the alt text
440
+ alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
441
+ # Remove common prefixes if they appear
442
+ prefixes_to_remove = ["This image shows", "The image shows", "This shows", "The figure shows"]
443
+ for prefix in prefixes_to_remove:
444
+ if alt_text.startswith(prefix):
445
+ alt_text = alt_text[len(prefix):].strip()
446
+ break
447
+
448
+ return alt_text if alt_text else "Image description unavailable"
449
+
450
+ except Exception as e:
451
+ print(f"Error generating alt text: {e}")
452
+ return "Image description unavailable"
453
+
454
+
455
  # Global state for managing tabs
456
  processed_markdown = ""
457
  show_results_tab = False
 
633
  gemini_status = "βœ… Gemini API ready" if gemini_model else "❌ Gemini API not configured"
634
  current_status = f"Currently loaded: {current_model or 'None'}"
635
  gr.Markdown(
636
+ "# Scholar Express - Alt Text Enhanced\n"
637
+ "### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by Gemini API.\n"
638
  f"**System:** {model_status}\n"
639
  f"**RAG System:** {embedding_status}\n"
640
  f"**Gemini API:** {gemini_status}\n"
641
+ f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
642
  f"**Status:** {current_status}"
643
  )
644