Spaces:

raksama19
/

Test-Dolphin-PDF

Runtime error

App Files Files Community

raksama19 commited on Jul 16

Commit

5a9132b

verified ·

1 Parent(s): 1e2434f

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -4

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
-DOLPHIN PDF Document AI - Final Version
 Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
 """
 import gradio as gr
@@ -219,6 +220,9 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
                     pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
                     pil_crop = crop_margin(pil_crop)
                     buffered = io.BytesIO()
                     pil_crop.save(buffered, format="PNG")
                     img_base64 = base64.b64encode(buffered.getvalue()).decode()
@@ -226,9 +230,10 @@ def process_elements_optimized(layout_results, padded_image, dims, model, max_ba
                     figure_results.append({
                         "label": label,
-                        "text": f"![Figure {reading_order}]({data_uri})",
                         "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
                         "reading_order": reading_order,
                     })
                 else:
                     pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
@@ -305,6 +310,7 @@ def generate_fallback_markdown(recognition_results):
         elif element["label"] in ["para", "title", "sec", "sub_sec"]:
             markdown_content += f"{element['text']}\n\n"
         elif element["label"] == "fig":
             markdown_content += f"{element['text']}\n\n"
     return markdown_content
@@ -407,6 +413,45 @@ def initialize_gemini_model():
         return None
 # Global state for managing tabs
 processed_markdown = ""
 show_results_tab = False
@@ -588,11 +633,12 @@ with gr.Blocks(
             gemini_status = "✅ Gemini API ready" if gemini_model else "❌ Gemini API not configured"
             current_status = f"Currently loaded: {current_model or 'None'}"
             gr.Markdown(
-                "# Scholar Express\n"
-                "### Upload a research paper to get a web-friendly version and an AI chatbot powered by Gemini API. DOLPHIN model runs on GPU for optimal performance.\n"
                 f"**System:** {model_status}\n"
                 f"**RAG System:** {embedding_status}\n"
                 f"**Gemini API:** {gemini_status}\n"
                 f"**Status:** {current_status}"
             )

 """
+DOLPHIN PDF Document AI - Alt Text Enhanced Version
 Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
+Features: AI-generated alt text for accessibility using Gemma 3n
 """
 import gradio as gr
                     pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
                     pil_crop = crop_margin(pil_crop)
+                    # Generate alt text for accessibility
+                    alt_text = generate_alt_text_for_image(pil_crop)
                     buffered = io.BytesIO()
                     pil_crop.save(buffered, format="PNG")
                     img_base64 = base64.b64encode(buffered.getvalue()).decode()
                     figure_results.append({
                         "label": label,
+                        "text": f"![{alt_text}]({data_uri})\n\n*{alt_text}*",
                         "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
                         "reading_order": reading_order,
+                        "alt_text": alt_text,
                     })
                 else:
                     pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
         elif element["label"] in ["para", "title", "sec", "sub_sec"]:
             markdown_content += f"{element['text']}\n\n"
         elif element["label"] == "fig":
+            # Image should already have alt text from processing
             markdown_content += f"{element['text']}\n\n"
     return markdown_content
         return None
+def generate_alt_text_for_image(pil_image):
+    """Generate alt text for an image using Gemma 3n model"""
+    try:
+        # Initialize Gemini model
+        model = initialize_gemini_model()
+        if model is None:
+            return "Image description unavailable"
+        # Create a detailed prompt for alt text generation
+        prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
+Focus on:
+- Main subject or content of the image
+- Important details, text, or data shown
+- Layout and structure if relevant (charts, diagrams, tables)
+- Context that would help someone understand the image's purpose
+Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
+        # Generate alt text using Gemini API
+        response = model.generate_content([prompt, pil_image])
+        alt_text = response.text.strip() if hasattr(response, 'text') else "Image description unavailable"
+        # Clean up the alt text
+        alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
+        # Remove common prefixes if they appear
+        prefixes_to_remove = ["This image shows", "The image shows", "This shows", "The figure shows"]
+        for prefix in prefixes_to_remove:
+            if alt_text.startswith(prefix):
+                alt_text = alt_text[len(prefix):].strip()
+                break
+        return alt_text if alt_text else "Image description unavailable"
+    except Exception as e:
+        print(f"Error generating alt text: {e}")
+        return "Image description unavailable"
 # Global state for managing tabs
 processed_markdown = ""
 show_results_tab = False
             gemini_status = "✅ Gemini API ready" if gemini_model else "❌ Gemini API not configured"
             current_status = f"Currently loaded: {current_model or 'None'}"
             gr.Markdown(
+                "# Scholar Express - Alt Text Enhanced\n"
+                "### Upload a research paper to get a web-friendly version with AI-generated alt text for accessibility. Includes an AI chatbot powered by Gemini API.\n"
                 f"**System:** {model_status}\n"
                 f"**RAG System:** {embedding_status}\n"
                 f"**Gemini API:** {gemini_status}\n"
+                f"**Alt Text:** Gemma 3n generates descriptive alt text for images\n"
                 f"**Status:** {current_status}"
             )