Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -17,6 +17,7 @@ try:
|
|
17 |
import numpy as np
|
18 |
from sklearn.metrics.pairwise import cosine_similarity
|
19 |
import google.generativeai as genai
|
|
|
20 |
RAG_DEPENDENCIES_AVAILABLE = True
|
21 |
except ImportError as e:
|
22 |
print(f"RAG dependencies not available: {e}")
|
@@ -338,21 +339,21 @@ if RAG_DEPENDENCIES_AVAILABLE:
|
|
338 |
gemini_api_key = os.getenv('GEMINI_API_KEY')
|
339 |
if gemini_api_key:
|
340 |
genai.configure(api_key=gemini_api_key)
|
341 |
-
|
342 |
print("β
Gemini API configured successfully")
|
343 |
else:
|
344 |
print("β GEMINI_API_KEY not found in environment")
|
345 |
-
|
346 |
except Exception as e:
|
347 |
print(f"β Error loading models: {e}")
|
348 |
import traceback
|
349 |
traceback.print_exc()
|
350 |
embedding_model = None
|
351 |
-
|
352 |
else:
|
353 |
print("β RAG dependencies not available")
|
354 |
embedding_model = None
|
355 |
-
|
356 |
|
357 |
# Model management functions
|
358 |
def load_dolphin_model():
|
@@ -388,12 +389,12 @@ def unload_dolphin_model():
|
|
388 |
torch.cuda.empty_cache()
|
389 |
print("β
DOLPHIN model unloaded")
|
390 |
|
391 |
-
def
|
392 |
-
"""Initialize Gemini API
|
393 |
-
global
|
394 |
|
395 |
-
if
|
396 |
-
return
|
397 |
|
398 |
try:
|
399 |
gemini_api_key = os.getenv('GEMINI_API_KEY')
|
@@ -401,35 +402,41 @@ def initialize_gemini_model():
|
|
401 |
print("β GEMINI_API_KEY not found in environment")
|
402 |
return None
|
403 |
|
404 |
-
print("Initializing Gemini API...")
|
405 |
-
genai.configure(api_key=gemini_api_key)
|
406 |
-
|
407 |
-
|
408 |
-
return gemini_model
|
409 |
except Exception as e:
|
410 |
-
print(f"β Error initializing Gemini
|
411 |
import traceback
|
412 |
traceback.print_exc()
|
413 |
return None
|
414 |
|
415 |
|
416 |
def generate_alt_text_for_image(pil_image):
|
417 |
-
"""Generate alt text for an image using Gemma 3n model"""
|
418 |
try:
|
419 |
-
# Initialize Gemini
|
420 |
-
|
421 |
-
if
|
422 |
-
print("β Gemini
|
423 |
return "Image description unavailable"
|
424 |
|
425 |
# Debug: Check image format and properties
|
426 |
print(f"π Image format: {pil_image.format}, mode: {pil_image.mode}, size: {pil_image.size}")
|
427 |
|
428 |
-
# Ensure image is in RGB mode
|
429 |
if pil_image.mode != 'RGB':
|
430 |
print(f"Converting image from {pil_image.mode} to RGB")
|
431 |
pil_image = pil_image.convert('RGB')
|
432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
433 |
# Create a detailed prompt for alt text generation
|
434 |
prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
|
435 |
|
@@ -441,36 +448,23 @@ Focus on:
|
|
441 |
|
442 |
Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
|
443 |
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
|
|
|
|
|
|
449 |
|
450 |
print(f"π‘ API response received: {type(response)}")
|
451 |
-
print(f"π‘ Response attributes: {dir(response)}")
|
452 |
|
453 |
if hasattr(response, 'text') and response.text:
|
454 |
alt_text = response.text.strip()
|
455 |
print(f"β
Alt text generated: {alt_text[:100]}...")
|
456 |
else:
|
457 |
print(f"β No text in response. Response: {response}")
|
458 |
-
|
459 |
-
if hasattr(response, 'candidates') and response.candidates:
|
460 |
-
candidate = response.candidates[0]
|
461 |
-
if hasattr(candidate, 'content') and candidate.content:
|
462 |
-
if hasattr(candidate.content, 'parts') and candidate.content.parts:
|
463 |
-
alt_text = candidate.content.parts[0].text.strip()
|
464 |
-
print(f"β
Alt text from candidates: {alt_text[:100]}...")
|
465 |
-
else:
|
466 |
-
print(f"β No parts in content")
|
467 |
-
return "Image description unavailable"
|
468 |
-
else:
|
469 |
-
print(f"β No content in candidate")
|
470 |
-
return "Image description unavailable"
|
471 |
-
else:
|
472 |
-
print(f"β No candidates in response")
|
473 |
-
return "Image description unavailable"
|
474 |
|
475 |
# Clean up the alt text
|
476 |
alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
|
@@ -498,7 +492,7 @@ document_embeddings = None
|
|
498 |
|
499 |
# Global model state
|
500 |
dolphin_model = None
|
501 |
-
|
502 |
current_model = None # Track which model is currently loaded
|
503 |
|
504 |
|
@@ -668,7 +662,7 @@ with gr.Blocks(
|
|
668 |
# Home Tab
|
669 |
with gr.TabItem("π Home", id="home"):
|
670 |
embedding_status = "β
RAG ready" if embedding_model else "β RAG not loaded"
|
671 |
-
gemini_status = "β
Gemini API ready" if
|
672 |
current_status = f"Currently loaded: {current_model or 'None'}"
|
673 |
gr.Markdown(
|
674 |
"# Scholar Express - Alt Text Enhanced\n"
|
@@ -786,11 +780,11 @@ with gr.Blocks(
|
|
786 |
return history + [[message, "β Please process a PDF document first before asking questions."]]
|
787 |
|
788 |
try:
|
789 |
-
# Initialize Gemini
|
790 |
-
|
791 |
|
792 |
-
if
|
793 |
-
return history + [[message, "β Failed to initialize Gemini
|
794 |
|
795 |
# Use RAG to get relevant chunks from markdown (balanced for performance vs quota)
|
796 |
if document_chunks and len(document_chunks) > 0:
|
@@ -821,7 +815,7 @@ Please provide a clear and helpful answer based on the context provided."""
|
|
821 |
|
822 |
for attempt in range(max_retries):
|
823 |
try:
|
824 |
-
response =
|
825 |
response_text = response.text if hasattr(response, 'text') else str(response)
|
826 |
return history + [[message, response_text]]
|
827 |
except Exception as api_error:
|
|
|
17 |
import numpy as np
|
18 |
from sklearn.metrics.pairwise import cosine_similarity
|
19 |
import google.generativeai as genai
|
20 |
+
from google.generativeai import types
|
21 |
RAG_DEPENDENCIES_AVAILABLE = True
|
22 |
except ImportError as e:
|
23 |
print(f"RAG dependencies not available: {e}")
|
|
|
339 |
gemini_api_key = os.getenv('GEMINI_API_KEY')
|
340 |
if gemini_api_key:
|
341 |
genai.configure(api_key=gemini_api_key)
|
342 |
+
gemini_client = True # Just mark as configured
|
343 |
print("β
Gemini API configured successfully")
|
344 |
else:
|
345 |
print("β GEMINI_API_KEY not found in environment")
|
346 |
+
gemini_client = None
|
347 |
except Exception as e:
|
348 |
print(f"β Error loading models: {e}")
|
349 |
import traceback
|
350 |
traceback.print_exc()
|
351 |
embedding_model = None
|
352 |
+
gemini_client = None
|
353 |
else:
|
354 |
print("β RAG dependencies not available")
|
355 |
embedding_model = None
|
356 |
+
gemini_client = None
|
357 |
|
358 |
# Model management functions
|
359 |
def load_dolphin_model():
|
|
|
389 |
torch.cuda.empty_cache()
|
390 |
print("β
DOLPHIN model unloaded")
|
391 |
|
392 |
+
def initialize_gemini_client():
|
393 |
+
"""Initialize Gemini API client"""
|
394 |
+
global gemini_client
|
395 |
|
396 |
+
if gemini_client is not None:
|
397 |
+
return gemini_client
|
398 |
|
399 |
try:
|
400 |
gemini_api_key = os.getenv('GEMINI_API_KEY')
|
|
|
402 |
print("β GEMINI_API_KEY not found in environment")
|
403 |
return None
|
404 |
|
405 |
+
print("Initializing Gemini API client...")
|
406 |
+
gemini_client = genai.configure(api_key=gemini_api_key)
|
407 |
+
print("β
Gemini API client ready for gemma-3n-e4b-it")
|
408 |
+
return gemini_client
|
|
|
409 |
except Exception as e:
|
410 |
+
print(f"β Error initializing Gemini client: {e}")
|
411 |
import traceback
|
412 |
traceback.print_exc()
|
413 |
return None
|
414 |
|
415 |
|
416 |
def generate_alt_text_for_image(pil_image):
|
417 |
+
"""Generate alt text for an image using Gemma 3n model via Google AI API"""
|
418 |
try:
|
419 |
+
# Initialize Gemini client
|
420 |
+
client = initialize_gemini_client()
|
421 |
+
if client is None:
|
422 |
+
print("β Gemini client not initialized for alt text generation")
|
423 |
return "Image description unavailable"
|
424 |
|
425 |
# Debug: Check image format and properties
|
426 |
print(f"π Image format: {pil_image.format}, mode: {pil_image.mode}, size: {pil_image.size}")
|
427 |
|
428 |
+
# Ensure image is in RGB mode
|
429 |
if pil_image.mode != 'RGB':
|
430 |
print(f"Converting image from {pil_image.mode} to RGB")
|
431 |
pil_image = pil_image.convert('RGB')
|
432 |
|
433 |
+
# Convert PIL image to bytes
|
434 |
+
buffered = io.BytesIO()
|
435 |
+
pil_image.save(buffered, format="JPEG")
|
436 |
+
image_bytes = buffered.getvalue()
|
437 |
+
|
438 |
+
print(f"π Generating alt text for image with Gemma 3n...")
|
439 |
+
|
440 |
# Create a detailed prompt for alt text generation
|
441 |
prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
|
442 |
|
|
|
448 |
|
449 |
Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
|
450 |
|
451 |
+
# Use the Google AI API client with proper format
|
452 |
+
response = genai.GenerativeModel('gemma-3n-e4b-it').generate_content([
|
453 |
+
types.Part.from_bytes(
|
454 |
+
data=image_bytes,
|
455 |
+
mime_type='image/jpeg',
|
456 |
+
),
|
457 |
+
prompt
|
458 |
+
])
|
459 |
|
460 |
print(f"π‘ API response received: {type(response)}")
|
|
|
461 |
|
462 |
if hasattr(response, 'text') and response.text:
|
463 |
alt_text = response.text.strip()
|
464 |
print(f"β
Alt text generated: {alt_text[:100]}...")
|
465 |
else:
|
466 |
print(f"β No text in response. Response: {response}")
|
467 |
+
return "Image description unavailable"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
468 |
|
469 |
# Clean up the alt text
|
470 |
alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
|
|
|
492 |
|
493 |
# Global model state
|
494 |
dolphin_model = None
|
495 |
+
gemini_client = None
|
496 |
current_model = None # Track which model is currently loaded
|
497 |
|
498 |
|
|
|
662 |
# Home Tab
|
663 |
with gr.TabItem("π Home", id="home"):
|
664 |
embedding_status = "β
RAG ready" if embedding_model else "β RAG not loaded"
|
665 |
+
gemini_status = "β
Gemini API ready" if gemini_client else "β Gemini API not configured"
|
666 |
current_status = f"Currently loaded: {current_model or 'None'}"
|
667 |
gr.Markdown(
|
668 |
"# Scholar Express - Alt Text Enhanced\n"
|
|
|
780 |
return history + [[message, "β Please process a PDF document first before asking questions."]]
|
781 |
|
782 |
try:
|
783 |
+
# Initialize Gemini client
|
784 |
+
client = initialize_gemini_client()
|
785 |
|
786 |
+
if client is None:
|
787 |
+
return history + [[message, "β Failed to initialize Gemini client. Please check your GEMINI_API_KEY."]]
|
788 |
|
789 |
# Use RAG to get relevant chunks from markdown (balanced for performance vs quota)
|
790 |
if document_chunks and len(document_chunks) > 0:
|
|
|
815 |
|
816 |
for attempt in range(max_retries):
|
817 |
try:
|
818 |
+
response = genai.GenerativeModel('gemma-3n-e4b-it').generate_content(prompt)
|
819 |
response_text = response.text if hasattr(response, 'text') else str(response)
|
820 |
return history + [[message, response_text]]
|
821 |
except Exception as api_error:
|