raksama19 commited on
Commit
4789fc5
Β·
verified Β·
1 Parent(s): dcbaa35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -50
app.py CHANGED
@@ -17,6 +17,7 @@ try:
17
  import numpy as np
18
  from sklearn.metrics.pairwise import cosine_similarity
19
  import google.generativeai as genai
 
20
  RAG_DEPENDENCIES_AVAILABLE = True
21
  except ImportError as e:
22
  print(f"RAG dependencies not available: {e}")
@@ -338,21 +339,21 @@ if RAG_DEPENDENCIES_AVAILABLE:
338
  gemini_api_key = os.getenv('GEMINI_API_KEY')
339
  if gemini_api_key:
340
  genai.configure(api_key=gemini_api_key)
341
- gemini_model = genai.GenerativeModel('gemma-3n-e4b-it')
342
  print("βœ… Gemini API configured successfully")
343
  else:
344
  print("❌ GEMINI_API_KEY not found in environment")
345
- gemini_model = None
346
  except Exception as e:
347
  print(f"❌ Error loading models: {e}")
348
  import traceback
349
  traceback.print_exc()
350
  embedding_model = None
351
- gemini_model = None
352
  else:
353
  print("❌ RAG dependencies not available")
354
  embedding_model = None
355
- gemini_model = None
356
 
357
  # Model management functions
358
  def load_dolphin_model():
@@ -388,12 +389,12 @@ def unload_dolphin_model():
388
  torch.cuda.empty_cache()
389
  print("βœ… DOLPHIN model unloaded")
390
 
391
- def initialize_gemini_model():
392
- """Initialize Gemini API model"""
393
- global gemini_model
394
 
395
- if gemini_model is not None:
396
- return gemini_model
397
 
398
  try:
399
  gemini_api_key = os.getenv('GEMINI_API_KEY')
@@ -401,35 +402,41 @@ def initialize_gemini_model():
401
  print("❌ GEMINI_API_KEY not found in environment")
402
  return None
403
 
404
- print("Initializing Gemini API...")
405
- genai.configure(api_key=gemini_api_key)
406
- gemini_model = genai.GenerativeModel('gemma-3n-e4b-it')
407
- print("βœ… Gemini API model ready (gemma-3n-e4b-it)")
408
- return gemini_model
409
  except Exception as e:
410
- print(f"❌ Error initializing Gemini model: {e}")
411
  import traceback
412
  traceback.print_exc()
413
  return None
414
 
415
 
416
  def generate_alt_text_for_image(pil_image):
417
- """Generate alt text for an image using Gemma 3n model"""
418
  try:
419
- # Initialize Gemini model
420
- model = initialize_gemini_model()
421
- if model is None:
422
- print("❌ Gemini model not initialized for alt text generation")
423
  return "Image description unavailable"
424
 
425
  # Debug: Check image format and properties
426
  print(f"πŸ” Image format: {pil_image.format}, mode: {pil_image.mode}, size: {pil_image.size}")
427
 
428
- # Ensure image is in RGB mode (required for Gemini API)
429
  if pil_image.mode != 'RGB':
430
  print(f"Converting image from {pil_image.mode} to RGB")
431
  pil_image = pil_image.convert('RGB')
432
 
 
 
 
 
 
 
 
433
  # Create a detailed prompt for alt text generation
434
  prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
435
 
@@ -441,36 +448,23 @@ Focus on:
441
 
442
  Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
443
 
444
- print(f"πŸ” Generating alt text for image with Gemma 3n...")
445
-
446
- # Generate alt text using Gemini API with proper multimodal input
447
- # Pass the PIL image directly - Gemini API handles PIL Image objects
448
- response = model.generate_content([prompt, pil_image])
 
 
 
449
 
450
  print(f"πŸ“‘ API response received: {type(response)}")
451
- print(f"πŸ“‘ Response attributes: {dir(response)}")
452
 
453
  if hasattr(response, 'text') and response.text:
454
  alt_text = response.text.strip()
455
  print(f"βœ… Alt text generated: {alt_text[:100]}...")
456
  else:
457
  print(f"❌ No text in response. Response: {response}")
458
- # Try to access response differently
459
- if hasattr(response, 'candidates') and response.candidates:
460
- candidate = response.candidates[0]
461
- if hasattr(candidate, 'content') and candidate.content:
462
- if hasattr(candidate.content, 'parts') and candidate.content.parts:
463
- alt_text = candidate.content.parts[0].text.strip()
464
- print(f"βœ… Alt text from candidates: {alt_text[:100]}...")
465
- else:
466
- print(f"❌ No parts in content")
467
- return "Image description unavailable"
468
- else:
469
- print(f"❌ No content in candidate")
470
- return "Image description unavailable"
471
- else:
472
- print(f"❌ No candidates in response")
473
- return "Image description unavailable"
474
 
475
  # Clean up the alt text
476
  alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
@@ -498,7 +492,7 @@ document_embeddings = None
498
 
499
  # Global model state
500
  dolphin_model = None
501
- gemini_model = None
502
  current_model = None # Track which model is currently loaded
503
 
504
 
@@ -668,7 +662,7 @@ with gr.Blocks(
668
  # Home Tab
669
  with gr.TabItem("🏠 Home", id="home"):
670
  embedding_status = "βœ… RAG ready" if embedding_model else "❌ RAG not loaded"
671
- gemini_status = "βœ… Gemini API ready" if gemini_model else "❌ Gemini API not configured"
672
  current_status = f"Currently loaded: {current_model or 'None'}"
673
  gr.Markdown(
674
  "# Scholar Express - Alt Text Enhanced\n"
@@ -786,11 +780,11 @@ with gr.Blocks(
786
  return history + [[message, "❌ Please process a PDF document first before asking questions."]]
787
 
788
  try:
789
- # Initialize Gemini model
790
- model = initialize_gemini_model()
791
 
792
- if model is None:
793
- return history + [[message, "❌ Failed to initialize Gemini model. Please check your GEMINI_API_KEY."]]
794
 
795
  # Use RAG to get relevant chunks from markdown (balanced for performance vs quota)
796
  if document_chunks and len(document_chunks) > 0:
@@ -821,7 +815,7 @@ Please provide a clear and helpful answer based on the context provided."""
821
 
822
  for attempt in range(max_retries):
823
  try:
824
- response = model.generate_content(prompt)
825
  response_text = response.text if hasattr(response, 'text') else str(response)
826
  return history + [[message, response_text]]
827
  except Exception as api_error:
 
17
  import numpy as np
18
  from sklearn.metrics.pairwise import cosine_similarity
19
  import google.generativeai as genai
20
+ from google.generativeai import types
21
  RAG_DEPENDENCIES_AVAILABLE = True
22
  except ImportError as e:
23
  print(f"RAG dependencies not available: {e}")
 
339
  gemini_api_key = os.getenv('GEMINI_API_KEY')
340
  if gemini_api_key:
341
  genai.configure(api_key=gemini_api_key)
342
+ gemini_client = True # Just mark as configured
343
  print("βœ… Gemini API configured successfully")
344
  else:
345
  print("❌ GEMINI_API_KEY not found in environment")
346
+ gemini_client = None
347
  except Exception as e:
348
  print(f"❌ Error loading models: {e}")
349
  import traceback
350
  traceback.print_exc()
351
  embedding_model = None
352
+ gemini_client = None
353
  else:
354
  print("❌ RAG dependencies not available")
355
  embedding_model = None
356
+ gemini_client = None
357
 
358
  # Model management functions
359
  def load_dolphin_model():
 
389
  torch.cuda.empty_cache()
390
  print("βœ… DOLPHIN model unloaded")
391
 
392
+ def initialize_gemini_client():
393
+ """Initialize Gemini API client"""
394
+ global gemini_client
395
 
396
+ if gemini_client is not None:
397
+ return gemini_client
398
 
399
  try:
400
  gemini_api_key = os.getenv('GEMINI_API_KEY')
 
402
  print("❌ GEMINI_API_KEY not found in environment")
403
  return None
404
 
405
+ print("Initializing Gemini API client...")
406
+ gemini_client = genai.configure(api_key=gemini_api_key)
407
+ print("βœ… Gemini API client ready for gemma-3n-e4b-it")
408
+ return gemini_client
 
409
  except Exception as e:
410
+ print(f"❌ Error initializing Gemini client: {e}")
411
  import traceback
412
  traceback.print_exc()
413
  return None
414
 
415
 
416
  def generate_alt_text_for_image(pil_image):
417
+ """Generate alt text for an image using Gemma 3n model via Google AI API"""
418
  try:
419
+ # Initialize Gemini client
420
+ client = initialize_gemini_client()
421
+ if client is None:
422
+ print("❌ Gemini client not initialized for alt text generation")
423
  return "Image description unavailable"
424
 
425
  # Debug: Check image format and properties
426
  print(f"πŸ” Image format: {pil_image.format}, mode: {pil_image.mode}, size: {pil_image.size}")
427
 
428
+ # Ensure image is in RGB mode
429
  if pil_image.mode != 'RGB':
430
  print(f"Converting image from {pil_image.mode} to RGB")
431
  pil_image = pil_image.convert('RGB')
432
 
433
+ # Convert PIL image to bytes
434
+ buffered = io.BytesIO()
435
+ pil_image.save(buffered, format="JPEG")
436
+ image_bytes = buffered.getvalue()
437
+
438
+ print(f"πŸ” Generating alt text for image with Gemma 3n...")
439
+
440
  # Create a detailed prompt for alt text generation
441
  prompt = """You are an accessibility expert creating alt text for images to help visually impaired users understand visual content. Analyze this image and provide a clear, concise description that captures the essential visual information.
442
 
 
448
 
449
  Provide a descriptive alt text in 1-2 sentences that is informative but not overly verbose. Start directly with the description without saying "This image shows" or similar phrases."""
450
 
451
+ # Use the Google AI API client with proper format
452
+ response = genai.GenerativeModel('gemma-3n-e4b-it').generate_content([
453
+ types.Part.from_bytes(
454
+ data=image_bytes,
455
+ mime_type='image/jpeg',
456
+ ),
457
+ prompt
458
+ ])
459
 
460
  print(f"πŸ“‘ API response received: {type(response)}")
 
461
 
462
  if hasattr(response, 'text') and response.text:
463
  alt_text = response.text.strip()
464
  print(f"βœ… Alt text generated: {alt_text[:100]}...")
465
  else:
466
  print(f"❌ No text in response. Response: {response}")
467
+ return "Image description unavailable"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
 
469
  # Clean up the alt text
470
  alt_text = alt_text.replace('\n', ' ').replace('\r', ' ')
 
492
 
493
  # Global model state
494
  dolphin_model = None
495
+ gemini_client = None
496
  current_model = None # Track which model is currently loaded
497
 
498
 
 
662
  # Home Tab
663
  with gr.TabItem("🏠 Home", id="home"):
664
  embedding_status = "βœ… RAG ready" if embedding_model else "❌ RAG not loaded"
665
+ gemini_status = "βœ… Gemini API ready" if gemini_client else "❌ Gemini API not configured"
666
  current_status = f"Currently loaded: {current_model or 'None'}"
667
  gr.Markdown(
668
  "# Scholar Express - Alt Text Enhanced\n"
 
780
  return history + [[message, "❌ Please process a PDF document first before asking questions."]]
781
 
782
  try:
783
+ # Initialize Gemini client
784
+ client = initialize_gemini_client()
785
 
786
+ if client is None:
787
+ return history + [[message, "❌ Failed to initialize Gemini client. Please check your GEMINI_API_KEY."]]
788
 
789
  # Use RAG to get relevant chunks from markdown (balanced for performance vs quota)
790
  if document_chunks and len(document_chunks) > 0:
 
815
 
816
  for attempt in range(max_retries):
817
  try:
818
+ response = genai.GenerativeModel('gemma-3n-e4b-it').generate_content(prompt)
819
  response_text = response.text if hasattr(response, 'text') else str(response)
820
  return history + [[message, response_text]]
821
  except Exception as api_error: