import streamlit as st from openai import OpenAI from PIL import Image import io import os import uuid from gtts import gTTS import cv2 import numpy as np import base64 # --- Configuration --- API_KEY = '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' client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key=API_KEY ) # --- Helper Functions --- def describe_image(image_bytes): # Convert to base64 base64_image = base64.b64encode(image_bytes).decode('utf-8') response = client.chat.completions.create( model="opengvlab/internvl3-14b:free", messages=[ { "role": "user", "content": [ {"type": "text", "text": "Describe this image clearly, including objects, scene, and any visible text. Also warn about potential hazards."}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} ] } ] ) return response.choices[0].message.content def speak(text, filename=None): if not filename: filename = f"audio_{uuid.uuid4()}.mp3" tts = gTTS(text=text, lang='en') tts.save(filename) return filename # --- Streamlit UI --- st.set_page_config(page_title="AI Visual Assistant for the Blind", layout="centered") st.title("👁️ AI Visual Assistant for the Blind") st.markdown("Use your **camera** to capture the world around you.") st.subheader("📸 Take a Picture") camera_image = st.camera_input("Capture a frame from your camera") if camera_image is not None: st.image(camera_image, caption="Captured Frame", use_column_width=True) with st.spinner("Analyzing the scene..."): # Read the image bytes directly image_bytes = camera_image.getvalue() description = describe_image(image_bytes) st.subheader("📝 Description") st.write(description) st.subheader("🔊 Audio Narration") audio_file = speak(description) audio_bytes = open(audio_file, 'rb').read() st.audio(audio_bytes, format='audio/mp3') # Cleanup os.remove(audio_file) st.markdown("---") st.markdown("*Built with 💡 using Streamlit, OpenRouter, and gTTS.*")