import torch import librosa from transformers import AutoModelForCausalLM, AutoProcessor, pipeline, WhisperProcessor, WhisperForConditionalGeneration from gtts import gTTS import gradio as gr import spaces from PIL import Image import os from langdetect import detect import subprocess # Install flash-attn subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) print("Loading models...") # Vision model vision_model_id = "microsoft/Phi-3.5-vision-instruct" vision_model = AutoModelForCausalLM.from_pretrained( vision_model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False ) vision_processor = AutoProcessor.from_pretrained(vision_model_id, trust_remote_code=True, num_crops=16) # Whisper model whisper_model_id = "openai/whisper-small" whisper_processor = WhisperProcessor.from_pretrained(whisper_model_id) whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_id) # Sarvam model sarvam_pipe = pipeline('sarvamai/sarvam-2b-v0.5') print("All models loaded successfully") @spaces.GPU def process_audio_input(audio): try: whisper_model.to('cuda') audio, sr = librosa.load(audio, sr=16000) input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to('cuda') predicted_ids = whisper_model.generate(input_features) transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] whisper_model.to('cpu') return transcription except Exception as e: return f"Error processing audio: {str(e)}. Please type your message instead." @spaces.GPU def process_image_input(image, text_prompt): try: vision_model.to('cuda') messages = [ {"role": "user", "content": f"{text_prompt}\n<|image_1|>"}, ] prompt = vision_processor.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda") generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] vision_model.to('cpu') return response except Exception as e: return f"Error processing image: {str(e)}" def generate_response(transcription): try: response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text'] return response except Exception as e: return f"Error generating response: {str(e)}" def text_to_speech(text, lang='hi'): try: tts = gTTS(text=text, lang=lang, tld='co.in') tts.save("response.mp3") return "response.mp3" except Exception as e: print(f"Error in text-to-speech: {str(e)}") return None @spaces.GPU def indic_vision_assistant(input_type, audio_input, text_input, image_input): try: if input_type == "audio" and audio_input is not None: transcription = process_audio_input(audio_input) elif input_type == "text" and text_input: transcription = text_input elif input_type == "image" and image_input is not None: text_prompt = text_input if text_input else "Describe this image in detail." transcription = process_image_input(image_input, text_prompt) else: return "Please provide either audio, text, or image input.", "No input provided.", None response = generate_response(transcription) lang = detect(response) audio_response = text_to_speech(response, lang) return transcription, response, audio_response except Exception as e: error_message = f"An error occurred: {str(e)}" return error_message, error_message, None # Custom CSS custom_css = """ body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif; } #custom-header { text-align: center; padding: 20px 0; background-color: #1a202c; margin-bottom: 20px; border-radius: 10px; } #custom-header h1 { font-size: 2.5rem; margin-bottom: 0.5rem; } #custom-header h1 .blue { color: #60a5fa; } #custom-header h1 .pink { color: #f472b6; } #custom-header h2 { font-size: 1.5rem; color: #94a3b8; } .suggestions { display: flex; justify-content: center; flex-wrap: wrap; gap: 1rem; margin: 20px 0; } .suggestion { background-color: #1e293b; border-radius: 0.5rem; padding: 1rem; display: flex; align-items: center; transition: transform 0.3s ease; width: 200px; } .suggestion:hover { transform: translateY(-5px); } .suggestion-icon { font-size: 1.5rem; margin-right: 1rem; background-color: #2d3748; padding: 0.5rem; border-radius: 50%; } .gradio-container { max-width: 100% !important; } #component-0, #component-1, #component-2 { max-width: 100% !important; } footer { text-align: center; margin-top: 2rem; color: #64748b; } """ # Custom HTML for the header custom_header = """
Speak in any Indic language
Type in any Indic language
Upload an image for analysis
Get AI-generated responses
Listen to audio responses