import torch import librosa from transformers import AutoModelForCausalLM, AutoProcessor, pipeline, WhisperProcessor, WhisperForConditionalGeneration from gtts import gTTS import gradio as gr import spaces from PIL import Image import os from langdetect import detect import subprocess # Install flash-attn subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) print("Loading models...") # Vision model vision_model_id = "microsoft/Phi-3.5-vision-instruct" vision_model = AutoModelForCausalLM.from_pretrained( vision_model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False ) vision_processor = AutoProcessor.from_pretrained(vision_model_id, trust_remote_code=True, num_crops=16) # Whisper model whisper_model_id = "openai/whisper-small" whisper_processor = WhisperProcessor.from_pretrained(whisper_model_id) whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_id) # Sarvam model sarvam_pipe = pipeline('sarvamai/sarvam-2b-v0.5') print("All models loaded successfully") @spaces.GPU def process_audio_input(audio): try: whisper_model.to('cuda') audio, sr = librosa.load(audio, sr=16000) input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to('cuda') predicted_ids = whisper_model.generate(input_features) transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] whisper_model.to('cpu') return transcription except Exception as e: return f"Error processing audio: {str(e)}. Please type your message instead." @spaces.GPU def process_image_input(image, text_prompt): try: vision_model.to('cuda') messages = [ {"role": "user", "content": f"{text_prompt}\n<|image_1|>"}, ] prompt = vision_processor.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda") generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True) generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] vision_model.to('cpu') return response except Exception as e: return f"Error processing image: {str(e)}" def generate_response(transcription): try: response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text'] return response except Exception as e: return f"Error generating response: {str(e)}" def text_to_speech(text, lang='hi'): try: tts = gTTS(text=text, lang=lang, tld='co.in') tts.save("response.mp3") return "response.mp3" except Exception as e: print(f"Error in text-to-speech: {str(e)}") return None @spaces.GPU def indic_vision_assistant(input_type, audio_input, text_input, image_input): try: if input_type == "audio" and audio_input is not None: transcription = process_audio_input(audio_input) elif input_type == "text" and text_input: transcription = text_input elif input_type == "image" and image_input is not None: text_prompt = text_input if text_input else "Describe this image in detail." transcription = process_image_input(image_input, text_prompt) else: return "Please provide either audio, text, or image input.", "No input provided.", None response = generate_response(transcription) lang = detect(response) audio_response = text_to_speech(response, lang) return transcription, response, audio_response except Exception as e: error_message = f"An error occurred: {str(e)}" return error_message, error_message, None # Custom CSS custom_css = """ body { background-color: #0b0f19; color: #e2e8f0; font-family: 'Arial', sans-serif; } #custom-header { text-align: center; padding: 20px 0; background-color: #1a202c; margin-bottom: 20px; border-radius: 10px; } #custom-header h1 { font-size: 2.5rem; margin-bottom: 0.5rem; } #custom-header h1 .blue { color: #60a5fa; } #custom-header h1 .pink { color: #f472b6; } #custom-header h2 { font-size: 1.5rem; color: #94a3b8; } .suggestions { display: flex; justify-content: center; flex-wrap: wrap; gap: 1rem; margin: 20px 0; } .suggestion { background-color: #1e293b; border-radius: 0.5rem; padding: 1rem; display: flex; align-items: center; transition: transform 0.3s ease; width: 200px; } .suggestion:hover { transform: translateY(-5px); } .suggestion-icon { font-size: 1.5rem; margin-right: 1rem; background-color: #2d3748; padding: 0.5rem; border-radius: 50%; } .gradio-container { max-width: 100% !important; } #component-0, #component-1, #component-2 { max-width: 100% !important; } footer { text-align: center; margin-top: 2rem; color: #64748b; } """ # Custom HTML for the header custom_header = """

Hello, User

How can I help you today?

""" # Custom HTML for suggestions custom_suggestions = """
🎤

Speak in any Indic language

⌨️

Type in any Indic language

🖼️

Upload an image for analysis

🤖

Get AI-generated responses

🔊

Listen to audio responses

""" # Gradio interface with gr.Blocks(css=custom_css, theme=gr.themes.Base().set( body_background_fill="#0b0f19", body_text_color="#e2e8f0", button_primary_background_fill="#3b82f6", button_primary_background_fill_hover="#2563eb", button_primary_text_color="white", block_title_text_color="#94a3b8", block_label_text_color="#94a3b8", )) as iface: gr.HTML(custom_header) gr.HTML(custom_suggestions) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Indic Vision Assistant") input_type = gr.Radio(["audio", "text", "image"], label="Input Type", value="audio") audio_input = gr.Audio(type="filepath", label="Speak (if audio input selected)") text_input = gr.Textbox(label="Type your message or image prompt") image_input = gr.Image(type="pil", label="Upload an image (if image input selected)") submit_btn = gr.Button("Submit") output_transcription = gr.Textbox(label="Transcription/Input") output_response = gr.Textbox(label="Generated Response") output_audio = gr.Audio(label="Audio Response") submit_btn.click( fn=indic_vision_assistant, inputs=[input_type, audio_input, text_input, image_input], outputs=[output_transcription, output_response, output_audio] ) gr.HTML("") # Launch the app iface.launch()