# app.py import gradio as gr from transformers import pipeline # ============================= # LOAD YOUR MODELS # ============================= # ----- Text models ----- text_models = [ pipeline("text-generation", model="openai/gpt-oss-120b"), pipeline("text-generation", model="deepseek-ai/DeepSeek-V3.1-Base"), pipeline("text-generation", model="zai-org/GLM-4.5"), pipeline("text-generation", model="zai-org/chatglm3-6b"), pipeline("text-generation", model="openai/gpt-oss-20b") ] # ----- Audio models ----- tts_models = [ pipeline("text-to-speech", model="sk0032/coqui-tts-model"), pipeline("text-to-speech", model="sysf/Edge-TTS") ] # ----- Whisper ASR ----- speech_to_text_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo") # ----- Image models ----- image_models = [ pipeline("text-to-image", model="GD-ML/USP-Image_Generation"), pipeline("text-to-image", model="Qwen/Qwen-Image"), pipeline("text-to-image", model="Comfy-Org/Qwen-Image-DiffSynth-ControlNets"), pipeline("image-to-text", model="Salesforce/blip2-opt-2.7b"), pipeline("vision-to-text", model="Vision-CAIR/MiniGPT-4") ] # ----- Video models ----- video_models = [ pipeline("text-to-video", model="sudip1987/Generate_videos_with_Veo3"), pipeline("text-to-video", model="ali-vilab/text-to-video-ms-1.7b") ] # ----- Music model ----- music_model = pipeline("text-to-music", model="sander-wood/text-to-music") # ============================= # HELPER FUNCTIONS # ============================= def merge_text_models(prompt): responses = [] for model in text_models: try: out = model(prompt)[0]['generated_text'] responses.append(out) except Exception as e: responses.append(f"[Model failed: {str(e)}]") return "\n---\n".join(responses) def generate_audio(text): audio_files = [] for model in tts_models: try: audio_path = model(text) audio_files.append(audio_path) except Exception as e: audio_files.append(None) return audio_files[0] if audio_files else None def generate_images(text): imgs = [] for model in image_models: try: img = model(text)[0]['image'] imgs.append(img) except Exception as e: continue return imgs[:3] # Show top 3 images def generate_videos(text): vids = [] for model in video_models: try: vid = model(text) vids.append(vid) except Exception as e: continue return vids[:1] # Show one video # ============================= # MAIN ARK-AI FUNCTION # ============================= def ark_ai_main(prompt): # Text text_output = merge_text_models(prompt) # Inject personality personality = "ARK-AI (fun, savage, chaotic-good) says:\n" full_text = personality + text_output # Media image_output = generate_images(prompt) video_output = generate_videos(prompt) audio_output = generate_audio(prompt) return full_text, image_output, video_output, audio_output # ============================= # GRADIO INTERFACE # ============================= iface = gr.Interface( fn=ark_ai_main, inputs=gr.Textbox(lines=3, placeholder="Ask ARK-AI anything..."), outputs=[ gr.Textbox(label="ARK-AI Text Response"), gr.Gallery(label="Images Generated"), gr.Video(label="Video Generated"), gr.Audio(label="Audio Response") ], title="ARK-AI Multi-Modal Assistant", description="ARK-AI: Savage, funny, chaotic-good AI assistant merging text, image, audio, and video models.", css="styles.css" # Optional: liquid-glass UI ) iface.launch()