|
|
|
import gradio as gr |
|
from transformers import pipeline |
|
|
|
|
|
|
|
|
|
|
|
|
|
text_models = [ |
|
pipeline("text-generation", model="openai/gpt-oss-120b"), |
|
pipeline("text-generation", model="deepseek-ai/DeepSeek-V3.1-Base"), |
|
pipeline("text-generation", model="zai-org/GLM-4.5"), |
|
pipeline("text-generation", model="zai-org/chatglm3-6b"), |
|
pipeline("text-generation", model="openai/gpt-oss-20b") |
|
] |
|
|
|
|
|
tts_models = [ |
|
pipeline("text-to-speech", model="sk0032/coqui-tts-model"), |
|
pipeline("text-to-speech", model="sysf/Edge-TTS") |
|
] |
|
|
|
|
|
speech_to_text_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo") |
|
|
|
|
|
image_models = [ |
|
pipeline("text-to-image", model="GD-ML/USP-Image_Generation"), |
|
pipeline("text-to-image", model="Qwen/Qwen-Image"), |
|
pipeline("text-to-image", model="Comfy-Org/Qwen-Image-DiffSynth-ControlNets"), |
|
pipeline("image-to-text", model="Salesforce/blip2-opt-2.7b"), |
|
pipeline("vision-to-text", model="Vision-CAIR/MiniGPT-4") |
|
] |
|
|
|
|
|
video_models = [ |
|
pipeline("text-to-video", model="sudip1987/Generate_videos_with_Veo3"), |
|
pipeline("text-to-video", model="ali-vilab/text-to-video-ms-1.7b") |
|
] |
|
|
|
|
|
music_model = pipeline("text-to-music", model="sander-wood/text-to-music") |
|
|
|
|
|
|
|
|
|
|
|
def merge_text_models(prompt): |
|
responses = [] |
|
for model in text_models: |
|
try: |
|
out = model(prompt)[0]['generated_text'] |
|
responses.append(out) |
|
except Exception as e: |
|
responses.append(f"[Model failed: {str(e)}]") |
|
return "\n---\n".join(responses) |
|
|
|
def generate_audio(text): |
|
audio_files = [] |
|
for model in tts_models: |
|
try: |
|
audio_path = model(text) |
|
audio_files.append(audio_path) |
|
except Exception as e: |
|
audio_files.append(None) |
|
return audio_files[0] if audio_files else None |
|
|
|
def generate_images(text): |
|
imgs = [] |
|
for model in image_models: |
|
try: |
|
img = model(text)[0]['image'] |
|
imgs.append(img) |
|
except Exception as e: |
|
continue |
|
return imgs[:3] |
|
|
|
def generate_videos(text): |
|
vids = [] |
|
for model in video_models: |
|
try: |
|
vid = model(text) |
|
vids.append(vid) |
|
except Exception as e: |
|
continue |
|
return vids[:1] |
|
|
|
|
|
|
|
|
|
|
|
def ark_ai_main(prompt): |
|
|
|
text_output = merge_text_models(prompt) |
|
|
|
|
|
personality = "ARK-AI (fun, savage, chaotic-good) says:\n" |
|
full_text = personality + text_output |
|
|
|
|
|
image_output = generate_images(prompt) |
|
video_output = generate_videos(prompt) |
|
audio_output = generate_audio(prompt) |
|
|
|
return full_text, image_output, video_output, audio_output |
|
|
|
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=ark_ai_main, |
|
inputs=gr.Textbox(lines=3, placeholder="Ask ARK-AI anything..."), |
|
outputs=[ |
|
gr.Textbox(label="ARK-AI Text Response"), |
|
gr.Gallery(label="Images Generated"), |
|
gr.Video(label="Video Generated"), |
|
gr.Audio(label="Audio Response") |
|
], |
|
title="ARK-AI Multi-Modal Assistant", |
|
description="ARK-AI: Savage, funny, chaotic-good AI assistant merging text, image, audio, and video models.", |
|
css="styles.css" |
|
) |
|
|
|
iface.launch() |
|
|