Arphd4 commited on
Commit
9f6025c
·
1 Parent(s): aa85d13

Full ARK-AI multi-modal setup

Browse files
Files changed (1) hide show
  1. app.py +118 -8
app.py CHANGED
@@ -1,16 +1,126 @@
1
  # app.py
2
  import gradio as gr
 
3
 
4
- # Load your models and functions here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # Define your Gradio interface
7
  iface = gr.Interface(
8
- fn=your_function,
9
- inputs=gr.Textbox(placeholder="Type your prompt here..."),
10
- outputs="text",
11
- title="ARK-AI Chat",
12
- description="ARK-AI: A savage, funny, chaotic-good AI assistant powered by multiple models.",
13
- css="styles.css" # Link to your custom CSS
 
 
 
 
 
14
  )
15
 
16
  iface.launch()
 
1
  # app.py
2
  import gradio as gr
3
+ from transformers import pipeline
4
 
5
+ # =============================
6
+ # LOAD YOUR MODELS
7
+ # =============================
8
+
9
+ # ----- Text models -----
10
+ text_models = [
11
+ pipeline("text-generation", model="openai/gpt-oss-120b"),
12
+ pipeline("text-generation", model="deepseek-ai/DeepSeek-V3.1-Base"),
13
+ pipeline("text-generation", model="zai-org/GLM-4.5"),
14
+ pipeline("text-generation", model="zai-org/chatglm3-6b"),
15
+ pipeline("text-generation", model="openai/gpt-oss-20b")
16
+ ]
17
+
18
+ # ----- Audio models -----
19
+ tts_models = [
20
+ pipeline("text-to-speech", model="sk0032/coqui-tts-model"),
21
+ pipeline("text-to-speech", model="sysf/Edge-TTS")
22
+ ]
23
+
24
+ # ----- Whisper ASR -----
25
+ speech_to_text_model = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3-turbo")
26
+
27
+ # ----- Image models -----
28
+ image_models = [
29
+ pipeline("text-to-image", model="GD-ML/USP-Image_Generation"),
30
+ pipeline("text-to-image", model="Qwen/Qwen-Image"),
31
+ pipeline("text-to-image", model="Comfy-Org/Qwen-Image-DiffSynth-ControlNets"),
32
+ pipeline("image-to-text", model="Salesforce/blip2-opt-2.7b"),
33
+ pipeline("vision-to-text", model="Vision-CAIR/MiniGPT-4")
34
+ ]
35
+
36
+ # ----- Video models -----
37
+ video_models = [
38
+ pipeline("text-to-video", model="sudip1987/Generate_videos_with_Veo3"),
39
+ pipeline("text-to-video", model="ali-vilab/text-to-video-ms-1.7b")
40
+ ]
41
+
42
+ # ----- Music model -----
43
+ music_model = pipeline("text-to-music", model="sander-wood/text-to-music")
44
+
45
+ # =============================
46
+ # HELPER FUNCTIONS
47
+ # =============================
48
+
49
+ def merge_text_models(prompt):
50
+ responses = []
51
+ for model in text_models:
52
+ try:
53
+ out = model(prompt)[0]['generated_text']
54
+ responses.append(out)
55
+ except Exception as e:
56
+ responses.append(f"[Model failed: {str(e)}]")
57
+ return "\n---\n".join(responses)
58
+
59
+ def generate_audio(text):
60
+ audio_files = []
61
+ for model in tts_models:
62
+ try:
63
+ audio_path = model(text)
64
+ audio_files.append(audio_path)
65
+ except Exception as e:
66
+ audio_files.append(None)
67
+ return audio_files[0] if audio_files else None
68
+
69
+ def generate_images(text):
70
+ imgs = []
71
+ for model in image_models:
72
+ try:
73
+ img = model(text)[0]['image']
74
+ imgs.append(img)
75
+ except Exception as e:
76
+ continue
77
+ return imgs[:3] # Show top 3 images
78
+
79
+ def generate_videos(text):
80
+ vids = []
81
+ for model in video_models:
82
+ try:
83
+ vid = model(text)
84
+ vids.append(vid)
85
+ except Exception as e:
86
+ continue
87
+ return vids[:1] # Show one video
88
+
89
+ # =============================
90
+ # MAIN ARK-AI FUNCTION
91
+ # =============================
92
+
93
+ def ark_ai_main(prompt):
94
+ # Text
95
+ text_output = merge_text_models(prompt)
96
+
97
+ # Inject personality
98
+ personality = "ARK-AI (fun, savage, chaotic-good) says:\n"
99
+ full_text = personality + text_output
100
+
101
+ # Media
102
+ image_output = generate_images(prompt)
103
+ video_output = generate_videos(prompt)
104
+ audio_output = generate_audio(prompt)
105
+
106
+ return full_text, image_output, video_output, audio_output
107
+
108
+ # =============================
109
+ # GRADIO INTERFACE
110
+ # =============================
111
 
 
112
  iface = gr.Interface(
113
+ fn=ark_ai_main,
114
+ inputs=gr.Textbox(lines=3, placeholder="Ask ARK-AI anything..."),
115
+ outputs=[
116
+ gr.Textbox(label="ARK-AI Text Response"),
117
+ gr.Gallery(label="Images Generated"),
118
+ gr.Video(label="Video Generated"),
119
+ gr.Audio(label="Audio Response")
120
+ ],
121
+ title="ARK-AI Multi-Modal Assistant",
122
+ description="ARK-AI: Savage, funny, chaotic-good AI assistant merging text, image, audio, and video models.",
123
+ css="styles.css" # Optional: liquid-glass UI
124
  )
125
 
126
  iface.launch()