AZILS commited on
Commit
c2552dd
·
verified ·
1 Parent(s): be4746f

Upload 3 files

Browse files
Files changed (3) hide show
  1. .env +6 -0
  2. app.py +1839 -0
  3. requirements.txt +27 -0
.env ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ ASSEMBLYAI_API_KEY=e9c253a938184370becdf77f2a9e6a45
2
+ OPENAI_API_KEY=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
3
+ GEMINI_API_KEY=AIzaSyA8SpThRntFroYYDrQRuO6f1F2dkiteSYE
4
+ ELEVENLABS_API_KEY=545bf254469ea5782233ae872eaa8809
5
+ STABILITY_API_KEY=abfd724a75fef2b01b2347d3dcfe10079f816976a32121
6
+ SEGMIND_API_KEY=SG_56e300a003a9a2d4
app.py ADDED
@@ -0,0 +1,1839 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import time
5
+ import random
6
+ import tempfile
7
+ import requests
8
+ import numpy as np
9
+ import uuid
10
+ from PIL import Image, ImageDraw, ImageFont
11
+ from io import BytesIO
12
+ from datetime import datetime
13
+ import gradio as gr
14
+ from dotenv import load_dotenv
15
+ import moviepy.editor as mpy
16
+ from moviepy.editor import *
17
+ from moviepy.audio.fx.all import volumex
18
+ from moviepy.video.fx.all import crop
19
+
20
+ # Suppress the asyncio "Event loop is closed" warning on Windows
21
+ import sys
22
+ if sys.platform.startswith('win'):
23
+ import asyncio
24
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
25
+
26
+ # Load environment variables from .env file if present
27
+ load_dotenv()
28
+
29
+ # Directory structure constants
30
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
31
+ STATIC_DIR = os.path.join(BASE_DIR, "static")
32
+ MUSIC_DIR = os.path.join(STATIC_DIR, "music")
33
+ FONTS_DIR = os.path.join(STATIC_DIR, "fonts")
34
+ STORAGE_DIR = os.path.join(BASE_DIR, "storage")
35
+
36
+ # Create necessary directories
37
+ os.makedirs(STATIC_DIR, exist_ok=True)
38
+ os.makedirs(MUSIC_DIR, exist_ok=True)
39
+ os.makedirs(FONTS_DIR, exist_ok=True)
40
+ os.makedirs(STORAGE_DIR, exist_ok=True)
41
+
42
+ # Helper functions for logging
43
+ def info(message):
44
+ timestamp = datetime.now().strftime("%H:%M:%S")
45
+ formatted_message = f"[{timestamp}] [INFO] {message}"
46
+ print(formatted_message)
47
+ return formatted_message
48
+
49
+ def success(message):
50
+ timestamp = datetime.now().strftime("%H:%M:%S")
51
+ formatted_message = f"[{timestamp}] [SUCCESS] {message}"
52
+ print(formatted_message)
53
+ return formatted_message
54
+
55
+ def warning(message):
56
+ timestamp = datetime.now().strftime("%H:%M:%S")
57
+ formatted_message = f"[{timestamp}] [WARNING] {message}"
58
+ print(formatted_message)
59
+ return formatted_message
60
+
61
+ def error(message):
62
+ timestamp = datetime.now().strftime("%H:%M:%S")
63
+ formatted_message = f"[{timestamp}] [ERROR] {message}"
64
+ print(formatted_message)
65
+ return formatted_message
66
+
67
+ def get_music_files():
68
+ """Get list of available music files in the music directory."""
69
+ if not os.path.exists(MUSIC_DIR):
70
+ return ["none"]
71
+
72
+ music_files = [f for f in os.listdir(MUSIC_DIR) if f.endswith(('.mp3', '.wav'))]
73
+ if not music_files:
74
+ return ["none"]
75
+
76
+ return ["random"] + music_files
77
+
78
+ def get_font_files():
79
+ """Get list of available font files in the fonts directory."""
80
+ if not os.path.exists(FONTS_DIR):
81
+ return ["default"]
82
+
83
+ font_files = [f.split('.')[0] for f in os.listdir(FONTS_DIR) if f.endswith(('.ttf', '.otf'))]
84
+ if not font_files:
85
+ return ["default"]
86
+
87
+ return ["random"] + font_files
88
+
89
+ def choose_random_music():
90
+ """Selects a random music file from the music directory."""
91
+ if not os.path.exists(MUSIC_DIR):
92
+ error(f"Music directory {MUSIC_DIR} does not exist")
93
+ return None
94
+
95
+ music_files = [f for f in os.listdir(MUSIC_DIR) if f.endswith(('.mp3', '.wav'))]
96
+ if not music_files:
97
+ warning(f"No music files found in {MUSIC_DIR}")
98
+ return None
99
+
100
+ return os.path.join(MUSIC_DIR, random.choice(music_files))
101
+
102
+ def choose_random_font():
103
+ """Selects a random font file from the fonts directory."""
104
+ if not os.path.exists(FONTS_DIR):
105
+ error(f"Fonts directory {FONTS_DIR} does not exist")
106
+ return "default"
107
+
108
+ font_files = [f for f in os.listdir(FONTS_DIR) if f.endswith(('.ttf', '.otf'))]
109
+ if not font_files:
110
+ warning(f"No font files found in {FONTS_DIR}")
111
+ return None
112
+
113
+ return font_files[0].split('.')[0] if len(font_files) == 1 else random.choice([f.split('.')[0] for f in font_files])
114
+
115
+ class YouTube:
116
+ def __init__(self, niche: str, language: str,
117
+ text_gen="g4f", text_model="gpt-4",
118
+ image_gen="g4f", image_model="flux",
119
+ tts_engine="edge", tts_voice="en-US-AriaNeural",
120
+ subtitle_font="default", font_size=80,
121
+ text_color="white", highlight_color="blue",
122
+ subtitles_enabled=True, highlighting_enabled=True,
123
+ subtitle_position="bottom", music_file="random",
124
+ enable_music=True, music_volume=0.1,
125
+ api_keys=None, progress=gr.Progress()) -> None:
126
+
127
+ """Initialize the YouTube Shorts Generator."""
128
+ self.progress = progress
129
+ self.progress(0, desc="Initializing")
130
+
131
+ # Store basic parameters
132
+ info(f"Initializing YouTube class")
133
+ self._niche = niche
134
+ self._language = language
135
+ self.text_gen = text_gen
136
+ self.text_model = text_model
137
+ self.image_gen = image_gen
138
+ self.image_model = image_model
139
+ self.tts_engine = tts_engine
140
+ self.tts_voice = tts_voice
141
+ self.subtitle_font = subtitle_font
142
+ self.font_size = font_size
143
+ self.text_color = text_color
144
+ self.highlight_color = highlight_color
145
+ self.subtitles_enabled = subtitles_enabled
146
+ self.highlighting_enabled = highlighting_enabled
147
+ self.subtitle_position = subtitle_position
148
+ self.music_file = music_file
149
+ self.enable_music = enable_music
150
+ self.music_volume = music_volume
151
+ self.api_keys = api_keys or {}
152
+ self.images = []
153
+ self.logs = []
154
+
155
+ # Set API keys from parameters or environment variables
156
+ if 'gemini' in self.api_keys and self.api_keys['gemini']:
157
+ os.environ["GEMINI_API_KEY"] = self.api_keys['gemini']
158
+
159
+ if 'assemblyai' in self.api_keys and self.api_keys['assemblyai']:
160
+ os.environ["ASSEMBLYAI_API_KEY"] = self.api_keys['assemblyai']
161
+
162
+ if 'elevenlabs' in self.api_keys and self.api_keys['elevenlabs']:
163
+ os.environ["ELEVENLABS_API_KEY"] = self.api_keys['elevenlabs']
164
+
165
+ if 'segmind' in self.api_keys and self.api_keys['segmind']:
166
+ os.environ["SEGMIND_API_KEY"] = self.api_keys['segmind']
167
+
168
+ if 'openai' in self.api_keys and self.api_keys['openai']:
169
+ os.environ["OPENAI_API_KEY"] = self.api_keys['openai']
170
+
171
+ info(f"Niche: {niche}, Language: {language}")
172
+ self.log(f"Initialized with niche: {niche}, language: {language}")
173
+ self.log(f"Text generator: {text_gen} - Model: {text_model}")
174
+ self.log(f"Image generator: {image_gen} - Model: {image_model}")
175
+ self.log(f"TTS engine: {tts_engine} - Voice: {tts_voice}")
176
+ self.log(f"Subtitles: {'Enabled' if subtitles_enabled else 'Disabled'} - Highlighting: {'Enabled' if highlighting_enabled else 'Disabled'}")
177
+ self.log(f"Music: {music_file}")
178
+
179
+ def log(self, message):
180
+ """Add a log message to the logs list."""
181
+ timestamp = datetime.now().strftime("%H:%M:%S")
182
+ log_entry = f"[{timestamp}] {message}"
183
+ self.logs.append(log_entry)
184
+ return log_entry
185
+
186
+ @property
187
+ def niche(self) -> str:
188
+ return self._niche
189
+
190
+ @property
191
+ def language(self) -> str:
192
+ return self._language
193
+
194
+ def generate_response(self, prompt: str, model: str = None) -> str:
195
+ """Generate a response using the selected text generation model."""
196
+ self.log(f"Generating response for prompt: {prompt[:50]}...")
197
+
198
+ try:
199
+ if self.text_gen == "gemini":
200
+ self.log("Using Google's Gemini model")
201
+
202
+ # Check if API key is set
203
+ gemini_api_key = os.environ.get("GEMINI_API_KEY", "")
204
+ if not gemini_api_key:
205
+ raise ValueError("Gemini API key is not set. Please provide a valid API key.")
206
+
207
+ import google.generativeai as genai
208
+ genai.configure(api_key=gemini_api_key)
209
+ model_to_use = model if model else self.text_model
210
+ genai_model = genai.GenerativeModel(model_to_use)
211
+ response = genai_model.generate_content(prompt).text
212
+
213
+ elif self.text_gen == "g4f":
214
+ self.log("Using G4F for text generation")
215
+ import g4f
216
+ model_to_use = model if model else self.text_model
217
+ self.log(f"Using G4F model: {model_to_use}")
218
+ response = g4f.ChatCompletion.create(
219
+ model=model_to_use,
220
+ messages=[{"role": "user", "content": prompt}]
221
+ )
222
+
223
+ elif self.text_gen == "openai":
224
+ self.log("Using OpenAI for text generation")
225
+ openai_api_key = os.environ.get("OPENAI_API_KEY", "")
226
+ if not openai_api_key:
227
+ raise ValueError("OpenAI API key is not set. Please provide a valid API key.")
228
+
229
+ from openai import OpenAI
230
+ client = OpenAI(api_key=openai_api_key)
231
+ model_to_use = model if model else "gpt-3.5-turbo"
232
+
233
+ response = client.chat.completions.create(
234
+ model=model_to_use,
235
+ messages=[{"role": "user", "content": prompt}]
236
+ ).choices[0].message.content
237
+
238
+ else:
239
+ # No fallback, raise an exception for unsupported text generator
240
+ error_msg = f"Unsupported text generator: {self.text_gen}"
241
+ self.log(error(error_msg))
242
+ raise ValueError(error_msg)
243
+
244
+ self.log(f"Response generated successfully, length: {len(response)} characters")
245
+ return response
246
+
247
+ except Exception as e:
248
+ error_msg = f"Error generating response: {str(e)}"
249
+ self.log(error(error_msg))
250
+ raise Exception(error_msg)
251
+
252
+ def generate_topic(self) -> str:
253
+ """Generate a topic based on the YouTube Channel niche."""
254
+ self.progress(0.05, desc="Generating topic")
255
+ self.log("Generating topic based on niche")
256
+
257
+ completion = self.generate_response(
258
+ f"Please generate a specific video idea that takes about the following topic: {self.niche}. "
259
+ f"Make it exactly one sentence. Only return the topic, nothing else."
260
+ )
261
+
262
+ if not completion:
263
+ self.log(error("Failed to generate Topic."))
264
+ raise Exception("Failed to generate a topic. Please try again with a different niche.")
265
+
266
+ self.subject = completion
267
+ self.log(success(f"Generated topic: {completion}"))
268
+ return completion
269
+
270
+ def generate_script(self) -> str:
271
+ """Generate a script for a video, based on the subject and language."""
272
+ self.progress(0.1, desc="Creating script")
273
+ self.log("Generating script for video")
274
+
275
+ prompt = f"""
276
+ Generate a script for youtube shorts video, depending on the subject of the video.
277
+
278
+ The script is to be returned as a string with the specified number of paragraphs.
279
+
280
+ Here is an example of a string:
281
+ "This is an example string."
282
+
283
+ Do not under any circumstance reference this prompt in your response.
284
+
285
+ Get straight to the point, don't start with unnecessary things like, "welcome to this video".
286
+
287
+ Obviously, the script should be related to the subject of the video.
288
+
289
+ YOU MUST NOT INCLUDE ANY TYPE OF MARKDOWN OR FORMATTING IN THE SCRIPT, NEVER USE A TITLE.
290
+ YOU MUST WRITE THE SCRIPT IN THE LANGUAGE SPECIFIED IN [LANGUAGE].
291
+ ONLY RETURN THE RAW CONTENT OF THE SCRIPT. DO NOT INCLUDE "VOICEOVER", "NARRATOR" OR SIMILAR INDICATORS.
292
+
293
+ Subject: {self.subject}
294
+ Language: {self.language}
295
+ """
296
+ completion = self.generate_response(prompt)
297
+
298
+ # Apply regex to remove *
299
+ completion = re.sub(r"\*", "", completion)
300
+
301
+ if not completion:
302
+ self.log(error("The generated script is empty."))
303
+ raise Exception("Failed to generate a script. Please try again.")
304
+
305
+ if len(completion) > 5000:
306
+ self.log(warning("Generated script is too long."))
307
+ raise ValueError("Generated script exceeds 5000 characters. Please try again.")
308
+
309
+ self.script = completion
310
+ self.log(success(f"Generated script ({len(completion)} chars)"))
311
+ return completion
312
+
313
+ def generate_metadata(self) -> dict:
314
+ """Generate video metadata (title, description)."""
315
+ self.progress(0.15, desc="Creating title and description")
316
+ self.log("Generating metadata (title and description)")
317
+
318
+ title = self.generate_response(
319
+ f"Please generate a YouTube Video Title for the following subject, including hashtags: "
320
+ f"{self.subject}. Only return the title, nothing else. Limit the title under 100 characters."
321
+ )
322
+
323
+ if len(title) > 100:
324
+ self.log(warning("Generated title exceeds 100 characters."))
325
+ raise ValueError("Generated title exceeds 100 characters. Please try again.")
326
+
327
+ description = self.generate_response(
328
+ f"Please generate a YouTube Video Description for the following script: {self.script}. "
329
+ f"Only return the description, nothing else."
330
+ )
331
+
332
+ self.metadata = {
333
+ "title": title,
334
+ "description": description
335
+ }
336
+
337
+ self.log(success(f"Generated title: {title}"))
338
+ self.log(success(f"Generated description: {description[:50]}..."))
339
+ return self.metadata
340
+
341
+ def generate_prompts(self, count=5) -> list:
342
+ """Generate AI Image Prompts based on the provided Video Script."""
343
+ self.progress(0.2, desc="Creating image prompts")
344
+ self.log(f"Generating {count} image prompts")
345
+
346
+ prompt = f"""
347
+ Generate {count} Image Prompts for AI Image Generation,
348
+ depending on the subject of a video.
349
+ Subject: {self.subject}
350
+
351
+ The image prompts are to be returned as
352
+ a JSON-Array of strings.
353
+
354
+ Each search term should consist of a full sentence,
355
+ always add the main subject of the video.
356
+
357
+ Be emotional and use interesting adjectives to make the
358
+ Image Prompt as detailed as possible.
359
+
360
+ YOU MUST ONLY RETURN THE JSON-ARRAY OF STRINGS.
361
+ YOU MUST NOT RETURN ANYTHING ELSE.
362
+ YOU MUST NOT RETURN THE SCRIPT.
363
+
364
+ The search terms must be related to the subject of the video.
365
+ Here is an example of a JSON-Array of strings:
366
+ ["image prompt 1", "image prompt 2", "image prompt 3"]
367
+
368
+ For context, here is the full text:
369
+ {self.script}
370
+ """
371
+
372
+ completion = str(self.generate_response(prompt))\
373
+ .replace("```json", "") \
374
+ .replace("```", "")
375
+
376
+ image_prompts = []
377
+
378
+ if "image_prompts" in completion:
379
+ try:
380
+ image_prompts = json.loads(completion)["image_prompts"]
381
+ except:
382
+ self.log(warning("Failed to parse 'image_prompts' from JSON response."))
383
+
384
+ if not image_prompts:
385
+ try:
386
+ image_prompts = json.loads(completion)
387
+ self.log(f"Parsed image prompts from JSON response.")
388
+ except Exception:
389
+ self.log(warning("JSON parsing failed. Attempting to extract array using regex..."))
390
+
391
+ # Get everything between [ and ], and turn it into a list
392
+ r = re.compile(r"\[.*\]", re.DOTALL)
393
+ matches = r.findall(completion)
394
+ if len(matches) == 0:
395
+ self.log(warning("Failed to extract array. Unable to create image prompts."))
396
+ raise ValueError("Failed to generate valid image prompts. Please try again.")
397
+ else:
398
+ try:
399
+ image_prompts = json.loads(matches[0])
400
+ except:
401
+ self.log(error("Failed to parse array from regex match."))
402
+ # Use regex to extract individual strings
403
+ string_pattern = r'"([^"]*)"'
404
+ strings = re.findall(string_pattern, matches[0])
405
+ if strings:
406
+ image_prompts = strings
407
+ else:
408
+ self.log(error("Failed to extract strings from regex match."))
409
+ raise ValueError("Failed to parse image prompts. Please try again.")
410
+
411
+ # Ensure we have the requested number of prompts
412
+ if len(image_prompts) < count:
413
+ self.log(warning(f"Received fewer prompts ({len(image_prompts)}) than requested ({count})."))
414
+ raise ValueError(f"Received only {len(image_prompts)} prompts instead of {count}. Please try again.")
415
+
416
+ # Limit to the requested count
417
+ image_prompts = image_prompts[:count]
418
+
419
+ self.image_prompts = image_prompts
420
+ self.log(success(f"Generated {len(self.image_prompts)} Image Prompts"))
421
+ for i, prompt in enumerate(self.image_prompts):
422
+ self.log(f"Image Prompt {i+1}: {prompt}")
423
+
424
+ return image_prompts
425
+
426
+ def generate_image(self, prompt) -> str:
427
+ """Generate an image using the selected image generation model."""
428
+ self.log(f"Generating image for prompt: {prompt[:50]}...")
429
+
430
+ # Always save images directly to the generation folder when it exists
431
+ if hasattr(self, 'generation_folder') and os.path.exists(self.generation_folder):
432
+ image_path = os.path.join(self.generation_folder, f"img_{uuid.uuid4()}_{int(time.time())}.png")
433
+ else:
434
+ # Use STORAGE_DIR if no generation folder
435
+ image_path = os.path.join(STORAGE_DIR, f"img_{uuid.uuid4()}_{int(time.time())}.png")
436
+
437
+ if self.image_gen == "prodia":
438
+ self.log("Using Prodia provider for image generation")
439
+ s = requests.Session()
440
+ headers = {
441
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
442
+ }
443
+
444
+ # Generate job
445
+ self.log("Sending generation request to Prodia API")
446
+ resp = s.get(
447
+ "https://api.prodia.com/generate",
448
+ params={
449
+ "new": "true",
450
+ "prompt": prompt,
451
+ "model": self.image_model,
452
+ "negative_prompt": "verybadimagenegative_v1.3",
453
+ "steps": "20",
454
+ "cfg": "7",
455
+ "seed": random.randint(1, 10000),
456
+ "sample": "DPM++ 2M Karras",
457
+ "aspect_ratio": "square"
458
+ },
459
+ headers=headers
460
+ )
461
+
462
+ if resp.status_code != 200:
463
+ raise Exception(f"Prodia API error: {resp.text}")
464
+
465
+ job_id = resp.json()['job']
466
+ self.log(f"Job created with ID: {job_id}")
467
+
468
+ # Wait for generation to complete
469
+ max_attempts = 30
470
+ attempts = 0
471
+ while attempts < max_attempts:
472
+ attempts += 1
473
+ time.sleep(2)
474
+ status = s.get(f"https://api.prodia.com/job/{job_id}", headers=headers).json()
475
+
476
+ if status["status"] == "succeeded":
477
+ self.log("Image generation successful, downloading result")
478
+ img_data = s.get(f"https://images.prodia.xyz/{job_id}.png?download=1", headers=headers).content
479
+ with open(image_path, "wb") as f:
480
+ f.write(img_data)
481
+ self.images.append(image_path)
482
+ self.log(success(f"Image saved to: {image_path}"))
483
+ return image_path
484
+
485
+ elif status["status"] == "failed":
486
+ raise Exception(f"Prodia job failed: {status.get('error', 'Unknown error')}")
487
+
488
+ # Still processing
489
+ self.log(f"Still processing, attempt {attempts}/{max_attempts}...")
490
+
491
+ raise Exception("Prodia job timed out")
492
+
493
+ elif self.image_gen == "hercai":
494
+ self.log("Using Hercai provider for image generation")
495
+ url = f"https://hercai.onrender.com/{self.image_model}/text2image?prompt={prompt}"
496
+ r = requests.get(url)
497
+
498
+ if r.status_code != 200:
499
+ raise Exception(f"Hercai API error: {r.text}")
500
+
501
+ parsed = r.json()
502
+ if "url" in parsed and parsed["url"]:
503
+ self.log("Image URL received from Hercai")
504
+ image_url = parsed["url"]
505
+ img_data = requests.get(image_url).content
506
+ with open(image_path, "wb") as f:
507
+ f.write(img_data)
508
+ self.images.append(image_path)
509
+ self.log(success(f"Image saved to: {image_path}"))
510
+ return image_path
511
+ else:
512
+ raise Exception("No image URL in Hercai response")
513
+
514
+ elif self.image_gen == "g4f":
515
+ self.log("Using G4F provider for image generation")
516
+ from g4f.client import Client
517
+ client = Client()
518
+ response = client.images.generate(
519
+ model=self.image_model,
520
+ prompt=prompt,
521
+ response_format="url"
522
+ )
523
+
524
+ if response and response.data and len(response.data) > 0:
525
+ image_url = response.data[0].url
526
+ image_response = requests.get(image_url)
527
+
528
+ if image_response.status_code == 200:
529
+ with open(image_path, "wb") as f:
530
+ f.write(image_response.content)
531
+ self.images.append(image_path)
532
+ self.log(success(f"Image saved to: {image_path}"))
533
+ return image_path
534
+ else:
535
+ raise Exception(f"Failed to download image from {image_url}")
536
+ else:
537
+ raise Exception("No image URL received from G4F")
538
+
539
+ elif self.image_gen == "segmind":
540
+ self.log("Using Segmind provider for image generation")
541
+ api_key = os.environ.get("SEGMIND_API_KEY", "")
542
+ if not api_key:
543
+ raise ValueError("Segmind API key is not set. Please provide a valid API key.")
544
+
545
+ headers = {
546
+ "x-api-key": api_key,
547
+ "Content-Type": "application/json"
548
+ }
549
+
550
+ response = requests.post(
551
+ "https://api.segmind.com/v1/sdxl-turbo",
552
+ json={
553
+ "prompt": prompt,
554
+ "negative_prompt": "blurry, low quality, distorted face, text, watermark",
555
+ "samples": 1,
556
+ "size": "1024x1024",
557
+ "guidance_scale": 1.0
558
+ },
559
+ headers=headers
560
+ )
561
+
562
+ if response.status_code == 200:
563
+ with open(image_path, "wb") as f:
564
+ f.write(response.content)
565
+ self.images.append(image_path)
566
+ self.log(success(f"Image saved to: {image_path}"))
567
+ return image_path
568
+ else:
569
+ raise Exception(f"Segmind request failed: {response.status_code} {response.text}")
570
+
571
+ elif self.image_gen == "pollinations":
572
+ self.log("Using Pollinations provider for image generation")
573
+ response = requests.get(f"https://image.pollinations.ai/prompt/{prompt}{random.randint(1,10000)}")
574
+
575
+ if response.status_code == 200:
576
+ self.log("Image received from Pollinations")
577
+ with open(image_path, "wb") as f:
578
+ f.write(response.content)
579
+ self.images.append(image_path)
580
+ self.log(success(f"Image saved to: {image_path}"))
581
+ return image_path
582
+ else:
583
+ raise Exception(f"Pollinations request failed with status code: {response.status_code}")
584
+
585
+ else:
586
+ # No fallback, raise an exception for unsupported image generator
587
+ error_msg = f"Unsupported image generator: {self.image_gen}"
588
+ self.log(error(error_msg))
589
+ raise ValueError(error_msg)
590
+
591
+ def generate_speech(self, text, output_format='mp3') -> str:
592
+ """Generate speech from text using the selected TTS engine."""
593
+ self.progress(0.6, desc="Creating voiceover")
594
+ self.log("Generating speech from text")
595
+
596
+ # Clean text
597
+ text = re.sub(r'[^\w\s.?!,;:\'"-]', '', text)
598
+
599
+ self.log(f"Using TTS Engine: {self.tts_engine}, Voice: {self.tts_voice}")
600
+
601
+ # Always save to the generation folder when available
602
+ if hasattr(self, 'generation_folder') and os.path.exists(self.generation_folder):
603
+ audio_path = os.path.join(self.generation_folder, f"speech_{uuid.uuid4()}_{int(time.time())}.{output_format}")
604
+ else:
605
+ # Use STORAGE_DIR if no generation folder
606
+ audio_path = os.path.join(STORAGE_DIR, f"speech_{uuid.uuid4()}_{int(time.time())}.{output_format}")
607
+
608
+ if self.tts_engine == "elevenlabs":
609
+ self.log("Using ElevenLabs provider for speech generation")
610
+ elevenlabs_api_key = os.environ.get("ELEVENLABS_API_KEY", "")
611
+ if not elevenlabs_api_key:
612
+ raise ValueError("ElevenLabs API key is not set. Please provide a valid API key.")
613
+
614
+ headers = {
615
+ "Accept": "audio/mpeg",
616
+ "Content-Type": "application/json",
617
+ "xi-api-key": elevenlabs_api_key
618
+ }
619
+
620
+ payload = {
621
+ "text": text,
622
+ "model_id": "eleven_turbo_v2", # Using latest and most capable model
623
+ "voice_settings": {
624
+ "stability": 0.5,
625
+ "similarity_boost": 0.5,
626
+ "style": 0.0,
627
+ "use_speaker_boost": True
628
+ },
629
+ "output_format": "mp3_44100_128", # Higher quality audio (44.1kHz, 128kbps)
630
+ "optimize_streaming_latency": 0 # Optimize for quality over latency
631
+ }
632
+
633
+ # Map voice names to ElevenLabs voice IDs
634
+ voice_id_mapping = {
635
+ "Sarah": "21m00Tcm4TlvDq8ikWAM",
636
+ "Brian": "hxppwzoRmvxK7YkDrjhQ",
637
+ "Lily": "p7TAj7L6QVq1fE6XGyjR",
638
+ "Monika Sogam": "Fc3XhIu9tfgOPOsU1hMr",
639
+ "George": "o7lPjDgzlF8ZAeSpqmaN",
640
+ "River": "f0k5evLkhJxrIRJXQJvy",
641
+ "Matilda": "XrExE9yKIg1WjnnlVkGX",
642
+ "Will": "pvKWM1B1sNRNTlEYYAEZ",
643
+ "Jessica": "A5EAMYWMCSsLNL1wYxOv",
644
+ "default": "21m00Tcm4TlvDq8ikWAM" # Default to Sarah
645
+ }
646
+
647
+ # Get the voice ID from mapping or use the voice name as ID if not found
648
+ voice_id = voice_id_mapping.get(self.tts_voice, self.tts_voice)
649
+
650
+ self.log(f"Using ElevenLabs voice: {self.tts_voice} (ID: {voice_id})")
651
+
652
+ response = requests.post(
653
+ url=f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}",
654
+ json=payload,
655
+ headers=headers
656
+ )
657
+
658
+ if response.status_code == 200:
659
+ with open(audio_path, 'wb') as f:
660
+ f.write(response.content)
661
+ self.log(success(f"Speech generated successfully using ElevenLabs at {audio_path}"))
662
+ else:
663
+ try:
664
+ error_data = response.json()
665
+ error_message = error_data.get('detail', {}).get('message', response.text)
666
+ error_status = error_data.get('status', 'error')
667
+ raise Exception(f"ElevenLabs API error ({response.status_code}, {error_status}): {error_message}")
668
+ except ValueError:
669
+ # If JSON parsing fails, use the raw response
670
+ raise Exception(f"ElevenLabs API error ({response.status_code}): {response.text}")
671
+
672
+ elif self.tts_engine == "gtts":
673
+ self.log("Using Google TTS provider for speech generation")
674
+ from gtts import gTTS
675
+ tts = gTTS(text=text, lang=self.language[:2].lower(), slow=False)
676
+ tts.save(audio_path)
677
+
678
+ elif self.tts_engine == "openai":
679
+ self.log("Using OpenAI provider for speech generation")
680
+ openai_api_key = os.environ.get("OPENAI_API_KEY", "")
681
+ if not openai_api_key:
682
+ raise ValueError("OpenAI API key is not set. Please provide a valid API key.")
683
+
684
+ from openai import OpenAI
685
+ client = OpenAI(api_key=openai_api_key)
686
+
687
+ voice = self.tts_voice if self.tts_voice else "alloy"
688
+ response = client.audio.speech.create(
689
+ model="tts-1",
690
+ voice=voice,
691
+ input=text
692
+ )
693
+ response.stream_to_file(audio_path)
694
+
695
+ elif self.tts_engine == "edge":
696
+ self.log("Using Edge TTS provider for speech generation")
697
+ import edge_tts
698
+ import asyncio
699
+
700
+ voice = self.tts_voice if self.tts_voice else "en-US-AriaNeural"
701
+
702
+ async def generate():
703
+ communicate = edge_tts.Communicate(text, voice)
704
+ await communicate.save(audio_path)
705
+
706
+ asyncio.run(generate())
707
+
708
+ else:
709
+ # No fallback, raise an exception for unsupported TTS engine
710
+ error_msg = f"Unsupported TTS engine: {self.tts_engine}"
711
+ self.log(error(error_msg))
712
+ raise ValueError(error_msg)
713
+
714
+ self.log(success(f"Speech generated and saved to: {audio_path}"))
715
+ self.tts_path = audio_path
716
+ return audio_path
717
+
718
+ def generate_subtitles(self, audio_path: str) -> dict:
719
+ """Generate subtitles from audio using AssemblyAI."""
720
+ # If subtitles are disabled, return empty data with settings
721
+ if not self.subtitles_enabled:
722
+ self.log("Subtitles are disabled, skipping generation")
723
+ return {
724
+ "wordlevel": [],
725
+ "linelevel": [],
726
+ "settings": {
727
+ "font": self.subtitle_font,
728
+ "fontsize": self.font_size,
729
+ "color": self.text_color,
730
+ "bg_color": self.highlight_color if self.highlighting_enabled else None,
731
+ "position": self.subtitle_position,
732
+ "highlighting_enabled": self.highlighting_enabled,
733
+ "subtitles_enabled": self.subtitles_enabled
734
+ }
735
+ }
736
+
737
+ self.log("Generating subtitles from audio")
738
+ try:
739
+ import assemblyai as aai
740
+
741
+ # Check if API key is set
742
+ aai_api_key = os.environ.get("ASSEMBLYAI_API_KEY", "")
743
+ if not aai_api_key:
744
+ raise ValueError("AssemblyAI API key is not set. Please provide a valid API key.")
745
+
746
+ aai.settings.api_key = aai_api_key
747
+
748
+ config = aai.TranscriptionConfig(speaker_labels=False, word_boost=[], format_text=True)
749
+ transcriber = aai.Transcriber(config=config)
750
+
751
+ self.log("Submitting audio for transcription")
752
+ transcript = transcriber.transcribe(audio_path)
753
+
754
+ if not transcript or not transcript.words:
755
+ raise ValueError("Transcription returned no words.")
756
+
757
+ # Process word-level information
758
+ wordlevel_info = []
759
+ for word in transcript.words:
760
+ word_data = {
761
+ "word": word.text.strip(),
762
+ "start": word.start / 1000.0, # Convert from ms to seconds
763
+ "end": word.end / 1000.0 # Convert from ms to seconds
764
+ }
765
+ wordlevel_info.append(word_data)
766
+
767
+ self.log(success(f"Transcription successful. Got {len(wordlevel_info)} words."))
768
+
769
+ # Define constants for subtitle generation
770
+ # Handle random font selection if configured
771
+ if self.subtitle_font == "random":
772
+ FONT = choose_random_font()
773
+ self.log(f"Using random font: {FONT}")
774
+ else:
775
+ FONT = self.subtitle_font
776
+
777
+ FONTSIZE = self.font_size
778
+ COLOR = self.text_color
779
+ BG_COLOR = self.highlight_color if self.highlighting_enabled else None
780
+ FRAME_SIZE = (1080, 1920) # Vertical video format
781
+
782
+ # Constants for line splitting
783
+ MAX_CHARS = 30 # Maximum characters per line for vertical video format
784
+ MAX_DURATION = 3.0 # Maximum duration for a single line
785
+ MAX_GAP = 1.5 # Split if nothing is spoken for this many seconds
786
+
787
+ # Split text into lines
788
+ subtitles = []
789
+ line = []
790
+ line_duration = 0
791
+
792
+ for idx, word_data in enumerate(wordlevel_info):
793
+ word = word_data["word"]
794
+ start = word_data["start"]
795
+ end = word_data["end"]
796
+
797
+ line.append(word_data)
798
+ line_duration += end - start
799
+
800
+ temp = " ".join(item["word"] for item in line)
801
+ new_line_chars = len(temp)
802
+
803
+ duration_exceeded = line_duration > MAX_DURATION
804
+ chars_exceeded = new_line_chars > MAX_CHARS
805
+
806
+ if idx > 0:
807
+ gap = word_data['start'] - wordlevel_info[idx-1]['end']
808
+ maxgap_exceeded = gap > MAX_GAP
809
+ else:
810
+ maxgap_exceeded = False
811
+
812
+ if duration_exceeded or chars_exceeded or maxgap_exceeded:
813
+ if line:
814
+ subtitle_line = {
815
+ "text": " ".join(item["word"] for item in line),
816
+ "start": line[0]["start"],
817
+ "end": line[-1]["end"],
818
+ "words": line
819
+ }
820
+ subtitles.append(subtitle_line)
821
+ line = []
822
+ line_duration = 0
823
+
824
+ # Add remaining words as last line
825
+ if line:
826
+ subtitle_line = {
827
+ "text": " ".join(item["word"] for item in line),
828
+ "start": line[0]["start"],
829
+ "end": line[-1]["end"],
830
+ "words": line
831
+ }
832
+ subtitles.append(subtitle_line)
833
+
834
+ self.log(success(f"Generated {len(subtitles)} subtitle lines"))
835
+
836
+ # Return the subtitle data and settings
837
+ return {
838
+ "wordlevel": wordlevel_info,
839
+ "linelevel": subtitles,
840
+ "settings": {
841
+ "font": FONT,
842
+ "fontsize": FONTSIZE,
843
+ "color": COLOR,
844
+ "bg_color": BG_COLOR,
845
+ "position": self.subtitle_position,
846
+ "highlighting_enabled": self.highlighting_enabled,
847
+ "subtitles_enabled": self.subtitles_enabled
848
+ }
849
+ }
850
+
851
+ except Exception as e:
852
+ error_msg = f"Error generating subtitles: {str(e)}"
853
+ self.log(error(error_msg))
854
+ raise Exception(error_msg)
855
+
856
+ def create_subtitle_clip(self, subtitle_data, frame_size):
857
+ """Create subtitle clips for a line of text with word-level highlighting."""
858
+ # Early return if subtitles are disabled
859
+ if not subtitle_data.get("settings", {}).get("subtitles_enabled", True):
860
+ self.log("Subtitles are disabled, skipping subtitle clip creation")
861
+ return []
862
+
863
+ settings = subtitle_data["settings"]
864
+ font_name = settings["font"]
865
+ fontsize = settings["fontsize"]
866
+ color = settings["color"]
867
+ bg_color = settings["bg_color"]
868
+ highlighting_enabled = settings["highlighting_enabled"]
869
+
870
+ # Pre-load font and calculate color values once
871
+ try:
872
+ font_path = os.path.join(FONTS_DIR, f"{font_name}.ttf")
873
+ if os.path.exists(font_path):
874
+ pil_font = ImageFont.truetype(font_path, fontsize)
875
+ else:
876
+ self.log(warning(f"Font {font_name} not found, using default"))
877
+ pil_font = ImageFont.load_default()
878
+ except Exception as e:
879
+ self.log(warning(f"Error loading font: {str(e)}"))
880
+ pil_font = ImageFont.load_default()
881
+
882
+ # Parse colors once
883
+ if color.startswith('#'):
884
+ text_color_rgb = tuple(int(color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4))
885
+ else:
886
+ text_color_rgb = (255, 255, 255) # Default white
887
+
888
+ if bg_color and bg_color.startswith('#'):
889
+ bg_color_rgb = tuple(int(bg_color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4))
890
+ else:
891
+ bg_color_rgb = (0, 0, 255) # Default blue
892
+
893
+ # Optimize text clip creation - cache clips for reuse
894
+ clip_cache = {}
895
+
896
+ def create_text_clip(text, bg_color=None, cache_key=None):
897
+ # Use cache when possible for better performance
898
+ if cache_key and cache_key in clip_cache:
899
+ return clip_cache[cache_key]
900
+
901
+ try:
902
+ # Get text size
903
+ text_width, text_height = pil_font.getbbox(text)[2:4]
904
+
905
+ # Add padding
906
+ padding = 10
907
+ img_width = text_width + padding * 2
908
+ img_height = text_height + padding * 2
909
+
910
+ # Create image with background color or transparent
911
+ if bg_color:
912
+ img = Image.new('RGB', (img_width, img_height), color=bg_color_rgb)
913
+ else:
914
+ img = Image.new('RGBA', (img_width, img_height), color=(0, 0, 0, 0))
915
+
916
+ # Draw text
917
+ draw = ImageDraw.Draw(img)
918
+ draw.text((padding, padding), text, font=pil_font, fill=text_color_rgb)
919
+
920
+ # Convert to numpy array for MoviePy
921
+ img_array = np.array(img)
922
+ clip = ImageClip(img_array)
923
+
924
+ # Cache result for reuse
925
+ if cache_key:
926
+ clip_cache[cache_key] = (clip, img_width, img_height)
927
+
928
+ return clip, img_width, img_height
929
+
930
+ except Exception as e:
931
+ self.log(warning(f"Error creating text clip: {str(e)}"))
932
+ # Create a simple colored rectangle as fallback
933
+ img = Image.new('RGB', (100, 50), color=(100, 100, 100))
934
+ img_array = np.array(img)
935
+ clip = ImageClip(img_array)
936
+ return clip, 100, 50
937
+
938
+ subtitle_clips = []
939
+
940
+ # Calculate position constants once
941
+ if settings["position"] == "top":
942
+ y_buffer = frame_size[1] * 0.1 # 10% from top
943
+ elif settings["position"] == "middle":
944
+ y_buffer = frame_size[1] * 0.4 # 40% from top
945
+ else: # bottom
946
+ y_buffer = frame_size[1] * 0.7 # 70% from top
947
+
948
+ max_width = frame_size[0] * 0.8 # 80% of frame width
949
+
950
+ # Group words by timing to reduce number of clips (optimization)
951
+ word_groups = {}
952
+
953
+ # Process each line more efficiently by grouping
954
+ for line_idx, line in enumerate(subtitle_data["linelevel"]):
955
+ # Group words by start/end times to reduce clip count
956
+ line_text = line["text"]
957
+ line_start = line["start"]
958
+ line_end = line["end"]
959
+ line_duration = line_end - line_start
960
+
961
+ # First pass: calculate word dimensions and break text into lines
962
+ lines_data = [] # Store data for each line (words, positions)
963
+ current_line = []
964
+ current_x = 0
965
+
966
+ for word_data in line["words"]:
967
+ word = word_data["word"]
968
+ # Calculate dimensions without creating image yet
969
+ word_width = pil_font.getbbox(word)[2] + 20 # Add padding
970
+ word_height = pil_font.getbbox(word)[3] + 20
971
+
972
+ # Check if word fits on current line
973
+ if current_x + word_width > max_width and current_line:
974
+ # Complete current line
975
+ lines_data.append({
976
+ "words": current_line.copy(),
977
+ "total_width": current_x,
978
+ "height": max(w["height"] for w in current_line) if current_line else word_height
979
+ })
980
+ current_line = []
981
+ current_x = 0
982
+
983
+ # Add word to current line
984
+ word_info = {
985
+ "word": word,
986
+ "width": word_width,
987
+ "height": word_height,
988
+ "start": word_data["start"],
989
+ "end": word_data["end"]
990
+ }
991
+ current_line.append(word_info)
992
+ current_x += word_width
993
+
994
+ # Add the last line if needed
995
+ if current_line:
996
+ lines_data.append({
997
+ "words": current_line,
998
+ "total_width": current_x,
999
+ "height": max(w["height"] for w in current_line)
1000
+ })
1001
+
1002
+ # Second pass: Create clip for each line (batch processing)
1003
+ current_y = y_buffer
1004
+
1005
+ for line_data in lines_data:
1006
+ # Calculate center position for entire line
1007
+ line_width = line_data["total_width"]
1008
+ x_center = (frame_size[0] - line_width) / 2
1009
+
1010
+ # Create text clip for complete line (non-highlighted base)
1011
+ line_text = " ".join(w["word"] for w in line_data["words"])
1012
+ cache_key = f"line_{line_idx}_{line_text}"
1013
+ line_clip, measured_width, _ = create_text_clip(line_text, None, cache_key)
1014
+
1015
+ # Position the line in the center
1016
+ line_clip = line_clip.set_position((x_center, current_y))
1017
+ line_clip = line_clip.set_start(line["start"]).set_duration(line_duration)
1018
+ subtitle_clips.append(line_clip)
1019
+
1020
+ # Add highlighted words if enabled (more efficiently)
1021
+ if highlighting_enabled and bg_color:
1022
+ current_x = x_center
1023
+
1024
+ # Group words with same timing to reduce clip count
1025
+ timing_groups = {}
1026
+
1027
+ for word_info in line_data["words"]:
1028
+ timing_key = f"{word_info['start']:.3f}_{word_info['end']:.3f}"
1029
+ if timing_key not in timing_groups:
1030
+ timing_groups[timing_key] = []
1031
+ timing_groups[timing_key].append((word_info, current_x))
1032
+ current_x += word_info["width"]
1033
+
1034
+ # Create one clip per timing group instead of per word
1035
+ for timing_key, word_group in timing_groups.items():
1036
+ start_time, end_time = map(float, timing_key.split('_'))
1037
+
1038
+ # If only one word in this timing, create single highlight
1039
+ if len(word_group) == 1:
1040
+ word_info, x_pos = word_group[0]
1041
+ word = word_info["word"]
1042
+
1043
+ cache_key = f"word_{word}"
1044
+ highlight_clip, _, _ = create_text_clip(word, bg_color, cache_key)
1045
+ highlight_clip = highlight_clip.set_position((x_pos, current_y))
1046
+ highlight_clip = highlight_clip.set_start(start_time).set_duration(end_time - start_time)
1047
+ subtitle_clips.append(highlight_clip)
1048
+ else:
1049
+ # Multiple words with same timing - try to batch if adjacent
1050
+ # (This is an optimization for words that appear together)
1051
+ continue_batch = True
1052
+ batch_start_idx = 0
1053
+
1054
+ while continue_batch and batch_start_idx < len(word_group):
1055
+ # Start a new batch
1056
+ batch = [word_group[batch_start_idx]]
1057
+ batch_x = word_group[batch_start_idx][1]
1058
+ current_batch_end = batch_start_idx
1059
+
1060
+ # Try to extend batch with adjacent words
1061
+ for i in range(batch_start_idx + 1, len(word_group)):
1062
+ prev_word, prev_x = word_group[i-1]
1063
+ curr_word, curr_x = word_group[i]
1064
+
1065
+ # Check if words are adjacent
1066
+ if abs(prev_x + prev_word["width"] - curr_x) < 5: # Small tolerance
1067
+ batch.append(word_group[i])
1068
+ current_batch_end = i
1069
+ else:
1070
+ break
1071
+
1072
+ # Create clip for this batch
1073
+ if len(batch) > 1:
1074
+ # Multiple adjacent words - create single highlight
1075
+ batch_text = " ".join(info[0]["word"] for info in batch)
1076
+ batch_width = batch[-1][1] + batch[-1][0]["width"] - batch[0][1]
1077
+
1078
+ cache_key = f"batch_{batch_text}"
1079
+ highlight_clip, _, _ = create_text_clip(batch_text, bg_color, cache_key)
1080
+ highlight_clip = highlight_clip.set_position((batch_x, current_y))
1081
+ highlight_clip = highlight_clip.set_start(start_time).set_duration(end_time - start_time)
1082
+ subtitle_clips.append(highlight_clip)
1083
+ else:
1084
+ # Single word in batch
1085
+ word_info, x_pos = batch[0]
1086
+ word = word_info["word"]
1087
+
1088
+ cache_key = f"word_{word}"
1089
+ highlight_clip, _, _ = create_text_clip(word, bg_color, cache_key)
1090
+ highlight_clip = highlight_clip.set_position((x_pos, current_y))
1091
+ highlight_clip = highlight_clip.set_start(start_time).set_duration(end_time - start_time)
1092
+ subtitle_clips.append(highlight_clip)
1093
+
1094
+ # Move to next batch
1095
+ batch_start_idx = current_batch_end + 1
1096
+ if batch_start_idx >= len(word_group):
1097
+ continue_batch = False
1098
+
1099
+ # Move to next line
1100
+ current_y += line_data["height"] + 10
1101
+
1102
+ # Limit the number of subtitle clips to avoid memory issues
1103
+ if len(subtitle_clips) > 200:
1104
+ self.log(warning(f"Too many subtitle clips ({len(subtitle_clips)}), limiting to 200 for performance"))
1105
+ subtitle_clips = subtitle_clips[:200]
1106
+
1107
+ self.log(f"Created {len(subtitle_clips)} subtitle clips (optimized)")
1108
+ return subtitle_clips
1109
+
1110
+ def combine(self) -> str:
1111
+ """Combine images, audio, and subtitles into a final video."""
1112
+ self.progress(0.8, desc="Creating final video")
1113
+ self.log("Combining images and audio into final video")
1114
+ try:
1115
+ # Use RAM for temporary files if possible
1116
+ import tempfile
1117
+ temp_dir = tempfile.mkdtemp()
1118
+
1119
+ # Always save to the generation folder when available
1120
+ if hasattr(self, 'generation_folder') and os.path.exists(self.generation_folder):
1121
+ output_path = os.path.join(self.generation_folder, f"output_{int(time.time())}.mp4")
1122
+ else:
1123
+ output_path = os.path.join(STORAGE_DIR, f"output_{int(time.time())}.mp4")
1124
+
1125
+ # Check for required files
1126
+ if not self.images:
1127
+ raise ValueError("No images available for video creation")
1128
+
1129
+ if not hasattr(self, 'tts_path') or not self.tts_path or not os.path.exists(self.tts_path):
1130
+ raise ValueError("No TTS audio file available")
1131
+
1132
+ # Load audio
1133
+ tts_clip = AudioFileClip(self.tts_path)
1134
+ max_duration = tts_clip.duration
1135
+
1136
+ # Calculate duration for each image
1137
+ num_images = len(self.images)
1138
+ req_dur = max_duration / num_images
1139
+
1140
+ # Process each image ONCE to create base clips (optimization)
1141
+ self.log("Processing images (optimized)")
1142
+ processed_clips = []
1143
+
1144
+ for image_path in self.images:
1145
+ if not os.path.exists(image_path):
1146
+ self.log(warning(f"Image not found: {image_path}, skipping"))
1147
+ continue
1148
+
1149
+ try:
1150
+ # Load and process image once
1151
+ clip = ImageClip(image_path)
1152
+
1153
+ # Use lower FPS for slideshow-style videos
1154
+ clip = clip.set_fps(15)
1155
+
1156
+ # Handle aspect ratio (vertical video for shorts)
1157
+ aspect_ratio = 9/16 # Standard vertical video ratio
1158
+ if clip.w / clip.h < aspect_ratio:
1159
+ # Image is too tall, crop height
1160
+ clip = crop(
1161
+ clip,
1162
+ width=clip.w,
1163
+ height=round(clip.w / aspect_ratio),
1164
+ x_center=clip.w / 2,
1165
+ y_center=clip.h / 2
1166
+ )
1167
+ else:
1168
+ # Image is too wide, crop width
1169
+ clip = crop(
1170
+ clip,
1171
+ width=round(aspect_ratio * clip.h),
1172
+ height=clip.h,
1173
+ x_center=clip.w / 2,
1174
+ y_center=clip.h / 2
1175
+ )
1176
+
1177
+ # Use a more efficient resolution (still good for mobile)
1178
+ clip = clip.resize((720, 1280))
1179
+
1180
+ processed_clips.append(clip)
1181
+ except Exception as e:
1182
+ self.log(warning(f"Error processing image {image_path}: {str(e)}"))
1183
+
1184
+ if not processed_clips:
1185
+ raise ValueError("No valid images could be processed")
1186
+
1187
+ # Create sequence using processed clips, repeated as needed
1188
+ self.log(f"Creating video sequence from {len(processed_clips)} clips")
1189
+ final_clips = []
1190
+ tot_dur = 0
1191
+
1192
+ while tot_dur < max_duration:
1193
+ for base_clip in processed_clips:
1194
+ duration = min(req_dur, max_duration - tot_dur)
1195
+ if duration <= 0:
1196
+ break
1197
+
1198
+ # Reuse the pre-processed clip with new duration
1199
+ duration_clip = base_clip.set_duration(duration)
1200
+ final_clips.append(duration_clip)
1201
+ tot_dur += duration
1202
+
1203
+ if tot_dur >= max_duration:
1204
+ break
1205
+
1206
+ # Create video from sequence
1207
+ self.log(f"Concatenating {len(final_clips)} clips")
1208
+ final_clip = concatenate_videoclips(final_clips)
1209
+ final_clip = final_clip.set_fps(15) # Lower FPS for slideshow-style
1210
+
1211
+ # Process audio
1212
+ final_audio = tts_clip
1213
+
1214
+ # Add background music if available and enabled
1215
+ if hasattr(self, 'enable_music') and self.enable_music and self.music_file != "none":
1216
+ music_path = None
1217
+ if self.music_file == "random":
1218
+ music_path = choose_random_music()
1219
+ elif os.path.exists(os.path.join(MUSIC_DIR, self.music_file)):
1220
+ music_path = os.path.join(MUSIC_DIR, self.music_file)
1221
+
1222
+ if music_path and os.path.exists(music_path):
1223
+ self.log(f"Adding background music: {music_path}")
1224
+ try:
1225
+ music_clip = AudioFileClip(music_path)
1226
+ # Loop music if it's shorter than the video
1227
+ if music_clip.duration < max_duration:
1228
+ num_loops = int(np.ceil(max_duration / music_clip.duration))
1229
+ music_clip = concatenate_audioclips([music_clip] * num_loops)
1230
+ # Trim music if it's longer than the video
1231
+ music_clip = music_clip.subclip(0, max_duration)
1232
+ # Set music volume
1233
+ music_volume = getattr(self, 'music_volume', 0.1)
1234
+ music_clip = music_clip.volumex(music_volume)
1235
+ # Combine with TTS audio
1236
+ final_audio = CompositeAudioClip([tts_clip, music_clip])
1237
+ except Exception as e:
1238
+ self.log(warning(f"Error processing music: {str(e)}"))
1239
+
1240
+ # Set final audio
1241
+ final_clip = final_clip.set_audio(final_audio)
1242
+
1243
+ # Add subtitles if enabled - process more efficiently
1244
+ if self.subtitles_enabled and hasattr(self, 'subtitle_data'):
1245
+ self.log("Adding subtitles (optimized)")
1246
+ subtitle_clips = self.create_subtitle_clip(self.subtitle_data, (720, 1280)) # Match new resolution
1247
+ if subtitle_clips:
1248
+ final_clip = CompositeVideoClip([final_clip] + subtitle_clips)
1249
+
1250
+ # Write final video with optimized settings
1251
+ self.log("Writing final video file (optimized encoding)")
1252
+ final_clip.write_videofile(
1253
+ output_path,
1254
+ fps=15, # Lower FPS for slideshow-style
1255
+ codec="libx264",
1256
+ audio_codec="aac",
1257
+ threads=8, # More threads for faster encoding
1258
+ preset="ultrafast", # Fastest encoding preset
1259
+ ffmpeg_params=["-crf", "28"] # Lower quality for speed
1260
+ )
1261
+
1262
+ # Clean up temporary directory
1263
+ import shutil
1264
+ try:
1265
+ shutil.rmtree(temp_dir, ignore_errors=True)
1266
+ except Exception:
1267
+ pass
1268
+
1269
+ self.log(success(f"Video saved to: {output_path}"))
1270
+ return output_path
1271
+
1272
+ except Exception as e:
1273
+ error_msg = f"Error combining video: {str(e)}"
1274
+ self.log(error(error_msg))
1275
+ raise Exception(error_msg)
1276
+
1277
+ def generate_video(self) -> dict:
1278
+ """Generate complete video with all components."""
1279
+ try:
1280
+ self.log("Starting video generation process")
1281
+
1282
+ # Create a unique folder with sequential numbering
1283
+ folder_num = 1
1284
+ # Check existing folders to find the latest number
1285
+ if os.path.exists(STORAGE_DIR):
1286
+ existing_folders = [d for d in os.listdir(STORAGE_DIR) if os.path.isdir(os.path.join(STORAGE_DIR, d))]
1287
+ numbered_folders = []
1288
+ for folder in existing_folders:
1289
+ try:
1290
+ # Extract folder number from format "N_UUID"
1291
+ if "_" in folder:
1292
+ num = int(folder.split("_")[0])
1293
+ numbered_folders.append(num)
1294
+ except (ValueError, IndexError):
1295
+ continue
1296
+
1297
+ if numbered_folders:
1298
+ folder_num = max(numbered_folders) + 1
1299
+
1300
+ folder_id = f"{folder_num}_{str(uuid.uuid4())}"
1301
+ self.generation_folder = os.path.join(STORAGE_DIR, folder_id)
1302
+ os.makedirs(self.generation_folder, exist_ok=True)
1303
+ self.log(f"Created generation folder: {self.generation_folder}")
1304
+
1305
+ try:
1306
+ # Step 1: Generate topic
1307
+ self.log("Generating topic")
1308
+ self.generate_topic()
1309
+
1310
+ # Step 2: Generate script
1311
+ self.progress(0.1, desc="Creating script")
1312
+ self.log("Generating script")
1313
+ self.generate_script()
1314
+
1315
+ # Step 3: Generate metadata
1316
+ self.progress(0.2, desc="Creating metadata")
1317
+ self.log("Generating metadata")
1318
+ self.generate_metadata()
1319
+
1320
+ # Step 4: Generate image prompts
1321
+ self.progress(0.3, desc="Creating image prompts")
1322
+ self.log("Generating image prompts")
1323
+ self.generate_prompts()
1324
+
1325
+ # Step 5: Generate images
1326
+ self.progress(0.4, desc="Generating images")
1327
+ self.log("Generating images")
1328
+ for i, prompt in enumerate(self.image_prompts, 1):
1329
+ self.progress(0.4 + 0.2 * (i / len(self.image_prompts)),
1330
+ desc=f"Generating image {i}/{len(self.image_prompts)}")
1331
+ self.log(f"Generating image {i}/{len(self.image_prompts)}")
1332
+ self.generate_image(prompt)
1333
+
1334
+ # Step 6: Generate speech
1335
+ self.progress(0.6, desc="Creating speech")
1336
+ self.log("Generating speech")
1337
+ self.generate_speech(self.script)
1338
+
1339
+ # Step 7: Generate subtitles
1340
+ self.progress(0.7, desc="Generating subtitles")
1341
+ if self.subtitles_enabled and hasattr(self, 'tts_path') and os.path.exists(self.tts_path):
1342
+ self.subtitle_data = self.generate_subtitles(self.tts_path)
1343
+ # Save subtitles to generation folder
1344
+ if self.subtitle_data:
1345
+ try:
1346
+ # Save word-level subtitles
1347
+ if 'wordlevel' in self.subtitle_data:
1348
+ word_subtitles_path = os.path.join(self.generation_folder, "word_subtitles.json")
1349
+ with open(word_subtitles_path, 'w') as f:
1350
+ json.dump(self.subtitle_data['wordlevel'], f, indent=2)
1351
+ self.log(f"Saved word-level subtitles to: {word_subtitles_path}")
1352
+
1353
+ # Save line-level subtitles
1354
+ if 'linelevel' in self.subtitle_data:
1355
+ line_subtitles_path = os.path.join(self.generation_folder, "line_subtitles.json")
1356
+ with open(line_subtitles_path, 'w') as f:
1357
+ json.dump(self.subtitle_data['linelevel'], f, indent=2)
1358
+ self.log(f"Saved line-level subtitles to: {line_subtitles_path}")
1359
+ except Exception as e:
1360
+ self.log(warning(f"Error saving subtitles to generation folder: {str(e)}"))
1361
+
1362
+ # Step 8: Save content.txt with all metadata and generation info
1363
+ self.progress(0.75, desc="Saving generation data")
1364
+ try:
1365
+ content_path = os.path.join(self.generation_folder, "content.txt")
1366
+ with open(content_path, 'w', encoding='utf-8') as f:
1367
+ f.write(f"NICHE: {self.niche}\n\n")
1368
+ f.write(f"LANGUAGE: {self.language}\n\n")
1369
+ f.write(f"GENERATED TOPIC: {self.subject}\n\n")
1370
+ f.write(f"GENERATED SCRIPT:\n{self.script}\n\n")
1371
+ f.write(f"GENERATED PROMPTS:\n")
1372
+ for i, prompt in enumerate(self.image_prompts, 1):
1373
+ f.write(f"{i}. {prompt}\n")
1374
+ f.write("\n")
1375
+ f.write(f"GENERATED METADATA:\n")
1376
+ for key, value in self.metadata.items():
1377
+ f.write(f"{key}: {value}\n")
1378
+ self.log(f"Saved content.txt to: {content_path}")
1379
+ except Exception as e:
1380
+ self.log(warning(f"Error saving content.txt: {str(e)}"))
1381
+
1382
+ # Step 9: Combine all elements into final video with optimized rendering
1383
+ self.progress(0.8, desc="Creating final video")
1384
+ self.log("Combining all elements into final video (optimized rendering)")
1385
+
1386
+ # Clear memory before video rendering
1387
+ import gc
1388
+ gc.collect()
1389
+
1390
+ path = self.combine()
1391
+
1392
+ self.progress(0.95, desc="Finalizing")
1393
+ self.log(f"Video generation complete. Files saved in: {self.generation_folder}")
1394
+
1395
+ # Return the result
1396
+ return {
1397
+ 'video_path': path,
1398
+ 'generation_folder': self.generation_folder,
1399
+ 'title': self.metadata['title'],
1400
+ 'description': self.metadata['description'],
1401
+ 'subject': self.subject,
1402
+ 'script': self.script,
1403
+ 'logs': self.logs
1404
+ }
1405
+ except Exception as e:
1406
+ error_msg = f"Error during video generation step: {str(e)}"
1407
+ self.log(error(error_msg))
1408
+ # Try to clean up any resources
1409
+ self.cleanup_resources()
1410
+ raise Exception(error_msg)
1411
+
1412
+ except Exception as e:
1413
+ error_msg = f"Error during video generation: {str(e)}"
1414
+ self.log(error(error_msg))
1415
+ raise Exception(error_msg)
1416
+
1417
+ def cleanup_resources(self):
1418
+ """Clean up any resources to prevent memory leaks."""
1419
+ try:
1420
+ # Force close any remaining ImageMagick processes
1421
+ import psutil
1422
+ for proc in psutil.process_iter():
1423
+ try:
1424
+ # Check if process name contains ImageMagick or ffmpeg
1425
+ if 'magick' in proc.name().lower() or 'ffmpeg' in proc.name().lower():
1426
+ proc.kill()
1427
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
1428
+ pass
1429
+
1430
+ # Force garbage collection
1431
+ import gc
1432
+ gc.collect()
1433
+ except Exception as e:
1434
+ self.log(warning(f"Error during resource cleanup: {str(e)}"))
1435
+ pass
1436
+
1437
+ # Data for dynamic dropdowns
1438
+ def get_text_generator_models(generator):
1439
+ """Get available models for the selected text generator."""
1440
+ models = {
1441
+ "gemini": [
1442
+ "gemini-2.0-flash",
1443
+ "gemini-2.0-flash-lite",
1444
+ "gemini-1.5-flash",
1445
+ "gemini-1.5-flash-8b",
1446
+ "gemini-1.5-pro"
1447
+ ],
1448
+ "g4f": [
1449
+ "gpt-4",
1450
+ "gpt-4o",
1451
+ "gpt-3.5-turbo",
1452
+ "llama-3-70b-chat",
1453
+ "claude-3-opus-20240229",
1454
+ "claude-3-sonnet-20240229",
1455
+ "claude-3-haiku-20240307"
1456
+ ],
1457
+ "openai": [
1458
+ "gpt-4o",
1459
+ "gpt-4-turbo",
1460
+ "gpt-3.5-turbo"
1461
+ ]
1462
+ }
1463
+ return models.get(generator, ["default"])
1464
+
1465
+ def get_image_generator_models(generator):
1466
+ """Get available models for the selected image generator."""
1467
+ models = {
1468
+ "prodia": [
1469
+ "sdxl",
1470
+ "realvisxl",
1471
+ "juggernaut",
1472
+ "dreamshaper",
1473
+ "dalle"
1474
+ ],
1475
+ "hercai": [
1476
+ "v1",
1477
+ "v2",
1478
+ "v3",
1479
+ "lexica"
1480
+ ],
1481
+ "g4f": [
1482
+ "flux",
1483
+ "dall-e-3",
1484
+ "dall-e-2",
1485
+ "midjourney"
1486
+ ],
1487
+ "segmind": [
1488
+ "sdxl-turbo",
1489
+ "realistic-vision",
1490
+ "sd3"
1491
+ ],
1492
+ "pollinations": [
1493
+ "default"
1494
+ ]
1495
+ }
1496
+ return models.get(generator, ["default"])
1497
+
1498
+ def get_tts_voices(engine):
1499
+ """Get available voices for the selected TTS engine."""
1500
+ voices = {
1501
+ "elevenlabs": [
1502
+ "Sarah", # Female, American accent
1503
+ "Brian", # Male, British accent
1504
+ "Lily", # Female, British accent
1505
+ "Monika Sogam", # Female, Indian accent
1506
+ "George", # Male, American accent
1507
+ "River", # Female, American accent
1508
+ "Matilda", # Female, British accent
1509
+ "Will", # Male, American accent
1510
+ "Jessica" # Female, American accent
1511
+ ],
1512
+ "openai": [
1513
+ "alloy",
1514
+ "echo",
1515
+ "fable",
1516
+ "onyx",
1517
+ "nova",
1518
+ "shimmer"
1519
+ ],
1520
+ "edge": [
1521
+ "en-US-AriaNeural",
1522
+ "en-US-GuyNeural",
1523
+ "en-GB-SoniaNeural",
1524
+ "en-AU-NatashaNeural"
1525
+ ],
1526
+ "gtts": [
1527
+ "en",
1528
+ "es",
1529
+ "fr",
1530
+ "de",
1531
+ "it",
1532
+ "pt",
1533
+ "ru",
1534
+ "ja",
1535
+ "zh",
1536
+ "hi"
1537
+ ]
1538
+ }
1539
+ return voices.get(engine, ["default"])
1540
+
1541
+ # Create the Gradio interface
1542
+ def create_interface():
1543
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", radius_size="lg"), title="YouTube Shorts Generator") as demo:
1544
+ with gr.Row():
1545
+ gr.Markdown(
1546
+ """
1547
+ # 📱 YouTube Shorts Generator
1548
+ Generate engaging YouTube Shorts videos with AI. Just provide a niche and language to get started!
1549
+ """
1550
+ )
1551
+
1552
+ with gr.Row(equal_height=True):
1553
+ # Left panel: Content Settings
1554
+ with gr.Column(scale=2, min_width=500):
1555
+ with gr.Group():
1556
+ gr.Markdown("### 📝 Content")
1557
+ niche = gr.Textbox(
1558
+ label="Niche/Topic",
1559
+ placeholder="What's your video about?",
1560
+ value="Historical Facts"
1561
+ )
1562
+ language = gr.Dropdown(
1563
+ choices=["English", "Spanish", "French", "German", "Italian", "Portuguese",
1564
+ "Russian", "Japanese", "Chinese", "Hindi"],
1565
+ label="Language",
1566
+ value="English"
1567
+ )
1568
+
1569
+ # Generator Settings
1570
+ with gr.Group():
1571
+ gr.Markdown("### 🔧 Generator Settings")
1572
+ with gr.Tabs():
1573
+ with gr.TabItem("Text"):
1574
+ text_gen = gr.Dropdown(
1575
+ choices=["g4f", "gemini", "openai"],
1576
+ label="Text Generator",
1577
+ value="g4f"
1578
+ )
1579
+ text_model = gr.Dropdown(
1580
+ choices=get_text_generator_models("g4f"),
1581
+ label="Text Model",
1582
+ value="gpt-4"
1583
+ )
1584
+
1585
+ with gr.TabItem("Image"):
1586
+ image_gen = gr.Dropdown(
1587
+ choices=["g4f", "prodia", "hercai", "segmind", "pollinations"],
1588
+ label="Image Generator",
1589
+ value="g4f"
1590
+ )
1591
+ image_model = gr.Dropdown(
1592
+ choices=get_image_generator_models("g4f"),
1593
+ label="Image Model",
1594
+ value="flux"
1595
+ )
1596
+
1597
+ with gr.TabItem("Speech"):
1598
+ tts_engine = gr.Dropdown(
1599
+ choices=["edge", "elevenlabs", "gtts", "openai"],
1600
+ label="Speech Generator",
1601
+ value="edge"
1602
+ )
1603
+ tts_voice = gr.Dropdown(
1604
+ choices=get_tts_voices("edge"),
1605
+ label="Voice",
1606
+ value="en-US-AriaNeural"
1607
+ )
1608
+
1609
+ with gr.TabItem("Audio"):
1610
+ enable_music = gr.Checkbox(label="Enable Background Music", value=True)
1611
+ # Fix for music_file - Get available music and set proper default
1612
+ music_choices = get_music_files()
1613
+ default_music = "none" if "random" not in music_choices else "random"
1614
+ music_file = gr.Dropdown(
1615
+ choices=music_choices,
1616
+ label="Background Music",
1617
+ value=default_music,
1618
+ interactive=True
1619
+ )
1620
+ music_volume = gr.Slider(
1621
+ minimum=0.0,
1622
+ maximum=1.0,
1623
+ value=0.1,
1624
+ step=0.05,
1625
+ label="Background Music Volume"
1626
+ )
1627
+
1628
+ with gr.TabItem("Subtitles"):
1629
+ subtitles_enabled = gr.Checkbox(label="Enable Subtitles", value=True)
1630
+ highlighting_enabled = gr.Checkbox(label="Enable Word Highlighting", value=True)
1631
+ subtitle_font = gr.Dropdown(
1632
+ choices=get_font_files(),
1633
+ label="Font",
1634
+ value="random"
1635
+ )
1636
+ with gr.Row():
1637
+ font_size = gr.Slider(
1638
+ minimum=40,
1639
+ maximum=120,
1640
+ value=80,
1641
+ step=5,
1642
+ label="Font Size"
1643
+ )
1644
+ subtitle_position = gr.Dropdown(
1645
+ choices=["bottom", "middle", "top"],
1646
+ label="Position",
1647
+ value="bottom"
1648
+ )
1649
+ with gr.Row():
1650
+ text_color = gr.ColorPicker(label="Text Color", value="#FFFFFF")
1651
+ highlight_color = gr.ColorPicker(label="Highlight Color", value="#0000FF")
1652
+
1653
+ # Generate button
1654
+ generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
1655
+
1656
+ # Right panel: Output display
1657
+ with gr.Column(scale=1, min_width=300):
1658
+ with gr.Tabs():
1659
+ with gr.TabItem("Video"):
1660
+ # Larger video preview with proper mobile proportions
1661
+ video_output = gr.Video(label="Generated Video", height=580, width=330)
1662
+
1663
+ with gr.TabItem("Metadata"):
1664
+ title_output = gr.Textbox(label="Title", lines=2)
1665
+ description_output = gr.Textbox(label="Description", lines=4)
1666
+ script_output = gr.Textbox(label="Script", lines=8)
1667
+
1668
+ # API Keys section as a tab
1669
+ with gr.TabItem("🔑 API Keys"):
1670
+ gemini_api_key = gr.Textbox(
1671
+ label="Gemini API Key",
1672
+ type="password",
1673
+ value=os.environ.get("GEMINI_API_KEY", "")
1674
+ )
1675
+ assemblyai_api_key = gr.Textbox(
1676
+ label="AssemblyAI API Key",
1677
+ type="password",
1678
+ value=os.environ.get("ASSEMBLYAI_API_KEY", "")
1679
+ )
1680
+ elevenlabs_api_key = gr.Textbox(
1681
+ label="ElevenLabs API Key",
1682
+ type="password",
1683
+ value=os.environ.get("ELEVENLABS_API_KEY", "")
1684
+ )
1685
+ segmind_api_key = gr.Textbox(
1686
+ label="Segmind API Key",
1687
+ type="password",
1688
+ value=os.environ.get("SEGMIND_API_KEY", "")
1689
+ )
1690
+ openai_api_key = gr.Textbox(
1691
+ label="OpenAI API Key",
1692
+ type="password",
1693
+ value=os.environ.get("OPENAI_API_KEY", "")
1694
+ )
1695
+
1696
+ with gr.TabItem("Log"):
1697
+ log_output = gr.Textbox(label="Process Log", lines=15, max_lines=100)
1698
+
1699
+ # Dynamic dropdown updates
1700
+ def update_text_models(generator):
1701
+ return gr.Dropdown(choices=get_text_generator_models(generator))
1702
+
1703
+ def update_image_models(generator):
1704
+ return gr.Dropdown(choices=get_image_generator_models(generator))
1705
+
1706
+ def update_tts_voices(engine):
1707
+ return gr.Dropdown(choices=get_tts_voices(engine))
1708
+
1709
+ # Connect the change events
1710
+ text_gen.change(fn=update_text_models, inputs=text_gen, outputs=text_model)
1711
+ image_gen.change(fn=update_image_models, inputs=image_gen, outputs=image_model)
1712
+ tts_engine.change(fn=update_tts_voices, inputs=tts_engine, outputs=tts_voice)
1713
+
1714
+ # Main generation function
1715
+ def generate_youtube_short(niche, language, text_gen, text_model, image_gen, image_model,
1716
+ tts_engine, tts_voice, subtitles_enabled, highlighting_enabled,
1717
+ subtitle_font, font_size, subtitle_position,
1718
+ text_color, highlight_color, music_file,
1719
+ enable_music, music_volume,
1720
+ gemini_api_key, assemblyai_api_key,
1721
+ elevenlabs_api_key, segmind_api_key, openai_api_key,
1722
+ progress=gr.Progress()):
1723
+
1724
+ if not niche.strip():
1725
+ return {
1726
+ video_output: None,
1727
+ title_output: "ERROR: Please enter a niche/topic",
1728
+ description_output: "",
1729
+ script_output: "",
1730
+ log_output: "Error: Niche/Topic is required. Please enter a valid topic and try again."
1731
+ }
1732
+
1733
+ # Create API keys dictionary
1734
+ api_keys = {
1735
+ 'gemini': gemini_api_key,
1736
+ 'assemblyai': assemblyai_api_key,
1737
+ 'elevenlabs': elevenlabs_api_key,
1738
+ 'segmind': segmind_api_key,
1739
+ 'openai': openai_api_key
1740
+ }
1741
+
1742
+ try:
1743
+ # Initialize YouTube class
1744
+ yt = YouTube(
1745
+ niche=niche,
1746
+ language=language,
1747
+ text_gen=text_gen,
1748
+ text_model=text_model,
1749
+ image_gen=image_gen,
1750
+ image_model=image_model,
1751
+ tts_engine=tts_engine,
1752
+ tts_voice=tts_voice,
1753
+ subtitle_font=subtitle_font,
1754
+ font_size=font_size,
1755
+ text_color=text_color,
1756
+ highlight_color=highlight_color,
1757
+ subtitles_enabled=subtitles_enabled,
1758
+ highlighting_enabled=highlighting_enabled,
1759
+ subtitle_position=subtitle_position,
1760
+ music_file=music_file,
1761
+ enable_music=enable_music,
1762
+ music_volume=music_volume,
1763
+ api_keys=api_keys,
1764
+ progress=progress
1765
+ )
1766
+
1767
+ # Generate video
1768
+ result = yt.generate_video()
1769
+
1770
+ # Check if video was successfully created
1771
+ if not result or not result.get('video_path') or not os.path.exists(result.get('video_path', '')):
1772
+ return {
1773
+ video_output: None,
1774
+ title_output: "ERROR: Video generation failed",
1775
+ description_output: "",
1776
+ script_output: "",
1777
+ log_output: "\n".join(yt.logs)
1778
+ }
1779
+
1780
+ return {
1781
+ video_output: result['video_path'],
1782
+ title_output: result['title'],
1783
+ description_output: result['description'],
1784
+ script_output: result['script'],
1785
+ log_output: "\n".join(result['logs'])
1786
+ }
1787
+
1788
+ except Exception as e:
1789
+ import traceback
1790
+ error_details = f"Error: {str(e)}\n\n{traceback.format_exc()}"
1791
+ return {
1792
+ video_output: None,
1793
+ title_output: f"ERROR: {str(e)}",
1794
+ description_output: "",
1795
+ script_output: "",
1796
+ log_output: error_details
1797
+ }
1798
+
1799
+ # Connect the button click event
1800
+ generate_btn.click(
1801
+ fn=generate_youtube_short,
1802
+ inputs=[
1803
+ niche, language, text_gen, text_model, image_gen, image_model,
1804
+ tts_engine, tts_voice, subtitles_enabled, highlighting_enabled,
1805
+ subtitle_font, font_size, subtitle_position, text_color, highlight_color, music_file,
1806
+ enable_music, music_volume, gemini_api_key, assemblyai_api_key, elevenlabs_api_key, segmind_api_key, openai_api_key
1807
+ ],
1808
+ outputs=[video_output, title_output, description_output, script_output, log_output]
1809
+ )
1810
+
1811
+ # Add examples
1812
+ music_choices = get_music_files()
1813
+ default_music = "none" if "random" not in music_choices else "random"
1814
+
1815
+ gr.Examples(
1816
+ [
1817
+ ["Historical Facts", "English", "g4f", "gpt-4", "g4f", "flux", "edge", "en-US-AriaNeural", True, True, "default", 80, "bottom", "#FFFFFF", "#0000FF", default_music, True, 0.1],
1818
+ ["Cooking Tips", "English", "g4f", "gpt-4", "g4f", "flux", "edge", "en-US-AriaNeural", True, True, "default", 80, "bottom", "#FFFFFF", "#FF0000", default_music, True, 0.1],
1819
+ ["Technology News", "English", "g4f", "gpt-4", "g4f", "flux", "edge", "en-US-GuyNeural", True, True, "default", 80, "bottom", "#FFFFFF", "#00FF00", default_music, True, 0.1],
1820
+ ],
1821
+ [niche, language, text_gen, text_model, image_gen, image_model, tts_engine, tts_voice,
1822
+ subtitles_enabled, highlighting_enabled, subtitle_font, font_size,
1823
+ subtitle_position, text_color, highlight_color, music_file, enable_music, music_volume],
1824
+ label="Quick Start Templates"
1825
+ )
1826
+
1827
+ return demo
1828
+
1829
+ # Create and launch the interface
1830
+ if __name__ == "__main__":
1831
+ # Create necessary directories
1832
+ os.makedirs(STATIC_DIR, exist_ok=True)
1833
+ os.makedirs(MUSIC_DIR, exist_ok=True)
1834
+ os.makedirs(FONTS_DIR, exist_ok=True)
1835
+ os.makedirs(STORAGE_DIR, exist_ok=True)
1836
+
1837
+ # Launch the app
1838
+ demo = create_interface()
1839
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ python-dotenv
3
+ wheel
4
+ setuptools
5
+ termcolor
6
+ schedule
7
+ prettytable
8
+ webdriver_manager
9
+ selenium_firefox
10
+ selenium
11
+ g4f[all]
12
+ moviepy==1.0.3
13
+ Pillow==9.5.0
14
+ yagmail
15
+ assemblyai
16
+ srt_equalizer
17
+ undetected_chromedriver
18
+ platformdirs
19
+ google-generativeai
20
+ gtts
21
+ Brotli
22
+ edge-tts
23
+ playsound
24
+ telethon
25
+ PyExecJS
26
+ psutil
27
+ #TTS