sheikhed commited on
Commit
9ba68c3
·
verified ·
1 Parent(s): 3e4eb0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -25
app.py CHANGED
@@ -11,14 +11,16 @@ from dotenv import load_dotenv
11
  load_dotenv()
12
 
13
  # API Keys
14
- A_KEY = os.getenv("A_KEY")
15
- B_KEY = os.getenv("B_KEY")
 
16
 
17
  # URLs
18
  API_URL = os.getenv("API_URL")
19
  UPLOAD_URL = os.getenv("UPLOAD_URL")
 
20
 
21
- def get_voices():
22
  url = "https://api.elevenlabs.io/v1/voices"
23
  headers = {
24
  "Accept": "application/json",
@@ -30,7 +32,18 @@ def get_voices():
30
  return []
31
  return [(voice['name'], voice['voice_id']) for voice in response.json().get('voices', [])]
32
 
33
- def text_to_speech(voice_id, text, session_id):
 
 
 
 
 
 
 
 
 
 
 
34
  url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
35
 
36
  headers = {
@@ -52,7 +65,27 @@ def text_to_speech(voice_id, text, session_id):
52
  if response.status_code != 200:
53
  return None
54
 
55
- # Save temporary audio file with session ID
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  audio_file_path = f'temp_voice_{session_id}.mp3'
57
  with open(audio_file_path, 'wb') as audio_file:
58
  audio_file.write(response.content)
@@ -89,7 +122,7 @@ def lipsync_api_call(video_url, audio_url):
89
 
90
  def check_job_status(job_id):
91
  headers = {"x-api-key": B_KEY}
92
- max_attempts = 30 # Limit the number of attempts
93
 
94
  for _ in range(max_attempts):
95
  response = requests.get(f"{API_URL}/{job_id}", headers=headers)
@@ -104,31 +137,27 @@ def check_job_status(job_id):
104
  return None
105
 
106
  def get_media_duration(file_path):
107
- # Fetch media duration using ffprobe
108
  cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path]
109
  result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
110
  return float(result.stdout.strip())
111
 
112
  def combine_audio_video(video_path, audio_path, output_path):
113
- # Get durations of both video and audio
114
  video_duration = get_media_duration(video_path)
115
  audio_duration = get_media_duration(audio_path)
116
 
117
  if video_duration > audio_duration:
118
- # Trim video to match the audio length
119
  cmd = [
120
  'ffmpeg', '-i', video_path, '-i', audio_path,
121
- '-t', str(audio_duration), # Trim video to audio duration
122
  '-map', '0:v', '-map', '1:a',
123
  '-c:v', 'copy', '-c:a', 'aac',
124
  '-y', output_path
125
  ]
126
  else:
127
- # Loop video if it's shorter than audio
128
- loop_count = int(audio_duration // video_duration) + 1 # Calculate how many times to loop
129
  cmd = [
130
  'ffmpeg', '-stream_loop', str(loop_count), '-i', video_path, '-i', audio_path,
131
- '-t', str(audio_duration), # Match the duration of the final video with the audio
132
  '-map', '0:v', '-map', '1:a',
133
  '-c:v', 'copy', '-c:a', 'aac',
134
  '-shortest', '-y', output_path
@@ -136,10 +165,15 @@ def combine_audio_video(video_path, audio_path, output_path):
136
 
137
  subprocess.run(cmd, check=True)
138
 
139
- def process_video(voice, video_url, text, progress=gr.Progress()):
140
- session_id = str(uuid.uuid4()) # Generate a unique session ID
141
  progress(0, desc="Generating speech...")
142
- audio_path = text_to_speech(voice, text, session_id)
 
 
 
 
 
143
  if not audio_path:
144
  return None, "Failed to generate speech audio."
145
 
@@ -177,7 +211,6 @@ def process_video(voice, video_url, text, progress=gr.Progress()):
177
  except Exception as e:
178
  progress(0.8, desc="Falling back to simple combination...")
179
  try:
180
- # Download the video from the URL
181
  video_response = requests.get(video_url)
182
  video_path = f"temp_video_{session_id}.mp4"
183
  with open(video_path, "wb") as f:
@@ -190,20 +223,21 @@ def process_video(voice, video_url, text, progress=gr.Progress()):
190
  except Exception as fallback_error:
191
  return None, f"All methods failed. Error: {str(fallback_error)}"
192
  finally:
193
- # Cleanup
194
  if os.path.exists(audio_path):
195
  os.remove(audio_path)
196
  if os.path.exists(f"temp_video_{session_id}.mp4"):
197
  os.remove(f"temp_video_{session_id}.mp4")
198
 
199
  def create_interface():
200
- voices = get_voices()
 
201
 
202
  with gr.Blocks() as app:
203
- gr.Markdown("# JSON Train")
204
  with gr.Row():
205
  with gr.Column():
206
- voice_dropdown = gr.Dropdown(choices=[v[0] for v in voices], label="Select Voice", value=voices[0][0] if voices else None)
 
207
  video_url_input = gr.Textbox(label="Enter Video URL")
208
  text_input = gr.Textbox(label="Enter text", lines=3)
209
  generate_btn = gr.Button("Generate Video")
@@ -211,15 +245,27 @@ def create_interface():
211
  video_output = gr.Video(label="Generated Video")
212
  status_output = gr.Textbox(label="Status", interactive=False)
213
 
214
- def on_generate(voice_name, video_url, text):
215
- voice_id = next((v[1] for v in voices if v[0] == voice_name), None)
 
 
 
 
 
 
 
 
 
 
 
 
216
  if not voice_id:
217
  return None, "Invalid voice selected."
218
- return process_video(voice_id, video_url, text)
219
 
220
  generate_btn.click(
221
  fn=on_generate,
222
- inputs=[voice_dropdown, video_url_input, text_input],
223
  outputs=[video_output, status_output]
224
  )
225
 
 
11
  load_dotenv()
12
 
13
  # API Keys
14
+ A_KEY = os.getenv("A_KEY") # ElevenLabs API key
15
+ B_KEY = os.getenv("B_KEY") # Lipsync API key
16
+ OPENAI_KEY = os.getenv("OPENAI_KEY") # OpenAI API key
17
 
18
  # URLs
19
  API_URL = os.getenv("API_URL")
20
  UPLOAD_URL = os.getenv("UPLOAD_URL")
21
+ OPENAI_API_URL = "https://api.openai.com/v1/audio/speech"
22
 
23
+ def get_elevenlabs_voices():
24
  url = "https://api.elevenlabs.io/v1/voices"
25
  headers = {
26
  "Accept": "application/json",
 
32
  return []
33
  return [(voice['name'], voice['voice_id']) for voice in response.json().get('voices', [])]
34
 
35
+ def get_openai_voices():
36
+ # OpenAI voices are predefined
37
+ return [
38
+ ("alloy", "alloy"),
39
+ ("echo", "echo"),
40
+ ("fable", "fable"),
41
+ ("onyx", "onyx"),
42
+ ("nova", "nova"),
43
+ ("shimmer", "shimmer")
44
+ ]
45
+
46
+ def text_to_speech_elevenlabs(voice_id, text, session_id):
47
  url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
48
 
49
  headers = {
 
65
  if response.status_code != 200:
66
  return None
67
 
68
+ audio_file_path = f'temp_voice_{session_id}.mp3'
69
+ with open(audio_file_path, 'wb') as audio_file:
70
+ audio_file.write(response.content)
71
+ return audio_file_path
72
+
73
+ def text_to_speech_openai(voice, text, session_id):
74
+ headers = {
75
+ "Authorization": f"Bearer {OPENAI_KEY}",
76
+ "Content-Type": "application/json"
77
+ }
78
+
79
+ data = {
80
+ "model": "tts-1",
81
+ "input": text,
82
+ "voice": voice
83
+ }
84
+
85
+ response = requests.post(OPENAI_API_URL, headers=headers, json=data)
86
+ if response.status_code != 200:
87
+ return None
88
+
89
  audio_file_path = f'temp_voice_{session_id}.mp3'
90
  with open(audio_file_path, 'wb') as audio_file:
91
  audio_file.write(response.content)
 
122
 
123
  def check_job_status(job_id):
124
  headers = {"x-api-key": B_KEY}
125
+ max_attempts = 30
126
 
127
  for _ in range(max_attempts):
128
  response = requests.get(f"{API_URL}/{job_id}", headers=headers)
 
137
  return None
138
 
139
  def get_media_duration(file_path):
 
140
  cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path]
141
  result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
142
  return float(result.stdout.strip())
143
 
144
  def combine_audio_video(video_path, audio_path, output_path):
 
145
  video_duration = get_media_duration(video_path)
146
  audio_duration = get_media_duration(audio_path)
147
 
148
  if video_duration > audio_duration:
 
149
  cmd = [
150
  'ffmpeg', '-i', video_path, '-i', audio_path,
151
+ '-t', str(audio_duration),
152
  '-map', '0:v', '-map', '1:a',
153
  '-c:v', 'copy', '-c:a', 'aac',
154
  '-y', output_path
155
  ]
156
  else:
157
+ loop_count = int(audio_duration // video_duration) + 1
 
158
  cmd = [
159
  'ffmpeg', '-stream_loop', str(loop_count), '-i', video_path, '-i', audio_path,
160
+ '-t', str(audio_duration),
161
  '-map', '0:v', '-map', '1:a',
162
  '-c:v', 'copy', '-c:a', 'aac',
163
  '-shortest', '-y', output_path
 
165
 
166
  subprocess.run(cmd, check=True)
167
 
168
+ def process_video(provider, voice, video_url, text, progress=gr.Progress()):
169
+ session_id = str(uuid.uuid4())
170
  progress(0, desc="Generating speech...")
171
+
172
+ if provider == "ElevenLabs":
173
+ audio_path = text_to_speech_elevenlabs(voice, text, session_id)
174
+ else: # OpenAI
175
+ audio_path = text_to_speech_openai(voice, text, session_id)
176
+
177
  if not audio_path:
178
  return None, "Failed to generate speech audio."
179
 
 
211
  except Exception as e:
212
  progress(0.8, desc="Falling back to simple combination...")
213
  try:
 
214
  video_response = requests.get(video_url)
215
  video_path = f"temp_video_{session_id}.mp4"
216
  with open(video_path, "wb") as f:
 
223
  except Exception as fallback_error:
224
  return None, f"All methods failed. Error: {str(fallback_error)}"
225
  finally:
 
226
  if os.path.exists(audio_path):
227
  os.remove(audio_path)
228
  if os.path.exists(f"temp_video_{session_id}.mp4"):
229
  os.remove(f"temp_video_{session_id}.mp4")
230
 
231
  def create_interface():
232
+ elevenlabs_voices = get_elevenlabs_voices()
233
+ openai_voices = get_openai_voices()
234
 
235
  with gr.Blocks() as app:
236
+ gr.Markdown("# Voice Synthesis Application")
237
  with gr.Row():
238
  with gr.Column():
239
+ provider_dropdown = gr.Dropdown(choices=["ElevenLabs", "OpenAI"], label="Select Provider", value="ElevenLabs")
240
+ voice_dropdown = gr.Dropdown(choices=[v[0] for v in elevenlabs_voices], label="Select Voice", value=elevenlabs_voices[0][0] if elevenlabs_voices else None)
241
  video_url_input = gr.Textbox(label="Enter Video URL")
242
  text_input = gr.Textbox(label="Enter text", lines=3)
243
  generate_btn = gr.Button("Generate Video")
 
245
  video_output = gr.Video(label="Generated Video")
246
  status_output = gr.Textbox(label="Status", interactive=False)
247
 
248
+ def update_voices(provider):
249
+ if provider == "ElevenLabs":
250
+ return gr.Dropdown.update(choices=[v[0] for v in elevenlabs_voices], value=elevenlabs_voices[0][0] if elevenlabs_voices else None)
251
+ else: # OpenAI
252
+ return gr.Dropdown.update(choices=[v[0] for v in openai_voices], value=openai_voices[0][0])
253
+
254
+ provider_dropdown.change(fn=update_voices, inputs=[provider_dropdown], outputs=[voice_dropdown])
255
+
256
+ def on_generate(provider, voice_name, video_url, text):
257
+ if provider == "ElevenLabs":
258
+ voice_id = next((v[1] for v in elevenlabs_voices if v[0] == voice_name), None)
259
+ else: # OpenAI
260
+ voice_id = next((v[1] for v in openai_voices if v[0] == voice_name), None)
261
+
262
  if not voice_id:
263
  return None, "Invalid voice selected."
264
+ return process_video(provider, voice_id, video_url, text)
265
 
266
  generate_btn.click(
267
  fn=on_generate,
268
+ inputs=[provider_dropdown, voice_dropdown, video_url_input, text_input],
269
  outputs=[video_output, status_output]
270
  )
271