sheikhed commited on
Commit
a23ccb7
·
verified ·
1 Parent(s): 07d1639

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -78
app.py CHANGED
@@ -11,16 +11,14 @@ from dotenv import load_dotenv
11
  load_dotenv()
12
 
13
  # API Keys
14
- A_KEY = os.getenv("A_KEY") # ElevenLabs API key
15
- B_KEY = os.getenv("B_KEY") # Lipsync API key
16
- OPENAI_KEY = os.getenv("OPENAI_KEY") # OpenAI API key
17
 
18
  # URLs
19
  API_URL = os.getenv("API_URL")
20
  UPLOAD_URL = os.getenv("UPLOAD_URL")
21
- OPENAI_API_URL = "https://api.openai.com/v1/audio/speech"
22
 
23
- def get_elevenlabs_voices():
24
  url = "https://api.elevenlabs.io/v1/voices"
25
  headers = {
26
  "Accept": "application/json",
@@ -32,18 +30,7 @@ def get_elevenlabs_voices():
32
  return []
33
  return [(voice['name'], voice['voice_id']) for voice in response.json().get('voices', [])]
34
 
35
- def get_openai_voices():
36
- # OpenAI voices are predefined
37
- return [
38
- ("alloy", "alloy"),
39
- ("echo", "echo"),
40
- ("fable", "fable"),
41
- ("onyx", "onyx"),
42
- ("nova", "nova"),
43
- ("shimmer", "shimmer")
44
- ]
45
-
46
- def text_to_speech_elevenlabs(voice_id, text, session_id):
47
  url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
48
 
49
  headers = {
@@ -65,27 +52,7 @@ def text_to_speech_elevenlabs(voice_id, text, session_id):
65
  if response.status_code != 200:
66
  return None
67
 
68
- audio_file_path = f'temp_voice_{session_id}.mp3'
69
- with open(audio_file_path, 'wb') as audio_file:
70
- audio_file.write(response.content)
71
- return audio_file_path
72
-
73
- def text_to_speech_openai(voice, text, session_id):
74
- headers = {
75
- "Authorization": f"Bearer {OPENAI_KEY}",
76
- "Content-Type": "application/json"
77
- }
78
-
79
- data = {
80
- "model": "tts-1",
81
- "input": text,
82
- "voice": voice
83
- }
84
-
85
- response = requests.post(OPENAI_API_URL, headers=headers, json=data)
86
- if response.status_code != 200:
87
- return None
88
-
89
  audio_file_path = f'temp_voice_{session_id}.mp3'
90
  with open(audio_file_path, 'wb') as audio_file:
91
  audio_file.write(response.content)
@@ -122,7 +89,7 @@ def lipsync_api_call(video_url, audio_url):
122
 
123
  def check_job_status(job_id):
124
  headers = {"x-api-key": B_KEY}
125
- max_attempts = 30
126
 
127
  for _ in range(max_attempts):
128
  response = requests.get(f"{API_URL}/{job_id}", headers=headers)
@@ -137,27 +104,31 @@ def check_job_status(job_id):
137
  return None
138
 
139
  def get_media_duration(file_path):
 
140
  cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path]
141
  result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
142
  return float(result.stdout.strip())
143
 
144
  def combine_audio_video(video_path, audio_path, output_path):
 
145
  video_duration = get_media_duration(video_path)
146
  audio_duration = get_media_duration(audio_path)
147
 
148
  if video_duration > audio_duration:
 
149
  cmd = [
150
  'ffmpeg', '-i', video_path, '-i', audio_path,
151
- '-t', str(audio_duration),
152
  '-map', '0:v', '-map', '1:a',
153
  '-c:v', 'copy', '-c:a', 'aac',
154
  '-y', output_path
155
  ]
156
  else:
157
- loop_count = int(audio_duration // video_duration) + 1
 
158
  cmd = [
159
  'ffmpeg', '-stream_loop', str(loop_count), '-i', video_path, '-i', audio_path,
160
- '-t', str(audio_duration),
161
  '-map', '0:v', '-map', '1:a',
162
  '-c:v', 'copy', '-c:a', 'aac',
163
  '-shortest', '-y', output_path
@@ -165,15 +136,10 @@ def combine_audio_video(video_path, audio_path, output_path):
165
 
166
  subprocess.run(cmd, check=True)
167
 
168
- def process_video(provider, voice, video_url, text, progress=gr.Progress()):
169
- session_id = str(uuid.uuid4())
170
  progress(0, desc="Generating speech...")
171
-
172
- if provider == "ElevenLabs":
173
- audio_path = text_to_speech_elevenlabs(voice, text, session_id)
174
- else: # OpenAI
175
- audio_path = text_to_speech_openai(voice, text, session_id)
176
-
177
  if not audio_path:
178
  return None, "Failed to generate speech audio."
179
 
@@ -211,6 +177,7 @@ def process_video(provider, voice, video_url, text, progress=gr.Progress()):
211
  except Exception as e:
212
  progress(0.8, desc="Falling back to simple combination...")
213
  try:
 
214
  video_response = requests.get(video_url)
215
  video_path = f"temp_video_{session_id}.mp4"
216
  with open(video_path, "wb") as f:
@@ -223,21 +190,20 @@ def process_video(provider, voice, video_url, text, progress=gr.Progress()):
223
  except Exception as fallback_error:
224
  return None, f"All methods failed. Error: {str(fallback_error)}"
225
  finally:
 
226
  if os.path.exists(audio_path):
227
  os.remove(audio_path)
228
  if os.path.exists(f"temp_video_{session_id}.mp4"):
229
  os.remove(f"temp_video_{session_id}.mp4")
230
 
231
  def create_interface():
232
- elevenlabs_voices = get_elevenlabs_voices()
233
- openai_voices = get_openai_voices()
234
 
235
  with gr.Blocks() as app:
236
- gr.Markdown("# Voice Synthesis Application")
237
  with gr.Row():
238
  with gr.Column():
239
- provider_dropdown = gr.Dropdown(choices=["ElevenLabs", "OpenAI"], label="Select Provider", value="ElevenLabs")
240
- voice_dropdown = gr.Dropdown(choices=[v[0] for v in elevenlabs_voices], label="Select Voice", value=elevenlabs_voices[0][0] if elevenlabs_voices else None)
241
  video_url_input = gr.Textbox(label="Enter Video URL")
242
  text_input = gr.Textbox(label="Enter text", lines=3)
243
  generate_btn = gr.Button("Generate Video")
@@ -245,32 +211,15 @@ def create_interface():
245
  video_output = gr.Video(label="Generated Video")
246
  status_output = gr.Textbox(label="Status", interactive=False)
247
 
248
- def update_voices(provider):
249
- if provider == "ElevenLabs":
250
- return gr.Dropdown(choices=[v[0] for v in elevenlabs_voices], value=elevenlabs_voices[0][0] if elevenlabs_voices else None)
251
- else: # OpenAI
252
- return gr.Dropdown(choices=[v[0] for v in openai_voices], value=openai_voices[0][0])
253
-
254
- provider_dropdown.change(fn=update_voices, inputs=[provider_dropdown], outputs=[voice_dropdown])
255
-
256
- def on_generate(provider, voice_name, video_url, text):
257
- try:
258
- if provider == "ElevenLabs":
259
- voices = elevenlabs_voices
260
- else: # OpenAI
261
- voices = openai_voices
262
-
263
- voice_id = next((v[1] for v in voices if v[0] == voice_name), None)
264
- if not voice_id:
265
- raise ValueError(f"Invalid voice selected for {provider}: {voice_name}")
266
-
267
- return process_video(provider, voice_id, video_url, text)
268
- except Exception as e:
269
- return None, f"Error: {str(e)}"
270
 
271
  generate_btn.click(
272
  fn=on_generate,
273
- inputs=[provider_dropdown, voice_dropdown, video_url_input, text_input],
274
  outputs=[video_output, status_output]
275
  )
276
 
 
11
  load_dotenv()
12
 
13
  # API Keys
14
+ A_KEY = os.getenv("A_KEY")
15
+ B_KEY = os.getenv("B_KEY")
 
16
 
17
  # URLs
18
  API_URL = os.getenv("API_URL")
19
  UPLOAD_URL = os.getenv("UPLOAD_URL")
 
20
 
21
+ def get_voices():
22
  url = "https://api.elevenlabs.io/v1/voices"
23
  headers = {
24
  "Accept": "application/json",
 
30
  return []
31
  return [(voice['name'], voice['voice_id']) for voice in response.json().get('voices', [])]
32
 
33
+ def text_to_speech(voice_id, text, session_id):
 
 
 
 
 
 
 
 
 
 
 
34
  url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
35
 
36
  headers = {
 
52
  if response.status_code != 200:
53
  return None
54
 
55
+ # Save temporary audio file with session ID
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  audio_file_path = f'temp_voice_{session_id}.mp3'
57
  with open(audio_file_path, 'wb') as audio_file:
58
  audio_file.write(response.content)
 
89
 
90
  def check_job_status(job_id):
91
  headers = {"x-api-key": B_KEY}
92
+ max_attempts = 30 # Limit the number of attempts
93
 
94
  for _ in range(max_attempts):
95
  response = requests.get(f"{API_URL}/{job_id}", headers=headers)
 
104
  return None
105
 
106
  def get_media_duration(file_path):
107
+ # Fetch media duration using ffprobe
108
  cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path]
109
  result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
110
  return float(result.stdout.strip())
111
 
112
  def combine_audio_video(video_path, audio_path, output_path):
113
+ # Get durations of both video and audio
114
  video_duration = get_media_duration(video_path)
115
  audio_duration = get_media_duration(audio_path)
116
 
117
  if video_duration > audio_duration:
118
+ # Trim video to match the audio length
119
  cmd = [
120
  'ffmpeg', '-i', video_path, '-i', audio_path,
121
+ '-t', str(audio_duration), # Trim video to audio duration
122
  '-map', '0:v', '-map', '1:a',
123
  '-c:v', 'copy', '-c:a', 'aac',
124
  '-y', output_path
125
  ]
126
  else:
127
+ # Loop video if it's shorter than audio
128
+ loop_count = int(audio_duration // video_duration) + 1 # Calculate how many times to loop
129
  cmd = [
130
  'ffmpeg', '-stream_loop', str(loop_count), '-i', video_path, '-i', audio_path,
131
+ '-t', str(audio_duration), # Match the duration of the final video with the audio
132
  '-map', '0:v', '-map', '1:a',
133
  '-c:v', 'copy', '-c:a', 'aac',
134
  '-shortest', '-y', output_path
 
136
 
137
  subprocess.run(cmd, check=True)
138
 
139
+ def process_video(voice, video_url, text, progress=gr.Progress()):
140
+ session_id = str(uuid.uuid4()) # Generate a unique session ID
141
  progress(0, desc="Generating speech...")
142
+ audio_path = text_to_speech(voice, text, session_id)
 
 
 
 
 
143
  if not audio_path:
144
  return None, "Failed to generate speech audio."
145
 
 
177
  except Exception as e:
178
  progress(0.8, desc="Falling back to simple combination...")
179
  try:
180
+ # Download the video from the URL
181
  video_response = requests.get(video_url)
182
  video_path = f"temp_video_{session_id}.mp4"
183
  with open(video_path, "wb") as f:
 
190
  except Exception as fallback_error:
191
  return None, f"All methods failed. Error: {str(fallback_error)}"
192
  finally:
193
+ # Cleanup
194
  if os.path.exists(audio_path):
195
  os.remove(audio_path)
196
  if os.path.exists(f"temp_video_{session_id}.mp4"):
197
  os.remove(f"temp_video_{session_id}.mp4")
198
 
199
  def create_interface():
200
+ voices = get_voices()
 
201
 
202
  with gr.Blocks() as app:
203
+ gr.Markdown("# JSON Train")
204
  with gr.Row():
205
  with gr.Column():
206
+ voice_dropdown = gr.Dropdown(choices=[v[0] for v in voices], label="Select Voice", value=voices[0][0] if voices else None)
 
207
  video_url_input = gr.Textbox(label="Enter Video URL")
208
  text_input = gr.Textbox(label="Enter text", lines=3)
209
  generate_btn = gr.Button("Generate Video")
 
211
  video_output = gr.Video(label="Generated Video")
212
  status_output = gr.Textbox(label="Status", interactive=False)
213
 
214
+ def on_generate(voice_name, video_url, text):
215
+ voice_id = next((v[1] for v in voices if v[0] == voice_name), None)
216
+ if not voice_id:
217
+ return None, "Invalid voice selected."
218
+ return process_video(voice_id, video_url, text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
  generate_btn.click(
221
  fn=on_generate,
222
+ inputs=[voice_dropdown, video_url_input, text_input],
223
  outputs=[video_output, status_output]
224
  )
225