jsonop

Sleeping

App Files Files Community

sheikhed commited on Oct 14, 2024

Commit

9ba68c3

verified ·

1 Parent(s): 3e4eb0e

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -25

app.py CHANGED Viewed

@@ -11,14 +11,16 @@ from dotenv import load_dotenv
 load_dotenv()
 # API Keys
-A_KEY = os.getenv("A_KEY")
-B_KEY = os.getenv("B_KEY")
 # URLs
 API_URL = os.getenv("API_URL")
 UPLOAD_URL = os.getenv("UPLOAD_URL")
-def get_voices():
     url = "https://api.elevenlabs.io/v1/voices"
     headers = {
         "Accept": "application/json",
@@ -30,7 +32,18 @@ def get_voices():
         return []
     return [(voice['name'], voice['voice_id']) for voice in response.json().get('voices', [])]
-def text_to_speech(voice_id, text, session_id):
     url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
     headers = {
@@ -52,7 +65,27 @@ def text_to_speech(voice_id, text, session_id):
     if response.status_code != 200:
         return None
-    # Save temporary audio file with session ID
     audio_file_path = f'temp_voice_{session_id}.mp3'
     with open(audio_file_path, 'wb') as audio_file:
         audio_file.write(response.content)
@@ -89,7 +122,7 @@ def lipsync_api_call(video_url, audio_url):
 def check_job_status(job_id):
     headers = {"x-api-key": B_KEY}
-    max_attempts = 30  # Limit the number of attempts
     for _ in range(max_attempts):
         response = requests.get(f"{API_URL}/{job_id}", headers=headers)
@@ -104,31 +137,27 @@ def check_job_status(job_id):
     return None
 def get_media_duration(file_path):
-    # Fetch media duration using ffprobe
     cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path]
     result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return float(result.stdout.strip())
 def combine_audio_video(video_path, audio_path, output_path):
-    # Get durations of both video and audio
     video_duration = get_media_duration(video_path)
     audio_duration = get_media_duration(audio_path)
     if video_duration > audio_duration:
-        # Trim video to match the audio length
         cmd = [
             'ffmpeg', '-i', video_path, '-i', audio_path,
-            '-t', str(audio_duration),  # Trim video to audio duration
             '-map', '0:v', '-map', '1:a',
             '-c:v', 'copy', '-c:a', 'aac',
             '-y', output_path
         ]
     else:
-        # Loop video if it's shorter than audio
-        loop_count = int(audio_duration // video_duration) + 1  # Calculate how many times to loop
         cmd = [
             'ffmpeg', '-stream_loop', str(loop_count), '-i', video_path, '-i', audio_path,
-            '-t', str(audio_duration),  # Match the duration of the final video with the audio
             '-map', '0:v', '-map', '1:a',
             '-c:v', 'copy', '-c:a', 'aac',
             '-shortest', '-y', output_path
@@ -136,10 +165,15 @@ def combine_audio_video(video_path, audio_path, output_path):
     subprocess.run(cmd, check=True)
-def process_video(voice, video_url, text, progress=gr.Progress()):
-    session_id = str(uuid.uuid4())  # Generate a unique session ID
     progress(0, desc="Generating speech...")
-    audio_path = text_to_speech(voice, text, session_id)
     if not audio_path:
         return None, "Failed to generate speech audio."
@@ -177,7 +211,6 @@ def process_video(voice, video_url, text, progress=gr.Progress()):
     except Exception as e:
         progress(0.8, desc="Falling back to simple combination...")
         try:
-            # Download the video from the URL
             video_response = requests.get(video_url)
             video_path = f"temp_video_{session_id}.mp4"
             with open(video_path, "wb") as f:
@@ -190,20 +223,21 @@ def process_video(voice, video_url, text, progress=gr.Progress()):
         except Exception as fallback_error:
             return None, f"All methods failed. Error: {str(fallback_error)}"
     finally:
-        # Cleanup
         if os.path.exists(audio_path):
             os.remove(audio_path)
         if os.path.exists(f"temp_video_{session_id}.mp4"):
             os.remove(f"temp_video_{session_id}.mp4")
 def create_interface():
-    voices = get_voices()
     with gr.Blocks() as app:
-        gr.Markdown("# JSON Train")
         with gr.Row():
             with gr.Column():
-                voice_dropdown = gr.Dropdown(choices=[v[0] for v in voices], label="Select Voice", value=voices[0][0] if voices else None)
                 video_url_input = gr.Textbox(label="Enter Video URL")
                 text_input = gr.Textbox(label="Enter text", lines=3)
                 generate_btn = gr.Button("Generate Video")
@@ -211,15 +245,27 @@ def create_interface():
                 video_output = gr.Video(label="Generated Video")
                 status_output = gr.Textbox(label="Status", interactive=False)
-        def on_generate(voice_name, video_url, text):
-            voice_id = next((v[1] for v in voices if v[0] == voice_name), None)
             if not voice_id:
                 return None, "Invalid voice selected."
-            return process_video(voice_id, video_url, text)
         generate_btn.click(
             fn=on_generate,
-            inputs=[voice_dropdown, video_url_input, text_input],
             outputs=[video_output, status_output]
         )

 load_dotenv()
 # API Keys
+A_KEY = os.getenv("A_KEY")  # ElevenLabs API key
+B_KEY = os.getenv("B_KEY")  # Lipsync API key
+OPENAI_KEY = os.getenv("OPENAI_KEY")  # OpenAI API key
 # URLs
 API_URL = os.getenv("API_URL")
 UPLOAD_URL = os.getenv("UPLOAD_URL")
+OPENAI_API_URL = "https://api.openai.com/v1/audio/speech"
+def get_elevenlabs_voices():
     url = "https://api.elevenlabs.io/v1/voices"
     headers = {
         "Accept": "application/json",
         return []
     return [(voice['name'], voice['voice_id']) for voice in response.json().get('voices', [])]
+def get_openai_voices():
+    # OpenAI voices are predefined
+    return [
+        ("alloy", "alloy"),
+        ("echo", "echo"),
+        ("fable", "fable"),
+        ("onyx", "onyx"),
+        ("nova", "nova"),
+        ("shimmer", "shimmer")
+    ]
+def text_to_speech_elevenlabs(voice_id, text, session_id):
     url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
     headers = {
     if response.status_code != 200:
         return None
+    audio_file_path = f'temp_voice_{session_id}.mp3'
+    with open(audio_file_path, 'wb') as audio_file:
+        audio_file.write(response.content)
+    return audio_file_path
+def text_to_speech_openai(voice, text, session_id):
+    headers = {
+        "Authorization": f"Bearer {OPENAI_KEY}",
+        "Content-Type": "application/json"
+    }
+    data = {
+        "model": "tts-1",
+        "input": text,
+        "voice": voice
+    }
+    response = requests.post(OPENAI_API_URL, headers=headers, json=data)
+    if response.status_code != 200:
+        return None
     audio_file_path = f'temp_voice_{session_id}.mp3'
     with open(audio_file_path, 'wb') as audio_file:
         audio_file.write(response.content)
 def check_job_status(job_id):
     headers = {"x-api-key": B_KEY}
+    max_attempts = 30
     for _ in range(max_attempts):
         response = requests.get(f"{API_URL}/{job_id}", headers=headers)
     return None
 def get_media_duration(file_path):
     cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_path]
     result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     return float(result.stdout.strip())
 def combine_audio_video(video_path, audio_path, output_path):
     video_duration = get_media_duration(video_path)
     audio_duration = get_media_duration(audio_path)
     if video_duration > audio_duration:
         cmd = [
             'ffmpeg', '-i', video_path, '-i', audio_path,
+            '-t', str(audio_duration),
             '-map', '0:v', '-map', '1:a',
             '-c:v', 'copy', '-c:a', 'aac',
             '-y', output_path
         ]
     else:
+        loop_count = int(audio_duration // video_duration) + 1
         cmd = [
             'ffmpeg', '-stream_loop', str(loop_count), '-i', video_path, '-i', audio_path,
+            '-t', str(audio_duration),
             '-map', '0:v', '-map', '1:a',
             '-c:v', 'copy', '-c:a', 'aac',
             '-shortest', '-y', output_path
     subprocess.run(cmd, check=True)
+def process_video(provider, voice, video_url, text, progress=gr.Progress()):
+    session_id = str(uuid.uuid4())
     progress(0, desc="Generating speech...")
+    if provider == "ElevenLabs":
+        audio_path = text_to_speech_elevenlabs(voice, text, session_id)
+    else:  # OpenAI
+        audio_path = text_to_speech_openai(voice, text, session_id)
     if not audio_path:
         return None, "Failed to generate speech audio."
     except Exception as e:
         progress(0.8, desc="Falling back to simple combination...")
         try:
             video_response = requests.get(video_url)
             video_path = f"temp_video_{session_id}.mp4"
             with open(video_path, "wb") as f:
         except Exception as fallback_error:
             return None, f"All methods failed. Error: {str(fallback_error)}"
     finally:
         if os.path.exists(audio_path):
             os.remove(audio_path)
         if os.path.exists(f"temp_video_{session_id}.mp4"):
             os.remove(f"temp_video_{session_id}.mp4")
 def create_interface():
+    elevenlabs_voices = get_elevenlabs_voices()
+    openai_voices = get_openai_voices()
     with gr.Blocks() as app:
+        gr.Markdown("# Voice Synthesis Application")
         with gr.Row():
             with gr.Column():
+                provider_dropdown = gr.Dropdown(choices=["ElevenLabs", "OpenAI"], label="Select Provider", value="ElevenLabs")
+                voice_dropdown = gr.Dropdown(choices=[v[0] for v in elevenlabs_voices], label="Select Voice", value=elevenlabs_voices[0][0] if elevenlabs_voices else None)
                 video_url_input = gr.Textbox(label="Enter Video URL")
                 text_input = gr.Textbox(label="Enter text", lines=3)
                 generate_btn = gr.Button("Generate Video")
                 video_output = gr.Video(label="Generated Video")
                 status_output = gr.Textbox(label="Status", interactive=False)
+        def update_voices(provider):
+            if provider == "ElevenLabs":
+                return gr.Dropdown.update(choices=[v[0] for v in elevenlabs_voices], value=elevenlabs_voices[0][0] if elevenlabs_voices else None)
+            else:  # OpenAI
+                return gr.Dropdown.update(choices=[v[0] for v in openai_voices], value=openai_voices[0][0])
+        provider_dropdown.change(fn=update_voices, inputs=[provider_dropdown], outputs=[voice_dropdown])
+        def on_generate(provider, voice_name, video_url, text):
+            if provider == "ElevenLabs":
+                voice_id = next((v[1] for v in elevenlabs_voices if v[0] == voice_name), None)
+            else:  # OpenAI
+                voice_id = next((v[1] for v in openai_voices if v[0] == voice_name), None)
             if not voice_id:
                 return None, "Invalid voice selected."
+            return process_video(provider, voice_id, video_url, text)
         generate_btn.click(
             fn=on_generate,
+            inputs=[provider_dropdown, voice_dropdown, video_url_input, text_input],
             outputs=[video_output, status_output]
         )