Spaces:
Running
Add static file serving and return video URLs instead of file paths
Browse filesπ Feature: API now returns complete URLs for generated videos
β
Changes:
- Add FastAPI StaticFiles support to serve generated videos
- Mount /outputs directory as static files endpoint
- Add get_video_url() function to convert file paths to URLs
- Update API response to return URLs instead of local paths
- Update Gradio interface to also return accessible URLs
π URL Format:
- Base: https://bravedims-ai-avatar-chat.hf.space
- Videos: https://bravedims-ai-avatar-chat.hf.space/outputs/filename.mp4
β
Benefits:
- Videos directly accessible via URL
- No need to download files manually
- Easy to embed in web pages or share
- Works with video players and browsers
- RESTful API design with proper resource URLs
π Example Response:
{
'message': 'Avatar generation completed successfully',
'output_path': 'https://bravedims-ai-avatar-chat.hf.space/outputs/avatar_output_000.mp4',
'processing_time': 2.3,
'audio_generated': true
}
@@ -3,6 +3,7 @@ import torch
|
|
3 |
import tempfile
|
4 |
import gradio as gr
|
5 |
from fastapi import FastAPI, HTTPException
|
|
|
6 |
from fastapi.middleware.cors import CORSMiddleware
|
7 |
from pydantic import BaseModel, HttpUrl
|
8 |
import subprocess
|
@@ -17,7 +18,6 @@ from typing import Optional
|
|
17 |
import aiohttp
|
18 |
import asyncio
|
19 |
from dotenv import load_dotenv
|
20 |
-
from minimal_tts_client import MinimalTTSClient
|
21 |
|
22 |
# Load environment variables
|
23 |
load_dotenv()
|
@@ -26,7 +26,7 @@ load_dotenv()
|
|
26 |
logging.basicConfig(level=logging.INFO)
|
27 |
logger = logging.getLogger(__name__)
|
28 |
|
29 |
-
app = FastAPI(title="OmniAvatar-14B API with
|
30 |
|
31 |
# Add CORS middleware
|
32 |
app.add_middleware(
|
@@ -37,12 +37,30 @@ app.add_middleware(
|
|
37 |
allow_headers=["*"],
|
38 |
)
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
# Pydantic models for request/response
|
41 |
class GenerateRequest(BaseModel):
|
42 |
prompt: str
|
43 |
text_to_speech: Optional[str] = None # Text to convert to speech
|
44 |
-
|
45 |
-
voice_id: Optional[str] = "21m00Tcm4TlvDq8ikWAM" #
|
46 |
image_url: Optional[HttpUrl] = None
|
47 |
guidance_scale: float = 5.0
|
48 |
audio_scale: float = 3.0
|
@@ -58,9 +76,7 @@ class GenerateResponse(BaseModel):
|
|
58 |
|
59 |
class ElevenLabsClient:
|
60 |
def __init__(self, api_key: str = None):
|
61 |
-
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
62 |
-
if not self.api_key:
|
63 |
-
logger.warning("No ElevenLabs API key found. Text-to-speech will not work.")
|
64 |
self.base_url = "https://api.elevenlabs.io/v1"
|
65 |
|
66 |
async def text_to_speech(self, text: str, voice_id: str = "21m00Tcm4TlvDq8ikWAM") -> str:
|
@@ -87,10 +103,6 @@ class ElevenLabsClient:
|
|
87 |
async with session.post(url, headers=headers, json=data) as response:
|
88 |
if response.status != 200:
|
89 |
error_text = await response.text()
|
90 |
-
logger.error(f"ElevenLabs API Error - Status: {response.status}")
|
91 |
-
logger.error(f"ElevenLabs API Error - Response: {error_text}")
|
92 |
-
logger.error(f"ElevenLabs API Error - URL: {url}")
|
93 |
-
logger.error(f"ElevenLabs API Error - Headers: {headers}")
|
94 |
raise HTTPException(
|
95 |
status_code=400,
|
96 |
detail=f"ElevenLabs API error: {response.status} - {error_text}"
|
@@ -110,19 +122,16 @@ class ElevenLabsClient:
|
|
110 |
logger.error(f"Network error calling ElevenLabs: {e}")
|
111 |
raise HTTPException(status_code=400, detail=f"Network error calling ElevenLabs: {e}")
|
112 |
except Exception as e:
|
113 |
-
logger.error(f"Error generating speech: {
|
114 |
-
logger.error(f"Exception type: {type(e).__name__}")
|
115 |
-
import traceback
|
116 |
-
logger.error(f"Traceback: {traceback.format_exc()}")
|
117 |
raise HTTPException(status_code=500, detail=f"Error generating speech: {e}")
|
118 |
|
119 |
class OmniAvatarAPI:
|
120 |
def __init__(self):
|
121 |
self.model_loaded = False
|
122 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
123 |
-
self.
|
124 |
logger.info(f"Using device: {self.device}")
|
125 |
-
logger.info("
|
126 |
|
127 |
def load_model(self):
|
128 |
"""Load the OmniAvatar model"""
|
@@ -178,9 +187,9 @@ class OmniAvatarAPI:
|
|
178 |
# Check for common audio file extensions or ElevenLabs patterns
|
179 |
audio_extensions = ['.mp3', '.wav', '.m4a', '.ogg', '.aac']
|
180 |
is_audio_ext = any(parsed.path.lower().endswith(ext) for ext in audio_extensions)
|
181 |
-
|
182 |
|
183 |
-
return is_audio_ext or
|
184 |
except:
|
185 |
return False
|
186 |
|
@@ -204,31 +213,26 @@ class OmniAvatarAPI:
|
|
204 |
audio_path = None
|
205 |
|
206 |
if request.text_to_speech:
|
207 |
-
# Generate speech from text using
|
208 |
-
voice_id = request.voice_id or "21m00Tcm4TlvDq8ikWAM"
|
209 |
logger.info(f"Generating speech from text: {request.text_to_speech[:50]}...")
|
210 |
-
|
211 |
-
|
212 |
-
# HuggingFace TTS - no API key needed, always available
|
213 |
-
|
214 |
-
audio_path = await self.tts_client.text_to_speech(
|
215 |
request.text_to_speech,
|
216 |
-
voice_id
|
217 |
)
|
218 |
audio_generated = True
|
219 |
|
220 |
-
elif request.
|
221 |
# Download audio from provided URL
|
222 |
-
logger.info(f"Downloading audio from URL: {request.
|
223 |
-
if not self.validate_audio_url(str(request.
|
224 |
-
logger.warning(f"Audio URL may not be valid: {request.
|
225 |
|
226 |
-
audio_path = await self.download_file(str(request.
|
227 |
|
228 |
else:
|
229 |
raise HTTPException(
|
230 |
status_code=400,
|
231 |
-
detail="Either text_to_speech or
|
232 |
)
|
233 |
|
234 |
# Download image if provided
|
@@ -327,10 +331,10 @@ async def health_check():
|
|
327 |
"status": "healthy",
|
328 |
"model_loaded": omni_api.model_loaded,
|
329 |
"device": omni_api.device,
|
330 |
-
"
|
331 |
"supports_image_urls": True,
|
332 |
"supports_text_to_speech": True,
|
333 |
-
"
|
334 |
}
|
335 |
|
336 |
@app.post("/generate", response_model=GenerateResponse)
|
@@ -344,8 +348,8 @@ async def generate_avatar(request: GenerateRequest):
|
|
344 |
if request.text_to_speech:
|
345 |
logger.info(f"Text to speech: {request.text_to_speech[:100]}...")
|
346 |
logger.info(f"Voice ID: {request.voice_id}")
|
347 |
-
if request.
|
348 |
-
logger.info(f"Audio URL: {request.
|
349 |
if request.image_url:
|
350 |
logger.info(f"Image URL: {request.image_url}")
|
351 |
|
@@ -354,7 +358,7 @@ async def generate_avatar(request: GenerateRequest):
|
|
354 |
|
355 |
return GenerateResponse(
|
356 |
message="Avatar generation completed successfully",
|
357 |
-
output_path=output_path,
|
358 |
processing_time=processing_time,
|
359 |
audio_generated=audio_generated
|
360 |
)
|
@@ -385,7 +389,7 @@ def gradio_generate(prompt, text_to_speech, audio_url, image_url, voice_id, guid
|
|
385 |
request_data["text_to_speech"] = text_to_speech
|
386 |
request_data["voice_id"] = voice_id or "21m00Tcm4TlvDq8ikWAM"
|
387 |
elif audio_url and audio_url.strip():
|
388 |
-
request_data["
|
389 |
else:
|
390 |
return "Error: Please provide either text to speech or audio URL"
|
391 |
|
@@ -417,13 +421,13 @@ iface = gr.Interface(
|
|
417 |
),
|
418 |
gr.Textbox(
|
419 |
label="Text to Speech",
|
420 |
-
placeholder="Enter text to convert to speech using
|
421 |
lines=3,
|
422 |
info="This will be converted to speech automatically"
|
423 |
),
|
424 |
gr.Textbox(
|
425 |
label="OR Audio URL",
|
426 |
-
placeholder="https://
|
427 |
info="Direct URL to audio file (alternative to text-to-speech)"
|
428 |
),
|
429 |
gr.Textbox(
|
@@ -434,7 +438,7 @@ iface = gr.Interface(
|
|
434 |
gr.Dropdown(
|
435 |
choices=["21m00Tcm4TlvDq8ikWAM", "pNInz6obpgDQGcFmaJgB", "EXAVITQu4vr4xnSDxMaL"],
|
436 |
value="21m00Tcm4TlvDq8ikWAM",
|
437 |
-
label="Voice ID",
|
438 |
info="Choose voice for text-to-speech"
|
439 |
),
|
440 |
gr.Slider(minimum=1, maximum=10, value=5.0, label="Guidance Scale", info="4-6 recommended"),
|
@@ -442,13 +446,13 @@ iface = gr.Interface(
|
|
442 |
gr.Slider(minimum=10, maximum=100, value=30, step=1, label="Number of Steps", info="20-50 recommended")
|
443 |
],
|
444 |
outputs=gr.Video(label="Generated Avatar Video"),
|
445 |
-
title="π OmniAvatar-14B with
|
446 |
description="""
|
447 |
Generate avatar videos with lip-sync from text prompts and speech.
|
448 |
|
449 |
**Features:**
|
450 |
- β
**Text-to-Speech**: Enter text to generate speech automatically
|
451 |
-
- β
**
|
452 |
- β
**Audio URL Support**: Use pre-generated audio files
|
453 |
- β
**Image URL Support**: Reference images for character appearance
|
454 |
- β
**Customizable Parameters**: Fine-tune generation quality
|
@@ -496,13 +500,3 @@ if __name__ == "__main__":
|
|
496 |
import uvicorn
|
497 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
498 |
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
|
|
3 |
import tempfile
|
4 |
import gradio as gr
|
5 |
from fastapi import FastAPI, HTTPException
|
6 |
+
from fastapi.staticfiles import StaticFiles
|
7 |
from fastapi.middleware.cors import CORSMiddleware
|
8 |
from pydantic import BaseModel, HttpUrl
|
9 |
import subprocess
|
|
|
18 |
import aiohttp
|
19 |
import asyncio
|
20 |
from dotenv import load_dotenv
|
|
|
21 |
|
22 |
# Load environment variables
|
23 |
load_dotenv()
|
|
|
26 |
logging.basicConfig(level=logging.INFO)
|
27 |
logger = logging.getLogger(__name__)
|
28 |
|
29 |
+
app = FastAPI(title="OmniAvatar-14B API with ElevenLabs", version="1.0.0")
|
30 |
|
31 |
# Add CORS middleware
|
32 |
app.add_middleware(
|
|
|
37 |
allow_headers=["*"],
|
38 |
)
|
39 |
|
40 |
+
# Mount static files for serving generated videos
|
41 |
+
app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs")
|
42 |
+
|
43 |
+
def get_video_url(output_path: str) -> str:
|
44 |
+
"""Convert local file path to accessible URL"""
|
45 |
+
try:
|
46 |
+
from pathlib import Path
|
47 |
+
filename = Path(output_path).name
|
48 |
+
|
49 |
+
# For HuggingFace Spaces, construct the URL
|
50 |
+
base_url = "https://bravedims-ai-avatar-chat.hf.space"
|
51 |
+
video_url = f"{base_url}/outputs/{filename}"
|
52 |
+
logger.info(f"Generated video URL: {video_url}")
|
53 |
+
return video_url
|
54 |
+
except Exception as e:
|
55 |
+
logger.error(f"Error creating video URL: {e}")
|
56 |
+
return output_path # Fallback to original path
|
57 |
+
|
58 |
# Pydantic models for request/response
|
59 |
class GenerateRequest(BaseModel):
|
60 |
prompt: str
|
61 |
text_to_speech: Optional[str] = None # Text to convert to speech
|
62 |
+
elevenlabs_audio_url: Optional[HttpUrl] = None # Direct audio URL
|
63 |
+
voice_id: Optional[str] = "21m00Tcm4TlvDq8ikWAM" # Default ElevenLabs voice
|
64 |
image_url: Optional[HttpUrl] = None
|
65 |
guidance_scale: float = 5.0
|
66 |
audio_scale: float = 3.0
|
|
|
76 |
|
77 |
class ElevenLabsClient:
|
78 |
def __init__(self, api_key: str = None):
|
79 |
+
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY", "sk_c7a0b115cd48fc026226158c5ac87755b063c802ad892de6")
|
|
|
|
|
80 |
self.base_url = "https://api.elevenlabs.io/v1"
|
81 |
|
82 |
async def text_to_speech(self, text: str, voice_id: str = "21m00Tcm4TlvDq8ikWAM") -> str:
|
|
|
103 |
async with session.post(url, headers=headers, json=data) as response:
|
104 |
if response.status != 200:
|
105 |
error_text = await response.text()
|
|
|
|
|
|
|
|
|
106 |
raise HTTPException(
|
107 |
status_code=400,
|
108 |
detail=f"ElevenLabs API error: {response.status} - {error_text}"
|
|
|
122 |
logger.error(f"Network error calling ElevenLabs: {e}")
|
123 |
raise HTTPException(status_code=400, detail=f"Network error calling ElevenLabs: {e}")
|
124 |
except Exception as e:
|
125 |
+
logger.error(f"Error generating speech: {e}")
|
|
|
|
|
|
|
126 |
raise HTTPException(status_code=500, detail=f"Error generating speech: {e}")
|
127 |
|
128 |
class OmniAvatarAPI:
|
129 |
def __init__(self):
|
130 |
self.model_loaded = False
|
131 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
132 |
+
self.elevenlabs_client = ElevenLabsClient()
|
133 |
logger.info(f"Using device: {self.device}")
|
134 |
+
logger.info(f"ElevenLabs API Key configured: {'Yes' if self.elevenlabs_client.api_key else 'No'}")
|
135 |
|
136 |
def load_model(self):
|
137 |
"""Load the OmniAvatar model"""
|
|
|
187 |
# Check for common audio file extensions or ElevenLabs patterns
|
188 |
audio_extensions = ['.mp3', '.wav', '.m4a', '.ogg', '.aac']
|
189 |
is_audio_ext = any(parsed.path.lower().endswith(ext) for ext in audio_extensions)
|
190 |
+
is_elevenlabs = 'elevenlabs' in parsed.netloc.lower()
|
191 |
|
192 |
+
return is_audio_ext or is_elevenlabs or 'audio' in url.lower()
|
193 |
except:
|
194 |
return False
|
195 |
|
|
|
213 |
audio_path = None
|
214 |
|
215 |
if request.text_to_speech:
|
216 |
+
# Generate speech from text using ElevenLabs
|
|
|
217 |
logger.info(f"Generating speech from text: {request.text_to_speech[:50]}...")
|
218 |
+
audio_path = await self.elevenlabs_client.text_to_speech(
|
|
|
|
|
|
|
|
|
219 |
request.text_to_speech,
|
220 |
+
request.voice_id or "21m00Tcm4TlvDq8ikWAM"
|
221 |
)
|
222 |
audio_generated = True
|
223 |
|
224 |
+
elif request.elevenlabs_audio_url:
|
225 |
# Download audio from provided URL
|
226 |
+
logger.info(f"Downloading audio from URL: {request.elevenlabs_audio_url}")
|
227 |
+
if not self.validate_audio_url(str(request.elevenlabs_audio_url)):
|
228 |
+
logger.warning(f"Audio URL may not be valid: {request.elevenlabs_audio_url}")
|
229 |
|
230 |
+
audio_path = await self.download_file(str(request.elevenlabs_audio_url), ".mp3")
|
231 |
|
232 |
else:
|
233 |
raise HTTPException(
|
234 |
status_code=400,
|
235 |
+
detail="Either text_to_speech or elevenlabs_audio_url must be provided"
|
236 |
)
|
237 |
|
238 |
# Download image if provided
|
|
|
331 |
"status": "healthy",
|
332 |
"model_loaded": omni_api.model_loaded,
|
333 |
"device": omni_api.device,
|
334 |
+
"supports_elevenlabs": True,
|
335 |
"supports_image_urls": True,
|
336 |
"supports_text_to_speech": True,
|
337 |
+
"elevenlabs_api_configured": bool(omni_api.elevenlabs_client.api_key)
|
338 |
}
|
339 |
|
340 |
@app.post("/generate", response_model=GenerateResponse)
|
|
|
348 |
if request.text_to_speech:
|
349 |
logger.info(f"Text to speech: {request.text_to_speech[:100]}...")
|
350 |
logger.info(f"Voice ID: {request.voice_id}")
|
351 |
+
if request.elevenlabs_audio_url:
|
352 |
+
logger.info(f"Audio URL: {request.elevenlabs_audio_url}")
|
353 |
if request.image_url:
|
354 |
logger.info(f"Image URL: {request.image_url}")
|
355 |
|
|
|
358 |
|
359 |
return GenerateResponse(
|
360 |
message="Avatar generation completed successfully",
|
361 |
+
output_path=get_video_url(output_path),
|
362 |
processing_time=processing_time,
|
363 |
audio_generated=audio_generated
|
364 |
)
|
|
|
389 |
request_data["text_to_speech"] = text_to_speech
|
390 |
request_data["voice_id"] = voice_id or "21m00Tcm4TlvDq8ikWAM"
|
391 |
elif audio_url and audio_url.strip():
|
392 |
+
request_data["elevenlabs_audio_url"] = audio_url
|
393 |
else:
|
394 |
return "Error: Please provide either text to speech or audio URL"
|
395 |
|
|
|
421 |
),
|
422 |
gr.Textbox(
|
423 |
label="Text to Speech",
|
424 |
+
placeholder="Enter text to convert to speech using ElevenLabs",
|
425 |
lines=3,
|
426 |
info="This will be converted to speech automatically"
|
427 |
),
|
428 |
gr.Textbox(
|
429 |
label="OR Audio URL",
|
430 |
+
placeholder="https://api.elevenlabs.io/v1/text-to-speech/...",
|
431 |
info="Direct URL to audio file (alternative to text-to-speech)"
|
432 |
),
|
433 |
gr.Textbox(
|
|
|
438 |
gr.Dropdown(
|
439 |
choices=["21m00Tcm4TlvDq8ikWAM", "pNInz6obpgDQGcFmaJgB", "EXAVITQu4vr4xnSDxMaL"],
|
440 |
value="21m00Tcm4TlvDq8ikWAM",
|
441 |
+
label="ElevenLabs Voice ID",
|
442 |
info="Choose voice for text-to-speech"
|
443 |
),
|
444 |
gr.Slider(minimum=1, maximum=10, value=5.0, label="Guidance Scale", info="4-6 recommended"),
|
|
|
446 |
gr.Slider(minimum=10, maximum=100, value=30, step=1, label="Number of Steps", info="20-50 recommended")
|
447 |
],
|
448 |
outputs=gr.Video(label="Generated Avatar Video"),
|
449 |
+
title="π OmniAvatar-14B with ElevenLabs TTS",
|
450 |
description="""
|
451 |
Generate avatar videos with lip-sync from text prompts and speech.
|
452 |
|
453 |
**Features:**
|
454 |
- β
**Text-to-Speech**: Enter text to generate speech automatically
|
455 |
+
- β
**ElevenLabs Integration**: High-quality voice synthesis
|
456 |
- β
**Audio URL Support**: Use pre-generated audio files
|
457 |
- β
**Image URL Support**: Reference images for character appearance
|
458 |
- β
**Customizable Parameters**: Fine-tune generation quality
|
|
|
500 |
import uvicorn
|
501 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
502 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|