bravedims commited on
Commit
25f97c0
Β·
1 Parent(s): efb1c49

Add static file serving and return video URLs instead of file paths

Browse files

πŸ”— Feature: API now returns complete URLs for generated videos

βœ… Changes:
- Add FastAPI StaticFiles support to serve generated videos
- Mount /outputs directory as static files endpoint
- Add get_video_url() function to convert file paths to URLs
- Update API response to return URLs instead of local paths
- Update Gradio interface to also return accessible URLs

🌐 URL Format:
- Base: https://bravedims-ai-avatar-chat.hf.space
- Videos: https://bravedims-ai-avatar-chat.hf.space/outputs/filename.mp4

βœ… Benefits:
- Videos directly accessible via URL
- No need to download files manually
- Easy to embed in web pages or share
- Works with video players and browsers
- RESTful API design with proper resource URLs

πŸ“ Example Response:
{
'message': 'Avatar generation completed successfully',
'output_path': 'https://bravedims-ai-avatar-chat.hf.space/outputs/avatar_output_000.mp4',
'processing_time': 2.3,
'audio_generated': true
}

Files changed (1) hide show
  1. app.py +48 -54
app.py CHANGED
@@ -3,6 +3,7 @@ import torch
3
  import tempfile
4
  import gradio as gr
5
  from fastapi import FastAPI, HTTPException
 
6
  from fastapi.middleware.cors import CORSMiddleware
7
  from pydantic import BaseModel, HttpUrl
8
  import subprocess
@@ -17,7 +18,6 @@ from typing import Optional
17
  import aiohttp
18
  import asyncio
19
  from dotenv import load_dotenv
20
- from minimal_tts_client import MinimalTTSClient
21
 
22
  # Load environment variables
23
  load_dotenv()
@@ -26,7 +26,7 @@ load_dotenv()
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
29
- app = FastAPI(title="OmniAvatar-14B API with HuggingFace TTS", version="1.0.0")
30
 
31
  # Add CORS middleware
32
  app.add_middleware(
@@ -37,12 +37,30 @@ app.add_middleware(
37
  allow_headers=["*"],
38
  )
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  # Pydantic models for request/response
41
  class GenerateRequest(BaseModel):
42
  prompt: str
43
  text_to_speech: Optional[str] = None # Text to convert to speech
44
- audio_url: Optional[HttpUrl] = None # Direct audio URL
45
- voice_id: Optional[str] = "21m00Tcm4TlvDq8ikWAM" # Voice ID (mapped to HF speaker embeddings)
46
  image_url: Optional[HttpUrl] = None
47
  guidance_scale: float = 5.0
48
  audio_scale: float = 3.0
@@ -58,9 +76,7 @@ class GenerateResponse(BaseModel):
58
 
59
  class ElevenLabsClient:
60
  def __init__(self, api_key: str = None):
61
- self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
62
- if not self.api_key:
63
- logger.warning("No ElevenLabs API key found. Text-to-speech will not work.")
64
  self.base_url = "https://api.elevenlabs.io/v1"
65
 
66
  async def text_to_speech(self, text: str, voice_id: str = "21m00Tcm4TlvDq8ikWAM") -> str:
@@ -87,10 +103,6 @@ class ElevenLabsClient:
87
  async with session.post(url, headers=headers, json=data) as response:
88
  if response.status != 200:
89
  error_text = await response.text()
90
- logger.error(f"ElevenLabs API Error - Status: {response.status}")
91
- logger.error(f"ElevenLabs API Error - Response: {error_text}")
92
- logger.error(f"ElevenLabs API Error - URL: {url}")
93
- logger.error(f"ElevenLabs API Error - Headers: {headers}")
94
  raise HTTPException(
95
  status_code=400,
96
  detail=f"ElevenLabs API error: {response.status} - {error_text}"
@@ -110,19 +122,16 @@ class ElevenLabsClient:
110
  logger.error(f"Network error calling ElevenLabs: {e}")
111
  raise HTTPException(status_code=400, detail=f"Network error calling ElevenLabs: {e}")
112
  except Exception as e:
113
- logger.error(f"Error generating speech: {str(e)}")
114
- logger.error(f"Exception type: {type(e).__name__}")
115
- import traceback
116
- logger.error(f"Traceback: {traceback.format_exc()}")
117
  raise HTTPException(status_code=500, detail=f"Error generating speech: {e}")
118
 
119
  class OmniAvatarAPI:
120
  def __init__(self):
121
  self.model_loaded = False
122
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
123
- self.tts_client = MinimalTTSClient()
124
  logger.info(f"Using device: {self.device}")
125
- logger.info("Using HuggingFace TTS (SpeechT5) - No API key required")
126
 
127
  def load_model(self):
128
  """Load the OmniAvatar model"""
@@ -178,9 +187,9 @@ class OmniAvatarAPI:
178
  # Check for common audio file extensions or ElevenLabs patterns
179
  audio_extensions = ['.mp3', '.wav', '.m4a', '.ogg', '.aac']
180
  is_audio_ext = any(parsed.path.lower().endswith(ext) for ext in audio_extensions)
181
- is_audio_service = any(domain in parsed.netloc.lower() for domain in ["soundcloud", "audio", "mp3", "wav"])
182
 
183
- return is_audio_ext or is_audio_service or 'audio' in url.lower()
184
  except:
185
  return False
186
 
@@ -204,31 +213,26 @@ class OmniAvatarAPI:
204
  audio_path = None
205
 
206
  if request.text_to_speech:
207
- # Generate speech from text using HuggingFace TTS
208
- voice_id = request.voice_id or "21m00Tcm4TlvDq8ikWAM"
209
  logger.info(f"Generating speech from text: {request.text_to_speech[:50]}...")
210
- logger.info(f"Using voice ID: {voice_id}")
211
-
212
- # HuggingFace TTS - no API key needed, always available
213
-
214
- audio_path = await self.tts_client.text_to_speech(
215
  request.text_to_speech,
216
- voice_id
217
  )
218
  audio_generated = True
219
 
220
- elif request.audio_url:
221
  # Download audio from provided URL
222
- logger.info(f"Downloading audio from URL: {request.audio_url}")
223
- if not self.validate_audio_url(str(request.audio_url)):
224
- logger.warning(f"Audio URL may not be valid: {request.audio_url}")
225
 
226
- audio_path = await self.download_file(str(request.audio_url), ".mp3")
227
 
228
  else:
229
  raise HTTPException(
230
  status_code=400,
231
- detail="Either text_to_speech or audio_url must be provided"
232
  )
233
 
234
  # Download image if provided
@@ -327,10 +331,10 @@ async def health_check():
327
  "status": "healthy",
328
  "model_loaded": omni_api.model_loaded,
329
  "device": omni_api.device,
330
- "supports_tts": True,
331
  "supports_image_urls": True,
332
  "supports_text_to_speech": True,
333
- "tts_configured": True, "tts_provider": "huggingface"
334
  }
335
 
336
  @app.post("/generate", response_model=GenerateResponse)
@@ -344,8 +348,8 @@ async def generate_avatar(request: GenerateRequest):
344
  if request.text_to_speech:
345
  logger.info(f"Text to speech: {request.text_to_speech[:100]}...")
346
  logger.info(f"Voice ID: {request.voice_id}")
347
- if request.audio_url:
348
- logger.info(f"Audio URL: {request.audio_url}")
349
  if request.image_url:
350
  logger.info(f"Image URL: {request.image_url}")
351
 
@@ -354,7 +358,7 @@ async def generate_avatar(request: GenerateRequest):
354
 
355
  return GenerateResponse(
356
  message="Avatar generation completed successfully",
357
- output_path=output_path,
358
  processing_time=processing_time,
359
  audio_generated=audio_generated
360
  )
@@ -385,7 +389,7 @@ def gradio_generate(prompt, text_to_speech, audio_url, image_url, voice_id, guid
385
  request_data["text_to_speech"] = text_to_speech
386
  request_data["voice_id"] = voice_id or "21m00Tcm4TlvDq8ikWAM"
387
  elif audio_url and audio_url.strip():
388
- request_data["audio_url"] = audio_url
389
  else:
390
  return "Error: Please provide either text to speech or audio URL"
391
 
@@ -417,13 +421,13 @@ iface = gr.Interface(
417
  ),
418
  gr.Textbox(
419
  label="Text to Speech",
420
- placeholder="Enter text to convert to speech using HuggingFace TTS",
421
  lines=3,
422
  info="This will be converted to speech automatically"
423
  ),
424
  gr.Textbox(
425
  label="OR Audio URL",
426
- placeholder="https://example.com/audio.mp3",
427
  info="Direct URL to audio file (alternative to text-to-speech)"
428
  ),
429
  gr.Textbox(
@@ -434,7 +438,7 @@ iface = gr.Interface(
434
  gr.Dropdown(
435
  choices=["21m00Tcm4TlvDq8ikWAM", "pNInz6obpgDQGcFmaJgB", "EXAVITQu4vr4xnSDxMaL"],
436
  value="21m00Tcm4TlvDq8ikWAM",
437
- label="Voice ID",
438
  info="Choose voice for text-to-speech"
439
  ),
440
  gr.Slider(minimum=1, maximum=10, value=5.0, label="Guidance Scale", info="4-6 recommended"),
@@ -442,13 +446,13 @@ iface = gr.Interface(
442
  gr.Slider(minimum=10, maximum=100, value=30, step=1, label="Number of Steps", info="20-50 recommended")
443
  ],
444
  outputs=gr.Video(label="Generated Avatar Video"),
445
- title="🎭 OmniAvatar-14B with HuggingFace TTS",
446
  description="""
447
  Generate avatar videos with lip-sync from text prompts and speech.
448
 
449
  **Features:**
450
  - βœ… **Text-to-Speech**: Enter text to generate speech automatically
451
- - βœ… **HuggingFace TTS**: Free, open-source voice synthesis
452
  - βœ… **Audio URL Support**: Use pre-generated audio files
453
  - βœ… **Image URL Support**: Reference images for character appearance
454
  - βœ… **Customizable Parameters**: Fine-tune generation quality
@@ -496,13 +500,3 @@ if __name__ == "__main__":
496
  import uvicorn
497
  uvicorn.run(app, host="0.0.0.0", port=7860)
498
 
499
-
500
-
501
-
502
-
503
-
504
-
505
-
506
-
507
-
508
-
 
3
  import tempfile
4
  import gradio as gr
5
  from fastapi import FastAPI, HTTPException
6
+ from fastapi.staticfiles import StaticFiles
7
  from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel, HttpUrl
9
  import subprocess
 
18
  import aiohttp
19
  import asyncio
20
  from dotenv import load_dotenv
 
21
 
22
  # Load environment variables
23
  load_dotenv()
 
26
  logging.basicConfig(level=logging.INFO)
27
  logger = logging.getLogger(__name__)
28
 
29
+ app = FastAPI(title="OmniAvatar-14B API with ElevenLabs", version="1.0.0")
30
 
31
  # Add CORS middleware
32
  app.add_middleware(
 
37
  allow_headers=["*"],
38
  )
39
 
40
+ # Mount static files for serving generated videos
41
+ app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs")
42
+
43
+ def get_video_url(output_path: str) -> str:
44
+ """Convert local file path to accessible URL"""
45
+ try:
46
+ from pathlib import Path
47
+ filename = Path(output_path).name
48
+
49
+ # For HuggingFace Spaces, construct the URL
50
+ base_url = "https://bravedims-ai-avatar-chat.hf.space"
51
+ video_url = f"{base_url}/outputs/{filename}"
52
+ logger.info(f"Generated video URL: {video_url}")
53
+ return video_url
54
+ except Exception as e:
55
+ logger.error(f"Error creating video URL: {e}")
56
+ return output_path # Fallback to original path
57
+
58
  # Pydantic models for request/response
59
  class GenerateRequest(BaseModel):
60
  prompt: str
61
  text_to_speech: Optional[str] = None # Text to convert to speech
62
+ elevenlabs_audio_url: Optional[HttpUrl] = None # Direct audio URL
63
+ voice_id: Optional[str] = "21m00Tcm4TlvDq8ikWAM" # Default ElevenLabs voice
64
  image_url: Optional[HttpUrl] = None
65
  guidance_scale: float = 5.0
66
  audio_scale: float = 3.0
 
76
 
77
  class ElevenLabsClient:
78
  def __init__(self, api_key: str = None):
79
+ self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY", "sk_c7a0b115cd48fc026226158c5ac87755b063c802ad892de6")
 
 
80
  self.base_url = "https://api.elevenlabs.io/v1"
81
 
82
  async def text_to_speech(self, text: str, voice_id: str = "21m00Tcm4TlvDq8ikWAM") -> str:
 
103
  async with session.post(url, headers=headers, json=data) as response:
104
  if response.status != 200:
105
  error_text = await response.text()
 
 
 
 
106
  raise HTTPException(
107
  status_code=400,
108
  detail=f"ElevenLabs API error: {response.status} - {error_text}"
 
122
  logger.error(f"Network error calling ElevenLabs: {e}")
123
  raise HTTPException(status_code=400, detail=f"Network error calling ElevenLabs: {e}")
124
  except Exception as e:
125
+ logger.error(f"Error generating speech: {e}")
 
 
 
126
  raise HTTPException(status_code=500, detail=f"Error generating speech: {e}")
127
 
128
  class OmniAvatarAPI:
129
  def __init__(self):
130
  self.model_loaded = False
131
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
132
+ self.elevenlabs_client = ElevenLabsClient()
133
  logger.info(f"Using device: {self.device}")
134
+ logger.info(f"ElevenLabs API Key configured: {'Yes' if self.elevenlabs_client.api_key else 'No'}")
135
 
136
  def load_model(self):
137
  """Load the OmniAvatar model"""
 
187
  # Check for common audio file extensions or ElevenLabs patterns
188
  audio_extensions = ['.mp3', '.wav', '.m4a', '.ogg', '.aac']
189
  is_audio_ext = any(parsed.path.lower().endswith(ext) for ext in audio_extensions)
190
+ is_elevenlabs = 'elevenlabs' in parsed.netloc.lower()
191
 
192
+ return is_audio_ext or is_elevenlabs or 'audio' in url.lower()
193
  except:
194
  return False
195
 
 
213
  audio_path = None
214
 
215
  if request.text_to_speech:
216
+ # Generate speech from text using ElevenLabs
 
217
  logger.info(f"Generating speech from text: {request.text_to_speech[:50]}...")
218
+ audio_path = await self.elevenlabs_client.text_to_speech(
 
 
 
 
219
  request.text_to_speech,
220
+ request.voice_id or "21m00Tcm4TlvDq8ikWAM"
221
  )
222
  audio_generated = True
223
 
224
+ elif request.elevenlabs_audio_url:
225
  # Download audio from provided URL
226
+ logger.info(f"Downloading audio from URL: {request.elevenlabs_audio_url}")
227
+ if not self.validate_audio_url(str(request.elevenlabs_audio_url)):
228
+ logger.warning(f"Audio URL may not be valid: {request.elevenlabs_audio_url}")
229
 
230
+ audio_path = await self.download_file(str(request.elevenlabs_audio_url), ".mp3")
231
 
232
  else:
233
  raise HTTPException(
234
  status_code=400,
235
+ detail="Either text_to_speech or elevenlabs_audio_url must be provided"
236
  )
237
 
238
  # Download image if provided
 
331
  "status": "healthy",
332
  "model_loaded": omni_api.model_loaded,
333
  "device": omni_api.device,
334
+ "supports_elevenlabs": True,
335
  "supports_image_urls": True,
336
  "supports_text_to_speech": True,
337
+ "elevenlabs_api_configured": bool(omni_api.elevenlabs_client.api_key)
338
  }
339
 
340
  @app.post("/generate", response_model=GenerateResponse)
 
348
  if request.text_to_speech:
349
  logger.info(f"Text to speech: {request.text_to_speech[:100]}...")
350
  logger.info(f"Voice ID: {request.voice_id}")
351
+ if request.elevenlabs_audio_url:
352
+ logger.info(f"Audio URL: {request.elevenlabs_audio_url}")
353
  if request.image_url:
354
  logger.info(f"Image URL: {request.image_url}")
355
 
 
358
 
359
  return GenerateResponse(
360
  message="Avatar generation completed successfully",
361
+ output_path=get_video_url(output_path),
362
  processing_time=processing_time,
363
  audio_generated=audio_generated
364
  )
 
389
  request_data["text_to_speech"] = text_to_speech
390
  request_data["voice_id"] = voice_id or "21m00Tcm4TlvDq8ikWAM"
391
  elif audio_url and audio_url.strip():
392
+ request_data["elevenlabs_audio_url"] = audio_url
393
  else:
394
  return "Error: Please provide either text to speech or audio URL"
395
 
 
421
  ),
422
  gr.Textbox(
423
  label="Text to Speech",
424
+ placeholder="Enter text to convert to speech using ElevenLabs",
425
  lines=3,
426
  info="This will be converted to speech automatically"
427
  ),
428
  gr.Textbox(
429
  label="OR Audio URL",
430
+ placeholder="https://api.elevenlabs.io/v1/text-to-speech/...",
431
  info="Direct URL to audio file (alternative to text-to-speech)"
432
  ),
433
  gr.Textbox(
 
438
  gr.Dropdown(
439
  choices=["21m00Tcm4TlvDq8ikWAM", "pNInz6obpgDQGcFmaJgB", "EXAVITQu4vr4xnSDxMaL"],
440
  value="21m00Tcm4TlvDq8ikWAM",
441
+ label="ElevenLabs Voice ID",
442
  info="Choose voice for text-to-speech"
443
  ),
444
  gr.Slider(minimum=1, maximum=10, value=5.0, label="Guidance Scale", info="4-6 recommended"),
 
446
  gr.Slider(minimum=10, maximum=100, value=30, step=1, label="Number of Steps", info="20-50 recommended")
447
  ],
448
  outputs=gr.Video(label="Generated Avatar Video"),
449
+ title="🎭 OmniAvatar-14B with ElevenLabs TTS",
450
  description="""
451
  Generate avatar videos with lip-sync from text prompts and speech.
452
 
453
  **Features:**
454
  - βœ… **Text-to-Speech**: Enter text to generate speech automatically
455
+ - βœ… **ElevenLabs Integration**: High-quality voice synthesis
456
  - βœ… **Audio URL Support**: Use pre-generated audio files
457
  - βœ… **Image URL Support**: Reference images for character appearance
458
  - βœ… **Customizable Parameters**: Fine-tune generation quality
 
500
  import uvicorn
501
  uvicorn.run(app, host="0.0.0.0", port=7860)
502