openfree commited on
Commit
9cdb5d5
ยท
verified ยท
1 Parent(s): 6e12ecc

Create app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +655 -0
app-backup.py ADDED
@@ -0,0 +1,655 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import asyncio
4
+ import torch
5
+ import io
6
+ import json
7
+ import re
8
+ import httpx
9
+ import tempfile
10
+ import wave
11
+ import base64
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import subprocess
15
+ import shutil
16
+ from dataclasses import dataclass
17
+ from typing import List, Tuple, Dict, Optional
18
+ from pathlib import Path
19
+ from threading import Thread
20
+ from dotenv import load_dotenv
21
+
22
+ # Edge TTS imports
23
+ import edge_tts
24
+ from pydub import AudioSegment
25
+
26
+ # OpenAI imports
27
+ from openai import OpenAI
28
+
29
+ # Transformers imports (for local mode)
30
+ from transformers import (
31
+ AutoModelForCausalLM,
32
+ AutoTokenizer,
33
+ TextIteratorStreamer,
34
+ BitsAndBytesConfig,
35
+ )
36
+
37
+ # Spark TTS imports
38
+ try:
39
+ from huggingface_hub import snapshot_download
40
+ SPARK_AVAILABLE = True
41
+ except:
42
+ SPARK_AVAILABLE = False
43
+
44
+ # MeloTTS imports (for local mode)
45
+ try:
46
+ os.system("python -m unidic download")
47
+ from melo.api import TTS as MeloTTS
48
+ MELO_AVAILABLE = True
49
+ except:
50
+ MELO_AVAILABLE = False
51
+
52
+ load_dotenv()
53
+
54
+
55
+ @dataclass
56
+ class ConversationConfig:
57
+ max_words: int = 6000
58
+ prefix_url: str = "https://r.jina.ai/"
59
+ model_name: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
60
+ local_model_name: str = "NousResearch/Hermes-2-Pro-Llama-3-8B"
61
+
62
+
63
+ class UnifiedAudioConverter:
64
+ def __init__(self, config: ConversationConfig):
65
+ self.config = config
66
+ self.llm_client = None
67
+ self.local_model = None
68
+ self.tokenizer = None
69
+ self.melo_models = None
70
+ self.spark_model_dir = None
71
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
72
+
73
+ def initialize_api_mode(self, api_key: str):
74
+ """Initialize API mode with Together API"""
75
+ self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
76
+
77
+ def initialize_local_mode(self):
78
+ """Initialize local mode with Hugging Face model"""
79
+ if self.local_model is None:
80
+ quantization_config = BitsAndBytesConfig(
81
+ load_in_4bit=True,
82
+ bnb_4bit_compute_dtype=torch.float16
83
+ )
84
+ self.local_model = AutoModelForCausalLM.from_pretrained(
85
+ self.config.local_model_name,
86
+ quantization_config=quantization_config
87
+ )
88
+ self.tokenizer = AutoTokenizer.from_pretrained(
89
+ self.config.local_model_name,
90
+ revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
91
+ )
92
+
93
+ def initialize_spark_tts(self):
94
+ """Initialize Spark TTS model by downloading if needed"""
95
+ if not SPARK_AVAILABLE:
96
+ raise RuntimeError("Spark TTS dependencies not available")
97
+
98
+ model_dir = "pretrained_models/Spark-TTS-0.5B"
99
+
100
+ # Check if model exists, if not download it
101
+ if not os.path.exists(model_dir):
102
+ print("Downloading Spark-TTS model...")
103
+ try:
104
+ os.makedirs("pretrained_models", exist_ok=True)
105
+ snapshot_download(
106
+ "SparkAudio/Spark-TTS-0.5B",
107
+ local_dir=model_dir
108
+ )
109
+ print("Spark-TTS model downloaded successfully")
110
+ except Exception as e:
111
+ raise RuntimeError(f"Failed to download Spark-TTS model: {e}")
112
+
113
+ self.spark_model_dir = model_dir
114
+
115
+ # Check if we have the CLI inference script
116
+ if not os.path.exists("cli/inference.py"):
117
+ print("Warning: Spark-TTS CLI not found. Please clone the Spark-TTS repository.")
118
+
119
+ def initialize_melo_tts(self):
120
+ """Initialize MeloTTS models"""
121
+ if MELO_AVAILABLE and self.melo_models is None:
122
+ self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
123
+
124
+ def fetch_text(self, url: str) -> str:
125
+ """Fetch text content from URL"""
126
+ if not url:
127
+ raise ValueError("URL cannot be empty")
128
+
129
+ if not url.startswith("http://") and not url.startswith("https://"):
130
+ raise ValueError("URL must start with 'http://' or 'https://'")
131
+
132
+ full_url = f"{self.config.prefix_url}{url}"
133
+ try:
134
+ response = httpx.get(full_url, timeout=60.0)
135
+ response.raise_for_status()
136
+ return response.text
137
+ except httpx.HTTPError as e:
138
+ raise RuntimeError(f"Failed to fetch URL: {e}")
139
+
140
+ def _build_prompt(self, text: str) -> str:
141
+ """Build prompt for conversation generation"""
142
+ template = """
143
+ {
144
+ "conversation": [
145
+ {"speaker": "", "text": ""},
146
+ {"speaker": "", "text": ""}
147
+ ]
148
+ }
149
+ """
150
+ return (
151
+ f"{text}\n\nConvert the provided text into a short, informative and crisp "
152
+ f"podcast conversation between two experts. The tone should be "
153
+ f"professional and engaging. Please adhere to the following "
154
+ f"format and return ONLY the JSON:\n{template}"
155
+ )
156
+
157
+ def extract_conversation_api(self, text: str) -> Dict:
158
+ """Extract conversation using API"""
159
+ if not self.llm_client:
160
+ raise RuntimeError("API mode not initialized")
161
+
162
+ try:
163
+ chat_completion = self.llm_client.chat.completions.create(
164
+ messages=[{"role": "user", "content": self._build_prompt(text)}],
165
+ model=self.config.model_name,
166
+ )
167
+
168
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
169
+ json_match = re.search(pattern, chat_completion.choices[0].message.content)
170
+
171
+ if not json_match:
172
+ raise ValueError("No valid JSON found in response")
173
+
174
+ return json.loads(json_match.group())
175
+ except Exception as e:
176
+ raise RuntimeError(f"Failed to extract conversation: {e}")
177
+
178
+ def extract_conversation_local(self, text: str, progress=None) -> Dict:
179
+ """Extract conversation using local model"""
180
+ if not self.local_model or not self.tokenizer:
181
+ raise RuntimeError("Local mode not initialized")
182
+
183
+ chat = [{
184
+ "role": "user",
185
+ "content": self._build_prompt(text)
186
+ }]
187
+
188
+ terminators = [
189
+ self.tokenizer.eos_token_id,
190
+ self.tokenizer.convert_tokens_to_ids("<|eot_id|>")
191
+ ]
192
+
193
+ messages = self.tokenizer.apply_chat_template(
194
+ chat, tokenize=False, add_generation_prompt=True
195
+ )
196
+ model_inputs = self.tokenizer([messages], return_tensors="pt").to(self.device)
197
+
198
+ streamer = TextIteratorStreamer(
199
+ self.tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
200
+ )
201
+
202
+ generate_kwargs = dict(
203
+ model_inputs,
204
+ streamer=streamer,
205
+ max_new_tokens=4000,
206
+ do_sample=True,
207
+ temperature=0.9,
208
+ eos_token_id=terminators,
209
+ )
210
+
211
+ t = Thread(target=self.local_model.generate, kwargs=generate_kwargs)
212
+ t.start()
213
+
214
+ partial_text = ""
215
+ for new_text in streamer:
216
+ partial_text += new_text
217
+
218
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
219
+ json_match = re.search(pattern, partial_text)
220
+
221
+ if json_match:
222
+ return json.loads(json_match.group())
223
+ else:
224
+ # Return a default template if no valid JSON found
225
+ return {
226
+ "conversation": [
227
+ {"speaker": "Host", "text": "Welcome to our podcast."},
228
+ {"speaker": "Guest", "text": "Thank you for having me."}
229
+ ]
230
+ }
231
+
232
+ def parse_conversation_text(self, conversation_text: str) -> Dict:
233
+ """Parse conversation text back to JSON format"""
234
+ lines = conversation_text.strip().split('\n')
235
+ conversation_data = {"conversation": []}
236
+
237
+ for line in lines:
238
+ if ':' in line:
239
+ speaker, text = line.split(':', 1)
240
+ conversation_data["conversation"].append({
241
+ "speaker": speaker.strip(),
242
+ "text": text.strip()
243
+ })
244
+
245
+ return conversation_data
246
+
247
+ async def text_to_speech_edge(self, conversation_json: Dict, voice_1: str, voice_2: str) -> Tuple[str, str]:
248
+ """Convert text to speech using Edge TTS"""
249
+ output_dir = Path(self._create_output_directory())
250
+ filenames = []
251
+
252
+ try:
253
+ for i, turn in enumerate(conversation_json["conversation"]):
254
+ filename = output_dir / f"output_{i}.wav"
255
+ voice = voice_1 if i % 2 == 0 else voice_2
256
+
257
+ tmp_path = await self._generate_audio_edge(turn["text"], voice)
258
+ os.rename(tmp_path, filename)
259
+ filenames.append(str(filename))
260
+
261
+ # Combine audio files
262
+ final_output = os.path.join(output_dir, "combined_output.wav")
263
+ self._combine_audio_files(filenames, final_output)
264
+
265
+ # Generate conversation text
266
+ conversation_text = "\n".join(
267
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
268
+ for i, turn in enumerate(conversation_json["conversation"])
269
+ )
270
+
271
+ return final_output, conversation_text
272
+ except Exception as e:
273
+ raise RuntimeError(f"Failed to convert text to speech: {e}")
274
+
275
+ async def _generate_audio_edge(self, text: str, voice: str) -> str:
276
+ """Generate audio using Edge TTS"""
277
+ if not text.strip():
278
+ raise ValueError("Text cannot be empty")
279
+
280
+ voice_short_name = voice.split(" - ")[0] if " - " in voice else voice
281
+ communicate = edge_tts.Communicate(text, voice_short_name)
282
+
283
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
284
+ tmp_path = tmp_file.name
285
+ await communicate.save(tmp_path)
286
+
287
+ return tmp_path
288
+
289
+ def text_to_speech_spark(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
290
+ """Convert text to speech using Spark TTS CLI"""
291
+ if not SPARK_AVAILABLE or not self.spark_model_dir:
292
+ raise RuntimeError("Spark TTS not available")
293
+
294
+ try:
295
+ output_dir = self._create_output_directory()
296
+ audio_files = []
297
+
298
+ # Create different voice characteristics for different speakers
299
+ voice_configs = [
300
+ {"prompt_text": "Hello, welcome to our podcast. I'm your host today.", "gender": "female"},
301
+ {"prompt_text": "Thank you for having me. I'm excited to be here.", "gender": "male"}
302
+ ]
303
+
304
+ for i, turn in enumerate(conversation_json["conversation"]):
305
+ text = turn["text"]
306
+ if not text.strip():
307
+ continue
308
+
309
+ # Use different voice config for each speaker
310
+ voice_config = voice_configs[i % len(voice_configs)]
311
+
312
+ output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
313
+
314
+ # Run Spark TTS CLI inference
315
+ cmd = [
316
+ "python", "-m", "cli.inference",
317
+ "--text", text,
318
+ "--device", "0" if torch.cuda.is_available() else "cpu",
319
+ "--save_dir", output_dir,
320
+ "--model_dir", self.spark_model_dir,
321
+ "--prompt_text", voice_config["prompt_text"],
322
+ "--output_name", f"spark_output_{i}.wav"
323
+ ]
324
+
325
+ try:
326
+ # Run the command
327
+ result = subprocess.run(
328
+ cmd,
329
+ capture_output=True,
330
+ text=True,
331
+ timeout=60,
332
+ cwd="." # Make sure we're in the right directory
333
+ )
334
+
335
+ if result.returncode == 0:
336
+ audio_files.append(output_file)
337
+ else:
338
+ print(f"Spark TTS error for turn {i}: {result.stderr}")
339
+ # Create a short silence as fallback
340
+ silence = np.zeros(int(22050 * 1.0)) # 1 second of silence
341
+ sf.write(output_file, silence, 22050)
342
+ audio_files.append(output_file)
343
+
344
+ except subprocess.TimeoutExpired:
345
+ print(f"Spark TTS timeout for turn {i}")
346
+ # Create silence as fallback
347
+ silence = np.zeros(int(22050 * 1.0))
348
+ sf.write(output_file, silence, 22050)
349
+ audio_files.append(output_file)
350
+ except Exception as e:
351
+ print(f"Error running Spark TTS for turn {i}: {e}")
352
+ # Create silence as fallback
353
+ silence = np.zeros(int(22050 * 1.0))
354
+ sf.write(output_file, silence, 22050)
355
+ audio_files.append(output_file)
356
+
357
+ # Combine all audio files
358
+ if audio_files:
359
+ final_output = os.path.join(output_dir, "spark_combined.wav")
360
+ self._combine_audio_files(audio_files, final_output)
361
+ else:
362
+ raise RuntimeError("No audio files generated")
363
+
364
+ # Generate conversation text
365
+ conversation_text = "\n".join(
366
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
367
+ for i, turn in enumerate(conversation_json["conversation"])
368
+ )
369
+
370
+ return final_output, conversation_text
371
+
372
+ except Exception as e:
373
+ raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
374
+
375
+ def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
376
+ """Convert text to speech using MeloTTS"""
377
+ if not MELO_AVAILABLE or not self.melo_models:
378
+ raise RuntimeError("MeloTTS not available")
379
+
380
+ speakers = ["EN-Default", "EN-US"]
381
+ combined_audio = AudioSegment.empty()
382
+
383
+ for i, turn in enumerate(conversation_json["conversation"]):
384
+ bio = io.BytesIO()
385
+ text = turn["text"]
386
+ speaker = speakers[i % 2]
387
+ speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
388
+
389
+ # Generate audio
390
+ self.melo_models["EN"].tts_to_file(
391
+ text, speaker_id, bio, speed=1.0,
392
+ pbar=progress.tqdm if progress else None,
393
+ format="wav"
394
+ )
395
+
396
+ bio.seek(0)
397
+ audio_segment = AudioSegment.from_file(bio, format="wav")
398
+ combined_audio += audio_segment
399
+
400
+ # Save final audio
401
+ final_audio_path = "melo_podcast.mp3"
402
+ combined_audio.export(final_audio_path, format="mp3")
403
+
404
+ # Generate conversation text
405
+ conversation_text = "\n".join(
406
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
407
+ for i, turn in enumerate(conversation_json["conversation"])
408
+ )
409
+
410
+ return final_audio_path, conversation_text
411
+
412
+ def _create_output_directory(self) -> str:
413
+ """Create a unique output directory"""
414
+ random_bytes = os.urandom(8)
415
+ folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
416
+ os.makedirs(folder_name, exist_ok=True)
417
+ return folder_name
418
+
419
+ def _combine_audio_files(self, filenames: List[str], output_file: str) -> None:
420
+ """Combine multiple audio files into one"""
421
+ if not filenames:
422
+ raise ValueError("No input files provided")
423
+
424
+ try:
425
+ audio_segments = []
426
+ for filename in filenames:
427
+ if os.path.exists(filename):
428
+ audio_segment = AudioSegment.from_file(filename)
429
+ audio_segments.append(audio_segment)
430
+
431
+ if audio_segments:
432
+ combined = sum(audio_segments)
433
+ combined.export(output_file, format="wav")
434
+
435
+ # Clean up temporary files
436
+ for filename in filenames:
437
+ if os.path.exists(filename):
438
+ os.remove(filename)
439
+
440
+ except Exception as e:
441
+ raise RuntimeError(f"Failed to combine audio files: {e}")
442
+
443
+
444
+ # Global converter instance
445
+ converter = UnifiedAudioConverter(ConversationConfig())
446
+
447
+
448
+ async def synthesize(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS"):
449
+ """Main synthesis function"""
450
+ if not article_url:
451
+ return "Please provide a valid URL.", None
452
+
453
+ try:
454
+ # Fetch text from URL
455
+ text = converter.fetch_text(article_url)
456
+
457
+ # Limit text to max words
458
+ words = text.split()
459
+ if len(words) > converter.config.max_words:
460
+ text = " ".join(words[:converter.config.max_words])
461
+
462
+ # Extract conversation based on mode
463
+ if mode == "API":
464
+ api_key = os.environ.get("TOGETHER_API_KEY")
465
+ if not api_key:
466
+ return "API key not found. Please set TOGETHER_API_KEY environment variable.", None
467
+ converter.initialize_api_mode(api_key)
468
+ conversation_json = converter.extract_conversation_api(text)
469
+ else: # Local mode
470
+ converter.initialize_local_mode()
471
+ conversation_json = converter.extract_conversation_local(text)
472
+
473
+ # Generate conversation text
474
+ conversation_text = "\n".join(
475
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
476
+ for i, turn in enumerate(conversation_json["conversation"])
477
+ )
478
+
479
+ return conversation_text, None
480
+
481
+ except Exception as e:
482
+ return f"Error: {str(e)}", None
483
+
484
+
485
+ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS"):
486
+ """Regenerate audio from edited conversation text"""
487
+ if not conversation_text.strip():
488
+ return "Please provide conversation text.", None
489
+
490
+ try:
491
+ # Parse the conversation text back to JSON format
492
+ conversation_json = converter.parse_conversation_text(conversation_text)
493
+
494
+ if not conversation_json["conversation"]:
495
+ return "No valid conversation found in the text.", None
496
+
497
+ # Generate audio based on TTS engine
498
+ if tts_engine == "Edge-TTS":
499
+ output_file, _ = await converter.text_to_speech_edge(
500
+ conversation_json,
501
+ "en-US-AvaMultilingualNeural",
502
+ "en-US-AndrewMultilingualNeural"
503
+ )
504
+ elif tts_engine == "Spark-TTS":
505
+ if not SPARK_AVAILABLE:
506
+ return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
507
+ converter.initialize_spark_tts()
508
+ output_file, _ = converter.text_to_speech_spark(conversation_json)
509
+ else: # MeloTTS
510
+ if not MELO_AVAILABLE:
511
+ return "MeloTTS not available. Please install required dependencies.", None
512
+ converter.initialize_melo_tts()
513
+ output_file, _ = converter.text_to_speech_melo(conversation_json)
514
+
515
+ return "Audio generated successfully!", output_file
516
+
517
+ except Exception as e:
518
+ return f"Error generating audio: {str(e)}", None
519
+
520
+
521
+ def synthesize_sync(article_url: str, mode: str = "API", tts_engine: str = "Edge-TTS"):
522
+ """Synchronous wrapper for async synthesis"""
523
+ return asyncio.run(synthesize(article_url, mode, tts_engine))
524
+
525
+
526
+ def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS"):
527
+ """Synchronous wrapper for async audio regeneration"""
528
+ return asyncio.run(regenerate_audio(conversation_text, tts_engine))
529
+
530
+
531
+ # Gradio Interface
532
+ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
533
+ gr.Markdown("# ๐ŸŽ™๏ธ URL to Podcast Converter")
534
+ gr.Markdown("Convert any article, blog, or news into an engaging podcast conversation!")
535
+
536
+ with gr.Row():
537
+ with gr.Column(scale=3):
538
+ url_input = gr.Textbox(
539
+ label="Article URL",
540
+ placeholder="Enter the article URL here...",
541
+ value=""
542
+ )
543
+ with gr.Column(scale=1):
544
+ mode_selector = gr.Radio(
545
+ choices=["API", "Local"],
546
+ value="API",
547
+ label="Processing Mode",
548
+ info="API: Faster, requires API key | Local: Slower, runs on device"
549
+ )
550
+
551
+ # TTS ์—”์ง„ ์„ ํƒ - ๊ธฐ๋ณธ 2๊ฐœ์™€ ์ถ”๊ฐ€ ์˜ต์…˜์œผ๋กœ ๊ตฌ๋ถ„
552
+ with gr.Group():
553
+ gr.Markdown("### TTS Engine Selection")
554
+ tts_selector = gr.Radio(
555
+ choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
556
+ value="Edge-TTS",
557
+ label="TTS Engine",
558
+ info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU"
559
+ )
560
+
561
+ gr.Markdown("""
562
+ **Recommended:**
563
+ - ๐ŸŒŸ **Edge-TTS**: Best quality, cloud-based, instant setup
564
+ - ๐Ÿค– **Spark-TTS**: Local AI model (0.5B), zero-shot voice cloning
565
+
566
+ **Additional Option:**
567
+ - โšก **MeloTTS**: Local processing, GPU recommended
568
+ """)
569
+
570
+ convert_btn = gr.Button("๐ŸŽฏ Generate Conversation", variant="primary", size="lg")
571
+
572
+ with gr.Row():
573
+ with gr.Column():
574
+ conversation_output = gr.Textbox(
575
+ label="Generated Conversation (Editable)",
576
+ lines=15,
577
+ max_lines=30,
578
+ interactive=True,
579
+ placeholder="Generated conversation will appear here. You can edit it before generating audio.",
580
+ info="Edit the conversation as needed. Format: 'Speaker Name: Text'"
581
+ )
582
+
583
+ # ์˜ค๋””์˜ค ์ƒ์„ฑ ๋ฒ„ํŠผ ์ถ”๊ฐ€
584
+ with gr.Row():
585
+ generate_audio_btn = gr.Button("๐ŸŽ™๏ธ Generate Audio from Text", variant="secondary", size="lg")
586
+ gr.Markdown("*Edit the conversation above, then click to generate audio*")
587
+
588
+ with gr.Column():
589
+ audio_output = gr.Audio(
590
+ label="Podcast Audio",
591
+ type="filepath",
592
+ interactive=False
593
+ )
594
+
595
+ # ์ƒํƒœ ๋ฉ”์‹œ์ง€ ์ถ”๊ฐ€
596
+ status_output = gr.Textbox(
597
+ label="Status",
598
+ interactive=False,
599
+ visible=True
600
+ )
601
+
602
+ # TTS ์—”์ง„๋ณ„ ์„ค๋ช… ๋ฐ ์„ค์น˜ ์•ˆ๋‚ด ์ถ”๊ฐ€
603
+ with gr.Row():
604
+ gr.Markdown("""
605
+ ### TTS Engine Details:
606
+
607
+ - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
608
+ - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
609
+ - **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
610
+ - Features: Bilingual support (Chinese/English), controllable speech generation
611
+ - License: CC BY-NC-SA (Non-commercial use only)
612
+ - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
613
+
614
+ ### Spark-TTS Setup Instructions:
615
+ ```bash
616
+ git clone https://github.com/SparkAudio/Spark-TTS.git
617
+ cd Spark-TTS
618
+ pip install -r requirements.txt
619
+ ```
620
+ """)
621
+
622
+ gr.Examples(
623
+ examples=[
624
+ ["https://huggingface.co/blog/openfree/cycle-navigator", "API", "Edge-TTS"],
625
+ ["https://www.bbc.com/news/technology-67988517", "API", "Spark-TTS"],
626
+ ["https://arxiv.org/abs/2301.00810", "API", "Edge-TTS"],
627
+ ],
628
+ inputs=[url_input, mode_selector, tts_selector],
629
+ outputs=[conversation_output, status_output],
630
+ fn=synthesize_sync,
631
+ cache_examples=False,
632
+ )
633
+
634
+ # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
635
+ convert_btn.click(
636
+ fn=synthesize_sync,
637
+ inputs=[url_input, mode_selector, tts_selector],
638
+ outputs=[conversation_output, status_output]
639
+ )
640
+
641
+ generate_audio_btn.click(
642
+ fn=regenerate_audio_sync,
643
+ inputs=[conversation_output, tts_selector],
644
+ outputs=[status_output, audio_output]
645
+ )
646
+
647
+
648
+ # Launch the app
649
+ if __name__ == "__main__":
650
+ demo.queue(api_open=True, default_concurrency_limit=10).launch(
651
+ show_api=True,
652
+ share=False,
653
+ server_name="0.0.0.0",
654
+ server_port=7860
655
+ )