openfree commited on
Commit
0991c48
ยท
verified ยท
1 Parent(s): 32f4b71

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +913 -0
app.py ADDED
@@ -0,0 +1,913 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import asyncio
4
+ import torch
5
+ import io
6
+ import json
7
+ import re
8
+ import httpx
9
+ import tempfile
10
+ import wave
11
+ import base64
12
+ import numpy as np
13
+ import soundfile as sf
14
+ import subprocess
15
+ import shutil
16
+ from dataclasses import dataclass
17
+ from typing import List, Tuple, Dict, Optional
18
+ from pathlib import Path
19
+ from threading import Thread
20
+ from dotenv import load_dotenv
21
+
22
+ # Edge TTS imports
23
+ import edge_tts
24
+ from pydub import AudioSegment
25
+
26
+ # OpenAI imports
27
+ from openai import OpenAI
28
+
29
+ # Transformers imports (for legacy local mode)
30
+ from transformers import (
31
+ AutoModelForCausalLM,
32
+ AutoTokenizer,
33
+ TextIteratorStreamer,
34
+ BitsAndBytesConfig,
35
+ )
36
+
37
+ # Llama CPP imports (for new local mode)
38
+ try:
39
+ from llama_cpp import Llama
40
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
41
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
42
+ from llama_cpp_agent.chat_history import BasicChatHistory
43
+ from llama_cpp_agent.chat_history.messages import Roles
44
+ from huggingface_hub import hf_hub_download
45
+ LLAMA_CPP_AVAILABLE = True
46
+ except ImportError:
47
+ LLAMA_CPP_AVAILABLE = False
48
+
49
+ # Spark TTS imports
50
+ try:
51
+ from huggingface_hub import snapshot_download
52
+ SPARK_AVAILABLE = True
53
+ except:
54
+ SPARK_AVAILABLE = False
55
+
56
+ # MeloTTS imports (for local mode)
57
+ try:
58
+ os.system("python -m unidic download")
59
+ from melo.api import TTS as MeloTTS
60
+ MELO_AVAILABLE = True
61
+ except:
62
+ MELO_AVAILABLE = False
63
+
64
+ load_dotenv()
65
+
66
+
67
+ @dataclass
68
+ class ConversationConfig:
69
+ max_words: int = 6000
70
+ prefix_url: str = "https://r.jina.ai/"
71
+ api_model_name: str = "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"
72
+ legacy_local_model_name: str = "NousResearch/Hermes-2-Pro-Llama-3-8B"
73
+ # ์ƒˆ๋กœ์šด ๋กœ์ปฌ ๋ชจ๋ธ ์„ค์ •
74
+ local_model_name: str = "Private-BitSix-Mistral-Small-3.1-24B-Instruct-2503.gguf"
75
+ local_model_repo: str = "ginigen/Private-BitSix-Mistral-Small-3.1-24B-Instruct-2503"
76
+
77
+
78
+ class UnifiedAudioConverter:
79
+ def __init__(self, config: ConversationConfig):
80
+ self.config = config
81
+ self.llm_client = None
82
+ self.legacy_local_model = None
83
+ self.legacy_tokenizer = None
84
+ # ์ƒˆ๋กœ์šด ๋กœ์ปฌ LLM ๊ด€๋ จ
85
+ self.local_llm = None
86
+ self.local_llm_model = None
87
+ self.melo_models = None
88
+ self.spark_model_dir = None
89
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
90
+
91
+ def initialize_api_mode(self, api_key: str):
92
+ """Initialize API mode with Together API (now fallback)"""
93
+ self.llm_client = OpenAI(api_key=api_key, base_url="https://api.together.xyz/v1")
94
+
95
+ def initialize_local_mode(self):
96
+ """Initialize new local mode with Llama CPP"""
97
+ if not LLAMA_CPP_AVAILABLE:
98
+ raise RuntimeError("Llama CPP dependencies not available. Please install llama-cpp-python and llama-cpp-agent.")
99
+
100
+ if self.local_llm is None or self.local_llm_model != self.config.local_model_name:
101
+ try:
102
+ # ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ
103
+ model_path = hf_hub_download(
104
+ repo_id=self.config.local_model_repo,
105
+ filename=self.config.local_model_name,
106
+ local_dir="./models"
107
+ )
108
+
109
+ model_path_local = os.path.join("./models", self.config.local_model_name)
110
+
111
+ if not os.path.exists(model_path_local):
112
+ raise RuntimeError(f"Model file not found at {model_path_local}")
113
+
114
+ # Llama ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
115
+ self.local_llm = Llama(
116
+ model_path=model_path_local,
117
+ flash_attn=True,
118
+ n_gpu_layers=81 if torch.cuda.is_available() else 0,
119
+ n_batch=1024,
120
+ n_ctx=8192,
121
+ )
122
+ self.local_llm_model = self.config.local_model_name
123
+ print(f"Local LLM initialized: {model_path_local}")
124
+
125
+ except Exception as e:
126
+ print(f"Failed to initialize local LLM: {e}")
127
+ raise RuntimeError(f"Failed to initialize local LLM: {e}")
128
+
129
+ def initialize_legacy_local_mode(self):
130
+ """Initialize legacy local mode with Hugging Face model (fallback)"""
131
+ if self.legacy_local_model is None:
132
+ quantization_config = BitsAndBytesConfig(
133
+ load_in_4bit=True,
134
+ bnb_4bit_compute_dtype=torch.float16
135
+ )
136
+ self.legacy_local_model = AutoModelForCausalLM.from_pretrained(
137
+ self.config.legacy_local_model_name,
138
+ quantization_config=quantization_config
139
+ )
140
+ self.legacy_tokenizer = AutoTokenizer.from_pretrained(
141
+ self.config.legacy_local_model_name,
142
+ revision='8ab73a6800796d84448bc936db9bac5ad9f984ae'
143
+ )
144
+
145
+ def initialize_spark_tts(self):
146
+ """Initialize Spark TTS model by downloading if needed"""
147
+ if not SPARK_AVAILABLE:
148
+ raise RuntimeError("Spark TTS dependencies not available")
149
+
150
+ model_dir = "pretrained_models/Spark-TTS-0.5B"
151
+
152
+ # Check if model exists, if not download it
153
+ if not os.path.exists(model_dir):
154
+ print("Downloading Spark-TTS model...")
155
+ try:
156
+ os.makedirs("pretrained_models", exist_ok=True)
157
+ snapshot_download(
158
+ "SparkAudio/Spark-TTS-0.5B",
159
+ local_dir=model_dir
160
+ )
161
+ print("Spark-TTS model downloaded successfully")
162
+ except Exception as e:
163
+ raise RuntimeError(f"Failed to download Spark-TTS model: {e}")
164
+
165
+ self.spark_model_dir = model_dir
166
+
167
+ # Check if we have the CLI inference script
168
+ if not os.path.exists("cli/inference.py"):
169
+ print("Warning: Spark-TTS CLI not found. Please clone the Spark-TTS repository.")
170
+
171
+ def initialize_melo_tts(self):
172
+ """Initialize MeloTTS models"""
173
+ if MELO_AVAILABLE and self.melo_models is None:
174
+ self.melo_models = {"EN": MeloTTS(language="EN", device=self.device)}
175
+
176
+ def fetch_text(self, url: str) -> str:
177
+ """Fetch text content from URL"""
178
+ if not url:
179
+ raise ValueError("URL cannot be empty")
180
+
181
+ if not url.startswith("http://") and not url.startswith("https://"):
182
+ raise ValueError("URL must start with 'http://' or 'https://'")
183
+
184
+ full_url = f"{self.config.prefix_url}{url}"
185
+ try:
186
+ response = httpx.get(full_url, timeout=60.0)
187
+ response.raise_for_status()
188
+ return response.text
189
+ except httpx.HTTPError as e:
190
+ raise RuntimeError(f"Failed to fetch URL: {e}")
191
+
192
+ def _get_messages_formatter_type(self, model_name):
193
+ """Get appropriate message formatter for the model"""
194
+ if "Mistral" in model_name or "BitSix" in model_name:
195
+ return MessagesFormatterType.CHATML
196
+ else:
197
+ return MessagesFormatterType.LLAMA_3
198
+
199
+ def _build_prompt(self, text: str, language: str = "English") -> str:
200
+ """Build prompt for conversation generation"""
201
+ if language == "Korean":
202
+ template = """
203
+ {
204
+ "conversation": [
205
+ {"speaker": "", "text": ""},
206
+ {"speaker": "", "text": ""}
207
+ ]
208
+ }
209
+ """
210
+ return (
211
+ f"{text}\n\n์ œ๊ณต๋œ ํ…์ŠคํŠธ๋ฅผ ๋‘ ๋ช…์˜ ์ „๋ฌธ๊ฐ€ ๊ฐ„์˜ ์งง๊ณ  ์œ ์ตํ•˜๋ฉฐ ๋ช…ํ™•ํ•œ "
212
+ f"ํŒŸ์บ์ŠคํŠธ ๋Œ€ํ™”๋กœ ๋ณ€ํ™˜ํ•ด์ฃผ์„ธ์š”. ํ†ค์€ ์ „๋ฌธ์ ์ด๊ณ  ๋งค๋ ฅ์ ์ด์–ด์•ผ ํ•ฉ๋‹ˆ๋‹ค. "
213
+ f"๋‹ค์Œ ํ˜•์‹์„ ์ค€์ˆ˜ํ•˜๊ณ  JSON๋งŒ ๋ฐ˜ํ™˜ํ•ด์ฃผ์„ธ์š”:\n{template}"
214
+ )
215
+ else:
216
+ template = """
217
+ {
218
+ "conversation": [
219
+ {"speaker": "", "text": ""},
220
+ {"speaker": "", "text": ""}
221
+ ]
222
+ }
223
+ """
224
+ return (
225
+ f"{text}\n\nConvert the provided text into a short, informative and crisp "
226
+ f"podcast conversation between two experts. The tone should be "
227
+ f"professional and engaging. Please adhere to the following "
228
+ f"format and return ONLY the JSON:\n{template}"
229
+ )
230
+
231
+ def _build_messages_for_local(self, text: str, language: str = "English") -> List[Dict]:
232
+ """Build messages for local LLM"""
233
+ if language == "Korean":
234
+ system_message = "๋‹น์‹ ์€ ํ•œ๊ตญ์–ด๋กœ ํŒŸ์บ์ŠคํŠธ ๋Œ€ํ™”๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ์œ ์ตํ•œ ํ•œ๊ตญ์–ด ๋Œ€ํ™”๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์„ธ์š”."
235
+ else:
236
+ system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
237
+
238
+ return [
239
+ {"role": "system", "content": system_message},
240
+ {"role": "user", "content": self._build_prompt(text, language)}
241
+ ]
242
+
243
+ def extract_conversation_local(self, text: str, language: str = "English", progress=None) -> Dict:
244
+ """Extract conversation using new local LLM (primary method)"""
245
+ try:
246
+ # ๋จผ์ € ์ƒˆ๋กœ์šด ๋กœ์ปฌ LLM ์‹œ๋„
247
+ self.initialize_local_mode()
248
+
249
+ chat_template = self._get_messages_formatter_type(self.config.local_model_name)
250
+ provider = LlamaCppPythonProvider(self.local_llm)
251
+
252
+ # ์–ธ์–ด๋ณ„ ์‹œ์Šคํ…œ ๋ฉ”์‹œ์ง€
253
+ if language == "Korean":
254
+ system_message = "๋‹น์‹ ์€ ํ•œ๊ตญ์–ด๋กœ ํŒŸ์บ์ŠคํŠธ ๋Œ€ํ™”๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ์œ ์ตํ•œ ํ•œ๊ตญ์–ด ๋Œ€ํ™”๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์„ธ์š”. JSON ํ˜•์‹์œผ๋กœ๋งŒ ์‘๋‹ตํ•˜์„ธ์š”."
255
+ else:
256
+ system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations. Respond only in JSON format."
257
+
258
+ agent = LlamaCppAgent(
259
+ provider,
260
+ system_prompt=system_message,
261
+ predefined_messages_formatter_type=chat_template,
262
+ debug_output=False
263
+ )
264
+
265
+ settings = provider.get_provider_default_settings()
266
+ settings.temperature = 0.7
267
+ settings.top_k = 40
268
+ settings.top_p = 0.95
269
+ settings.max_tokens = 2048
270
+ settings.repeat_penalty = 1.1
271
+ settings.stream = False
272
+
273
+ messages = BasicChatHistory()
274
+
275
+ prompt = self._build_prompt(text, language)
276
+ response = agent.get_chat_response(
277
+ prompt,
278
+ llm_sampling_settings=settings,
279
+ chat_history=messages,
280
+ returns_streaming_generator=False,
281
+ print_output=False
282
+ )
283
+
284
+ # JSON ํŒŒ์‹ฑ
285
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
286
+ json_match = re.search(pattern, response)
287
+
288
+ if json_match:
289
+ return json.loads(json_match.group())
290
+ else:
291
+ raise ValueError("No valid JSON found in local LLM response")
292
+
293
+ except Exception as e:
294
+ print(f"Local LLM failed: {e}, falling back to legacy local method")
295
+ return self.extract_conversation_legacy_local(text, language, progress)
296
+
297
+ def extract_conversation_legacy_local(self, text: str, language: str = "English", progress=None) -> Dict:
298
+ """Extract conversation using legacy local model (fallback)"""
299
+ try:
300
+ self.initialize_legacy_local_mode()
301
+
302
+ # ์–ธ์–ด๋ณ„ ์‹œ์Šคํ…œ ๋ฉ”์‹œ์ง€
303
+ if language == "Korean":
304
+ system_message = "๋‹น์‹ ์€ ํ•œ๊ตญ์–ด๋กœ ํŒŸ์บ์ŠคํŠธ ๋Œ€ํ™”๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ์œ ์ตํ•œ ํ•œ๊ตญ์–ด ๋Œ€ํ™”๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์„ธ์š”."
305
+ else:
306
+ system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
307
+
308
+ chat = [
309
+ {"role": "system", "content": system_message},
310
+ {"role": "user", "content": self._build_prompt(text, language)}
311
+ ]
312
+
313
+ terminators = [
314
+ self.legacy_tokenizer.eos_token_id,
315
+ self.legacy_tokenizer.convert_tokens_to_ids("<|eot_id|>")
316
+ ]
317
+
318
+ messages = self.legacy_tokenizer.apply_chat_template(
319
+ chat, tokenize=False, add_generation_prompt=True
320
+ )
321
+ model_inputs = self.legacy_tokenizer([messages], return_tensors="pt").to(self.device)
322
+
323
+ streamer = TextIteratorStreamer(
324
+ self.legacy_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True
325
+ )
326
+
327
+ generate_kwargs = dict(
328
+ model_inputs,
329
+ streamer=streamer,
330
+ max_new_tokens=4000,
331
+ do_sample=True,
332
+ temperature=0.9,
333
+ eos_token_id=terminators,
334
+ )
335
+
336
+ t = Thread(target=self.legacy_local_model.generate, kwargs=generate_kwargs)
337
+ t.start()
338
+
339
+ partial_text = ""
340
+ for new_text in streamer:
341
+ partial_text += new_text
342
+
343
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
344
+ json_match = re.search(pattern, partial_text)
345
+
346
+ if json_match:
347
+ return json.loads(json_match.group())
348
+ else:
349
+ raise ValueError("No valid JSON found in legacy local response")
350
+
351
+ except Exception as e:
352
+ print(f"Legacy local model also failed: {e}")
353
+ # Return default template
354
+ if language == "Korean":
355
+ return {
356
+ "conversation": [
357
+ {"speaker": "์ง„ํ–‰์ž", "text": "์•ˆ๋…•ํ•˜์„ธ์š”, ํŒŸ์บ์ŠคํŠธ์— ์˜ค์‹  ๊ฒƒ์„ ํ™˜์˜ํ•ฉ๋‹ˆ๋‹ค."},
358
+ {"speaker": "๊ฒŒ์ŠคํŠธ", "text": "์•ˆ๋…•ํ•˜์„ธ์š”, ์ดˆ๋Œ€ํ•ด ์ฃผ์…”์„œ ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค."}
359
+ ]
360
+ }
361
+ else:
362
+ return {
363
+ "conversation": [
364
+ {"speaker": "Host", "text": "Welcome to our podcast."},
365
+ {"speaker": "Guest", "text": "Thank you for having me."}
366
+ ]
367
+ }
368
+
369
+ def extract_conversation_api(self, text: str, language: str = "English") -> Dict:
370
+ """Extract conversation using API (fallback method)"""
371
+ if not self.llm_client:
372
+ raise RuntimeError("API mode not initialized")
373
+
374
+ try:
375
+ # ์–ธ์–ด๋ณ„ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
376
+ if language == "Korean":
377
+ system_message = "๋‹น์‹ ์€ ํ•œ๊ตญ์–ด๋กœ ํŒŸ์บ์ŠคํŠธ ๋Œ€ํ™”๋ฅผ ์ƒ์„ฑํ•˜๋Š” ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ์œ ์ตํ•œ ํ•œ๊ตญ์–ด ๋Œ€ํ™”๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์„ธ์š”."
378
+ else:
379
+ system_message = "You are an expert at creating podcast conversations in English. Create natural and informative English conversations."
380
+
381
+ chat_completion = self.llm_client.chat.completions.create(
382
+ messages=[
383
+ {"role": "system", "content": system_message},
384
+ {"role": "user", "content": self._build_prompt(text, language)}
385
+ ],
386
+ model=self.config.api_model_name,
387
+ )
388
+
389
+ pattern = r"\{(?:[^{}]|(?:\{[^{}]*\}))*\}"
390
+ json_match = re.search(pattern, chat_completion.choices[0].message.content)
391
+
392
+ if not json_match:
393
+ raise ValueError("No valid JSON found in response")
394
+
395
+ return json.loads(json_match.group())
396
+ except Exception as e:
397
+ raise RuntimeError(f"Failed to extract conversation: {e}")
398
+
399
+ def parse_conversation_text(self, conversation_text: str) -> Dict:
400
+ """Parse conversation text back to JSON format"""
401
+ lines = conversation_text.strip().split('\n')
402
+ conversation_data = {"conversation": []}
403
+
404
+ for line in lines:
405
+ if ':' in line:
406
+ speaker, text = line.split(':', 1)
407
+ conversation_data["conversation"].append({
408
+ "speaker": speaker.strip(),
409
+ "text": text.strip()
410
+ })
411
+
412
+ return conversation_data
413
+
414
+ async def text_to_speech_edge(self, conversation_json: Dict, language: str = "English") -> Tuple[str, str]:
415
+ """Convert text to speech using Edge TTS"""
416
+ output_dir = Path(self._create_output_directory())
417
+ filenames = []
418
+
419
+ try:
420
+ # ์–ธ์–ด๋ณ„ ์Œ์„ฑ ์„ค์ •
421
+ if language == "Korean":
422
+ voices = [
423
+ "ko-KR-SunHiNeural", # ์—ฌ์„ฑ ์Œ์„ฑ (์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด)
424
+ "ko-KR-HyunsuNeural" # ๋‚จ์„ฑ ์Œ์„ฑ (์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด)
425
+ ]
426
+ else:
427
+ voices = [
428
+ "en-US-AvaMultilingualNeural", # ์—ฌ์„ฑ ์Œ์„ฑ
429
+ "en-US-AndrewMultilingualNeural" # ๋‚จ์„ฑ ์Œ์„ฑ
430
+ ]
431
+
432
+ for i, turn in enumerate(conversation_json["conversation"]):
433
+ filename = output_dir / f"output_{i}.wav"
434
+ voice = voices[i % len(voices)]
435
+
436
+ tmp_path = await self._generate_audio_edge(turn["text"], voice)
437
+ os.rename(tmp_path, filename)
438
+ filenames.append(str(filename))
439
+
440
+ # Combine audio files
441
+ final_output = os.path.join(output_dir, "combined_output.wav")
442
+ self._combine_audio_files(filenames, final_output)
443
+
444
+ # Generate conversation text
445
+ conversation_text = "\n".join(
446
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
447
+ for i, turn in enumerate(conversation_json["conversation"])
448
+ )
449
+
450
+ return final_output, conversation_text
451
+ except Exception as e:
452
+ raise RuntimeError(f"Failed to convert text to speech: {e}")
453
+
454
+ async def _generate_audio_edge(self, text: str, voice: str) -> str:
455
+ """Generate audio using Edge TTS"""
456
+ if not text.strip():
457
+ raise ValueError("Text cannot be empty")
458
+
459
+ voice_short_name = voice.split(" - ")[0] if " - " in voice else voice
460
+ communicate = edge_tts.Communicate(text, voice_short_name)
461
+
462
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
463
+ tmp_path = tmp_file.name
464
+ await communicate.save(tmp_path)
465
+
466
+ return tmp_path
467
+
468
+ def text_to_speech_spark(self, conversation_json: Dict, language: str = "English", progress=None) -> Tuple[str, str]:
469
+ """Convert text to speech using Spark TTS CLI"""
470
+ if not SPARK_AVAILABLE or not self.spark_model_dir:
471
+ raise RuntimeError("Spark TTS not available")
472
+
473
+ try:
474
+ output_dir = self._create_output_directory()
475
+ audio_files = []
476
+
477
+ # Create different voice characteristics for different speakers
478
+ if language == "Korean":
479
+ voice_configs = [
480
+ {"prompt_text": "์•ˆ๋…•ํ•˜์„ธ์š”, ์˜ค๋Š˜ ํŒŸ์บ์ŠคํŠธ ์ง„ํ–‰์„ ๋งก์€ ์ง„ํ–‰์ž์ž…๋‹ˆ๋‹ค.", "gender": "female"},
481
+ {"prompt_text": "์•ˆ๋…•ํ•˜์„ธ์š”, ์˜ค๋Š˜ ๊ฒŒ์ŠคํŠธ๋กœ ์ฐธ์—ฌํ•˜๊ฒŒ ๋˜์–ด ๊ธฐ์ฉ๋‹ˆ๋‹ค.", "gender": "male"}
482
+ ]
483
+ else:
484
+ voice_configs = [
485
+ {"prompt_text": "Hello, welcome to our podcast. I'm your host today.", "gender": "female"},
486
+ {"prompt_text": "Thank you for having me. I'm excited to be here.", "gender": "male"}
487
+ ]
488
+
489
+ for i, turn in enumerate(conversation_json["conversation"]):
490
+ text = turn["text"]
491
+ if not text.strip():
492
+ continue
493
+
494
+ # Use different voice config for each speaker
495
+ voice_config = voice_configs[i % len(voice_configs)]
496
+
497
+ output_file = os.path.join(output_dir, f"spark_output_{i}.wav")
498
+
499
+ # Run Spark TTS CLI inference
500
+ cmd = [
501
+ "python", "-m", "cli.inference",
502
+ "--text", text,
503
+ "--device", "0" if torch.cuda.is_available() else "cpu",
504
+ "--save_dir", output_dir,
505
+ "--model_dir", self.spark_model_dir,
506
+ "--prompt_text", voice_config["prompt_text"],
507
+ "--output_name", f"spark_output_{i}.wav"
508
+ ]
509
+
510
+ try:
511
+ # Run the command
512
+ result = subprocess.run(
513
+ cmd,
514
+ capture_output=True,
515
+ text=True,
516
+ timeout=60,
517
+ cwd="." # Make sure we're in the right directory
518
+ )
519
+
520
+ if result.returncode == 0:
521
+ audio_files.append(output_file)
522
+ else:
523
+ print(f"Spark TTS error for turn {i}: {result.stderr}")
524
+ # Create a short silence as fallback
525
+ silence = np.zeros(int(22050 * 1.0)) # 1 second of silence
526
+ sf.write(output_file, silence, 22050)
527
+ audio_files.append(output_file)
528
+
529
+ except subprocess.TimeoutExpired:
530
+ print(f"Spark TTS timeout for turn {i}")
531
+ # Create silence as fallback
532
+ silence = np.zeros(int(22050 * 1.0))
533
+ sf.write(output_file, silence, 22050)
534
+ audio_files.append(output_file)
535
+ except Exception as e:
536
+ print(f"Error running Spark TTS for turn {i}: {e}")
537
+ # Create silence as fallback
538
+ silence = np.zeros(int(22050 * 1.0))
539
+ sf.write(output_file, silence, 22050)
540
+ audio_files.append(output_file)
541
+
542
+ # Combine all audio files
543
+ if audio_files:
544
+ final_output = os.path.join(output_dir, "spark_combined.wav")
545
+ self._combine_audio_files(audio_files, final_output)
546
+ else:
547
+ raise RuntimeError("No audio files generated")
548
+
549
+ # Generate conversation text
550
+ conversation_text = "\n".join(
551
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
552
+ for i, turn in enumerate(conversation_json["conversation"])
553
+ )
554
+
555
+ return final_output, conversation_text
556
+
557
+ except Exception as e:
558
+ raise RuntimeError(f"Failed to convert text to speech with Spark TTS: {e}")
559
+
560
+ def text_to_speech_melo(self, conversation_json: Dict, progress=None) -> Tuple[str, str]:
561
+ """Convert text to speech using MeloTTS"""
562
+ if not MELO_AVAILABLE or not self.melo_models:
563
+ raise RuntimeError("MeloTTS not available")
564
+
565
+ speakers = ["EN-Default", "EN-US"]
566
+ combined_audio = AudioSegment.empty()
567
+
568
+ for i, turn in enumerate(conversation_json["conversation"]):
569
+ bio = io.BytesIO()
570
+ text = turn["text"]
571
+ speaker = speakers[i % 2]
572
+ speaker_id = self.melo_models["EN"].hps.data.spk2id[speaker]
573
+
574
+ # Generate audio
575
+ self.melo_models["EN"].tts_to_file(
576
+ text, speaker_id, bio, speed=1.0,
577
+ pbar=progress.tqdm if progress else None,
578
+ format="wav"
579
+ )
580
+
581
+ bio.seek(0)
582
+ audio_segment = AudioSegment.from_file(bio, format="wav")
583
+ combined_audio += audio_segment
584
+
585
+ # Save final audio
586
+ final_audio_path = "melo_podcast.mp3"
587
+ combined_audio.export(final_audio_path, format="mp3")
588
+
589
+ # Generate conversation text
590
+ conversation_text = "\n".join(
591
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
592
+ for i, turn in enumerate(conversation_json["conversation"])
593
+ )
594
+
595
+ return final_audio_path, conversation_text
596
+
597
+ def _create_output_directory(self) -> str:
598
+ """Create a unique output directory"""
599
+ random_bytes = os.urandom(8)
600
+ folder_name = base64.urlsafe_b64encode(random_bytes).decode("utf-8")
601
+ os.makedirs(folder_name, exist_ok=True)
602
+ return folder_name
603
+
604
+ def _combine_audio_files(self, filenames: List[str], output_file: str) -> None:
605
+ """Combine multiple audio files into one"""
606
+ if not filenames:
607
+ raise ValueError("No input files provided")
608
+
609
+ try:
610
+ audio_segments = []
611
+ for filename in filenames:
612
+ if os.path.exists(filename):
613
+ audio_segment = AudioSegment.from_file(filename)
614
+ audio_segments.append(audio_segment)
615
+
616
+ if audio_segments:
617
+ combined = sum(audio_segments)
618
+ combined.export(output_file, format="wav")
619
+
620
+ # Clean up temporary files
621
+ for filename in filenames:
622
+ if os.path.exists(filename):
623
+ os.remove(filename)
624
+
625
+ except Exception as e:
626
+ raise RuntimeError(f"Failed to combine audio files: {e}")
627
+
628
+
629
+ # Global converter instance
630
+ converter = UnifiedAudioConverter(ConversationConfig())
631
+
632
+
633
+ async def synthesize(article_url: str, mode: str = "Local", tts_engine: str = "Edge-TTS", language: str = "English"):
634
+ """Main synthesis function - Local is now primary, API is fallback"""
635
+ if not article_url:
636
+ return "Please provide a valid URL.", None
637
+
638
+ try:
639
+ # Fetch text from URL
640
+ text = converter.fetch_text(article_url)
641
+
642
+ # Limit text to max words
643
+ words = text.split()
644
+ if len(words) > converter.config.max_words:
645
+ text = " ".join(words[:converter.config.max_words])
646
+
647
+ # Extract conversation based on mode
648
+ if mode == "Local":
649
+ # ๋กœ์ปฌ ๋ชจ๋“œ๊ฐ€ ๊ธฐ๋ณธ (์ƒˆ๋กœ์šด Local LLM ์‚ฌ์šฉ)
650
+ try:
651
+ conversation_json = converter.extract_conversation_local(text, language)
652
+ except Exception as e:
653
+ print(f"Local mode failed: {e}, trying API fallback")
654
+ # API ํด๋ฐฑ
655
+ api_key = os.environ.get("TOGETHER_API_KEY")
656
+ if api_key:
657
+ converter.initialize_api_mode(api_key)
658
+ conversation_json = converter.extract_conversation_api(text, language)
659
+ else:
660
+ raise RuntimeError("Local mode failed and no API key available for fallback")
661
+ else: # API mode (now secondary)
662
+ api_key = os.environ.get("TOGETHER_API_KEY")
663
+ if not api_key:
664
+ print("API key not found, falling back to local mode")
665
+ conversation_json = converter.extract_conversation_local(text, language)
666
+ else:
667
+ try:
668
+ converter.initialize_api_mode(api_key)
669
+ conversation_json = converter.extract_conversation_api(text, language)
670
+ except Exception as e:
671
+ print(f"API mode failed: {e}, falling back to local mode")
672
+ conversation_json = converter.extract_conversation_local(text, language)
673
+
674
+ # Generate conversation text
675
+ conversation_text = "\n".join(
676
+ f"{turn.get('speaker', f'Speaker {i+1}')}: {turn['text']}"
677
+ for i, turn in enumerate(conversation_json["conversation"])
678
+ )
679
+
680
+ return conversation_text, None
681
+
682
+ except Exception as e:
683
+ return f"Error: {str(e)}", None
684
+
685
+
686
+ async def regenerate_audio(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
687
+ """Regenerate audio from edited conversation text"""
688
+ if not conversation_text.strip():
689
+ return "Please provide conversation text.", None
690
+
691
+ try:
692
+ # Parse the conversation text back to JSON format
693
+ conversation_json = converter.parse_conversation_text(conversation_text)
694
+
695
+ if not conversation_json["conversation"]:
696
+ return "No valid conversation found in the text.", None
697
+
698
+ # ํ•œ๊ตญ์–ด์ธ ๊ฒฝ์šฐ Edge-TTS๋งŒ ์‚ฌ์šฉ (๋‹ค๋ฅธ TTS๋Š” ํ•œ๊ตญ์–ด ์ง€์›์ด ์ œํ•œ์ )
699
+ if language == "Korean" and tts_engine != "Edge-TTS":
700
+ return "ํ•œ๊ตญ์–ด๋Š” Edge-TTS๋งŒ ์ง€์›๋ฉ๋‹ˆ๋‹ค. TTS ์—”์ง„์ด ์ž๋™์œผ๋กœ Edge-TTS๋กœ ๋ณ€๊ฒฝ๋ฉ๋‹ˆ๋‹ค.", None
701
+
702
+ # Generate audio based on TTS engine
703
+ if tts_engine == "Edge-TTS":
704
+ output_file, _ = await converter.text_to_speech_edge(conversation_json, language)
705
+ elif tts_engine == "Spark-TTS":
706
+ if not SPARK_AVAILABLE:
707
+ return "Spark TTS not available. Please install required dependencies and clone the Spark-TTS repository.", None
708
+ converter.initialize_spark_tts()
709
+ output_file, _ = converter.text_to_speech_spark(conversation_json, language)
710
+ else: # MeloTTS
711
+ if not MELO_AVAILABLE:
712
+ return "MeloTTS not available. Please install required dependencies.", None
713
+ if language == "Korean":
714
+ return "MeloTTS does not support Korean. Please use Edge-TTS for Korean.", None
715
+ converter.initialize_melo_tts()
716
+ output_file, _ = converter.text_to_speech_melo(conversation_json)
717
+
718
+ return "Audio generated successfully!", output_file
719
+
720
+ except Exception as e:
721
+ return f"Error generating audio: {str(e)}", None
722
+
723
+
724
+ def synthesize_sync(article_url: str, mode: str = "Local", tts_engine: str = "Edge-TTS", language: str = "English"):
725
+ """Synchronous wrapper for async synthesis"""
726
+ return asyncio.run(synthesize(article_url, mode, tts_engine, language))
727
+
728
+
729
+ def regenerate_audio_sync(conversation_text: str, tts_engine: str = "Edge-TTS", language: str = "English"):
730
+ """Synchronous wrapper for async audio regeneration"""
731
+ return asyncio.run(regenerate_audio(conversation_text, tts_engine, language))
732
+
733
+
734
+ def update_tts_engine_for_korean(language):
735
+ """ํ•œ๊ตญ์–ด ์„ ํƒ ์‹œ TTS ์—”์ง„ ์˜ต์…˜ ์—…๋ฐ์ดํŠธ"""
736
+ if language == "Korean":
737
+ return gr.Radio(
738
+ choices=["Edge-TTS"],
739
+ value="Edge-TTS",
740
+ label="TTS Engine",
741
+ info="ํ•œ๊ตญ์–ด๋Š” Edge-TTS๋งŒ ์ง€์›๋ฉ๋‹ˆ๋‹ค",
742
+ interactive=False
743
+ )
744
+ else:
745
+ return gr.Radio(
746
+ choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
747
+ value="Edge-TTS",
748
+ label="TTS Engine",
749
+ info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU",
750
+ interactive=True
751
+ )
752
+
753
+
754
+ # Gradio Interface
755
+ with gr.Blocks(theme='soft', title="URL to Podcast Converter") as demo:
756
+ gr.Markdown("# ๐ŸŽ™๏ธ URL to Podcast Converter")
757
+ gr.Markdown("Convert any article, blog, or news into an engaging podcast conversation!")
758
+
759
+ # ์ƒ๋‹จ์— ๋กœ์ปฌ LLM ์ƒํƒœ ํ‘œ์‹œ
760
+ with gr.Row():
761
+ gr.Markdown(f"""
762
+ ### ๐Ÿค– LLM Configuration:
763
+ - **Primary**: Local LLM ({converter.config.local_model_name}) - Runs on your device
764
+ - **Fallback**: API LLM ({converter.config.api_model_name}) - Used when local fails
765
+ - **Status**: {"โœ… Llama CPP Available" if LLAMA_CPP_AVAILABLE else "โŒ Llama CPP Not Available - Install llama-cpp-python"}
766
+ """)
767
+
768
+ with gr.Row():
769
+ with gr.Column(scale=3):
770
+ url_input = gr.Textbox(
771
+ label="Article URL",
772
+ placeholder="Enter the article URL here...",
773
+ value=""
774
+ )
775
+ with gr.Column(scale=1):
776
+ # ์–ธ์–ด ์„ ํƒ ์ถ”๊ฐ€
777
+ language_selector = gr.Radio(
778
+ choices=["English", "Korean"],
779
+ value="English",
780
+ label="Language / ์–ธ์–ด",
781
+ info="Select output language / ์ถœ๋ ฅ ์–ธ์–ด๋ฅผ ์„ ํƒํ•˜์„ธ์š”"
782
+ )
783
+
784
+ mode_selector = gr.Radio(
785
+ choices=["Local", "API"],
786
+ value="Local",
787
+ label="Processing Mode",
788
+ info="Local: Runs on device (Primary) | API: Cloud-based (Fallback)"
789
+ )
790
+
791
+ # TTS ์—”์ง„ ์„ ํƒ
792
+ with gr.Group():
793
+ gr.Markdown("### TTS Engine Selection")
794
+ tts_selector = gr.Radio(
795
+ choices=["Edge-TTS", "Spark-TTS", "MeloTTS"],
796
+ value="Edge-TTS",
797
+ label="TTS Engine",
798
+ info="Edge-TTS: Cloud-based, natural voices | Spark-TTS: Local AI model | MeloTTS: Local, requires GPU"
799
+ )
800
+
801
+ gr.Markdown("""
802
+ **Recommended:**
803
+ - ๐ŸŒŸ **Edge-TTS**: Best quality, cloud-based, instant setup
804
+ - ๐Ÿค– **Spark-TTS**: Local AI model (0.5B), zero-shot voice cloning
805
+
806
+ **Additional Option:**
807
+ - โšก **MeloTTS**: Local processing, GPU recommended
808
+
809
+ **ํ•œ๊ตญ์–ด ์ง€์›:**
810
+ - ๐Ÿ‡ฐ๐Ÿ‡ท ํ•œ๊ตญ์–ด ์„ ํƒ ์‹œ Edge-TTS๋งŒ ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค
811
+ """)
812
+
813
+ convert_btn = gr.Button("๐ŸŽฏ Generate Conversation / ๋Œ€ํ™” ์ƒ์„ฑ", variant="primary", size="lg")
814
+
815
+ with gr.Row():
816
+ with gr.Column():
817
+ conversation_output = gr.Textbox(
818
+ label="Generated Conversation (Editable) / ์ƒ์„ฑ๋œ ๋Œ€ํ™” (ํŽธ์ง‘ ๊ฐ€๋Šฅ)",
819
+ lines=15,
820
+ max_lines=30,
821
+ interactive=True,
822
+ placeholder="Generated conversation will appear here. You can edit it before generating audio.\n์ƒ์„ฑ๋œ ๋Œ€ํ™”๊ฐ€ ์—ฌ๊ธฐ์— ํ‘œ์‹œ๋ฉ๋‹ˆ๋‹ค. ์˜ค๋””์˜ค ์ƒ์„ฑ ์ „์— ํŽธ์ง‘ํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.",
823
+ info="Edit the conversation as needed. Format: 'Speaker Name: Text' / ํ•„์š”์— ๋”ฐ๋ผ ๋Œ€ํ™”๋ฅผ ํŽธ์ง‘ํ•˜์„ธ์š”. ํ˜•์‹: 'ํ™”์ž ์ด๋ฆ„: ํ…์ŠคํŠธ'"
824
+ )
825
+
826
+ # ์˜ค๋””์˜ค ์ƒ์„ฑ ๋ฒ„ํŠผ ์ถ”๊ฐ€
827
+ with gr.Row():
828
+ generate_audio_btn = gr.Button("๐ŸŽ™๏ธ Generate Audio from Text / ํ…์ŠคํŠธ์—์„œ ์˜ค๋””์˜ค ์ƒ์„ฑ", variant="secondary", size="lg")
829
+ gr.Markdown("*Edit the conversation above, then click to generate audio / ์œ„์˜ ๋Œ€ํ™”๋ฅผ ํŽธ์ง‘ํ•œ ํ›„ ํด๋ฆญํ•˜์—ฌ ์˜ค๋””์˜ค๋ฅผ ์ƒ์„ฑํ•˜์„ธ์š”*")
830
+
831
+ with gr.Column():
832
+ audio_output = gr.Audio(
833
+ label="Podcast Audio / ํŒŸ์บ์ŠคํŠธ ์˜ค๋””์˜ค",
834
+ type="filepath",
835
+ interactive=False
836
+ )
837
+
838
+ # ์ƒํƒœ ๋ฉ”์‹œ์ง€ ์ถ”๊ฐ€
839
+ status_output = gr.Textbox(
840
+ label="Status / ์ƒํƒœ",
841
+ interactive=False,
842
+ visible=True
843
+ )
844
+
845
+ # TTS ์—”์ง„๋ณ„ ์„ค๋ช… ๋ฐ ์„ค์น˜ ์•ˆ๋‚ด ์ถ”๊ฐ€
846
+ with gr.Row():
847
+ gr.Markdown("""
848
+ ### TTS Engine Details / TTS ์—”์ง„ ์ƒ์„ธ์ •๋ณด:
849
+
850
+ - **Edge-TTS**: Microsoft's cloud TTS service with high-quality natural voices. Requires internet connection.
851
+ - ๐Ÿ‡ฐ๐Ÿ‡ท **ํ•œ๊ตญ์–ด ์ง€์›**: ์ž์—ฐ์Šค๋Ÿฌ์šด ํ•œ๊ตญ์–ด ์Œ์„ฑ (์—ฌ์„ฑ: SunHi, ๋‚จ์„ฑ: InJoon)
852
+ - **Spark-TTS**: SparkAudio's local AI model (0.5B parameters) with zero-shot voice cloning capability.
853
+ - **Setup required**: Clone [Spark-TTS repository](https://github.com/SparkAudio/Spark-TTS) in current directory
854
+ - Features: Bilingual support (Chinese/English), controllable speech generation
855
+ - License: CC BY-NC-SA (Non-commercial use only)
856
+ - โš ๏ธ **ํ•œ๊ตญ์–ด ๋ฏธ์ง€์›**
857
+ - **MeloTTS**: Local TTS with multiple voice options. GPU recommended for better performance.
858
+ - โš ๏ธ **ํ•œ๊ตญ์–ด ๋ฏธ์ง€์›**
859
+
860
+ ### Local LLM Setup / ๋กœ์ปฌ LLM ์„ค์ •:
861
+ The system now uses **Private-BitSix-Mistral-Small-3.1-24B-Instruct** as the primary LLM, which runs locally on your device for privacy and independence. API fallback is available when needed.
862
+
863
+ ๋กœ์ปฌ ๋””๋ฐ”์ด์Šค์—์„œ ๊ฐœ์ธ์ •๋ณด ๋ณดํ˜ธ์™€ ๋…๋ฆฝ์„ฑ์„ ์œ„ํ•ด **Private-BitSix-Mistral-Small-3.1-24B-Instruct**๋ฅผ ๊ธฐ๋ณธ LLM์œผ๋กœ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค. ํ•„์š”์‹œ API ํด๋ฐฑ์ด ์ œ๊ณต๋ฉ๋‹ˆ๋‹ค.
864
+
865
+ ### Spark-TTS Setup Instructions:
866
+ ```bash
867
+ git clone https://github.com/SparkAudio/Spark-TTS.git
868
+ cd Spark-TTS
869
+ pip install -r requirements.txt
870
+ ```
871
+ """)
872
+
873
+ gr.Examples(
874
+ examples=[
875
+ ["https://huggingface.co/blog/openfree/cycle-navigator", "Local", "Edge-TTS", "English"],
876
+ ["https://www.bbc.com/news/technology-67988517", "Local", "Spark-TTS", "English"],
877
+ ["https://arxiv.org/abs/2301.00810", "Local", "Edge-TTS", "Korean"],
878
+ ],
879
+ inputs=[url_input, mode_selector, tts_selector, language_selector],
880
+ outputs=[conversation_output, status_output],
881
+ fn=synthesize_sync,
882
+ cache_examples=False,
883
+ )
884
+
885
+ # ์–ธ์–ด ๋ณ€๊ฒฝ ์‹œ TTS ์—”์ง„ ์˜ต์…˜ ์—…๋ฐ์ดํŠธ
886
+ language_selector.change(
887
+ fn=update_tts_engine_for_korean,
888
+ inputs=[language_selector],
889
+ outputs=[tts_selector]
890
+ )
891
+
892
+ # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
893
+ convert_btn.click(
894
+ fn=synthesize_sync,
895
+ inputs=[url_input, mode_selector, tts_selector, language_selector],
896
+ outputs=[conversation_output, status_output]
897
+ )
898
+
899
+ generate_audio_btn.click(
900
+ fn=regenerate_audio_sync,
901
+ inputs=[conversation_output, tts_selector, language_selector],
902
+ outputs=[status_output, audio_output]
903
+ )
904
+
905
+
906
+ # Launch the app
907
+ if __name__ == "__main__":
908
+ demo.queue(api_open=True, default_concurrency_limit=10).launch(
909
+ show_api=True,
910
+ share=False,
911
+ server_name="0.0.0.0",
912
+ server_port=7860
913
+ )