bluenevus commited on
Commit
5753bc2
·
verified ·
1 Parent(s): 21f3e87

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -85
app.py CHANGED
@@ -9,7 +9,8 @@ import logging
9
  import os
10
  import spaces
11
 
12
- logging.basicConfig(level=logging.INFO)
 
13
  logger = logging.getLogger(__name__)
14
 
15
  def get_device():
@@ -18,7 +19,7 @@ def get_device():
18
  return torch.device("cpu")
19
 
20
  device = get_device()
21
- print(f"Using device: {device}")
22
 
23
  model = None
24
  tokenizer = None
@@ -27,107 +28,132 @@ tokenizer = None
27
  def load_model():
28
  global model, tokenizer
29
 
30
- print("Loading Orpheus model...")
31
  model_name = "canopylabs/orpheus-3b-0.1-ft"
32
 
33
  hf_token = os.environ.get("HUGGINGFACE_TOKEN")
34
  if not hf_token:
35
  raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
36
 
37
- login(token=hf_token)
38
-
39
- snapshot_download(
40
- repo_id=model_name,
41
- use_auth_token=hf_token,
42
- allow_patterns=[
43
- "config.json",
44
- "*.safetensors",
45
- "model.safetensors.index.json",
46
- ],
47
- ignore_patterns=[
48
- "optimizer.pt",
49
- "pytorch_model.bin",
50
- "training_args.bin",
51
- "scheduler.pt",
52
- "tokenizer.json",
53
- "tokenizer_config.json",
54
- "special_tokens_map.json",
55
- "vocab.json",
56
- "merges.txt",
57
- "tokenizer.*"
58
- ]
59
- )
60
-
61
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32 if device.type == 'cpu' else torch.bfloat16)
62
- model.to(device)
63
- tokenizer = AutoTokenizer.from_pretrained(model_name)
64
- print(f"Orpheus model and tokenizer loaded to {device}")
65
-
66
- def generate_podcast_script(api_key, content, duration, num_hosts):
67
- genai.configure(api_key=api_key)
68
- model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
69
-
70
- prompt = f"""
71
- Create a podcast script for {'one person' if num_hosts == 1 else 'two people'} discussing:
72
- {content}
73
-
74
- Duration: {duration}. Include natural speech, humor, and occasional off-topic thoughts.
75
- Use speech fillers like um, ah. Vary emotional tone.
76
-
77
- Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
78
- Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
79
-
80
- Use emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
81
-
82
- Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
83
-
84
- Ensure content flows naturally and stays on topic. Match the script length to {duration}.
85
- """
86
-
87
- response = model.generate_content(prompt)
88
- return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  @spaces.GPU()
91
  def text_to_speech(text, voice):
92
  global model, tokenizer
93
- if model is None or tokenizer is None:
94
- load_model()
95
-
96
- inputs = tokenizer(text, return_tensors="pt").to(device)
97
- with torch.no_grad():
98
- output = model.generate(**inputs, max_new_tokens=256)
99
- mel = output[0].cpu().numpy()
100
- audio = mel_to_audio(mel)
101
- return audio
 
 
 
 
102
 
103
  def mel_to_audio(mel):
104
  return np.zeros(24000, dtype=np.float32) # Placeholder: 1 second of silence
105
 
106
  @spaces.GPU()
107
  def render_podcast(api_key, script, voice1, voice2, num_hosts):
108
- lines = [line for line in script.split('\n') if line.strip()]
109
- audio_segments = []
110
-
111
- for i, line in enumerate(lines):
112
- voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
113
- try:
114
- audio = text_to_speech(line, voice)
115
- audio_segments.append(audio)
116
- except Exception as e:
117
- logger.error(f"Error processing audio segment: {str(e)}")
118
-
119
- if not audio_segments:
120
- logger.warning("No valid audio segments were generated.")
121
- return (24000, np.zeros(24000, dtype=np.float32))
122
-
123
- podcast_audio = np.concatenate(audio_segments)
124
- return (24000, podcast_audio)
 
 
 
 
125
 
126
  with gr.Blocks() as demo:
127
  gr.Markdown("# AI Podcast Generator")
128
 
129
  api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
130
- content_input = gr.Textbox(label="Paste your content")
 
 
 
 
131
  duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
132
  num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
133
 
@@ -142,7 +168,7 @@ with gr.Blocks() as demo:
142
  audio_output = gr.Audio(label="Generated Podcast")
143
 
144
  generate_btn.click(generate_podcast_script,
145
- inputs=[api_key_input, content_input, duration, num_hosts],
146
  outputs=script_output)
147
 
148
  render_btn.click(render_podcast,
@@ -154,5 +180,8 @@ with gr.Blocks() as demo:
154
  outputs=[voice2_select])
155
 
156
  if __name__ == "__main__":
157
- load_model()
158
- demo.launch()
 
 
 
 
9
  import os
10
  import spaces
11
 
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
14
  logger = logging.getLogger(__name__)
15
 
16
  def get_device():
 
19
  return torch.device("cpu")
20
 
21
  device = get_device()
22
+ logger.info(f"Using device: {device}")
23
 
24
  model = None
25
  tokenizer = None
 
28
  def load_model():
29
  global model, tokenizer
30
 
31
+ logger.info("Loading Orpheus model...")
32
  model_name = "canopylabs/orpheus-3b-0.1-ft"
33
 
34
  hf_token = os.environ.get("HUGGINGFACE_TOKEN")
35
  if not hf_token:
36
  raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
37
 
38
+ try:
39
+ login(token=hf_token)
40
+
41
+ snapshot_download(
42
+ repo_id=model_name,
43
+ use_auth_token=hf_token,
44
+ allow_patterns=[
45
+ "config.json",
46
+ "*.safetensors",
47
+ "model.safetensors.index.json",
48
+ ],
49
+ ignore_patterns=[
50
+ "optimizer.pt",
51
+ "pytorch_model.bin",
52
+ "training_args.bin",
53
+ "scheduler.pt",
54
+ "tokenizer.json",
55
+ "tokenizer_config.json",
56
+ "special_tokens_map.json",
57
+ "vocab.json",
58
+ "merges.txt",
59
+ "tokenizer.*"
60
+ ]
61
+ )
62
+
63
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32 if device.type == 'cpu' else torch.bfloat16)
64
+ model.to(device)
65
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
66
+ logger.info(f"Orpheus model and tokenizer loaded to {device}")
67
+ except Exception as e:
68
+ logger.error(f"Error loading model: {str(e)}")
69
+ raise
70
+
71
+ def generate_podcast_script(api_key, content, uploaded_file, duration, num_hosts):
72
+ try:
73
+ genai.configure(api_key=api_key)
74
+ model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
75
+
76
+ combined_content = content or ""
77
+ if uploaded_file:
78
+ file_content = uploaded_file.read().decode('utf-8')
79
+ combined_content += "\n" + file_content if combined_content else file_content
80
+
81
+ prompt = f"""
82
+ Create a podcast script for {'one person' if num_hosts == 1 else 'two people'} discussing:
83
+ {combined_content}
84
+
85
+ Duration: {duration}. Include natural speech, humor, and occasional off-topic thoughts.
86
+ Use speech fillers like um, ah. Vary emotional tone.
87
+
88
+ Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
89
+ Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
90
+
91
+ Use emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
92
+
93
+ Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
94
+
95
+ Ensure content flows naturally and stays on topic. Match the script length to {duration}.
96
+ """
97
+
98
+ response = model.generate_content(prompt)
99
+ return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
100
+ except Exception as e:
101
+ logger.error(f"Error generating podcast script: {str(e)}")
102
+ raise
103
 
104
  @spaces.GPU()
105
  def text_to_speech(text, voice):
106
  global model, tokenizer
107
+ try:
108
+ if model is None or tokenizer is None:
109
+ load_model()
110
+
111
+ inputs = tokenizer(text, return_tensors="pt").to(device)
112
+ with torch.no_grad():
113
+ output = model.generate(**inputs, max_new_tokens=256)
114
+ mel = output[0].cpu().numpy()
115
+ audio = mel_to_audio(mel)
116
+ return audio
117
+ except Exception as e:
118
+ logger.error(f"Error in text_to_speech: {str(e)}")
119
+ raise
120
 
121
  def mel_to_audio(mel):
122
  return np.zeros(24000, dtype=np.float32) # Placeholder: 1 second of silence
123
 
124
  @spaces.GPU()
125
  def render_podcast(api_key, script, voice1, voice2, num_hosts):
126
+ try:
127
+ lines = [line for line in script.split('\n') if line.strip()]
128
+ audio_segments = []
129
+
130
+ for i, line in enumerate(lines):
131
+ voice = voice1 if num_hosts == 1 or i % 2 == 0 else voice2
132
+ try:
133
+ audio = text_to_speech(line, voice)
134
+ audio_segments.append(audio)
135
+ except Exception as e:
136
+ logger.error(f"Error processing audio segment: {str(e)}")
137
+
138
+ if not audio_segments:
139
+ logger.warning("No valid audio segments were generated.")
140
+ return (24000, np.zeros(24000, dtype=np.float32))
141
+
142
+ podcast_audio = np.concatenate(audio_segments)
143
+ return (24000, podcast_audio)
144
+ except Exception as e:
145
+ logger.error(f"Error rendering podcast: {str(e)}")
146
+ raise
147
 
148
  with gr.Blocks() as demo:
149
  gr.Markdown("# AI Podcast Generator")
150
 
151
  api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
152
+
153
+ with gr.Row():
154
+ content_input = gr.Textbox(label="Paste your content (optional)")
155
+ document_upload = gr.File(label="Upload Document (optional)")
156
+
157
  duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
158
  num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
159
 
 
168
  audio_output = gr.Audio(label="Generated Podcast")
169
 
170
  generate_btn.click(generate_podcast_script,
171
+ inputs=[api_key_input, content_input, document_upload, duration, num_hosts],
172
  outputs=script_output)
173
 
174
  render_btn.click(render_podcast,
 
180
  outputs=[voice2_select])
181
 
182
  if __name__ == "__main__":
183
+ try:
184
+ load_model()
185
+ demo.launch()
186
+ except Exception as e:
187
+ logger.error(f"Error launching the application: {str(e)}")