Spaces:

MicroHealth
/

ai-podcast-builder

Paused

App Files Files Community

bluenevus commited on Apr 15

Commit

33f9554

verified ·

1 Parent(s): 7976c43

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -134

app.py CHANGED Viewed

@@ -80,108 +80,12 @@ def load_model():
         logger.error(f"Error loading model: {str(e)}")
         raise
-def generate_podcast_script(api_key, content, uploaded_file, duration, num_hosts):
-    try:
-        genai.configure(api_key=api_key)
-        model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
-        combined_content = content or ""
-        if uploaded_file:
-            file_content = uploaded_file.read().decode('utf-8')
-            combined_content += "\n" + file_content if combined_content else file_content
-        prompt = f"""
-        Create a podcast script for {'one person' if num_hosts == 1 else 'two people'} discussing:
-        {combined_content}
-        Duration: {duration}. Include natural speech, humor, and occasional off-topic thoughts.
-        Use speech fillers like um, ah. Vary emotional tone.
-        Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
-        Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
-        Use emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
-        Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
-        Ensure content flows naturally and stays on topic. Match the script length to {duration}.
-        """
-        response = model.generate_content(prompt)
-        return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
-    except Exception as e:
-        logger.error(f"Error generating podcast script: {str(e)}")
-        raise
-def process_prompt(prompt, voice, tokenizer, device):
-    prompt = f"{voice}: {prompt}"
-    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-    start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
-    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human
-    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
-    # No padding needed for single input
-    attention_mask = torch.ones_like(modified_input_ids)
-    return modified_input_ids.to(device), attention_mask.to(device)
-def parse_output(generated_ids):
-    token_to_find = 128257
-    token_to_remove = 128258
-    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
-    if len(token_indices[1]) > 0:
-        last_occurrence_idx = token_indices[1][-1].item()
-        cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
-    else:
-        cropped_tensor = generated_ids
-    processed_rows = []
-    for row in cropped_tensor:
-        masked_row = row[row != token_to_remove]
-        processed_rows.append(masked_row)
-    code_lists = []
-    for row in processed_rows:
-        row_length = row.size(0)
-        new_length = (row_length // 7) * 7
-        trimmed_row = row[:new_length]
-        trimmed_row = [t - 128266 for t in trimmed_row]
-        code_lists.append(trimmed_row)
-    return code_lists[0]  # Return just the first one for single sample
-def redistribute_codes(code_list, snac_model):
-    device = next(snac_model.parameters()).device  # Get the device of SNAC model
-    layer_1 = []
-    layer_2 = []
-    layer_3 = []
-    for i in range((len(code_list)+1)//7):
-        layer_1.append(code_list[7*i])
-        layer_2.append(code_list[7*i+1]-4096)
-        layer_3.append(code_list[7*i+2]-(2*4096))
-        layer_3.append(code_list[7*i+3]-(3*4096))
-        layer_2.append(code_list[7*i+4]-(4*4096))
-        layer_3.append(code_list[7*i+5]-(5*4096))
-        layer_3.append(code_list[7*i+6]-(6*4096))
-    # Move tensors to the same device as the SNAC model
-    codes = [
-        torch.tensor(layer_1, device=device).unsqueeze(0),
-        torch.tensor(layer_2, device=device).unsqueeze(0),
-        torch.tensor(layer_3, device=device).unsqueeze(0)
-    ]
-    audio_hat = snac_model.decode(codes)
-    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
 @spaces.GPU()
 def text_to_speech(text, voice, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200):
     global model, tokenizer, snac_model
     if not text.strip():
         return None
@@ -238,44 +142,11 @@ def render_podcast(api_key, script, voice1, voice2, num_hosts):
         logger.error(f"Error rendering podcast: {str(e)}")
         raise
-# Gradio Interface
-with gr.Blocks() as demo:
-    gr.Markdown("# AI Podcast Generator")
-    api_key_input = gr.Textbox(label="Enter your Gemini API Key", type="password")
-    with gr.Row():
-        content_input = gr.Textbox(label="Paste your content (optional)")
-        document_upload = gr.File(label="Upload Document (optional)")
-    duration = gr.Radio(["1-5 min", "5-10 min", "10-15 min"], label="Estimated podcast duration")
-    num_hosts = gr.Radio([1, 2], label="Number of podcast hosts", value=2)
-    voice_options = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
-    voice1_select = gr.Dropdown(label="Select Voice 1", choices=voice_options, value="tara")
-    voice2_select = gr.Dropdown(label="Select Voice 2", choices=voice_options, value="leo")
-    generate_btn = gr.Button("Generate Script")
-    script_output = gr.Textbox(label="Generated Script", lines=10)
-    render_btn = gr.Button("Render Podcast")
-    audio_output = gr.Audio(label="Generated Podcast")
-    generate_btn.click(generate_podcast_script,
-                       inputs=[api_key_input, content_input, document_upload, duration, num_hosts],
-                       outputs=script_output)
-    render_btn.click(render_podcast,
-                     inputs=[api_key_input, script_output, voice1_select, voice2_select, num_hosts],
-                     outputs=audio_output)
-    num_hosts.change(lambda x: gr.update(visible=x == 2),
-                     inputs=[num_hosts],
-                     outputs=[voice2_select])
 if __name__ == "__main__":
     try:
-        load_model()
         demo.launch()
     except Exception as e:
         logger.error(f"Error launching the application: {str(e)}")

         logger.error(f"Error loading model: {str(e)}")
         raise
 @spaces.GPU()
 def text_to_speech(text, voice, temperature=0.6, top_p=0.95, repetition_penalty=1.1, max_new_tokens=1200):
     global model, tokenizer, snac_model
+    if model is None or tokenizer is None or snac_model is None:
+        load_model()
     if not text.strip():
         return None
         logger.error(f"Error rendering podcast: {str(e)}")
         raise
+# ... (rest of the code remains the same)
 if __name__ == "__main__":
     try:
+        load_model()  # Load models at startup
         demo.launch()
     except Exception as e:
         logger.error(f"Error launching the application: {str(e)}")