Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 7

Commit

f0eca57

verified ·

1 Parent(s): 2a72cc8

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -37

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 # =============================================================
-# Hugging Face Space – Lecture → Multilingual Podcast Generator
 # =============================================================
-# Uses SmolAgents HfApiModel for text generation and HF audio
-# pipeline for speech. Generates two‑host dialogues in five
-# languages (English, Bangla, Chinese, Urdu, Nepali) directly
-# from a PDF lecture upload.
 # -----------------------------------------------------------------
 import os
@@ -15,35 +15,33 @@ from typing import List, Dict
 import gradio as gr
 from PyPDF2 import PdfReader
-from transformers import pipeline  # for audio generation (e.g., xtts)
-from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
 # ------------------------------------------------------------------
 # LLM configuration (SmolAgents wrapper for HF Inference API)
 # ------------------------------------------------------------------
 llm = HfApiModel(
-    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',  # 34B parameter instruct model
     max_tokens=2096,
     temperature=0.5,
     custom_role_conversions=None,
 )
 # ------------------------------------------------------------------
-# Audio model (multilingual text ➜ speech); choose an open xtts‑v2
-# model that supports our languages. Switch model id if you prefer.
 # ------------------------------------------------------------------
-audio_pipe = pipeline(
-    "text-to-audio",
-    model="suno/xtts_v2",
-    framework="pt",
-)
 LANG_INFO: Dict[str, Dict[str, str]] = {
-    "en": {"name": "English", "speaker": "hostA"},
-    "bn": {"name": "Bangla", "speaker": "hostB"},
-    "zh": {"name": "Chinese", "speaker": "hostC"},
-    "ur": {"name": "Urdu", "speaker": "hostD"},
-    "ne": {"name": "Nepali", "speaker": "hostE"},
 }
 PROMPT_TEMPLATE = textwrap.dedent(
@@ -59,59 +57,66 @@ PROMPT_TEMPLATE = textwrap.dedent(
 )
 # ------------------------------------------------------------------
-# Utility: extract & truncate PDF text to fit LLM token budget
 # ------------------------------------------------------------------
 def extract_pdf_text(pdf_file) -> str:
     reader = PdfReader(pdf_file)
-    raw = "\n".join(p.extract_text() or "" for p in reader.pages)
-    return raw
-TOKEN_LIMIT = 6000  # conservative words (≈ tokens) for prompt+response
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
     return " ".join(words[:limit])
 # ------------------------------------------------------------------
-# Main generation function
 # ------------------------------------------------------------------
 def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
     with tempfile.TemporaryDirectory() as tmpdir:
         lecture_text = truncate_text(extract_pdf_text(pdf.name))
         audio_outputs = []
         for lang_code, info in LANG_INFO.items():
             prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
-            # --- Generate dialogue ---
             dialogue = llm(prompt)
-            # Save text for transparency/debug
-            text_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
-            with open(text_path, "w", encoding="utf-8") as f:
                 f.write(dialogue)
-            # --- TTS ---
-            audio = audio_pipe(dialogue, forward_params={"language": lang_code})
             wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
-            audio["audio"].export(wav_path, format="wav")
-            audio_outputs.append((wav_path, None))  # Gradio Audio expects (file, label)
         return audio_outputs
 # ------------------------------------------------------------------
-# Gradio Interface
 # ------------------------------------------------------------------
-audio_components = [gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()]
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
     outputs=audio_components,
     title="Lecture → Multilingual Podcast Generator",
-    description="Upload a lecture PDF and get a two‑host audio podcast in English, Bangla, Chinese, Urdu, and Nepali."
 )
 if __name__ == "__main__":

 # =============================================================
+# Hugging Face Space – Lecture → Multilingual Podcast Generator
 # =============================================================
+# * Text generation: SmolAgents HfApiModel (Qwen/Qwen2.5‑Coder‑32B)
+# * Speech synthesis: **Coqui XTTS‑v2** open model via the TTS lib
+#   (no private / gated repo, so it runs without a HF token).
+# * Outputs five WAV files: English, Bangla, Chinese, Urdu, Nepali.
 # -----------------------------------------------------------------
 import os
 import gradio as gr
 from PyPDF2 import PdfReader
+from smolagents import HfApiModel
+from TTS.api import TTS  # ↳ Coqui TTS
 # ------------------------------------------------------------------
 # LLM configuration (SmolAgents wrapper for HF Inference API)
 # ------------------------------------------------------------------
 llm = HfApiModel(
+    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
     max_tokens=2096,
     temperature=0.5,
     custom_role_conversions=None,
 )
 # ------------------------------------------------------------------
+# XTTS‑v2 multilingual text‑to‑speech (≈ 1.2 GB, CPU OK)
 # ------------------------------------------------------------------
+TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
+tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
+# Automatically downloads and caches the model on first run.
 LANG_INFO: Dict[str, Dict[str, str]] = {
+    "en": {"name": "English"},
+    "bn": {"name": "Bangla"},
+    "zh": {"name": "Chinese"},
+    "ur": {"name": "Urdu"},
+    "ne": {"name": "Nepali"},
 }
 PROMPT_TEMPLATE = textwrap.dedent(
 )
 # ------------------------------------------------------------------
+# Utility: extract & truncate PDF text to fit the LLM token budget
 # ------------------------------------------------------------------
 def extract_pdf_text(pdf_file) -> str:
     reader = PdfReader(pdf_file)
+    return "\n".join(p.extract_text() or "" for p in reader.pages)
+TOKEN_LIMIT = 6000  # ≈ tokens (safe margin for prompt + response)
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
     return " ".join(words[:limit])
 # ------------------------------------------------------------------
+# Main generation routine
 # ------------------------------------------------------------------
 def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
     with tempfile.TemporaryDirectory() as tmpdir:
         lecture_text = truncate_text(extract_pdf_text(pdf.name))
         audio_outputs = []
         for lang_code, info in LANG_INFO.items():
+            # 1️⃣  Create prompt + generate dialogue
             prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
             dialogue = llm(prompt)
+            # 2️⃣  Save raw dialogue text (for reference)
+            txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
+            with open(txt_path, "w", encoding="utf-8") as f:
                 f.write(dialogue)
+            # 3️⃣  Synthesise speech with XTTS‑v2
             wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
+            # ► xtts_v2 accepts ISO‑639‑1 language codes directly
+            tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path)
+            audio_outputs.append((wav_path, None))  # (file, label) for Gradio Audio
         return audio_outputs
 # ------------------------------------------------------------------
+# Gradio UI
 # ------------------------------------------------------------------
+audio_components = [
+    gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
+]
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
     outputs=audio_components,
     title="Lecture → Multilingual Podcast Generator",
+    description=(
+        "Upload a lecture PDF and receive a two‑host audio podcast in English, "
+        "Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the "
+        "dialogue and Coqui XTTS‑v2 for speech synthesis — no private repos "
+        "or API keys needed."
+    ),
 )
 if __name__ == "__main__":