Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 7

Commit

50d2a40

verified ·

1 Parent(s): f0eca57

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -51

app.py CHANGED Viewed

@@ -1,25 +1,27 @@
 # =============================================================
-# Hugging Face Space – Lecture → Multilingual Podcast Generator
 # =============================================================
-# * Text generation: SmolAgents HfApiModel (Qwen/Qwen2.5‑Coder‑32B)
-# * Speech synthesis: **Coqui XTTS‑v2** open model via the TTS lib
-#   (no private / gated repo, so it runs without a HF token).
-# * Outputs five WAV files: English, Bangla, Chinese, Urdu, Nepali.
 # -----------------------------------------------------------------
 import os
 import tempfile
-import uuid
 import textwrap
 from typing import List, Dict
 import gradio as gr
 from PyPDF2 import PdfReader
 from smolagents import HfApiModel
-from TTS.api import TTS  # ↳ Coqui TTS
 # ------------------------------------------------------------------
-# LLM configuration (SmolAgents wrapper for HF Inference API)
 # ------------------------------------------------------------------
 llm = HfApiModel(
     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
@@ -29,93 +31,92 @@ llm = HfApiModel(
 )
 # ------------------------------------------------------------------
-# XTTS‑v2 multilingual text‑to‑speech (≈ 1.2 GB, CPU OK)
 # ------------------------------------------------------------------
-TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
-tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
-# Automatically downloads and caches the model on first run.
 LANG_INFO: Dict[str, Dict[str, str]] = {
-    "en": {"name": "English"},
-    "bn": {"name": "Bangla"},
-    "zh": {"name": "Chinese"},
-    "ur": {"name": "Urdu"},
-    "ne": {"name": "Nepali"},
 }
 PROMPT_TEMPLATE = textwrap.dedent(
     """
     You are producing a lively two‑host educational podcast in {lang_name}.
-    Summarize the following lecture content into a dialogue of about 1200 words.
-    Use an engaging style: hosts ask each other questions, clarify ideas, add
-    simple analogies, and conclude with a short recap. Keep technical accuracy.
-    ### Lecture Content
     {content}
     """
 )
 # ------------------------------------------------------------------
-# Utility: extract & truncate PDF text to fit the LLM token budget
 # ------------------------------------------------------------------
-def extract_pdf_text(pdf_file) -> str:
-    reader = PdfReader(pdf_file)
-    return "\n".join(p.extract_text() or "" for p in reader.pages)
-TOKEN_LIMIT = 6000  # ≈ tokens (safe margin for prompt + response)
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
     return " ".join(words[:limit])
 # ------------------------------------------------------------------
-# Main generation routine
 # ------------------------------------------------------------------
 def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
     with tempfile.TemporaryDirectory() as tmpdir:
-        lecture_text = truncate_text(extract_pdf_text(pdf.name))
-        audio_outputs = []
-        for lang_code, info in LANG_INFO.items():
-            # 1️⃣  Create prompt + generate dialogue
             prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
-            dialogue = llm(prompt)
-            # 2️⃣  Save raw dialogue text (for reference)
-            txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
-            with open(txt_path, "w", encoding="utf-8") as f:
-                f.write(dialogue)
-            # 3️⃣  Synthesise speech with XTTS‑v2
-            wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
-            # ► xtts_v2 accepts ISO‑639‑1 language codes directly
-            tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path)
-            audio_outputs.append((wav_path, None))  # (file, label) for Gradio Audio
-        return audio_outputs
 # ------------------------------------------------------------------
-# Gradio UI
 # ------------------------------------------------------------------
 audio_components = [
-    gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
 ]
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
     outputs=audio_components,
-    title="Lecture → Multilingual Podcast Generator",
     description=(
-        "Upload a lecture PDF and receive a two‑host audio podcast in English, "
-        "Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the "
-        "dialogue and Coqui XTTS‑v2 for speech synthesis — no private repos "
-        "or API keys needed."
     ),
 )

 # =============================================================
+# Hugging Face Space – Lecture → Multilingual Podcast Generator
 # =============================================================
+# * **Text generation** – SmolAgents `HfApiModel` running the remote
+#   Qwen/Qwen2.5‑Coder‑32B‑Instruct model.
+# * **Speech synthesis** – `huggingface_hub.InferenceClient.text_to_speech`
+#   (serverless) with open models per language – no heavy local
+#   downloads.
+# * Outputs five FLAC files (English, Bangla, Chinese, Urdu, Nepali).
 # -----------------------------------------------------------------
 import os
 import tempfile
 import textwrap
+from pathlib import Path
 from typing import List, Dict
 import gradio as gr
+from huggingface_hub import InferenceClient
 from PyPDF2 import PdfReader
 from smolagents import HfApiModel
 # ------------------------------------------------------------------
+# LLM: Qwen 32‑B via SmolAgents
 # ------------------------------------------------------------------
 llm = HfApiModel(
     model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
 )
 # ------------------------------------------------------------------
+# HF Inference API client (reads HF_TOKEN secret if set)
 # ------------------------------------------------------------------
+client = InferenceClient(token=os.getenv("HF_TOKEN", None))
+# ------------------------------------------------------------------
+# Language metadata and matching TTS model IDs
+# ------------------------------------------------------------------
 LANG_INFO: Dict[str, Dict[str, str]] = {
+    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
+    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
+    # MMS lacks mainstream Mandarin — fallback to an open Chinese TTS
+    "zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"},
+    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd-script_arabic"},
+    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
 }
 PROMPT_TEMPLATE = textwrap.dedent(
     """
     You are producing a lively two‑host educational podcast in {lang_name}.
+    Summarize the following lecture content into a dialogue of ≈1200 words.
+    Make it engaging: hosts ask questions, clarify ideas with analogies, and
+    wrap up with a concise recap. Preserve technical accuracy.
+    ### Lecture Content
     {content}
     """
 )
 # ------------------------------------------------------------------
+# Helpers: extract and truncate PDF text
 # ------------------------------------------------------------------
+def extract_pdf_text(pdf_path: str) -> str:
+    reader = PdfReader(pdf_path)
+    return "\n".join(page.extract_text() or "" for page in reader.pages)
+TOKEN_LIMIT = 6000  # rough word‑level cap before hitting context limit
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
     return " ".join(words[:limit])
 # ------------------------------------------------------------------
+# Main pipeline
 # ------------------------------------------------------------------
 def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
+    """Generate multilingual podcast from a lecture PDF."""
     with tempfile.TemporaryDirectory() as tmpdir:
+        raw_text = extract_pdf_text(pdf.name)
+        lecture_text = truncate_text(raw_text)
+        outputs: List[tuple] = []
+        for code, info in LANG_INFO.items():
+            # 1️⃣ Draft dialogue in the target language
             prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
+            dialogue: str = llm(prompt)
+            # 2️⃣ Synthesize speech via HF Inference API
+            audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"])
+            flac_path = Path(tmpdir) / f"podcast_{code}.flac"
+            flac_path.write_bytes(audio_bytes)
+            outputs.append((str(flac_path), None))  # (filepath, label)
+        return outputs
 # ------------------------------------------------------------------
+# Gradio interface
 # ------------------------------------------------------------------
 audio_components = [
+    gr.Audio(label=f"{info['name']} Podcast", type="filepath")
+    for info in LANG_INFO.values()
 ]
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
     outputs=audio_components,
+    title="Lecture → Multilingual Podcast Generator",
     description=(
+        "Upload a lecture PDF and receive a two‑host audio podcast in five "
+        "languages (English, Bangla, Chinese, Urdu, Nepali). Dialogue is "
+        "crafted by Qwen‑32B; speech is synthesized on‑the‑fly using the "
+        "Hugging Face Inference API — no heavy downloads or GPUs required."
     ),
 )