Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 7

Commit

f1adb14

verified ·

1 Parent(s): 910bbfc

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -195

app.py CHANGED Viewed

@@ -1,195 +1,118 @@
-# =============================================================
-# Hugging Face Space – Lecture → Multilingual Podcast Generator
-# =============================================================
-# Upload a lecture PDF ⟶ generate a two‑host dialogue (podcast)
-# directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
-# using **Qwen/Qwen2.5‑Coder‑32B‑Instruct** for text AND a Hugging
-# Face *audio‑generation* model for speech (no external TTS APIs).
-# -----------------------------------------------------------------
-# Files for your Space:
-#   • app.py              (this file)
-#   • requirements.txt    (see bottom comment block)
-# -----------------------------------------------------------------
-# Add your HF_TOKEN as a Space secret if required for gated models.
-# =============================================================
-import os
-import tempfile
-import uuid
-import textwrap
-from typing import Dict, Tuple
-import gradio as gr
-from PyPDF2 import PdfReader
-import nltk  # sentence tokenisation
-from llama_index.llms.huggingface import HfApiModel
-from transformers import pipeline  # HF TTS pipeline
-import soundfile as sf  # save audio
-# ---------------------------------------------------------------
-# Ensure NLTK punkt is present on first launch
-# ---------------------------------------------------------------
-try:
-    nltk.data.find("tokenizers/punkt")
-except LookupError:
-    nltk.download("punkt")
-# --------------------------- LLM Setup ---------------------------
-llm = HfApiModel(
-    max_tokens=2096,
-    temperature=0.5,
-    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",  # text generation
-    custom_role_conversions=None,
-)
-# ------------------------ TTS Setup ------------------------------
-# Multilingual TTS model (≈ 500 MB). It supports >100 languages.
-# If you need lighter weights choose language‑specific VITS models.
-# ----------------------------------------------------------------
-TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
-# Load once; Space queues requests so single GPU/CPU is okay.
-try:
-    tts_pipeline = pipeline(
-        "text-to-speech",
-        model=TTS_MODEL_ID,
-        device_map="auto",  # GPU if available, else CPU
-    )
-except Exception as e:
-    raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")
-# ------------------------ Helpers --------------------------------
-LANG_CONFIG = {
-    "English": {
-        "tts_lang": "en",
-        "prompt_tag": "English",
-    },
-    "Bangla": {
-        "tts_lang": "bn",
-        "prompt_tag": "Bangla (বাংলা)",
-    },
-    "Chinese": {
-        "tts_lang": "zh",
-        "prompt_tag": "Mandarin Chinese",
-    },
-    "Urdu": {
-        "tts_lang": "ur",
-        "prompt_tag": "Urdu (اردو)",
-    },
-    "Nepali": {
-        "tts_lang": "ne",
-        "prompt_tag": "Nepali (नेपाली)",
-    },
-}
-def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
-    """Extract raw text from PDF, truncate to avoid token overflow."""
-    reader = PdfReader(pdf_path)
-    text_parts = []
-    for page in reader.pages:
-        page_text = page.extract_text() or ""
-        text_parts.append(page_text)
-        if sum(len(t) for t in text_parts) >= max_chars:
-            break
-    raw_text = "\n".join(text_parts)[:max_chars]
-    return raw_text
-def build_prompt(lecture_text: str, lang: str) -> str:
-    """Craft a prompt instructing the LLM to return a dialogue in `lang`."""
-    # Compress lecture to ~150 sentences to stay under token budget
-    sentences = nltk.sent_tokenize(lecture_text)
-    short_text = " ".join(sentences[: min(len(sentences), 150)])
-    prompt = textwrap.dedent(
-        f"""
-        You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.
-        Lecture content (for reference):
-        """.strip()
-    ) + "\n" + short_text + "\n"
-    return prompt
-def generate_dialogue(lecture_text: str, lang: str) -> str:
-    """Call the Qwen model to generate podcast script for the given language."""
-    prompt = build_prompt(lecture_text, lang)
-    try:
-        response = llm.complete(prompt)
-        dialogue = response.text.strip()
-    except Exception as e:
-        dialogue = f"Error generating dialogue in {lang}: {e}"
-    return dialogue
-def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
-    """Convert text to speech via HF TTS; returns (filepath, mime)."""
-    language_code = LANG_CONFIG[lang]["tts_lang"]
-    tmpdir = tempfile.gettempdir()
-    filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
-    try:
-        # xtts_v2 accepts a `language` forward param
-        speech = tts_pipeline(text, forward_params={"language": language_code})
-        sf.write(filename, speech["audio"], speech["sampling_rate"])
-        return filename, "audio/wav"
-    except Exception as e:
-        # Return dummy text file explaining error
-        errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
-        with open(errfile, "w", encoding="utf-8") as fh:
-            fh.write(f"TTS error for {lang}: {e}\n")
-        return errfile, "text/plain"
-def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
-    """Full pipeline returning a dict: language → (file_path, mime)."""
-    if pdf_file is None:
-        raise gr.Error("Please upload a PDF lecture first.")
-    lecture_text = extract_text(pdf_file)
-    audio_outputs = {}
-    for lang in LANG_CONFIG.keys():
-        dialogue = generate_dialogue(lecture_text, lang)
-        path, mime = tts_for_dialogue(lang, dialogue)
-        audio_outputs[lang] = (path, mime)
-    return audio_outputs
-# ------------------------ Gradio UI --------------------------------
-with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
-    gr.Markdown(
-        """# 📚🎙️ Lecture → Podcast
-Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.
-"""
-    )
-    with gr.Row():
-        inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
-        btn = gr.Button("Generate Podcast")
-    with gr.Group():
-        audio_components = [
-            gr.Audio(label=lang, interactive=False, type="filepath")
-            for lang in LANG_CONFIG.keys()
-        ]
-    def gradio_wrapper(pdf_file):
-        results = pipeline_runner(pdf_file)
-        return [results[lang][0] for lang in LANG_CONFIG.keys()]
-    btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)
-if __name__ == "__main__":
-    demo.launch()
-# ---------------------------------------------------------------
-# requirements.txt  (commit as separate file in the Space root)
-# ---------------------------------------------------------------
-# gradio>=4.28.0
-# PyPDF2>=3.0.1
-# nltk>=3.8.1
-# transformers>=4.39.0
-# torch>=2.1.2
-# soundfile>=0.12.1
-# llama-index>=0.11.47
-# huggingface-hub>=0.23.0

+# =============================================================
+# Hugging Face Space – Lecture → Multilingual Podcast Generator
+# =============================================================
+# Uses SmolAgents HfApiModel for text generation and HF audio
+# pipeline for speech. Generates two‑host dialogues in five
+# languages (English, Bangla, Chinese, Urdu, Nepali) directly
+# from a PDF lecture upload.
+# -----------------------------------------------------------------
+import os
+import tempfile
+import uuid
+import textwrap
+from typing import List, Dict
+import gradio as gr
+from PyPDF2 import PdfReader
+from transformers import pipeline  # for audio generation (e.g., xtts)
+from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
+# ------------------------------------------------------------------
+# LLM configuration (SmolAgents wrapper for HF Inference API)
+# ------------------------------------------------------------------
+llm = HfApiModel(
+    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',  # 34B parameter instruct model
+    max_tokens=2096,
+    temperature=0.5,
+    custom_role_conversions=None,
+)
+# ------------------------------------------------------------------
+# Audio model (multilingual text ➜ speech); choose an open xtts‑v2
+# model that supports our languages. Switch model id if you prefer.
+# ------------------------------------------------------------------
+audio_pipe = pipeline(
+    "text-to-audio",
+    model="suno/xtts_v2",
+    framework="pt",
+)
+LANG_INFO: Dict[str, Dict[str, str]] = {
+    "en": {"name": "English", "speaker": "hostA"},
+    "bn": {"name": "Bangla", "speaker": "hostB"},
+    "zh": {"name": "Chinese", "speaker": "hostC"},
+    "ur": {"name": "Urdu", "speaker": "hostD"},
+    "ne": {"name": "Nepali", "speaker": "hostE"},
+}
+PROMPT_TEMPLATE = textwrap.dedent(
+    """
+    You are producing a lively two‑host educational podcast in {lang_name}.
+    Summarize the following lecture content into a dialogue of about 1200 words.
+    Use an engaging style: hosts ask each other questions, clarify ideas, add
+    simple analogies, and conclude with a short recap. Keep technical accuracy.
+    ### Lecture Content
+    {content}
+    """
+)
+# ------------------------------------------------------------------
+# Utility: extract & truncate PDF text to fit LLM token budget
+# ------------------------------------------------------------------
+def extract_pdf_text(pdf_file) -> str:
+    reader = PdfReader(pdf_file)
+    raw = "\n".join(p.extract_text() or "" for p in reader.pages)
+    return raw
+TOKEN_LIMIT = 6000  # conservative words (≈ tokens) for prompt+response
+def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
+    words = text.split()
+    return " ".join(words[:limit])
+# ------------------------------------------------------------------
+# Main generation function
+# ------------------------------------------------------------------
+def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        lecture_text = truncate_text(extract_pdf_text(pdf.name))
+        audio_outputs = []
+        for lang_code, info in LANG_INFO.items():
+            prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
+            # --- Generate dialogue ---
+            dialogue = llm(prompt)
+            # Save text for transparency/debug
+            text_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
+            with open(text_path, "w", encoding="utf-8") as f:
+                f.write(dialogue)
+            # --- TTS ---
+            audio = audio_pipe(dialogue, forward_params={"language": lang_code})
+            wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
+            audio["audio"].export(wav_path, format="wav")
+            audio_outputs.append((wav_path, None))  # Gradio Audio expects (file, label)
+        return audio_outputs
+# ------------------------------------------------------------------
+# Gradio Interface
+# ------------------------------------------------------------------
+audio_components = [gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()]
+iface = gr.Interface(
+    fn=generate_podcast,
+    inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
+    outputs=audio_components,
+    title="Lecture → Multilingual Podcast Generator",
+    description="Upload a lecture PDF and get a two‑host audio podcast in English, Bangla, Chinese, Urdu, and Nepali."
+)
+if __name__ == "__main__":
+    iface.launch()