Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 7

Commit

910bbfc

verified ·

1 Parent(s): 8f20f5f

Upload 2 files

Browse files

Files changed (2) hide show

app.py +195 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# =============================================================
+# Hugging Face Space – Lecture → Multilingual Podcast Generator
+# =============================================================
+# Upload a lecture PDF ⟶ generate a two‑host dialogue (podcast)
+# directly in five languages (English, Bangla, Chinese, Urdu, Nepali)
+# using **Qwen/Qwen2.5‑Coder‑32B‑Instruct** for text AND a Hugging
+# Face *audio‑generation* model for speech (no external TTS APIs).
+# -----------------------------------------------------------------
+# Files for your Space:
+#   • app.py              (this file)
+#   • requirements.txt    (see bottom comment block)
+# -----------------------------------------------------------------
+# Add your HF_TOKEN as a Space secret if required for gated models.
+# =============================================================
+import os
+import tempfile
+import uuid
+import textwrap
+from typing import Dict, Tuple
+import gradio as gr
+from PyPDF2 import PdfReader
+import nltk  # sentence tokenisation
+from llama_index.llms.huggingface import HfApiModel
+from transformers import pipeline  # HF TTS pipeline
+import soundfile as sf  # save audio
+# ---------------------------------------------------------------
+# Ensure NLTK punkt is present on first launch
+# ---------------------------------------------------------------
+try:
+    nltk.data.find("tokenizers/punkt")
+except LookupError:
+    nltk.download("punkt")
+# --------------------------- LLM Setup ---------------------------
+llm = HfApiModel(
+    max_tokens=2096,
+    temperature=0.5,
+    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",  # text generation
+    custom_role_conversions=None,
+)
+# ------------------------ TTS Setup ------------------------------
+# Multilingual TTS model (≈ 500 MB). It supports >100 languages.
+# If you need lighter weights choose language‑specific VITS models.
+# ----------------------------------------------------------------
+TTS_MODEL_ID = "tts_models/multilingual/multi-dataset/xtts_v2"
+# Load once; Space queues requests so single GPU/CPU is okay.
+try:
+    tts_pipeline = pipeline(
+        "text-to-speech",
+        model=TTS_MODEL_ID,
+        device_map="auto",  # GPU if available, else CPU
+    )
+except Exception as e:
+    raise RuntimeError(f"Failed to load TTS model {TTS_MODEL_ID}: {e}")
+# ------------------------ Helpers --------------------------------
+LANG_CONFIG = {
+    "English": {
+        "tts_lang": "en",
+        "prompt_tag": "English",
+    },
+    "Bangla": {
+        "tts_lang": "bn",
+        "prompt_tag": "Bangla (বাংলা)",
+    },
+    "Chinese": {
+        "tts_lang": "zh",
+        "prompt_tag": "Mandarin Chinese",
+    },
+    "Urdu": {
+        "tts_lang": "ur",
+        "prompt_tag": "Urdu (اردو)",
+    },
+    "Nepali": {
+        "tts_lang": "ne",
+        "prompt_tag": "Nepali (नेपाली)",
+    },
+}
+def extract_text(pdf_path: str, max_chars: int = 16000) -> str:
+    """Extract raw text from PDF, truncate to avoid token overflow."""
+    reader = PdfReader(pdf_path)
+    text_parts = []
+    for page in reader.pages:
+        page_text = page.extract_text() or ""
+        text_parts.append(page_text)
+        if sum(len(t) for t in text_parts) >= max_chars:
+            break
+    raw_text = "\n".join(text_parts)[:max_chars]
+    return raw_text
+def build_prompt(lecture_text: str, lang: str) -> str:
+    """Craft a prompt instructing the LLM to return a dialogue in `lang`."""
+    # Compress lecture to ~150 sentences to stay under token budget
+    sentences = nltk.sent_tokenize(lecture_text)
+    short_text = " ".join(sentences[: min(len(sentences), 150)])
+    prompt = textwrap.dedent(
+        f"""
+        You are simulating a podcast with two lively hosts, A and B. Their job is to discuss the following lecture, summarise key points, quiz each other, and clarify concepts so a student listening can follow along. Produce a back‑and‑forth conversation **in {LANG_CONFIG[lang]['prompt_tag']}**, roughly 40 turns, totalling about 800–1000 words. Prefix each utterance with the speaker name (A: ... / B: ...). Avoid adding any narration outside the dialogue.
+        Lecture content (for reference):
+        """.strip()
+    ) + "\n" + short_text + "\n"
+    return prompt
+def generate_dialogue(lecture_text: str, lang: str) -> str:
+    """Call the Qwen model to generate podcast script for the given language."""
+    prompt = build_prompt(lecture_text, lang)
+    try:
+        response = llm.complete(prompt)
+        dialogue = response.text.strip()
+    except Exception as e:
+        dialogue = f"Error generating dialogue in {lang}: {e}"
+    return dialogue
+def tts_for_dialogue(lang: str, text: str) -> Tuple[str, str]:
+    """Convert text to speech via HF TTS; returns (filepath, mime)."""
+    language_code = LANG_CONFIG[lang]["tts_lang"]
+    tmpdir = tempfile.gettempdir()
+    filename = os.path.join(tmpdir, f"{lang}_{uuid.uuid4().hex}.wav")
+    try:
+        # xtts_v2 accepts a `language` forward param
+        speech = tts_pipeline(text, forward_params={"language": language_code})
+        sf.write(filename, speech["audio"], speech["sampling_rate"])
+        return filename, "audio/wav"
+    except Exception as e:
+        # Return dummy text file explaining error
+        errfile = os.path.join(tmpdir, f"err_{uuid.uuid4().hex}.txt")
+        with open(errfile, "w", encoding="utf-8") as fh:
+            fh.write(f"TTS error for {lang}: {e}\n")
+        return errfile, "text/plain"
+def pipeline_runner(pdf_file) -> Dict[str, Tuple[str, str]]:
+    """Full pipeline returning a dict: language → (file_path, mime)."""
+    if pdf_file is None:
+        raise gr.Error("Please upload a PDF lecture first.")
+    lecture_text = extract_text(pdf_file)
+    audio_outputs = {}
+    for lang in LANG_CONFIG.keys():
+        dialogue = generate_dialogue(lecture_text, lang)
+        path, mime = tts_for_dialogue(lang, dialogue)
+        audio_outputs[lang] = (path, mime)
+    return audio_outputs
+# ------------------------ Gradio UI --------------------------------
+with gr.Blocks(title="Multilingual Lecture Podcast (LLM + HF‑TTS)") as demo:
+    gr.Markdown(
+        """# 📚🎙️ Lecture → Podcast
+Upload a lecture PDF and receive a two‑host audio podcast generated **directly** in five languages using Qwen for text and XTTS‑v2 for speech.
+"""
+    )
+    with gr.Row():
+        inp = gr.File(label="Lecture PDF", file_types=[".pdf"])
+        btn = gr.Button("Generate Podcast")
+    with gr.Group():
+        audio_components = [
+            gr.Audio(label=lang, interactive=False, type="filepath")
+            for lang in LANG_CONFIG.keys()
+        ]
+    def gradio_wrapper(pdf_file):
+        results = pipeline_runner(pdf_file)
+        return [results[lang][0] for lang in LANG_CONFIG.keys()]
+    btn.click(gradio_wrapper, inputs=inp, outputs=audio_components)
+if __name__ == "__main__":
+    demo.launch()
+# ---------------------------------------------------------------
+# requirements.txt  (commit as separate file in the Space root)
+# ---------------------------------------------------------------
+# gradio>=4.28.0
+# PyPDF2>=3.0.1
+# nltk>=3.8.1
+# transformers>=4.39.0
+# torch>=2.1.2
+# soundfile>=0.12.1
+# llama-index>=0.11.47
+# huggingface-hub>=0.23.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=4.28.0
+PyPDF2>=3.0.1
+nltk>=3.8.1
+transformers>=4.39.0
+torch>=2.1.2
+soundfile>=0.12.1
+llama-index>=0.11.47
+huggingface-hub>=0.23.0