Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 8

Commit

4c19533

verified ·

1 Parent(s): 764a881

Update app.py

Browse files

Files changed (1) hide show

app.py +182 -96

app.py CHANGED Viewed

@@ -1,7 +1,10 @@
 # =============================================================
-# Lecture → Podcast & Script Generator (Gemini + HF TTS)
-# Modified: Script outputs rendered as HTML
 # =============================================================
 import os
 import re
 import tempfile
@@ -14,19 +17,28 @@ from PyPDF2 import PdfReader
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
-# Hugging Face TTS
 from huggingface_hub import InferenceClient
-# Google Gemini
-import google.generativeai as genai
 # ------------------------------------------------------------------
-# HF TTS client
 # ------------------------------------------------------------------
 hf_token = os.getenv("HF_TOKEN")
-hf_tts_client: Optional[InferenceClient] = InferenceClient(token=hf_token) if hf_token else None
-# Language metadata
 LANG_INFO: Dict[str, Dict[str, str]] = {
     "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
     "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
@@ -36,122 +48,196 @@ LANG_INFO: Dict[str, Dict[str, str]] = {
 }
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
-# Prompt template
 PROMPT_TEMPLATE = textwrap.dedent(
     """
     You are producing a lively two-host educational podcast in {lang_name}.
     Summarize the following lecture content into a dialogue of **approximately 300 words**.
     Make it engaging: hosts ask questions, clarify ideas with analogies, and
-    wrap up with a concise recap. Preserve technical accuracy.
     ### Lecture Content
     {content}
     """
 )
-# PDF extraction
-TOKEN_LIMIT = 8000
-def extract_pdf_text(path: str) -> str:
-    reader = PdfReader(path)
-    return "\n".join(p.extract_text() or "" for p in reader.pages)
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
-    return " ".join(words[:limit]) if len(words) > limit else text
-# TTS chunking
-CHUNK_CHAR_LIMIT = 280
-def split_chunks(text: str) -> List[str]:
-    sentences = re.split(r"(?<=[.!?])\s+", text.strip())
-    chunks, curr = [], ""
-    for s in sentences:
-        if curr and len(curr) + len(s) + 1 > CHUNK_CHAR_LIMIT:
-            chunks.append(curr)
-            curr = s
         else:
-            curr = f"{curr} {s}" if curr else s
-    if curr: chunks.append(curr)
-    return chunks
-# Synthesize speech
-def synthesize(text: str, model_id: str, outdir: Path) -> str:
-    segments = []
-    for i, chunk in enumerate(split_chunks(text)):
-        audio_bytes = hf_tts_client.text_to_speech(chunk, model=model_id)
-        path = outdir / f"part{i}.flac"
-        path.write_bytes(audio_bytes)
-        seg = AudioSegment.from_file(path, format="flac")
-        segments.append(seg)
-    final = sum(segments, AudioSegment.empty())
-    out = outdir / "podcast.flac"
-    final.export(out, format="flac")
-    return str(out)
-# Main pipeline
 def generate_podcast(
-    gemini_key: str,
-    pdf_file: gr.File,
-    langs: List[str]
 ) -> List[Optional[Any]]:
-    if not gemini_key:
-        raise gr.Error("Enter Google AI Studio API Key.")
-    if not pdf_file:
-        raise gr.Error("Upload a PDF file.")
-    if not langs:
-        raise gr.Error("Select at least one language.")
-    genai.configure(api_key=gemini_key)
-    raw = extract_pdf_text(pdf_file.name)
-    content = truncate_text(raw)
-    tmp = Path(tempfile.mkdtemp())
-    results = []
-    data = {}
-    for code, info in LANG_INFO.items():
-        if info["name"] not in langs:
-            results.extend([None, None, None])
-            continue
-        # Generate script
-        prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=content)
-        model = genai.GenerativeModel('gemini-1.5-flash-latest')
-        resp = model.generate_content(prompt)
-        script = resp.text.strip()
-        # Save plain text
-        script_path = tmp / f"script_{code}.txt"
-        script_path.write_text(script, encoding="utf-8")
-        # Render HTML version
-        html_script = f"<pre>{script}</pre>"
-        # Synthesize audio if available
-        audio_path = None
-        if hf_tts_client:
-            audio_path = synthesize(script, info["tts_model"], tmp / code)
-        results.extend([audio_path, html_script, str(script_path)])
-    return results
-# Interface
 inputs = [
-    gr.Textbox(label="Google AI Studio API Key", type="password"),
-    gr.File(label="Lecture PDF", file_types=[".pdf"]),
-    gr.CheckboxGroup(choices=[info["name"] for info in LANG_INFO.values()],
-                     value=["English"], label="Languages")
 ]
 outputs = []
-for code, info in LANG_INFO.items():
-    outputs.append(gr.Audio(label=f"{info['name']} Podcast", type="filepath"))
-    outputs.append(gr.HTML(label=f"{info['name']} Script HTML"))
-    outputs.append(gr.File(label=f"Download {info['name']} Script"))
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
     title="Lecture → Podcast & Script",
 )
 if __name__ == "__main__":
     iface.launch()

 # =============================================================
+# Hugging Face Space – Lecture → Podcast Generator (Gemini + HF TTS)
 # =============================================================
+# • **Text generation** – Google Gemini API (via user-provided genai API Key)
+# • **Speech synthesis** – Hugging Face Inference API for TTS (via HF_TOKEN secret)
+# -----------------------------------------------------------------
 import os
 import re
 import tempfile
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
+# For Hugging Face TTS
 from huggingface_hub import InferenceClient
+# For Google Gemini
+try:
+    import google.generativeai as genai
+except ImportError:
+    raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
 # ------------------------------------------------------------------
+# Hugging Face Inference API client for TTS (uses HF_TOKEN secret)
 # ------------------------------------------------------------------
+hf_tts_client: Optional[InferenceClient] = None
 hf_token = os.getenv("HF_TOKEN")
+if hf_token:
+    hf_tts_client = InferenceClient(token=hf_token)
+else:
+    print("WARNING: HF_TOKEN secret not found. Hugging Face TTS will not be available.")
+# ------------------------------------------------------------------
+# Language metadata for Hugging Face MMS-TTS models
+# ------------------------------------------------------------------
 LANG_INFO: Dict[str, Dict[str, str]] = {
     "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
     "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
 }
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
+# ------------------------------------------------------------------
+# Prompt template for Gemini
+# ------------------------------------------------------------------
 PROMPT_TEMPLATE = textwrap.dedent(
     """
     You are producing a lively two-host educational podcast in {lang_name}.
     Summarize the following lecture content into a dialogue of **approximately 300 words**.
     Make it engaging: hosts ask questions, clarify ideas with analogies, and
+    wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
     ### Lecture Content
     {content}
     """
 )
+# PDF helpers (unchanged) -------------------------------------------
+def extract_pdf_text(pdf_path: str) -> str:
+    try:
+        reader = PdfReader(pdf_path)
+        return "\n".join(page.extract_text() or "" for page in reader.pages)
+    except Exception as e:
+        raise gr.Error(f"Failed to process PDF: {e}")
+TOKEN_LIMIT = 8000
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
+    if len(words) > limit:
+        gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
+        return " ".join(words[:limit])
+    return text
+# ------------------------------------------------------------------
+# TTS helper using Hugging Face Inference API
+# ------------------------------------------------------------------
+CHUNK_CHAR_LIMIT_HF = 280
+def _split_to_chunks_hf(text: str, limit: int = CHUNK_CHAR_LIMIT_HF) -> List[str]:
+    sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
+    sentences = [s.strip() for s in sentences_raw if s.strip()]
+    chunks, current_chunk = [], ""
+    for sent in sentences:
+        if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
+            chunks.append(current_chunk)
+            current_chunk = sent
         else:
+            current_chunk += (" " + sent) if current_chunk else sent
+    if current_chunk:
+        chunks.append(current_chunk)
+    return [chunk for chunk in chunks if chunk.strip()]
+def synthesize_speech_hf(
+    text: str,
+    hf_model_id: str,
+    lang_tmpdir: Path,
+    tts_client: InferenceClient
+) -> Path:
+    chunks = _split_to_chunks_hf(text)
+    if not chunks:
+        raise ValueError("Text resulted in no speakable chunks after splitting.")
+    audio_segments: List[AudioSegment] = []
+    for idx, chunk in enumerate(chunks):
+        gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with HF TTS ({hf_model_id})...")
+        try:
+            audio_bytes = tts_client.text_to_speech(chunk, model=hf_model_id)
+        except Exception as e:
+            raise RuntimeError(f"HF TTS client error for chunk {idx+1}: {e}") from e
+        part_path = lang_tmpdir / f"part_{idx}.flac"
+        part_path.write_bytes(audio_bytes)
+        try:
+            segment = AudioSegment.from_file(part_path, format="flac")
+            audio_segments.append(segment)
+        except CouldntDecodeError as e:
+            raise RuntimeError(f"Failed to decode audio chunk {idx+1}: {e}") from e
+    combined_audio = sum(audio_segments, AudioSegment.empty())
+    final_path = lang_tmpdir / "podcast_audio.flac"
+    combined_audio.export(final_path, format="flac")
+    return final_path
+# ------------------------------------------------------------------
+# Main pipeline function for Gradio
+# ------------------------------------------------------------------
 def generate_podcast(
+    gemini_api_key_from_ui: Optional[str],
+    pdf_file_obj: Optional[gr.File],
+    selected_lang_names: List[str]
 ) -> List[Optional[Any]]:
+    if not gemini_api_key_from_ui:
+        raise gr.Error("Please enter your Google AI Studio API Key for Gemini.")
+    if not pdf_file_obj:
+        raise gr.Error("Please upload a PDF file.")
+    if not selected_lang_names:
+        raise gr.Error("Please select at least one language.")
+    try:
+        genai.configure(api_key=gemini_api_key_from_ui)
+    except Exception as e:
+        raise gr.Error(f"Failed to configure Gemini API: {e}")
+    if not hf_tts_client:
+        gr.Warning("HF TTS unavailable; only script will be generated.")
+    selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
+    results_data = {
+        code: {"audio": None, "script_md": None, "script_file": None}
+        for code in LANG_INFO.keys()
+    }
+    with tempfile.TemporaryDirectory() as td:
+        tmpdir_base = Path(td)
+        lecture_raw = extract_pdf_text(pdf_file_obj.name)
+        lecture_text = truncate_text(lecture_raw)
+        if not lecture_text.strip():
+            raise gr.Error("Extracted PDF text is empty.")
+        gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')
+        for code in selected_codes:
+            info = LANG_INFO[code]
+            lang_name = info["name"]
+            hf_tts_model_id = info["tts_model"]
+            lang_tmpdir = tmpdir_base / code
+            lang_tmpdir.mkdir(parents=True, exist_ok=True)
+            # 1️⃣ Generate script via Gemini
+            prompt = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
+            try:
+                resp = gemini_model.generate_content(prompt)
+                dialogue = resp.text or ""
+            except Exception as e:
+                raise gr.Error(f"Gemini error for {lang_name}: {e}")
+            if dialogue:
+                # store Markdown script
+                results_data[code]["script_md"] = dialogue
+                # write .txt file
+                script_path = lang_tmpdir / f"podcast_script_{code}.txt"
+                script_path.write_text(dialogue, encoding="utf-8")
+                results_data[code]["script_file"] = str(script_path)
+                # 2️⃣ Synthesize audio via HF TTS
+                if hf_tts_client:
+                    try:
+                        audio_path = synthesize_speech_hf(dialogue, hf_tts_model_id, lang_tmpdir, hf_tts_client)
+                        results_data[code]["audio"] = str(audio_path)
+                    except Exception as e:
+                        gr.Error(f"TTS error for {lang_name}: {e}")
+    # assemble outputs in the order: Audio, Markdown, File for each language
+    final_outputs: List[Optional[Any]] = []
+    for code in LANG_INFO.keys():
+        out = results_data[code]
+        final_outputs.extend([ out["audio"], out["script_md"], out["script_file"] ])
+    return final_outputs
+# ------------------------------------------------------------------
+# Gradio Interface Setup
+# ------------------------------------------------------------------
+language_names_ordered = [info["name"] for info in LANG_INFO.values()]
 inputs = [
+    gr.Textbox(label="Google Gemini API Key", type="password", placeholder="Paste your key here"),
+    gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
+    gr.CheckboxGroup(choices=language_names_ordered, value=["English"], label="Select language(s)"),
 ]
 outputs = []
+for code in LANG_INFO.keys():
+    lang_name = LANG_INFO[code]["name"]
+    outputs.append(gr.Audio(label=f"{lang_name} Podcast", type="filepath"))
+    outputs.append(gr.Markdown(label=f"{lang_name} Script"))
+    outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
     title="Lecture → Podcast & Script",
+    description=(
+        "Enter your Gemini API Key, upload a lecture PDF, choose language(s), "
+        "and get a two-host podcast (audio) plus the Markdown script & downloadable text."
+    ),
+    allow_flagging="never",
 )
 if __name__ == "__main__":
+    if not os.getenv("HF_TOKEN"):
+        print("Reminder: set HF_TOKEN in Secrets for TTS to work.")
     iface.launch()