Spaces:

HaiderAUT
/

PodCastIt

Build error

App Files Files Community

HaiderAUT commited on May 8

Commit

53744b5

verified ·

1 Parent(s): f036ad8

Update app.py

Browse files

Files changed (1) hide show

app.py +274 -102

app.py CHANGED Viewed

@@ -1,61 +1,62 @@
 import os
 import re
 import tempfile
 import textwrap
 from pathlib import Path
-from typing import List, Dict, Optional
 import gradio as gr
-from huggingface_hub import InferenceClient
-from PyPDF2 import PdfReader  # For PDF processing
-from smolagents import HfApiModel  # For LLM interaction
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
-# ------------------------------------------------------------------
-# LLM setup – remote Qwen model via SmolAgents
-# ------------------------------------------------------------------
-llm = HfApiModel(
-    model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
-    max_tokens=2048,
-    temperature=0.5,
-)
-# ------------------------------------------------------------------
-# Hugging Face Inference API client
-# ------------------------------------------------------------------
-client = InferenceClient(token=os.getenv("HF_TOKEN", None))
 # ------------------------------------------------------------------
-# Language metadata and open TTS models
 # ------------------------------------------------------------------
 LANG_INFO: Dict[str, Dict[str, str]] = {
-    "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
-    "bn": {"name": "Bangla",  "tts_model": "facebook/mms-tts-ben"},
-    "zh": {"name": "Chinese", "tts_model": "facebook/mms-tts-zho"},
-    "ur": {"name": "Urdu",    "tts_model": "facebook/mms-tts-urd"},
-    "ne": {"name": "Nepali",  "tts_model": "facebook/mms-tts-npi"},
 }
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
 PROMPT_TEMPLATE = textwrap.dedent(
     """
     You are producing a lively two-host educational podcast in {lang_name}.
-    Summarize the following lecture content into a dialogue of ~300 words.
     Make it engaging: hosts ask questions, clarify ideas with analogies, and
-    wrap up with a concise recap. Preserve technical accuracy.
     ### Lecture Content
     {content}
     """
 )
-TOKEN_LIMIT = 8000
-CHUNK_CHAR_LIMIT = 280
-# ------------------------------------------------------------------
-# PDF text extraction
-# ------------------------------------------------------------------
 def extract_pdf_text(pdf_path: str) -> str:
     try:
         reader = PdfReader(pdf_path)
@@ -63,114 +64,285 @@ def extract_pdf_text(pdf_path: str) -> str:
     except Exception as e:
         raise gr.Error(f"Failed to process PDF: {e}")
-# ------------------------------------------------------------------
-# Helpers
-# ------------------------------------------------------------------
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
     if len(words) > limit:
         return " ".join(words[:limit])
     return text
 def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
-    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
-    chunks, current = [], ""
     for sent in sentences:
-        if current and len(current) + len(sent) + 1 > limit:
-            chunks.append(current)
-            current = sent
         else:
-            current = f"{current} {sent}".strip()
-    if current:
-        chunks.append(current)
-    return chunks
-def synthesize_speech(text: str, model_id: str, tempdir: Path) -> Path:
     chunks = _split_to_chunks(text)
     if not chunks:
-        raise ValueError("No text chunks to synthesize.")
-    segments = []
-    for i, chunk in enumerate(chunks):
         try:
-            audio_bytes = client.text_to_speech(chunk, model=model_id)
-        except HubHTTPError as e:
-            raise RuntimeError(f"TTS error on chunk {i}: {e}")
-        part = tempdir / f"seg_{i}.flac"
-        part.write_bytes(audio_bytes)
         try:
-            seg = AudioSegment.from_file(part, format="flac")
         except CouldntDecodeError as e:
-            raise RuntimeError(f"Decode error on chunk {i}: {e}")
-        segments.append(seg)
-    combined = sum(segments, AudioSegment.empty())
-    outpath = tempdir / "podcast.flac"
-    combined.export(outpath, format="flac")
-    return outpath
 # ------------------------------------------------------------------
-# Main pipeline
 # ------------------------------------------------------------------
-def generate_podcast(pdf_file: Optional[gr.File], languages: List[str]):
-    if not pdf_file:
         raise gr.Error("Please upload a PDF file.")
-    if not languages:
-        raise gr.Error("Select at least one language.")
-    # Extract and truncate
-    text = extract_pdf_text(pdf_file.name)
-    if not text.strip():
-        raise gr.Error("No text found in PDF.")
-    lecture = truncate_text(text)
-    transcripts, audios = [], []
-    with tempfile.TemporaryDirectory() as td:
-        base = Path(td)
-        for name in languages:
-            code = LANG_CODE_BY_NAME[name]
-            # 1️⃣ Dialogue
-            prompt = PROMPT_TEMPLATE.format(lang_name=name, content=lecture)
-            dialogue = llm(prompt).strip()
-            transcripts.append(dialogue)
-            # 2️⃣ Speech
-            tempdir = base / code
-            tempdir.mkdir(parents=True, exist_ok=True)
-            audio_path = synthesize_speech(dialogue, LANG_INFO[code]["tts_model"], tempdir)
-            audios.append(str(audio_path))
-    # Return alternating transcript and audio path
-    results: List = []
-    for t, a in zip(transcripts, audios):
-        results.extend([t, a])
-    return results
 # ------------------------------------------------------------------
-# Gradio UI
 # ------------------------------------------------------------------
-languages = [info["name"] for info in LANG_INFO.values()]
 inputs = [
-    gr.File(label="Lecture PDF", file_types=[".pdf"]),
-    gr.CheckboxGroup(languages, value=["English"], label="Languages"),
 ]
-# Two outputs per language: transcript and audio
 outputs = []
-for name in languages:
-    outputs.append(gr.Textbox(label=f"{name} Transcript", interactive=False))
-    outputs.append(gr.Audio(label=f"{name} Podcast", type="filepath"))
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
-    title="Lecture → Podcast Generator",
-    description="Upload a lecture PDF, select languages, get dialogue transcript and audio podcast."
 )
 if __name__ == "__main__":
-    iface.launch()

+# =============================================================
+# Hugging Face Space – Lecture → Podcast Generator (Google Gemini & TTS)
+# =============================================================
+# • **Text generation** – Google Gemini API
+# • **Speech synthesis** – Google Cloud Text-to-Speech API
+# -----------------------------------------------------------------
 import os
 import re
 import tempfile
 import textwrap
 from pathlib import Path
+from typing import List, Dict, Optional, Any
 import gradio as gr
+from PyPDF2 import PdfReader
 from pydub import AudioSegment
 from pydub.exceptions import CouldntDecodeError
+# Import Google Cloud libraries
+try:
+    import google.generativeai as genai
+    from google.cloud import texttospeech
+except ImportError:
+    raise ImportError(
+        "Please install required Google libraries: "
+        "pip install google-generativeai google-cloud-texttospeech"
+    )
 # ------------------------------------------------------------------
+# Language metadata for Google TTS (BCP-47 codes)
+# You might want to specify particular voices too (e.g., "en-US-Wavenet-D")
+# For simplicity, we'll let Google pick a standard voice for the language code.
 # ------------------------------------------------------------------
 LANG_INFO: Dict[str, Dict[str, str]] = {
+    "en": {"name": "English", "tts_lang_code": "en-US"},
+    "bn": {"name": "Bangla",  "tts_lang_code": "bn-IN"},
+    "zh": {"name": "Chinese (Mandarin)", "tts_lang_code": "cmn-CN"}, # cmn for Mandarin
+    "ur": {"name": "Urdu",    "tts_lang_code": "ur-PK"},
+    "ne": {"name": "Nepali",  "tts_lang_code": "ne-NP"},
 }
 LANG_CODE_BY_NAME = {info["name"]: code for code, info in LANG_INFO.items()}
+# ------------------------------------------------------------------
+# Prompt template (adjust if needed for Gemini's style)
+# ------------------------------------------------------------------
 PROMPT_TEMPLATE = textwrap.dedent(
     """
     You are producing a lively two-host educational podcast in {lang_name}.
+    Summarize the following lecture content into a dialogue of **approximately 300 words**.
     Make it engaging: hosts ask questions, clarify ideas with analogies, and
+    wrap up with a concise recap. Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
     ### Lecture Content
     {content}
     """
 )
+# PDF helpers (unchanged) -------------------------------------------
 def extract_pdf_text(pdf_path: str) -> str:
     try:
         reader = PdfReader(pdf_path)
     except Exception as e:
         raise gr.Error(f"Failed to process PDF: {e}")
+TOKEN_LIMIT = 8000 # Word limit for input text
 def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
     words = text.split()
     if len(words) > limit:
+        gr.Warning(f"Input text was truncated from {len(words)} to {limit} words to fit LLM context window.")
         return " ".join(words[:limit])
     return text
+# ------------------------------------------------------------------
+# TTS helper – chunk long text (Google TTS has a limit of 5000 bytes per request)
+# ------------------------------------------------------------------
+CHUNK_CHAR_LIMIT = 1500  # Google TTS limit is 5000 bytes. Characters are safer.
+                        # Average 3 bytes/char for UTF-8, so 1500 chars is ~4500 bytes.
 def _split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
+    sentences_raw = re.split(r"(?<=[.!?])\s+", text.strip())
+    sentences = [s.strip() for s in sentences_raw if s.strip()]
+    if not sentences: return []
+    chunks, current_chunk = [], ""
     for sent in sentences:
+        if current_chunk and (len(current_chunk) + len(sent) + 1 > limit):
+            chunks.append(current_chunk)
+            current_chunk = sent
         else:
+            current_chunk += (" " + sent) if current_chunk else sent
+    if current_chunk: chunks.append(current_chunk)
+    return [chunk for chunk in chunks if chunk.strip()]
+def synthesize_speech_google(
+    text: str,
+    google_lang_code: str,
+    lang_tmpdir: Path,
+    tts_client: texttospeech.TextToSpeechClient
+) -> Path:
+    """Splits text, synthesizes with Google TTS, concatenates MP3s."""
     chunks = _split_to_chunks(text)
     if not chunks:
+        raise ValueError("Text resulted in no speakable chunks after splitting.")
+    audio_segments: List[AudioSegment] = []
+    for idx, chunk in enumerate(chunks):
+        gr.Info(f"Synthesizing audio for chunk {idx + 1}/{len(chunks)} with Google TTS...")
+        synthesis_input = texttospeech.SynthesisInput(text=chunk)
+        voice = texttospeech.VoiceSelectionParams(
+            language_code=google_lang_code,
+            # You can specify a voice name, e.g., "en-US-Wavenet-D"
+            # ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL # Optional
+        )
+        audio_config = texttospeech.AudioConfig(
+            audio_encoding=texttospeech.AudioEncoding.MP3
+        )
         try:
+            response = tts_client.synthesize_speech(
+                input=synthesis_input, voice=voice, audio_config=audio_config
+            )
+        except Exception as e:
+            raise RuntimeError(f"Google TTS request failed for chunk {idx+1}: {e}") from e
+        part_path = lang_tmpdir / f"part_{idx}.mp3"
+        with open(part_path, "wb") as out_mp3:
+            out_mp3.write(response.audio_content)
         try:
+            segment = AudioSegment.from_mp3(part_path)
+            audio_segments.append(segment)
         except CouldntDecodeError as e:
+            raise RuntimeError(f"Failed to decode MP3 audio chunk {idx+1} from {part_path}. Error: {e}") from e
+    if not audio_segments:
+        raise RuntimeError("No audio segments were successfully synthesized or decoded.")
+    combined_audio = sum(audio_segments, AudioSegment.empty())
+    final_path = lang_tmpdir / "podcast_audio.mp3"
+    combined_audio.export(final_path, format="mp3")
+    return final_path
 # ------------------------------------------------------------------
+# Main pipeline function for Gradio
 # ------------------------------------------------------------------
+def generate_podcast(
+    gemini_api_key: Optional[str],
+    pdf_file_obj: Optional[gr.File],
+    selected_lang_names: List[str]
+) -> List[Optional[Any]]:
+    if not gemini_api_key:
+        raise gr.Error("Please enter your Google AI Studio API Key for Gemini.")
+    if not pdf_file_obj:
         raise gr.Error("Please upload a PDF file.")
+    if not selected_lang_names:
+        raise gr.Error("Please select at least one language for the podcast.")
+    try:
+        genai.configure(api_key=gemini_api_key)
+    except Exception as e:
+        raise gr.Error(f"Failed to configure Gemini API. Check your API key. Error: {e}")
+    # IMPORTANT: Google Cloud Text-to-Speech client initialization.
+    # It expects GOOGLE_APPLICATION_CREDENTIALS environment variable to be set,
+    # pointing to your service account JSON key file.
+    # In Hugging Face Spaces, upload this JSON file as a Secret, e.g., named
+    # `GOOGLE_CREDS_JSON_CONTENT` (paste the content of the file).
+    # Then, in your Space's startup or here, you'd write this content to a temporary file
+    # and set GOOGLE_APPLICATION_CREDENTIALS to that temp file's path.
+    # Or, if GOOGLE_APPLICATION_CREDENTIALS points to a file path directly (less secure for pasted content).
+    # Example for setting GOOGLE_APPLICATION_CREDENTIALS from a Space secret:
+    google_creds_json_content = os.getenv("GOOGLE_CREDS_JSON_CONTENT")
+    temp_creds_file = None
+    if google_creds_json_content:
+        try:
+            fd, temp_creds_path = tempfile.mkstemp(suffix=".json")
+            with os.fdopen(fd, "w") as tmp:
+                tmp.write(google_creds_json_content)
+            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = temp_creds_path
+            temp_creds_file = Path(temp_creds_path)
+            gr.Info("Using GOOGLE_CREDS_JSON_CONTENT secret for Text-to-Speech API authentication.")
+        except Exception as e:
+            gr.Warning(f"Could not process GOOGLE_CREDS_JSON_CONTENT secret: {e}. TTS might fail.")
+    elif not os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
+        gr.Warning(
+            "GOOGLE_APPLICATION_CREDENTIALS environment variable not set, and no "
+            "GOOGLE_CREDS_JSON_CONTENT secret found. "
+            "Google Text-to-Speech API calls may fail. "
+            "Please set up authentication for Google Cloud Text-to-Speech."
+        )
+    try:
+        tts_client = texttospeech.TextToSpeechClient()
+    except Exception as e:
+        raise gr.Error(f"Failed to initialize Google Text-to-Speech client. Ensure authentication is set up. Error: {e}")
+    selected_codes = [LANG_CODE_BY_NAME[name] for name in selected_lang_names]
+    results_data: Dict[str, Dict[str, Optional[str]]] = {
+        code: {"audio": None, "script_text": None, "script_file": None}
+        for code in LANG_INFO.keys()
+    }
+    try:
+        with tempfile.TemporaryDirectory() as td:
+            tmpdir_base = Path(td)
+            gr.Info("Extracting text from PDF...")
+            lecture_raw = extract_pdf_text(pdf_file_obj.name)
+            lecture_text = truncate_text(lecture_raw)
+            if not lecture_text.strip():
+                raise gr.Error("Could not extract any text from the PDF, or the PDF content is empty.")
+            # Initialize Gemini model (e.g., 'gemini-1.5-flash' or 'gemini-pro')
+            # Choose a model appropriate for your task and quota.
+            gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest') # Or 'gemini-pro'
+            for code in selected_codes:
+                info = LANG_INFO[code]
+                lang_name = info["name"]
+                google_tts_lang = info["tts_lang_code"]
+                gr.Info(f"Processing for {lang_name}...")
+                lang_tmpdir = tmpdir_base / code
+                lang_tmpdir.mkdir(parents=True, exist_ok=True)
+                dialogue: Optional[str] = None
+                gr.Info(f"Generating dialogue for {lang_name} with Gemini...")
+                prompt_for_gemini = PROMPT_TEMPLATE.format(lang_name=lang_name, content=lecture_text)
+                try:
+                    response = gemini_model.generate_content(prompt_for_gemini)
+                    dialogue_raw = response.text # Accessing the text part of the response
+                    if not dialogue_raw or not dialogue_raw.strip():
+                        gr.Warning(f"Gemini returned empty dialogue for {lang_name}. Skipping.")
+                        continue
+                    dialogue = dialogue_raw
+                    results_data[code]["script_text"] = dialogue
+                    script_file_path = lang_tmpdir / f"podcast_script_{code}.txt"
+                    script_file_path.write_text(dialogue, encoding="utf-8")
+                    results_data[code]["script_file"] = str(script_file_path)
+                except Exception as e:
+                    gr.Error(f"Error generating dialogue with Gemini for {lang_name}: {e}")
+                    continue
+                if dialogue:
+                    gr.Info(f"Synthesizing speech for {lang_name} with Google TTS...")
+                    try:
+                        tts_path = synthesize_speech_google(dialogue, google_tts_lang, lang_tmpdir, tts_client)
+                        results_data[code]["audio"] = str(tts_path)
+                    except ValueError as e:
+                        gr.Warning(f"Could not synthesize speech for {lang_name} (ValueError): {e}")
+                    except RuntimeError as e:
+                        gr.Error(f"Error synthesizing speech for {lang_name} (RuntimeError): {e}")
+                    except Exception as e:
+                        gr.Error(f"Unexpected error during speech synthesis for {lang_name}: {e}")
+        final_ordered_results: List[Optional[Any]] = []
+        for code_key in LANG_INFO.keys():
+            lang_output_data = results_data[code_key]
+            final_ordered_results.append(lang_output_data["audio"])
+            final_ordered_results.append(lang_output_data["script_text"])
+            final_ordered_results.append(lang_output_data["script_file"])
+        gr.Info("Podcast generation complete!")
+        return final_ordered_results
+    except gr.Error as e:
+        raise e
+    except Exception as e:
+        import traceback
+        print("An unexpected error occurred in generate_podcast:")
+        traceback.print_exc()
+        raise gr.Error(f"An unexpected server error occurred. Details: {str(e)[:100]}...")
+    finally:
+        # Clean up the temporary credentials file if it was created
+        if temp_creds_file and temp_creds_file.exists():
+            try:
+                temp_creds_file.unlink()
+                # Unset the env var if you want, though it's specific to this run
+                # if "GOOGLE_APPLICATION_CREDENTIALS" in os.environ and os.environ["GOOGLE_APPLICATION_CREDENTIALS"] == str(temp_creds_file):
+                # del os.environ["GOOGLE_APPLICATION_CREDENTIALS"]
+            except Exception as e_clean:
+                print(f"Warning: Could not clean up temporary credentials file {temp_creds_file}: {e_clean}")
 # ------------------------------------------------------------------
+# Gradio Interface Setup
 # ------------------------------------------------------------------
+language_names_ordered = [LANG_INFO[code]["name"] for code in LANG_INFO.keys()]
 inputs = [
+    gr.Textbox(
+        label="Enter your Google AI Studio API Key (for Gemini)",
+        type="password",
+        placeholder="Paste your API key here",
+    ),
+    gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
+    gr.CheckboxGroup(
+        choices=language_names_ordered,
+        value=["English"],
+        label="Select podcast language(s) to generate",
+    ),
 ]
 outputs = []
+for code in LANG_INFO.keys():
+    info = LANG_INFO[code]
+    lang_name = info["name"]
+    outputs.append(gr.Audio(label=f"{lang_name} Podcast (.mp3)", type="filepath"))
+    outputs.append(gr.Markdown(label=f"{lang_name} Script"))
+    outputs.append(gr.File(label=f"Download {lang_name} Script (.txt)", type="filepath"))
 iface = gr.Interface(
     fn=generate_podcast,
     inputs=inputs,
     outputs=outputs,
+    title="Lecture → Podcast & Script (Google Gemini & TTS)",
+    description=(
+        "**IMPORTANT SETUP:**\n"
+        "1. Enter your Google AI Studio API Key for Gemini text generation.\n"
+        "2. For Text-to-Speech: Enable the 'Cloud Text-to-Speech API' in your Google Cloud Project. "
+        "Create a service account with 'Cloud Text-to-Speech API User' role, download its JSON key. "
+        "In this Hugging Face Space, go to 'Settings' -> 'Secrets' and add a new secret named `GOOGLE_CREDS_JSON_CONTENT`. "
+        "Paste the *entire content* of your service account JSON key file as the value for this secret.\n\n"
+        "Upload a lecture PDF, choose language(s), and receive an audio podcast "
+        "and its script. Dialogue by Google Gemini, speech by Google Cloud TTS."
+    ),
+    allow_flagging="never",
 )
 if __name__ == "__main__":
+    # Make sure GOOGLE_CREDS_JSON_CONTENT is available as an environment variable
+    # or GOOGLE_APPLICATION_CREDENTIALS is set correctly if running locally for testing.
+    # For local testing with a service account key file:
+    # os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "path/to/your/service-account-file.json"
+    iface.launch()