HaiderAUT commited on
Commit
cca7e91
·
verified ·
1 Parent(s): d4adc2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -77
app.py CHANGED
@@ -1,29 +1,28 @@
1
  # =============================================================
2
  # Lecture → Podcast & Script Generator (English Only)
3
- # • Text: Google Gemini API (via UI-provided key)
4
- # • Audio: Hugging Face InferenceClient.text_to_speech (public MMS-TTS for English)
5
  # =============================================================
6
 
7
- import os
8
  import re
9
  import tempfile
10
  import textwrap
11
  from pathlib import Path
12
- from typing import List, Optional, Any
13
 
14
  import gradio as gr
15
  from PyPDF2 import PdfReader
16
  from pydub import AudioSegment
17
  from pydub.exceptions import CouldntDecodeError
18
 
19
- # Hugging Face TTS client (anonymous/public access)
20
- from huggingface_hub import InferenceClient
21
-
22
  # Google Gemini SDK
23
  try:
24
  import google.generativeai as genai
25
  except ImportError:
26
- raise ImportError("Please install Google Generative AI SDK: pip install google-generativeai")
 
 
 
 
27
 
28
  # ------------------------------------------------------------------
29
  # Globals & templates
@@ -43,10 +42,11 @@ PROMPT_TEMPLATE = textwrap.dedent(
43
  HF_TTS_MODEL = "facebook/mms-tts-eng"
44
  CHUNK_CHAR_LIMIT = 280
45
 
 
46
  tts_client = InferenceClient()
47
 
48
  # ------------------------------------------------------------------
49
- # Helpers
50
  # ------------------------------------------------------------------
51
  def extract_pdf_text(pdf_path: str) -> str:
52
  reader = PdfReader(pdf_path)
@@ -69,98 +69,111 @@ def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
69
  chunks.append(current)
70
  return chunks
71
 
72
- def synthesize_speech(text: str, model_id: str, out_dir: Path) -> Path:
73
- chunks = split_to_chunks(text)
74
  if not chunks:
75
- raise ValueError("No text to synthesize.")
76
  segments = []
77
- for i, chunk in enumerate(chunks):
78
- try:
79
- audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
80
- except Exception as e:
81
- raise RuntimeError(f"TTS failed on chunk {i+1}: {e}")
82
- part_path = out_dir / f"seg_{i}.flac"
83
  part_path.write_bytes(audio_bytes)
84
  try:
85
  seg = AudioSegment.from_file(part_path, format="flac")
86
  segments.append(seg)
87
  except CouldntDecodeError as e:
88
- raise RuntimeError(f"Could not decode segment {i+1}: {e}")
89
  final_audio = sum(segments, AudioSegment.empty())
90
- out_path = out_dir / "podcast_audio.flac"
91
- final_audio.export(out_path, format="flac")
92
- return out_path
93
 
94
  # ------------------------------------------------------------------
95
- # Main pipeline
96
  # ------------------------------------------------------------------
97
- def generate_podcast(
98
- gemini_api_key: Optional[str],
99
- lecture_pdf: Optional[gr.File]
100
- ) -> List[Optional[Any]]:
101
  if not gemini_api_key:
102
- raise gr.Error("Enter your Google AI Studio API Key.")
103
  if not lecture_pdf:
104
- raise gr.Error("Upload a lecture PDF file.")
105
-
106
- genai.configure(api_key=gemini_api_key)
107
-
108
- raw = extract_pdf_text(lecture_pdf.name)
109
- content = truncate_text(raw)
110
- if not content.strip():
111
- raise gr.Error("Lecture PDF contained no extractable text.")
112
-
113
  try:
114
- gemini_model = genai.GenerativeModel("gemini-1.5-flash-latest")
 
115
  except Exception as e:
116
- raise gr.Error(f"Gemini init failed: {e}")
117
 
 
 
 
 
 
 
 
118
  prompt = PROMPT_TEMPLATE.format(content=content)
119
  try:
120
- resp = gemini_model.generate_content(prompt)
121
- script = resp.text or ""
122
  except Exception as e:
123
  raise gr.Error(f"Gemini generation error: {e}")
124
 
 
 
 
 
 
 
 
 
 
 
 
125
  with tempfile.TemporaryDirectory() as td:
126
- tmp = Path(td)
127
- # Save script file
128
- script_path = tmp / "podcast_script.txt"
129
- script_path.write_text(script, encoding="utf-8")
130
- # Synthesize audio
131
- try:
132
- audio_path = synthesize_speech(script, HF_TTS_MODEL, tmp)
133
- except Exception as e:
134
- raise gr.Error(f"Speech synthesis error: {e}")
135
- # Return [audio, markdown script, txt file]
136
- return [str(audio_path), script, str(script_path)]
137
 
138
  # ------------------------------------------------------------------
139
- # Gradio Interface
140
  # ------------------------------------------------------------------
141
- iface = gr.Interface(
142
- fn=generate_podcast,
143
- inputs=[
144
- gr.Textbox(
 
 
145
  label="Google Gemini API Key",
146
  type="password",
147
- placeholder="Paste your key here"
148
- ),
149
- gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
150
- ],
151
- outputs=[
152
- gr.Audio(label="English Podcast", type="filepath"),
153
- gr.Markdown(label="English Script"), # renders the script
154
- gr.File(label="Download English Script (.txt)", type="filepath"),
155
- ],
156
- title="Lecture → English Podcast & Script",
157
- description=(
158
- "Enter your Gemini API Key and upload a lecture PDF. "
159
- "Generates a two-host podcast audio and a Markdown script in English "
160
- "using Google Gemini for text and Hugging Face MMS-TTS for audio."
161
- ),
162
- allow_flagging="never",
163
- )
164
-
165
- if __name__ == "__main__":
166
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
1
  # =============================================================
2
  # Lecture → Podcast & Script Generator (English Only)
3
+ # Two-step: 1) Gemini script 2) HF MMS-TTS audio
 
4
  # =============================================================
5
 
 
6
  import re
7
  import tempfile
8
  import textwrap
9
  from pathlib import Path
10
+ from typing import List, Optional
11
 
12
  import gradio as gr
13
  from PyPDF2 import PdfReader
14
  from pydub import AudioSegment
15
  from pydub.exceptions import CouldntDecodeError
16
 
 
 
 
17
  # Google Gemini SDK
18
  try:
19
  import google.generativeai as genai
20
  except ImportError:
21
+ raise ImportError("Please install the Google Generative AI SDK:\n"
22
+ " pip install google-generativeai")
23
+
24
+ # Hugging Face TTS client (anonymous/public)
25
+ from huggingface_hub import InferenceClient
26
 
27
  # ------------------------------------------------------------------
28
  # Globals & templates
 
42
  HF_TTS_MODEL = "facebook/mms-tts-eng"
43
  CHUNK_CHAR_LIMIT = 280
44
 
45
+ # Initialize the HF TTS client once
46
  tts_client = InferenceClient()
47
 
48
  # ------------------------------------------------------------------
49
+ # Helper functions
50
  # ------------------------------------------------------------------
51
  def extract_pdf_text(pdf_path: str) -> str:
52
  reader = PdfReader(pdf_path)
 
69
  chunks.append(current)
70
  return chunks
71
 
72
+ def synthesize_speech(script: str, model_id: str, out_dir: Path) -> str:
73
+ chunks = split_to_chunks(script)
74
  if not chunks:
75
+ raise RuntimeError("No text chunks to synthesize.")
76
  segments = []
77
+ for idx, chunk in enumerate(chunks):
78
+ audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
79
+ part_path = out_dir / f"seg_{idx}.flac"
 
 
 
80
  part_path.write_bytes(audio_bytes)
81
  try:
82
  seg = AudioSegment.from_file(part_path, format="flac")
83
  segments.append(seg)
84
  except CouldntDecodeError as e:
85
+ raise RuntimeError(f"Failed to decode chunk {idx}: {e}") from e
86
  final_audio = sum(segments, AudioSegment.empty())
87
+ final_path = out_dir / "podcast_audio.flac"
88
+ final_audio.export(final_path, format="flac")
89
+ return str(final_path)
90
 
91
  # ------------------------------------------------------------------
92
+ # Step 1: Generate script via Gemini
93
  # ------------------------------------------------------------------
94
+ def generate_script(
95
+ gemini_api_key: str,
96
+ lecture_pdf: gr.File
97
+ ) -> List[str]:
98
  if not gemini_api_key:
99
+ raise gr.Error("Please enter your Google AI Studio API Key.")
100
  if not lecture_pdf:
101
+ raise gr.Error("Please upload a lecture PDF.")
102
+ # Configure Gemini
 
 
 
 
 
 
 
103
  try:
104
+ genai.configure(api_key=gemini_api_key)
105
+ model = genai.GenerativeModel("gemini-1.5-flash-latest")
106
  except Exception as e:
107
+ raise gr.Error(f"Gemini init/config error: {e}")
108
 
109
+ # Extract and truncate text
110
+ raw_text = extract_pdf_text(lecture_pdf.name)
111
+ content = truncate_text(raw_text)
112
+ if not content.strip():
113
+ raise gr.Error("No extractable text found in the PDF.")
114
+
115
+ # Generate dialogue script
116
  prompt = PROMPT_TEMPLATE.format(content=content)
117
  try:
118
+ response = model.generate_content(prompt)
119
+ script = response.text or ""
120
  except Exception as e:
121
  raise gr.Error(f"Gemini generation error: {e}")
122
 
123
+ return [script, script] # [for Markdown display, for state storage]
124
+
125
+ # ------------------------------------------------------------------
126
+ # Step 2: Generate audio from provided script
127
+ # ------------------------------------------------------------------
128
+ def generate_audio(
129
+ script: str
130
+ ) -> str:
131
+ if not script:
132
+ raise gr.Error("No script available. Please generate the script first.")
133
+ # Create a temp dir for audio parts
134
  with tempfile.TemporaryDirectory() as td:
135
+ out_dir = Path(td)
136
+ audio_path = synthesize_speech(script, HF_TTS_MODEL, out_dir)
137
+ return audio_path
 
 
 
 
 
 
 
 
138
 
139
  # ------------------------------------------------------------------
140
+ # Gradio UI
141
  # ------------------------------------------------------------------
142
+ with gr.Blocks() as demo:
143
+ # Shared state for the script
144
+ script_state = gr.State()
145
+
146
+ with gr.Tab("Generate Script"):
147
+ api_key_input = gr.Textbox(
148
  label="Google Gemini API Key",
149
  type="password",
150
+ placeholder="Enter your key"
151
+ )
152
+ pdf_input = gr.File(
153
+ label="Upload Lecture PDF",
154
+ file_types=[".pdf"]
155
+ )
156
+ script_md = gr.Markdown(
157
+ label="Generated Script",
158
+ placeholder="Your script will appear here..."
159
+ )
160
+ gen_script_btn = gr.Button("Generate Script")
161
+ gen_script_btn.click(
162
+ fn=generate_script,
163
+ inputs=[api_key_input, pdf_input],
164
+ outputs=[script_md, script_state]
165
+ )
166
+
167
+ with gr.Tab("Generate Audio"):
168
+ gen_audio_btn = gr.Button("Generate Audio")
169
+ audio_out = gr.Audio(
170
+ label="Podcast Audio",
171
+ type="filepath"
172
+ )
173
+ gen_audio_btn.click(
174
+ fn=generate_audio,
175
+ inputs=[script_state],
176
+ outputs=[audio_out]
177
+ )
178
+
179
+ demo.launch()