HaiderAUT commited on
Commit
03ef672
Β·
verified Β·
1 Parent(s): cf56cc8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -155
app.py CHANGED
@@ -1,179 +1,113 @@
1
  # =============================================================
2
- # Lecture β†’ Podcast & Script Generator (English Only)
3
- # Two-step: 1) Gemini script 2) HF MMS-TTS audio
 
4
  # =============================================================
5
 
 
6
  import re
7
  import tempfile
8
  import textwrap
9
  from pathlib import Path
10
- from typing import List, Optional
11
 
12
  import gradio as gr
13
  from PyPDF2 import PdfReader
14
- from pydub import AudioSegment
15
- from pydub.exceptions import CouldntDecodeError
16
-
17
- # Google Gemini SDK
18
- try:
19
- import google.generativeai as genai
20
- except ImportError:
21
- raise ImportError("Please install the Google Generative AI SDK:\n"
22
- " pip install google-generativeai")
23
-
24
- # Hugging Face TTS client (anonymous/public)
25
  from huggingface_hub import InferenceClient
26
 
27
- # ------------------------------------------------------------------
28
- # Globals & templates
29
- # ------------------------------------------------------------------
30
- PROMPT_TEMPLATE = textwrap.dedent(
31
- """
32
- You are producing a lively two-host educational podcast in English.
33
- Summarize the following lecture content into a dialogue of approximately 300 words.
34
- Make it engaging: hosts ask questions, clarify ideas with analogies, and wrap up with a concise recap.
35
- Preserve technical accuracy. Use Markdown for host names (e.g., **Host 1:**).
36
-
37
- ### Lecture Content
38
- {content}
39
- """
40
- )
41
-
42
- HF_TTS_MODEL = "facebook/mms-tts-eng"
43
- CHUNK_CHAR_LIMIT = 280
44
-
45
- # Initialize the HF TTS client once
46
- tts_client = InferenceClient()
47
-
48
- # ------------------------------------------------------------------
49
- # Helper functions
50
- # ------------------------------------------------------------------
 
 
 
 
 
 
 
 
51
  def extract_pdf_text(pdf_path: str) -> str:
52
  reader = PdfReader(pdf_path)
53
  return "\n".join(page.extract_text() or "" for page in reader.pages)
54
 
55
- def truncate_text(text: str, max_words: int = 8000) -> str:
56
- words = text.split()
57
- return " ".join(words[:max_words])
58
-
59
- def split_to_chunks(text: str, limit: int = CHUNK_CHAR_LIMIT) -> List[str]:
60
- sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
61
- chunks, current = [], ""
62
- for sent in sentences:
63
- if current and len(current) + len(sent) + 1 > limit:
64
- chunks.append(current)
65
- current = sent
66
  else:
67
- current = f"{current} {sent}".strip() if current else sent
68
- if current:
69
- chunks.append(current)
70
  return chunks
71
 
72
- def synthesize_speech(script: str, model_id: str, out_dir: Path) -> str:
73
- chunks = split_to_chunks(script)
74
- if not chunks:
75
- raise RuntimeError("No text chunks to synthesize.")
76
- segments = []
77
- for idx, chunk in enumerate(chunks):
78
- audio_bytes = tts_client.text_to_speech(chunk, model=model_id)
79
- part_path = out_dir / f"seg_{idx}.flac"
80
- part_path.write_bytes(audio_bytes)
81
- try:
82
- seg = AudioSegment.from_file(part_path, format="flac")
83
- segments.append(seg)
84
- except CouldntDecodeError as e:
85
- raise RuntimeError(f"Failed to decode chunk {idx}: {e}") from e
86
- final_audio = sum(segments, AudioSegment.empty())
87
- final_path = out_dir / "podcast_audio.flac"
88
- final_audio.export(final_path, format="flac")
89
- return str(final_path)
90
-
91
- # ------------------------------------------------------------------
92
- # Step 1: Generate script via Gemini
93
- # ------------------------------------------------------------------
94
- def generate_script(
95
- gemini_api_key: str,
96
- lecture_pdf: gr.File
97
- ) -> List[str]:
98
- if not gemini_api_key:
99
- raise gr.Error("Please enter your Google AI Studio API Key.")
100
  if not lecture_pdf:
101
  raise gr.Error("Please upload a lecture PDF.")
102
- # Configure Gemini
103
- try:
104
- genai.configure(api_key=gemini_api_key)
105
- model = genai.GenerativeModel("gemini-1.5-flash-latest")
106
- except Exception as e:
107
- raise gr.Error(f"Gemini init/config error: {e}")
108
-
109
- # Extract and truncate text
110
- raw_text = extract_pdf_text(lecture_pdf.name)
111
- content = truncate_text(raw_text)
112
- if not content.strip():
113
- raise gr.Error("No extractable text found in the PDF.")
114
-
115
- # Generate dialogue script
116
- prompt = PROMPT_TEMPLATE.format(content=content)
117
- try:
118
- response = model.generate_content(prompt)
119
- script = response.text or ""
120
- except Exception as e:
121
- raise gr.Error(f"Gemini generation error: {e}")
122
-
123
- return [script, script] # [for Markdown display, for state storage]
124
-
125
- # ------------------------------------------------------------------
126
- # Step 2: Generate audio from provided script
127
- # ------------------------------------------------------------------
128
- def generate_audio(
129
- script: str
130
- ) -> str:
131
- if not script:
132
- raise gr.Error("No script available. Please generate the script first.")
133
- # Create a temp dir for audio parts
134
- with tempfile.TemporaryDirectory() as td:
135
- out_dir = Path(td)
136
- audio_path = synthesize_speech(script, HF_TTS_MODEL, out_dir)
137
- return audio_path
138
-
139
- # ------------------------------------------------------------------
140
- # Gradio UI
141
- # ------------------------------------------------------------------
142
  with gr.Blocks() as demo:
143
- # Shared state for the script
144
- script_state = gr.State()
145
-
146
- with gr.Tab("Generate Script"):
147
- api_key_input = gr.Textbox(
148
- label="Google Gemini API Key",
149
- type="password",
150
- placeholder="Enter your key"
151
- )
152
- pdf_input = gr.File(
153
- label="Upload Lecture PDF",
154
- file_types=[".pdf"]
155
- )
156
- script_md = gr.Markdown(
157
- label="Generated Script",
158
-
159
- )
160
- gen_script_btn = gr.Button("Generate Script")
161
- gen_script_btn.click(
162
- fn=generate_script,
163
- inputs=[api_key_input, pdf_input],
164
- outputs=[script_md, script_state]
165
- )
166
-
167
- with gr.Tab("Generate Audio"):
168
- gen_audio_btn = gr.Button("Generate Audio")
169
- audio_out = gr.Audio(
170
- label="Podcast Audio",
171
- type="filepath"
172
- )
173
- gen_audio_btn.click(
174
- fn=generate_audio,
175
- inputs=[script_state],
176
- outputs=[audio_out]
177
- )
178
 
179
  demo.launch()
 
1
  # =============================================================
2
+ # Lecture β†’ English Podcast Generator
3
+ # β€’ Script: HF Inference API (Qwen/Qwen2.5-Coder-32B-Instruct)
4
+ # β€’ Audio: MeloTTS (English)
5
  # =============================================================
6
 
7
+ import io
8
  import re
9
  import tempfile
10
  import textwrap
11
  from pathlib import Path
12
+ from typing import List
13
 
14
  import gradio as gr
15
  from PyPDF2 import PdfReader
 
 
 
 
 
 
 
 
 
 
 
16
  from huggingface_hub import InferenceClient
17
 
18
+ import torch
19
+ import nltk
20
+ nltk.download('averaged_perceptron_tagger_eng')
21
+ from melo.api import TTS
22
+
23
+ # ────────────────────────────────────────────────────────────────────
24
+ # 1) Setup HF client & MeloTTS for English
25
+ # ────────────────────────────────────────────────────────────────────
26
+ hf_client = InferenceClient() # anonymous/public access
27
+
28
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
29
+ melo_en = TTS(language='EN', device=device)
30
+ speaker_ids = melo_en.hps.data.spk2id
31
+ default_speaker = next(iter(speaker_ids.keys()))
32
+
33
+ # ────────────────────────────────────────────────────────────────────
34
+ # 2) Prompt template
35
+ # ────────────────────────────────────────────────────────────────────
36
+ PROMPT = textwrap.dedent("""
37
+ You are producing a lively two-host educational podcast in English.
38
+ Summarize the following lecture content into a dialogue of approximately 300 words.
39
+ Make it engaging: hosts ask questions, clarify ideas with analogies,
40
+ and wrap up with a concise recap. Preserve technical accuracy.
41
+ Use Markdown for host names (e.g., **Host 1:**).
42
+
43
+ ### Lecture Content
44
+ {content}
45
+ """)
46
+
47
+ # ────────────────────────────────────────────────────────────────────
48
+ # 3) Helpers
49
+ # ────────────────────────────────────────────────────────────────────
50
  def extract_pdf_text(pdf_path: str) -> str:
51
  reader = PdfReader(pdf_path)
52
  return "\n".join(page.extract_text() or "" for page in reader.pages)
53
 
54
+ def split_to_chunks(text: str, limit: int = 280) -> List[str]:
55
+ sents = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
56
+ chunks, curr = [], ""
57
+ for sent in sents:
58
+ if curr and len(curr) + len(sent) + 1 > limit:
59
+ chunks.append(curr)
60
+ curr = sent
 
 
 
 
61
  else:
62
+ curr = f"{curr} {sent}".strip() if curr else sent
63
+ if curr:
64
+ chunks.append(curr)
65
  return chunks
66
 
67
+ # ────────────────────────────────────────────────────────────────────
68
+ # 4) Main generate function
69
+ # ────────────────────────────────────────────────────────────────────
70
+ def generate_podcast(lecture_pdf: gr.File):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  if not lecture_pdf:
72
  raise gr.Error("Please upload a lecture PDF.")
73
+ # 1️⃣ Extract & prompt
74
+ raw = extract_pdf_text(lecture_pdf.name)
75
+ prompt = PROMPT.format(content=raw)
76
+ # 2️⃣ HF text generation
77
+ out = hf_client.text_generation(
78
+ inputs=prompt,
79
+ model="Qwen/Qwen2.5-Coder-32B-Instruct",
80
+ parameters={"max_new_tokens": 512, "temperature": 0.5}
81
+ )
82
+ # InferenceClient returns a dict or a str depending on version
83
+ script = out.get("generated_text") if isinstance(out, dict) else out
84
+
85
+ # 3️⃣ MeloTTS audio
86
+ tmpdir = Path(tempfile.mkdtemp())
87
+ bio = io.BytesIO()
88
+ progress = gr.Progress()
89
+ # use the default English speaker
90
+ melo_en.tts_to_file(
91
+ script,
92
+ speaker_ids[default_speaker],
93
+ bio,
94
+ speed=1.0,
95
+ pbar=progress.tqdm,
96
+ format="wav"
97
+ )
98
+ audio_bytes = bio.getvalue()
99
+
100
+ return script, audio_bytes
101
+
102
+ # ────────────────────────────────────────────────────────────────────
103
+ # 5) Gradio UI
104
+ # ────────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
105
  with gr.Blocks() as demo:
106
+ gr.Markdown("## Lecture β†’ English Podcast")
107
+ pdf_in = gr.File(label="Upload Lecture PDF", file_types=[".pdf"])
108
+ btn = gr.Button("Generate Podcast")
109
+ script_md = gr.Markdown(label="Podcast Script")
110
+ audio_out = gr.Audio(label="Podcast Audio", type="bytes")
111
+ btn.click(fn=generate_podcast, inputs=[pdf_in], outputs=[script_md, audio_out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  demo.launch()