HaiderAUT commited on
Commit
50d2a40
·
verified ·
1 Parent(s): f0eca57

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -51
app.py CHANGED
@@ -1,25 +1,27 @@
1
  # =============================================================
2
- # HuggingFace Space – Lecture Multilingual Podcast Generator
3
  # =============================================================
4
- # * Text generation: SmolAgents HfApiModel (Qwen/Qwen2.5‑Coder‑32B)
5
- # * Speech synthesis: **Coqui XTTSv2** open model via the TTS lib
6
- # (no private / gated repo, so it runs without a HF token).
7
- # * Outputs five WAV files: English, Bangla, Chinese, Urdu, Nepali.
 
 
8
  # -----------------------------------------------------------------
9
 
10
  import os
11
  import tempfile
12
- import uuid
13
  import textwrap
 
14
  from typing import List, Dict
15
 
16
  import gradio as gr
 
17
  from PyPDF2 import PdfReader
18
  from smolagents import HfApiModel
19
- from TTS.api import TTS # ↳ Coqui TTS
20
 
21
  # ------------------------------------------------------------------
22
- # LLM configuration (SmolAgents wrapper for HF Inference API)
23
  # ------------------------------------------------------------------
24
  llm = HfApiModel(
25
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
@@ -29,93 +31,92 @@ llm = HfApiModel(
29
  )
30
 
31
  # ------------------------------------------------------------------
32
- # XTTS‑v2 multilingual text‑to‑speech ( 1.2 GB, CPU OK)
33
  # ------------------------------------------------------------------
34
- TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
35
-
36
- tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
37
- # Automatically downloads and caches the model on first run.
38
 
 
 
 
39
  LANG_INFO: Dict[str, Dict[str, str]] = {
40
- "en": {"name": "English"},
41
- "bn": {"name": "Bangla"},
42
- "zh": {"name": "Chinese"},
43
- "ur": {"name": "Urdu"},
44
- "ne": {"name": "Nepali"},
 
45
  }
46
 
47
  PROMPT_TEMPLATE = textwrap.dedent(
48
  """
49
  You are producing a lively two‑host educational podcast in {lang_name}.
50
- Summarize the following lecture content into a dialogue of about 1200 words.
51
- Use an engaging style: hosts ask each other questions, clarify ideas, add
52
- simple analogies, and conclude with a short recap. Keep technical accuracy.
53
 
54
- ### Lecture Content
55
  {content}
56
  """
57
  )
58
 
59
  # ------------------------------------------------------------------
60
- # Utility: extract & truncate PDF text to fit the LLM token budget
61
  # ------------------------------------------------------------------
62
 
63
- def extract_pdf_text(pdf_file) -> str:
64
- reader = PdfReader(pdf_file)
65
- return "\n".join(p.extract_text() or "" for p in reader.pages)
66
 
67
- TOKEN_LIMIT = 6000 # tokens (safe margin for prompt + response)
68
 
69
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
70
  words = text.split()
71
  return " ".join(words[:limit])
72
 
73
  # ------------------------------------------------------------------
74
- # Main generation routine
75
  # ------------------------------------------------------------------
76
 
77
  def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
 
78
  with tempfile.TemporaryDirectory() as tmpdir:
79
- lecture_text = truncate_text(extract_pdf_text(pdf.name))
80
- audio_outputs = []
 
81
 
82
- for lang_code, info in LANG_INFO.items():
83
- # 1️⃣ Create prompt + generate dialogue
84
  prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
85
- dialogue = llm(prompt)
86
-
87
- # 2️⃣ Save raw dialogue text (for reference)
88
- txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
89
- with open(txt_path, "w", encoding="utf-8") as f:
90
- f.write(dialogue)
91
 
92
- # 3️⃣ Synthesise speech with XTTS‑v2
93
- wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
94
- # xtts_v2 accepts ISO‑639‑1 language codes directly
95
- tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path)
96
 
97
- audio_outputs.append((wav_path, None)) # (file, label) for Gradio Audio
98
 
99
- return audio_outputs
100
 
101
  # ------------------------------------------------------------------
102
- # Gradio UI
103
  # ------------------------------------------------------------------
104
 
105
  audio_components = [
106
- gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
 
107
  ]
108
 
109
  iface = gr.Interface(
110
  fn=generate_podcast,
111
  inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
112
  outputs=audio_components,
113
- title="Lecture Multilingual Podcast Generator",
114
  description=(
115
- "Upload a lecture PDF and receive a two‑host audio podcast in English, "
116
- "Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the "
117
- "dialogue and Coqui XTTSv2 for speech synthesis no private repos "
118
- "or API keys needed."
119
  ),
120
  )
121
 
 
1
  # =============================================================
2
+ # Hugging Face Space – Lecture  Multilingual Podcast Generator
3
  # =============================================================
4
+ # * **Text generation** – SmolAgents `HfApiModel` running the remote
5
+ # Qwen/Qwen2.5Coder‑32B‑Instruct model.
6
+ # * **Speech synthesis** – `huggingface_hub.InferenceClient.text_to_speech`
7
+ # (serverless) with open models per language no heavy local
8
+ # downloads.
9
+ # * Outputs five FLAC files (English, Bangla, Chinese, Urdu, Nepali).
10
  # -----------------------------------------------------------------
11
 
12
  import os
13
  import tempfile
 
14
  import textwrap
15
+ from pathlib import Path
16
  from typing import List, Dict
17
 
18
  import gradio as gr
19
+ from huggingface_hub import InferenceClient
20
  from PyPDF2 import PdfReader
21
  from smolagents import HfApiModel
 
22
 
23
  # ------------------------------------------------------------------
24
+ # LLM: Qwen 32‑B via SmolAgents
25
  # ------------------------------------------------------------------
26
  llm = HfApiModel(
27
  model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
 
31
  )
32
 
33
  # ------------------------------------------------------------------
34
+ # HF Inference API client (reads HF_TOKEN secret if set)
35
  # ------------------------------------------------------------------
36
+ client = InferenceClient(token=os.getenv("HF_TOKEN", None))
 
 
 
37
 
38
+ # ------------------------------------------------------------------
39
+ # Language metadata and matching TTS model IDs
40
+ # ------------------------------------------------------------------
41
  LANG_INFO: Dict[str, Dict[str, str]] = {
42
+ "en": {"name": "English", "tts_model": "facebook/mms-tts-eng"},
43
+ "bn": {"name": "Bangla", "tts_model": "facebook/mms-tts-ben"},
44
+ # MMS lacks mainstream Mandarin — fallback to an open Chinese TTS
45
+ "zh": {"name": "Chinese", "tts_model": "myshell-ai/MeloTTS-Chinese"},
46
+ "ur": {"name": "Urdu", "tts_model": "facebook/mms-tts-urd-script_arabic"},
47
+ "ne": {"name": "Nepali", "tts_model": "facebook/mms-tts-npi"},
48
  }
49
 
50
  PROMPT_TEMPLATE = textwrap.dedent(
51
  """
52
  You are producing a lively two‑host educational podcast in {lang_name}.
53
+ Summarize the following lecture content into a dialogue of 1200 words.
54
+ Make it engaging: hosts ask questions, clarify ideas with analogies, and
55
+ wrap up with a concise recap. Preserve technical accuracy.
56
 
57
+ ### Lecture Content
58
  {content}
59
  """
60
  )
61
 
62
  # ------------------------------------------------------------------
63
+ # Helpers: extract and truncate PDF text
64
  # ------------------------------------------------------------------
65
 
66
+ def extract_pdf_text(pdf_path: str) -> str:
67
+ reader = PdfReader(pdf_path)
68
+ return "\n".join(page.extract_text() or "" for page in reader.pages)
69
 
70
+ TOKEN_LIMIT = 6000 # rough word‑level cap before hitting context limit
71
 
72
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
73
  words = text.split()
74
  return " ".join(words[:limit])
75
 
76
  # ------------------------------------------------------------------
77
+ # Main pipeline
78
  # ------------------------------------------------------------------
79
 
80
  def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
81
+ """Generate multilingual podcast from a lecture PDF."""
82
  with tempfile.TemporaryDirectory() as tmpdir:
83
+ raw_text = extract_pdf_text(pdf.name)
84
+ lecture_text = truncate_text(raw_text)
85
+ outputs: List[tuple] = []
86
 
87
+ for code, info in LANG_INFO.items():
88
+ # 1️⃣ Draft dialogue in the target language
89
  prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
90
+ dialogue: str = llm(prompt)
 
 
 
 
 
91
 
92
+ # 2️⃣ Synthesize speech via HF Inference API
93
+ audio_bytes: bytes = client.text_to_speech(dialogue, model=info["tts_model"])
94
+ flac_path = Path(tmpdir) / f"podcast_{code}.flac"
95
+ flac_path.write_bytes(audio_bytes)
96
 
97
+ outputs.append((str(flac_path), None)) # (filepath, label)
98
 
99
+ return outputs
100
 
101
  # ------------------------------------------------------------------
102
+ # Gradio interface
103
  # ------------------------------------------------------------------
104
 
105
  audio_components = [
106
+ gr.Audio(label=f"{info['name']} Podcast", type="filepath")
107
+ for info in LANG_INFO.values()
108
  ]
109
 
110
  iface = gr.Interface(
111
  fn=generate_podcast,
112
  inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
113
  outputs=audio_components,
114
+ title="Lecture  Multilingual Podcast Generator",
115
  description=(
116
+ "Upload a lecture PDF and receive a two‑host audio podcast in five "
117
+ "languages (English, Bangla, Chinese, Urdu, Nepali). Dialogue is "
118
+ "crafted by Qwen32B; speech is synthesized on‑the‑fly using the "
119
+ "Hugging Face Inference API no heavy downloads or GPUs required."
120
  ),
121
  )
122