HaiderAUT commited on
Commit
f0eca57
·
verified ·
1 Parent(s): 2a72cc8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -37
app.py CHANGED
@@ -1,10 +1,10 @@
1
  # =============================================================
2
- # Hugging Face Space – Lecture → Multilingual Podcast Generator
3
  # =============================================================
4
- # Uses SmolAgents HfApiModel for text generation and HF audio
5
- # pipeline for speech. Generates twohost dialogues in five
6
- # languages (English, Bangla, Chinese, Urdu, Nepali) directly
7
- # from a PDF lecture upload.
8
  # -----------------------------------------------------------------
9
 
10
  import os
@@ -15,35 +15,33 @@ from typing import List, Dict
15
 
16
  import gradio as gr
17
  from PyPDF2 import PdfReader
18
- from transformers import pipeline # for audio generation (e.g., xtts)
19
- from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
20
 
21
  # ------------------------------------------------------------------
22
  # LLM configuration (SmolAgents wrapper for HF Inference API)
23
  # ------------------------------------------------------------------
24
  llm = HfApiModel(
25
- model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # 34B parameter instruct model
26
  max_tokens=2096,
27
  temperature=0.5,
28
  custom_role_conversions=None,
29
  )
30
 
31
  # ------------------------------------------------------------------
32
- # Audio model (multilingual textspeech); choose an open xtts‑v2
33
- # model that supports our languages. Switch model id if you prefer.
34
  # ------------------------------------------------------------------
35
- audio_pipe = pipeline(
36
- "text-to-audio",
37
- model="suno/xtts_v2",
38
- framework="pt",
39
- )
40
 
41
  LANG_INFO: Dict[str, Dict[str, str]] = {
42
- "en": {"name": "English", "speaker": "hostA"},
43
- "bn": {"name": "Bangla", "speaker": "hostB"},
44
- "zh": {"name": "Chinese", "speaker": "hostC"},
45
- "ur": {"name": "Urdu", "speaker": "hostD"},
46
- "ne": {"name": "Nepali", "speaker": "hostE"},
47
  }
48
 
49
  PROMPT_TEMPLATE = textwrap.dedent(
@@ -59,59 +57,66 @@ PROMPT_TEMPLATE = textwrap.dedent(
59
  )
60
 
61
  # ------------------------------------------------------------------
62
- # Utility: extract & truncate PDF text to fit LLM token budget
63
  # ------------------------------------------------------------------
64
 
65
  def extract_pdf_text(pdf_file) -> str:
66
  reader = PdfReader(pdf_file)
67
- raw = "\n".join(p.extract_text() or "" for p in reader.pages)
68
- return raw
69
-
70
- TOKEN_LIMIT = 6000 # conservative words (≈ tokens) for prompt+response
71
 
 
72
 
73
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
74
  words = text.split()
75
  return " ".join(words[:limit])
76
 
77
  # ------------------------------------------------------------------
78
- # Main generation function
79
  # ------------------------------------------------------------------
80
 
81
  def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
82
  with tempfile.TemporaryDirectory() as tmpdir:
83
  lecture_text = truncate_text(extract_pdf_text(pdf.name))
84
  audio_outputs = []
 
85
  for lang_code, info in LANG_INFO.items():
 
86
  prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
87
- # --- Generate dialogue ---
88
  dialogue = llm(prompt)
89
 
90
- # Save text for transparency/debug
91
- text_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
92
- with open(text_path, "w", encoding="utf-8") as f:
93
  f.write(dialogue)
94
 
95
- # --- TTS ---
96
- audio = audio_pipe(dialogue, forward_params={"language": lang_code})
97
  wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
98
- audio["audio"].export(wav_path, format="wav")
99
- audio_outputs.append((wav_path, None)) # Gradio Audio expects (file, label)
 
 
100
 
101
  return audio_outputs
102
 
103
  # ------------------------------------------------------------------
104
- # Gradio Interface
105
  # ------------------------------------------------------------------
106
 
107
- audio_components = [gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()]
 
 
108
 
109
  iface = gr.Interface(
110
  fn=generate_podcast,
111
  inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
112
  outputs=audio_components,
113
  title="Lecture → Multilingual Podcast Generator",
114
- description="Upload a lecture PDF and get a two‑host audio podcast in English, Bangla, Chinese, Urdu, and Nepali."
 
 
 
 
 
115
  )
116
 
117
  if __name__ == "__main__":
 
1
  # =============================================================
2
+ # HuggingFace Space – Lecture → Multilingual Podcast Generator
3
  # =============================================================
4
+ # * Text generation: SmolAgents HfApiModel (Qwen/Qwen2.5‑Coder‑32B)
5
+ # * Speech synthesis: **Coqui XTTSv2** open model via the TTS lib
6
+ # (no private / gated repo, so it runs without a HF token).
7
+ # * Outputs five WAV files: English, Bangla, Chinese, Urdu, Nepali.
8
  # -----------------------------------------------------------------
9
 
10
  import os
 
15
 
16
  import gradio as gr
17
  from PyPDF2 import PdfReader
18
+ from smolagents import HfApiModel
19
+ from TTS.api import TTS # Coqui TTS
20
 
21
  # ------------------------------------------------------------------
22
  # LLM configuration (SmolAgents wrapper for HF Inference API)
23
  # ------------------------------------------------------------------
24
  llm = HfApiModel(
25
+ model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
26
  max_tokens=2096,
27
  temperature=0.5,
28
  custom_role_conversions=None,
29
  )
30
 
31
  # ------------------------------------------------------------------
32
+ # XTTS‑v2 multilingual text‑to‑speech (≈ 1.2 GB, CPU OK)
 
33
  # ------------------------------------------------------------------
34
+ TTS_MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
35
+
36
+ tts = TTS(model_name=TTS_MODEL_NAME, progress_bar=False)
37
+ # Automatically downloads and caches the model on first run.
 
38
 
39
  LANG_INFO: Dict[str, Dict[str, str]] = {
40
+ "en": {"name": "English"},
41
+ "bn": {"name": "Bangla"},
42
+ "zh": {"name": "Chinese"},
43
+ "ur": {"name": "Urdu"},
44
+ "ne": {"name": "Nepali"},
45
  }
46
 
47
  PROMPT_TEMPLATE = textwrap.dedent(
 
57
  )
58
 
59
  # ------------------------------------------------------------------
60
+ # Utility: extract & truncate PDF text to fit the LLM token budget
61
  # ------------------------------------------------------------------
62
 
63
  def extract_pdf_text(pdf_file) -> str:
64
  reader = PdfReader(pdf_file)
65
+ return "\n".join(p.extract_text() or "" for p in reader.pages)
 
 
 
66
 
67
+ TOKEN_LIMIT = 6000 # ≈ tokens (safe margin for prompt + response)
68
 
69
  def truncate_text(text: str, limit: int = TOKEN_LIMIT) -> str:
70
  words = text.split()
71
  return " ".join(words[:limit])
72
 
73
  # ------------------------------------------------------------------
74
+ # Main generation routine
75
  # ------------------------------------------------------------------
76
 
77
  def generate_podcast(pdf: gr.File) -> List[gr.Audio]:
78
  with tempfile.TemporaryDirectory() as tmpdir:
79
  lecture_text = truncate_text(extract_pdf_text(pdf.name))
80
  audio_outputs = []
81
+
82
  for lang_code, info in LANG_INFO.items():
83
+ # 1️⃣ Create prompt + generate dialogue
84
  prompt = PROMPT_TEMPLATE.format(lang_name=info["name"], content=lecture_text)
 
85
  dialogue = llm(prompt)
86
 
87
+ # 2️⃣ Save raw dialogue text (for reference)
88
+ txt_path = os.path.join(tmpdir, f"podcast_{lang_code}.txt")
89
+ with open(txt_path, "w", encoding="utf-8") as f:
90
  f.write(dialogue)
91
 
92
+ # 3️⃣ Synthesise speech with XTTS‑v2
 
93
  wav_path = os.path.join(tmpdir, f"podcast_{lang_code}.wav")
94
+ # ► xtts_v2 accepts ISO‑639‑1 language codes directly
95
+ tts.tts_to_file(text=dialogue, language=lang_code, file_path=wav_path)
96
+
97
+ audio_outputs.append((wav_path, None)) # (file, label) for Gradio Audio
98
 
99
  return audio_outputs
100
 
101
  # ------------------------------------------------------------------
102
+ # Gradio UI
103
  # ------------------------------------------------------------------
104
 
105
+ audio_components = [
106
+ gr.Audio(label=f"{info['name']} Podcast", type="filepath") for info in LANG_INFO.values()
107
+ ]
108
 
109
  iface = gr.Interface(
110
  fn=generate_podcast,
111
  inputs=gr.File(label="Upload Lecture PDF", file_types=[".pdf"]),
112
  outputs=audio_components,
113
  title="Lecture → Multilingual Podcast Generator",
114
+ description=(
115
+ "Upload a lecture PDF and receive a two‑host audio podcast in English, "
116
+ "Bangla, Chinese, Urdu, and Nepali. Generation uses Qwen‑32B for the "
117
+ "dialogue and Coqui XTTS‑v2 for speech synthesis — no private repos "
118
+ "or API keys needed."
119
+ ),
120
  )
121
 
122
  if __name__ == "__main__":