Spaces:
Running
Running
added files
Browse files- app.py +101 -0
- requirements.txt +8 -0
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import time
|
4 |
+
from parler_tts import ParlerTTSForConditionalGeneration
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
import soundfile as sf
|
7 |
+
from google.generativeai import GenerativeModel, configure
|
8 |
+
import gradio as gr
|
9 |
+
|
10 |
+
# Initialize with prints
|
11 |
+
print("⚡ Initializing models...")
|
12 |
+
start_load = time.time()
|
13 |
+
|
14 |
+
# 1. Load Gemini
|
15 |
+
GEMINI_KEY = os.environ.get('GEMINI_API_KEY')
|
16 |
+
configure(api_key=GEMINI_KEY)
|
17 |
+
gemini = GenerativeModel('gemini-2.0-flash')
|
18 |
+
print(f" ✅ Gemini loaded (device: {'GPU' if torch.cuda.is_available() else 'CPU'})")
|
19 |
+
|
20 |
+
# 2. Load Indic-TTS
|
21 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
22 |
+
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
|
23 |
+
tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
|
24 |
+
desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)
|
25 |
+
print(f" ✅ Indic-TTS loaded in {time.time() - start_load:.2f}s\n")
|
26 |
+
|
27 |
+
def hinglish_to_devnagri(text):
|
28 |
+
try:
|
29 |
+
print(f"🔠 Converting Hinglish to Devnagri: '{text[:30]}...'")
|
30 |
+
start = time.time()
|
31 |
+
|
32 |
+
response = gemini.generate_content(
|
33 |
+
f"""
|
34 |
+
Analyze this mixed-language text containing English and a regional language:
|
35 |
+
"{text}"
|
36 |
+
|
37 |
+
Perform:
|
38 |
+
1. Detect the regional language (e.g. Hindi, Tamil, Bengali) if you don't find regional language, convert to Hindi (pure devanagari script).
|
39 |
+
2. Convert to pure regional language script (Devanagari/Tamil/Bangla)
|
40 |
+
3. Preserve complex words (technical/medical terms) in their original form
|
41 |
+
4. Maintain natural flow and meaning
|
42 |
+
5. Remove the code if you find them in backticks ```.
|
43 |
+
|
44 |
+
Rules:
|
45 |
+
- Keep proper nouns unchanged
|
46 |
+
- Use colloquial spellings (e.g. "कॉलेज" not "विद्यालय" for "college")
|
47 |
+
|
48 |
+
Output ONLY the converted text in the detected script.
|
49 |
+
"""
|
50 |
+
)
|
51 |
+
|
52 |
+
print(f" ✓ Translation done in {time.time() - start:.2f}s")
|
53 |
+
return response.text
|
54 |
+
except Exception as e:
|
55 |
+
print(f"❌ Gemini error: {str(e)}")
|
56 |
+
raise gr.Error(f"Gemini error: {str(e)}")
|
57 |
+
|
58 |
+
def generate_speech(text):
|
59 |
+
print("\n" + "="*50)
|
60 |
+
print("🎤 Starting Hinglish-to-Speech pipeline")
|
61 |
+
|
62 |
+
# 1. Text Conversion
|
63 |
+
hindi_text = hinglish_to_devnagri(text)
|
64 |
+
print(f" Hindi text: {hindi_text[:50]}...")
|
65 |
+
|
66 |
+
# 2. Audio Generation
|
67 |
+
print("\n🔊 Generating audio...")
|
68 |
+
start_audio = time.time()
|
69 |
+
|
70 |
+
desc = "एक महिला वक्ता स्पष्ट हिंदी में बोल रही हैं"
|
71 |
+
desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device)
|
72 |
+
text_inputs = tts_tokenizer(hindi_text, return_tensors="pt").to(device)
|
73 |
+
|
74 |
+
audio = tts_model.generate(
|
75 |
+
input_ids=desc_inputs.input_ids,
|
76 |
+
attention_mask=desc_inputs.attention_mask,
|
77 |
+
prompt_input_ids=text_inputs.input_ids,
|
78 |
+
prompt_attention_mask=text_inputs.attention_mask
|
79 |
+
)
|
80 |
+
|
81 |
+
# 3. Save Output
|
82 |
+
sf.write("output.wav", audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate)
|
83 |
+
print(f"\n💾 Audio generated in {time.time() - start_audio:.2f}s")
|
84 |
+
print("="*50 + "\n")
|
85 |
+
|
86 |
+
return "output.wav", hindi_text
|
87 |
+
|
88 |
+
# Gradio UI
|
89 |
+
with gr.Blocks() as app:
|
90 |
+
gr.Markdown("## 🚀 Hinglish-to-Speech (Gemini + Indic-TTS)")
|
91 |
+
with gr.Row():
|
92 |
+
inp = gr.Textbox(label="Enter Hinglish Text", placeholder="Aaj mood nahi hai...")
|
93 |
+
btn = gr.Button("Generate")
|
94 |
+
with gr.Row():
|
95 |
+
audio_out = gr.Audio(label="Speech Output")
|
96 |
+
text_out = gr.Textbox(label="Devnagri Translation")
|
97 |
+
|
98 |
+
btn.click(fn=generate_speech, inputs=inp, outputs=[audio_out, text_out])
|
99 |
+
|
100 |
+
print("\n🚀 App ready! Waiting for input...")
|
101 |
+
app.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch>=2.0.0
|
2 |
+
transformers>=4.30.0
|
3 |
+
parler-tts>=0.1.0
|
4 |
+
soundfile>=0.12.0
|
5 |
+
google-generativeai>=0.3.0
|
6 |
+
gradio>=3.40.0
|
7 |
+
|
8 |
+
|