argodinho commited on
Commit
62e10c5
·
1 Parent(s): 52c9496

added files

Browse files
Files changed (2) hide show
  1. app.py +101 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import time
4
+ from parler_tts import ParlerTTSForConditionalGeneration
5
+ from transformers import AutoTokenizer
6
+ import soundfile as sf
7
+ from google.generativeai import GenerativeModel, configure
8
+ import gradio as gr
9
+
10
+ # Initialize with prints
11
+ print("⚡ Initializing models...")
12
+ start_load = time.time()
13
+
14
+ # 1. Load Gemini
15
+ GEMINI_KEY = os.environ.get('GEMINI_API_KEY')
16
+ configure(api_key=GEMINI_KEY)
17
+ gemini = GenerativeModel('gemini-2.0-flash')
18
+ print(f" ✅ Gemini loaded (device: {'GPU' if torch.cuda.is_available() else 'CPU'})")
19
+
20
+ # 2. Load Indic-TTS
21
+ device = "cuda" if torch.cuda.is_available() else "cpu"
22
+ tts_model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
23
+ tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
24
+ desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)
25
+ print(f" ✅ Indic-TTS loaded in {time.time() - start_load:.2f}s\n")
26
+
27
+ def hinglish_to_devnagri(text):
28
+ try:
29
+ print(f"🔠 Converting Hinglish to Devnagri: '{text[:30]}...'")
30
+ start = time.time()
31
+
32
+ response = gemini.generate_content(
33
+ f"""
34
+ Analyze this mixed-language text containing English and a regional language:
35
+ "{text}"
36
+
37
+ Perform:
38
+ 1. Detect the regional language (e.g. Hindi, Tamil, Bengali) if you don't find regional language, convert to Hindi (pure devanagari script).
39
+ 2. Convert to pure regional language script (Devanagari/Tamil/Bangla)
40
+ 3. Preserve complex words (technical/medical terms) in their original form
41
+ 4. Maintain natural flow and meaning
42
+ 5. Remove the code if you find them in backticks ```.
43
+
44
+ Rules:
45
+ - Keep proper nouns unchanged
46
+ - Use colloquial spellings (e.g. "कॉलेज" not "विद्यालय" for "college")
47
+
48
+ Output ONLY the converted text in the detected script.
49
+ """
50
+ )
51
+
52
+ print(f" ✓ Translation done in {time.time() - start:.2f}s")
53
+ return response.text
54
+ except Exception as e:
55
+ print(f"❌ Gemini error: {str(e)}")
56
+ raise gr.Error(f"Gemini error: {str(e)}")
57
+
58
+ def generate_speech(text):
59
+ print("\n" + "="*50)
60
+ print("🎤 Starting Hinglish-to-Speech pipeline")
61
+
62
+ # 1. Text Conversion
63
+ hindi_text = hinglish_to_devnagri(text)
64
+ print(f" Hindi text: {hindi_text[:50]}...")
65
+
66
+ # 2. Audio Generation
67
+ print("\n🔊 Generating audio...")
68
+ start_audio = time.time()
69
+
70
+ desc = "एक महिला वक्ता स्पष्ट हिंदी में बोल रही हैं"
71
+ desc_inputs = desc_tokenizer(desc, return_tensors="pt").to(device)
72
+ text_inputs = tts_tokenizer(hindi_text, return_tensors="pt").to(device)
73
+
74
+ audio = tts_model.generate(
75
+ input_ids=desc_inputs.input_ids,
76
+ attention_mask=desc_inputs.attention_mask,
77
+ prompt_input_ids=text_inputs.input_ids,
78
+ prompt_attention_mask=text_inputs.attention_mask
79
+ )
80
+
81
+ # 3. Save Output
82
+ sf.write("output.wav", audio.cpu().numpy().squeeze(), tts_model.config.sampling_rate)
83
+ print(f"\n💾 Audio generated in {time.time() - start_audio:.2f}s")
84
+ print("="*50 + "\n")
85
+
86
+ return "output.wav", hindi_text
87
+
88
+ # Gradio UI
89
+ with gr.Blocks() as app:
90
+ gr.Markdown("## 🚀 Hinglish-to-Speech (Gemini + Indic-TTS)")
91
+ with gr.Row():
92
+ inp = gr.Textbox(label="Enter Hinglish Text", placeholder="Aaj mood nahi hai...")
93
+ btn = gr.Button("Generate")
94
+ with gr.Row():
95
+ audio_out = gr.Audio(label="Speech Output")
96
+ text_out = gr.Textbox(label="Devnagri Translation")
97
+
98
+ btn.click(fn=generate_speech, inputs=inp, outputs=[audio_out, text_out])
99
+
100
+ print("\n🚀 App ready! Waiting for input...")
101
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.30.0
3
+ parler-tts>=0.1.0
4
+ soundfile>=0.12.0
5
+ google-generativeai>=0.3.0
6
+ gradio>=3.40.0
7
+
8
+