saeedzou commited on
Commit
9ebf5e3
Β·
verified Β·
1 Parent(s): 6560d59

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +222 -222
app.py CHANGED
@@ -1,222 +1,222 @@
1
- import gradio as gr
2
- from google import genai
3
- import nemo.collections.asr as nemo_asr
4
- from pydub import AudioSegment
5
- import os
6
- from huggingface_hub import login
7
- from hazm import Normalizer
8
- import numpy as np
9
- import re
10
-
11
- # Fetch API Keys from environment variables
12
- HF_TOKEN = os.getenv("HF_TOKEN")
13
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
14
-
15
- if not HF_TOKEN:
16
- raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")
17
-
18
- if not GEMINI_API_KEY:
19
- raise ValueError("GEMINI_API_KEY environment variable not set. Please provide a valid GEMINI_API_KEY.")
20
-
21
- # Authenticate with Hugging Face
22
- login(HF_TOKEN)
23
-
24
- # Load the NeMo ASR Model
25
- try:
26
- asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
27
- model_name="faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30"
28
- )
29
- except Exception as e:
30
- raise RuntimeError(f"Failed to load model: {str(e)}")
31
-
32
- # Persian text normalizer
33
- normalizer = Normalizer()
34
-
35
- # Function to load audio
36
- def load_audio(audio_path):
37
- audio = AudioSegment.from_file(audio_path)
38
- audio = audio.set_channels(1).set_frame_rate(16000)
39
- audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
40
- audio_samples /= np.max(np.abs(audio_samples))
41
- return audio_samples, audio.frame_rate
42
-
43
- # Transcribe audio chunk
44
- def transcribe_chunk(audio_chunk, model):
45
- transcription = model.transcribe([audio_chunk], batch_size=1, verbose=False)
46
- return transcription[0].text
47
-
48
- # Transcribe the full audio file
49
- def transcribe_audio(file_path, model, chunk_size=30*16000):
50
- waveform, _ = load_audio(file_path)
51
- transcriptions = []
52
- for start in range(0, len(waveform), chunk_size):
53
- end = min(len(waveform), start + chunk_size)
54
- transcription = transcribe_chunk(waveform[start:end], model)
55
- transcriptions.append(transcription)
56
-
57
- transcriptions = ' '.join(transcriptions)
58
- transcriptions = re.sub(' +', ' ', transcriptions)
59
- transcriptions = normalizer.normalize(transcriptions)
60
-
61
- return transcriptions
62
-
63
- # Main transcription function
64
- def transcribe(audio):
65
- if audio is None:
66
- return "Please upload an audio file.", gr.update(interactive=False), gr.update(interactive=False)
67
-
68
- transcription = transcribe_audio(audio, asr_model)
69
-
70
- # Enable summarize & translate buttons once transcription is done
71
- return transcription, gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)
72
-
73
- # List of 100 languages for summarization
74
- languages = [
75
- "English", "Persian", "French", "Spanish", "German", "Italian", "Portuguese", "Dutch", "Swedish", "Danish",
76
- "Finnish", "Norwegian", "Russian", "Polish", "Turkish", "Arabic", "Hindi", "Chinese", "Japanese", "Korean",
77
- "Thai", "Vietnamese", "Indonesian", "Hebrew", "Greek", "Czech", "Hungarian", "Romanian", "Bulgarian", "Serbian",
78
- "Croatian", "Slovak", "Slovenian", "Ukrainian", "Lithuanian", "Latvian", "Estonian", "Macedonian", "Albanian",
79
- "Basque", "Catalan", "Maltese", "Icelandic", "Georgian", "Armenian", "Belarusian", "Yiddish", "Pashto", "Urdu",
80
- "Bengali", "Punjabi", "Tamil", "Telugu", "Malayalam", "Sinhala", "Burmese", "Lao", "Khmer", "Mongolian",
81
- "Nepali", "Marathi", "Gujarati", "Kannada", "Odia", "Assamese", "Maithili", "Kurdish", "Azerbaijani", "Kazakh",
82
- "Uzbek", "Turkmen", "Tajik", "Kyrgyz", "Uighur", "Tatar", "Haitian Creole", "Swahili", "Hausa", "Yoruba",
83
- "Zulu", "Xhosa", "Amharic", "Somali", "Tigrinya", "Shona", "Igbo", "Malagasy", "Quechua", "Aymara", "Guarani",
84
- "Sundanese", "Javanese", "Filipino", "Hmong", "Fijian", "Tongan", "Samoan", "Chamorro", "Hawaiian"
85
- ]
86
- languages = sorted(languages)
87
-
88
- # List of available AI models
89
- model_selections = [
90
- 'gemini-2.0-flash',
91
- 'gemini-2.0-pro-exp-02-05',
92
- 'gemini-2.0-flash-lite'
93
- ]
94
-
95
- def translate(text, target_language, model_sel):
96
- client = genai.Client(api_key=GEMINI_API_KEY)
97
- prompt = f"Translate the following text into {target_language}. Only reply with the translation.\n'{text}'."
98
-
99
- response = client.models.generate_content(
100
- model=model_sel,
101
- contents=[prompt]
102
- )
103
-
104
- return response.text
105
-
106
- # Function to summarize transcribed text
107
- def summarize(transcript_text, word_count, model_sel, lang_sel):
108
- client = genai.Client(api_key=GEMINI_API_KEY)
109
- prompt = f"Summarize the following text in {word_count} words in {lang_sel}: '{transcript_text}'."
110
-
111
- response = client.models.generate_content(
112
- model=model_sel,
113
- contents=[prompt]
114
- )
115
-
116
- return response.text
117
-
118
- def punctuate(transcript, model_sel):
119
- client = genai.Client(api_key=GEMINI_API_KEY)
120
- prompt = f"Restore puncutations of the transcript from an ASR model given. Maintain the original content. Only reply with the output. text: \n{transcript}"
121
-
122
- response = client.models.generate_content(
123
- model=model_sel,
124
- contents=[prompt]
125
- )
126
-
127
- return response.text
128
-
129
- # Gradio Interface
130
- with gr.Blocks(theme="huggingface") as demo:
131
- gr.Markdown(
132
- """
133
- # πŸ“ Persian ASR, Translation & Summarization
134
-
135
- Welcome to the **Persian Speech-to-Text & NLP** platform! This app allows you to upload an audio file,
136
- get an accurate **transcription**, and enhance the output with **translation**, **summarization**,
137
- and **punctuation restoration**.
138
-
139
- ## 🎯 How It Works
140
- 1️⃣ Upload an **audio file** containing Persian speech.
141
- 2️⃣ Click **"Transcribe"** to generate the text output.
142
- 3️⃣ Use additional features: **Translate**, **Summarize**, or **Restore Punctuation**.
143
- 4️⃣ Customize settings: Select a **language**, **AI model**, and **summary length**.
144
- 5️⃣ View and copy the processed text for your use!
145
-
146
- """
147
- )
148
-
149
- with gr.Row():
150
- audio_input = gr.Audio(type="filepath", label="🎡 Upload Audio File")
151
- transcript_output = gr.Textbox(label="πŸ“ Transcription", interactive=False)
152
- translation_output = gr.Textbox(label="🌍 Translation", interactive=False)
153
- summarized_output = gr.Textbox(label="πŸ“– Summarized Text", interactive=False)
154
-
155
- transcribe_button = gr.Button("πŸŽ™οΈ Transcribe")
156
- translate_button = gr.Button("🌐 Translate", interactive=False)
157
- summarize_button = gr.Button("βœ‚οΈ Summarize", interactive=False)
158
- punctuate_button = gr.Button("πŸ”€ Restore Punctuation", interactive=False)
159
-
160
- with gr.Row():
161
- word_count_input = gr.Number(value=50, label="πŸ“ Word Count for Summary")
162
- lang_selection = gr.Dropdown(choices=languages, value="English", label="🌎 Select Language")
163
- model_selection = gr.Dropdown(choices=model_selections, value="gemini-2.0-flash", label="πŸ€– Select AI Model")
164
-
165
- # Button Click Actions
166
- transcribe_button.click(
167
- transcribe,
168
- inputs=audio_input,
169
- outputs=[transcript_output, summarize_button, translate_button, punctuate_button]
170
- )
171
-
172
- translate_button.click(
173
- translate,
174
- inputs=[transcript_output, lang_selection, model_selection],
175
- outputs=translation_output
176
- )
177
-
178
- summarize_button.click(
179
- summarize,
180
- inputs=[transcript_output, word_count_input, model_selection, lang_selection],
181
- outputs=summarized_output
182
- )
183
-
184
- punctuate_button.click(
185
- punctuate,
186
- inputs=[transcript_output, model_selection],
187
- outputs=transcript_output
188
- )
189
-
190
- gr.Markdown(
191
- """
192
- \n\n
193
- ---
194
-
195
- Powered by NVIDIA’s **NeMo Fast Conformer**, this tool is optimized for high-quality **Persian ASR (Automatic Speech Recognition)**.
196
-
197
- **πŸ“š Trained on 800+ Hours of Speech Data:**
198
- - Common Voice 17 (~300 hours)
199
- - YouTube (~400 hours)
200
- - NasleMana (~90 hours)
201
- - In-house dataset (~70 hours)
202
-
203
- ---
204
-
205
- ## πŸ“œ License & Business Inquiries
206
-
207
- This application is licensed under **Creative Commons Attribution-NonCommercial 4.0 (CC BY-NC 4.0)**.
208
- - **πŸ›‘ Non-Commercial Use Only** – Commercial use is not permitted without prior approval.
209
- - **πŸ”— Attribution Required** – Credit must be given to FAIM Group, Sharif University of Technology.
210
- - **❌ No Derivatives** – Modifications or adaptations of this work are not allowed.
211
-
212
- πŸ“œ Full License Details: [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/)
213
-
214
- πŸ“© **Business Inquiries:**
215
- If you're interested in commercial applications, please contact us at:
216
- βœ‰οΈ **Email:** [[email protected]](mailto:[email protected])
217
-
218
- ---
219
- """
220
- )
221
- # Launch Gradio app
222
- demo.launch()
 
1
+ import gradio as gr
2
+ from google import genai
3
+ import nemo.collections.asr as nemo_asr
4
+ from pydub import AudioSegment
5
+ import os
6
+ from huggingface_hub import login
7
+ from hazm import Normalizer
8
+ import numpy as np
9
+ import re
10
+
11
+ # Fetch API Keys from environment variables
12
+ HF_TOKEN = os.getenv("HF_TOKEN")
13
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
14
+
15
+ if not HF_TOKEN:
16
+ raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")
17
+
18
+ if not GEMINI_API_KEY:
19
+ raise ValueError("GEMINI_API_KEY environment variable not set. Please provide a valid GEMINI_API_KEY.")
20
+
21
+ # Authenticate with Hugging Face
22
+ login(HF_TOKEN)
23
+
24
+ # Load the NeMo ASR Model
25
+ try:
26
+ asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
27
+ model_name="faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30"
28
+ )
29
+ except Exception as e:
30
+ raise RuntimeError(f"Failed to load model: {str(e)}")
31
+
32
+ # Persian text normalizer
33
+ normalizer = Normalizer()
34
+
35
+ # Function to load audio
36
+ def load_audio(audio_path):
37
+ audio = AudioSegment.from_file(audio_path)
38
+ audio = audio.set_channels(1).set_frame_rate(16000)
39
+ audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
40
+ audio_samples /= np.max(np.abs(audio_samples))
41
+ return audio_samples, audio.frame_rate
42
+
43
+ # Transcribe audio chunk
44
+ def transcribe_chunk(audio_chunk, model):
45
+ transcription = model.transcribe([audio_chunk], batch_size=1, verbose=False)
46
+ return transcription[0].text
47
+
48
+ # Transcribe the full audio file
49
+ def transcribe_audio(file_path, model, chunk_size=30*16000):
50
+ waveform, _ = load_audio(file_path)
51
+ transcriptions = []
52
+ for start in range(0, len(waveform), chunk_size):
53
+ end = min(len(waveform), start + chunk_size)
54
+ transcription = transcribe_chunk(waveform[start:end], model)
55
+ transcriptions.append(transcription)
56
+
57
+ transcriptions = ' '.join(transcriptions)
58
+ transcriptions = re.sub(' +', ' ', transcriptions)
59
+ transcriptions = normalizer.normalize(transcriptions)
60
+
61
+ return transcriptions
62
+
63
+ # Main transcription function
64
+ def transcribe(audio):
65
+ if audio is None:
66
+ return "Please upload an audio file.", gr.update(interactive=False), gr.update(interactive=False)
67
+
68
+ transcription = transcribe_audio(audio, asr_model)
69
+
70
+ # Enable summarize & translate buttons once transcription is done
71
+ return transcription
72
+
73
+ # List of 100 languages for summarization
74
+ languages = [
75
+ "English", "Persian", "French", "Spanish", "German", "Italian", "Portuguese", "Dutch", "Swedish", "Danish",
76
+ "Finnish", "Norwegian", "Russian", "Polish", "Turkish", "Arabic", "Hindi", "Chinese", "Japanese", "Korean",
77
+ "Thai", "Vietnamese", "Indonesian", "Hebrew", "Greek", "Czech", "Hungarian", "Romanian", "Bulgarian", "Serbian",
78
+ "Croatian", "Slovak", "Slovenian", "Ukrainian", "Lithuanian", "Latvian", "Estonian", "Macedonian", "Albanian",
79
+ "Basque", "Catalan", "Maltese", "Icelandic", "Georgian", "Armenian", "Belarusian", "Yiddish", "Pashto", "Urdu",
80
+ "Bengali", "Punjabi", "Tamil", "Telugu", "Malayalam", "Sinhala", "Burmese", "Lao", "Khmer", "Mongolian",
81
+ "Nepali", "Marathi", "Gujarati", "Kannada", "Odia", "Assamese", "Maithili", "Kurdish", "Azerbaijani", "Kazakh",
82
+ "Uzbek", "Turkmen", "Tajik", "Kyrgyz", "Uighur", "Tatar", "Haitian Creole", "Swahili", "Hausa", "Yoruba",
83
+ "Zulu", "Xhosa", "Amharic", "Somali", "Tigrinya", "Shona", "Igbo", "Malagasy", "Quechua", "Aymara", "Guarani",
84
+ "Sundanese", "Javanese", "Filipino", "Hmong", "Fijian", "Tongan", "Samoan", "Chamorro", "Hawaiian"
85
+ ]
86
+ languages = sorted(languages)
87
+
88
+ # List of available AI models
89
+ model_selections = [
90
+ 'gemini-2.0-flash',
91
+ 'gemini-2.0-pro-exp-02-05',
92
+ 'gemini-2.0-flash-lite'
93
+ ]
94
+
95
+ def translate(text, target_language, model_sel):
96
+ client = genai.Client(api_key=GEMINI_API_KEY)
97
+ prompt = f"Translate the following text into {target_language}. Only reply with the translation.\n'{text}'."
98
+
99
+ response = client.models.generate_content(
100
+ model=model_sel,
101
+ contents=[prompt]
102
+ )
103
+
104
+ return response.text
105
+
106
+ # Function to summarize transcribed text
107
+ def summarize(transcript_text, word_count, model_sel, lang_sel):
108
+ client = genai.Client(api_key=GEMINI_API_KEY)
109
+ prompt = f"Summarize the following text in {word_count} words in {lang_sel}: '{transcript_text}'."
110
+
111
+ response = client.models.generate_content(
112
+ model=model_sel,
113
+ contents=[prompt]
114
+ )
115
+
116
+ return response.text
117
+
118
+ def punctuate(transcript, model_sel):
119
+ client = genai.Client(api_key=GEMINI_API_KEY)
120
+ prompt = f"Restore puncutations of the transcript from an ASR model given. Maintain the original content. Only reply with the output. text: \n{transcript}"
121
+
122
+ response = client.models.generate_content(
123
+ model=model_sel,
124
+ contents=[prompt]
125
+ )
126
+
127
+ return response.text
128
+
129
+ # Gradio Interface
130
+ with gr.Blocks(theme="huggingface") as demo:
131
+ gr.Markdown(
132
+ """
133
+ # πŸ“ Persian ASR, Translation & Summarization
134
+
135
+ Welcome to the **Persian Speech-to-Text & NLP** platform! This app allows you to upload an audio file,
136
+ get an accurate **transcription**, and enhance the output with **translation**, **summarization**,
137
+ and **punctuation restoration**.
138
+
139
+ ## 🎯 How It Works
140
+ 1️⃣ Upload an **audio file** containing Persian speech. To transcribe YouTube videos, open this [Colab Notebook](https://colab.research.google.com/github/saeedzou/NeMo-Gradio/blob/main/yt_transcribe_gradio.ipynb)
141
+ 2️⃣ Click **"Transcribe"** to generate the text output.
142
+ 3️⃣ Use additional features: **Translate**, **Summarize**, or **Restore Punctuation**.
143
+ 4️⃣ Customize settings: Select a **language**, **AI model**, and **summary length**.
144
+ 5️⃣ View and copy the processed text for your use!
145
+
146
+ """
147
+ )
148
+
149
+ with gr.Row():
150
+ audio_input = gr.Audio(type="filepath", label="οΏ½οΏ½ Upload Audio File")
151
+ transcript_output = gr.Textbox(label="πŸ“ Transcription", interactive=True)
152
+ translation_output = gr.Textbox(label="🌍 Translation", interactive=False)
153
+ summarized_output = gr.Textbox(label="πŸ“– Summarized Text", interactive=False)
154
+
155
+ transcribe_button = gr.Button("πŸŽ™οΈ Transcribe")
156
+ translate_button = gr.Button("🌐 Translate", interactive=True)
157
+ summarize_button = gr.Button("βœ‚οΈ Summarize", interactive=True)
158
+ punctuate_button = gr.Button("πŸ”€ Restore Punctuation", interactive=True)
159
+
160
+ with gr.Row():
161
+ word_count_input = gr.Number(value=50, label="πŸ“ Word Count for Summary")
162
+ lang_selection = gr.Dropdown(choices=languages, value="English", label="🌎 Select Language")
163
+ model_selection = gr.Dropdown(choices=model_selections, value="gemini-2.0-flash", label="πŸ€– Select AI Model")
164
+
165
+ # Button Click Actions
166
+ transcribe_button.click(
167
+ transcribe,
168
+ inputs=audio_input,
169
+ outputs=transcript_output
170
+ )
171
+
172
+ translate_button.click(
173
+ translate,
174
+ inputs=[transcript_output, lang_selection, model_selection],
175
+ outputs=translation_output
176
+ )
177
+
178
+ summarize_button.click(
179
+ summarize,
180
+ inputs=[transcript_output, word_count_input, model_selection, lang_selection],
181
+ outputs=summarized_output
182
+ )
183
+
184
+ punctuate_button.click(
185
+ punctuate,
186
+ inputs=[transcript_output, model_selection],
187
+ outputs=transcript_output
188
+ )
189
+
190
+ gr.Markdown(
191
+ """
192
+ \n\n
193
+ ---
194
+
195
+ Powered by NVIDIA’s **NeMo Fast Conformer**, this tool is optimized for high-quality **Persian ASR (Automatic Speech Recognition)**.
196
+
197
+ **πŸ“š Trained on 800+ Hours of Speech Data:**
198
+ - Common Voice 17 (~300 hours)
199
+ - YouTube (~400 hours)
200
+ - NasleMana (~90 hours)
201
+ - In-house dataset (~70 hours)
202
+
203
+ ---
204
+
205
+ ## πŸ“œ License & Business Inquiries
206
+
207
+ This application is licensed under **Creative Commons Attribution-NonCommercial 4.0 (CC BY-NC 4.0)**.
208
+ - **πŸ›‘ Non-Commercial Use Only** – Commercial use is not permitted without prior approval.
209
+ - **πŸ”— Attribution Required** – Credit must be given to FAIM Group, Sharif University of Technology.
210
+ - **❌ No Derivatives** – Modifications or adaptations of this work are not allowed.
211
+
212
+ πŸ“œ Full License Details: [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/)
213
+
214
+ πŸ“© **Business Inquiries:**
215
+ If you're interested in commercial applications, please contact us at:
216
+ βœ‰οΈ **Email:** [[email protected]](mailto:[email protected])
217
+
218
+ ---
219
+ """
220
+ )
221
+ # Launch Gradio app
222
+ demo.launch()