mohan696matlab commited on
Commit
65a7d00
Β·
1 Parent(s): b3b17d2
Files changed (1) hide show
  1. app.py +56 -39
app.py CHANGED
@@ -8,7 +8,7 @@ from pydub import AudioSegment
8
 
9
 
10
 
11
- def translate_audio(audio, SARVAM_API_KEY):
12
 
13
  # API endpoint for speech-to-text translation
14
  api_url = "https://api.sarvam.ai/speech-to-text-translate"
@@ -21,7 +21,8 @@ def translate_audio(audio, SARVAM_API_KEY):
21
  # Data payload for the translation request
22
  model_data = {
23
  "model": "saaras:v2", # Specify the model to be used
24
- "with_diarization": False # Set to True for speaker diarization
 
25
  }
26
 
27
 
@@ -38,6 +39,7 @@ def translate_audio(audio, SARVAM_API_KEY):
38
  if response.status_code == 200 or response.status_code == 201:
39
  response_data = response.json()
40
  transcript = response_data.get("transcript", "")
 
41
  elif response.status_code == 401 or response.status_code == 403:
42
  raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
43
  else:
@@ -48,10 +50,9 @@ def translate_audio(audio, SARVAM_API_KEY):
48
  finally:
49
  chunk_buffer.close()
50
 
51
- return transcript
52
 
53
- def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
54
- start_time = time.time()
55
 
56
  if history is None:
57
  history = ""
@@ -59,6 +60,8 @@ def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
59
  try:
60
  sr, y = new_chunk
61
 
 
 
62
  # Convert to mono if stereo
63
  if y.ndim > 1:
64
  y = y.mean(axis=1)
@@ -74,17 +77,16 @@ def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
74
  channels=1
75
  )
76
 
77
- transcription = translate_audio(audio_segment, SARVAM_API_KEY)
78
- end_time = time.time()
79
- latency = end_time - start_time
80
- history = history + '\n' + transcription
81
 
82
- return history, history, f"{latency:.2f}"
83
  except ValueError as ve:
84
- return history, str(ve), "Invalid Key"
85
  except Exception as e:
86
  print(f"Error during Transcription: {e}")
87
- return history, str(e), "Error"
88
 
89
 
90
 
@@ -99,40 +101,53 @@ def clear_api_key():
99
  return ""
100
 
101
 
102
- with gr.Blocks(theme=gr.themes.Citrus) as microphone:
103
  with gr.Column():
104
 
105
  gr.Markdown(
106
- """
107
- ### This app is designed to **transcribe and translate simultaneously from multiple Indian languages**. It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more. It can **translate the transcribed text in real-time to English**, making it incredibly useful for multilingual audio processing.
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- ### πŸ”‘ Sarvam AI API Key Required
110
- To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).
111
-
112
- πŸ‘‰ **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai)
113
- πŸ‘‰ **Step 2:** Sign up or log in
114
- πŸ‘‰ **Step 3:** Generate your API key and paste it below
115
 
116
- Your key stays on your device and is not stored.
117
- """
118
- )
119
  api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")
120
 
121
- with gr.Row():
122
- input_audio_microphone = gr.Audio(streaming=True)
123
- output = gr.Textbox(label="Transcription", value="")
124
- latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
 
 
 
 
 
 
 
 
 
 
125
  with gr.Row():
126
  clear_button = gr.Button("Clear Output")
127
  clear_api_key_button = gr.Button("Clear API Key")
128
  state = gr.State(value="")
129
- def wrapped_stream_transcribe(history, new_chunk, api_key):
130
- return stream_transcribe(history, new_chunk, api_key)
131
 
132
  input_audio_microphone.stream(
133
  wrapped_stream_transcribe,
134
- [state, input_audio_microphone, api_key_box],
135
- [state, output, latency_textbox],
136
  time_limit=30,
137
  stream_every=5,
138
  concurrency_limit=None,
@@ -143,17 +158,19 @@ with gr.Blocks(theme=gr.themes.Citrus) as microphone:
143
 
144
  gr.Markdown(
145
  """
146
- ---
 
 
147
 
148
- ### πŸ‘‹ Who am I?
 
149
 
150
- I'm **Dr. Mohan Dash**, a PhD in Industrial Computer Science and an AI Research Engineer.
151
- I run a YouTube channel called **[Intelligent Machines](https://www.youtube.com/@Mohankumardash)** where I share practical tutorials and insights on building real-world AI applications.
152
 
153
- If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there.
154
- [![YouTube Channel](https://yt3.googleusercontent.com/UYcIFCkqev-zwJemtbOPmmOzRU26gk-hetSSU18GWO-1wBbGHd7pjx5oTsz4x1sJ8riWg35TQw=w1707-fcrop64=1,00005a57ffffa5a8-k-c0xffffffff-no-nd-rj)]
155
  """
156
- )
157
 
158
  demo = microphone
159
  demo.launch()
 
8
 
9
 
10
 
11
+ def translate_audio(audio, language_code, SARVAM_API_KEY):
12
 
13
  # API endpoint for speech-to-text translation
14
  api_url = "https://api.sarvam.ai/speech-to-text-translate"
 
21
  # Data payload for the translation request
22
  model_data = {
23
  "model": "saaras:v2", # Specify the model to be used
24
+ "with_diarization": False, # Set to True for speaker diarization
25
+ "language_code": language_code
26
  }
27
 
28
 
 
39
  if response.status_code == 200 or response.status_code == 201:
40
  response_data = response.json()
41
  transcript = response_data.get("transcript", "")
42
+ detected_language = response_data.get("language_code", "")
43
  elif response.status_code == 401 or response.status_code == 403:
44
  raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
45
  else:
 
50
  finally:
51
  chunk_buffer.close()
52
 
53
+ return transcript,detected_language
54
 
55
+ def stream_transcribe(history, new_chunk, language_code, SARVAM_API_KEY):
 
56
 
57
  if history is None:
58
  history = ""
 
60
  try:
61
  sr, y = new_chunk
62
 
63
+ print(y.max(), y.min())
64
+
65
  # Convert to mono if stereo
66
  if y.ndim > 1:
67
  y = y.mean(axis=1)
 
77
  channels=1
78
  )
79
 
80
+ transcription,detected_language = translate_audio(audio_segment, language_code, SARVAM_API_KEY)
81
+
82
+ history = history + '\n' + f'({detected_language})==> ' +transcription
 
83
 
84
+ return history, history
85
  except ValueError as ve:
86
+ return history, str(ve)
87
  except Exception as e:
88
  print(f"Error during Transcription: {e}")
89
+ return history, str(e)
90
 
91
 
92
 
 
101
  return ""
102
 
103
 
104
+ with gr.Blocks(theme=gr.themes.Soft()) as microphone:
105
  with gr.Column():
106
 
107
  gr.Markdown(
108
+ """
109
+ ## Translate simultaneously from multiple Indian languages to **English**.
110
+ ### It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more.
111
+
112
+ ### πŸ”‘ Sarvam AI API Key Required
113
+ To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).
114
+
115
+ πŸ‘‰ **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai)
116
+ πŸ‘‰ **Step 2:** Sign up or log in
117
+ πŸ‘‰ **Step 3:** Generate your API key and paste it below
118
+
119
+ Your key stays on your device and is not stored.
120
+ """
121
+ )
122
 
 
 
 
 
 
 
123
 
 
 
 
124
  api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")
125
 
126
+ language_options = [
127
+ "hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN",
128
+ "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN", "unknown"
129
+ ]
130
+ language_code_box = gr.Dropdown(
131
+ choices=language_options,
132
+ label="Select Language Code",
133
+ value="unknown" # optional: default selected value
134
+ )
135
+
136
+
137
+ input_audio_microphone = gr.Audio(streaming=True)
138
+ output = gr.Textbox(label="Transcription", lines=10,max_lines=100, show_copy_button=True, value="")
139
+
140
  with gr.Row():
141
  clear_button = gr.Button("Clear Output")
142
  clear_api_key_button = gr.Button("Clear API Key")
143
  state = gr.State(value="")
144
+ def wrapped_stream_transcribe(history, new_chunk,language_code, api_key):
145
+ return stream_transcribe(history, new_chunk,language_code, api_key)
146
 
147
  input_audio_microphone.stream(
148
  wrapped_stream_transcribe,
149
+ [state, input_audio_microphone,language_code_box, api_key_box],
150
+ [state, output],
151
  time_limit=30,
152
  stream_every=5,
153
  concurrency_limit=None,
 
158
 
159
  gr.Markdown(
160
  """
161
+ ---
162
+
163
+ ### πŸ‘‹ Who am I?
164
 
165
+ I am **Dr. Mohan Dash**, a PhD in Industrial Computer Science and an AI Research Engineer.
166
+ I run a YouTube channel called **[Intelligent Machines](https://www.youtube.com/@Mohankumardash)** where I share practical tutorials and insights on building real-world AI applications.
167
 
168
+ If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there.
169
+ ![YouTube Channel](https://yt3.googleusercontent.com/UYcIFCkqev-zwJemtbOPmmOzRU26gk-hetSSU18GWO-1wBbGHd7pjx5oTsz4x1sJ8riWg35TQw=w1707-fcrop64=1,00005a57ffffa5a8-k-c0xffffffff-no-nd-rj)
170
 
171
+ ---
 
172
  """
173
+ )
174
 
175
  demo = microphone
176
  demo.launch()