Spaces:

Mohan-diffuser
/

indian-asr-with-sarvam

Running

App Files Files Community

mohan696matlab commited on 26 days ago

Commit

65a7d00

1 Parent(s): b3b17d2

ui update

Browse files

Files changed (1) hide show

app.py +56 -39

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pydub import AudioSegment
-def translate_audio(audio, SARVAM_API_KEY):
     # API endpoint for speech-to-text translation
     api_url = "https://api.sarvam.ai/speech-to-text-translate"
@@ -21,7 +21,8 @@ def translate_audio(audio, SARVAM_API_KEY):
     # Data payload for the translation request
     model_data = {
         "model": "saaras:v2",  # Specify the model to be used
-        "with_diarization": False  # Set to True for speaker diarization
     }
@@ -38,6 +39,7 @@ def translate_audio(audio, SARVAM_API_KEY):
         if response.status_code == 200 or response.status_code == 201:
             response_data = response.json()
             transcript = response_data.get("transcript", "")
         elif response.status_code == 401 or response.status_code == 403:
             raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
         else:
@@ -48,10 +50,9 @@ def translate_audio(audio, SARVAM_API_KEY):
     finally:
         chunk_buffer.close()
-    return transcript
-def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
-    start_time = time.time()
     if history is None:
         history = ""
@@ -59,6 +60,8 @@ def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
     try:
         sr, y = new_chunk
         # Convert to mono if stereo
         if y.ndim > 1:
             y = y.mean(axis=1)
@@ -74,17 +77,16 @@ def stream_transcribe(history, new_chunk, SARVAM_API_KEY):
             channels=1
         )
-        transcription = translate_audio(audio_segment, SARVAM_API_KEY)
-        end_time = time.time()
-        latency = end_time - start_time
-        history = history + '\n' + transcription
-        return history, history, f"{latency:.2f}"
     except ValueError as ve:
-        return history, str(ve), "Invalid Key"
     except Exception as e:
         print(f"Error during Transcription: {e}")
-        return history, str(e), "Error"
@@ -99,40 +101,53 @@ def clear_api_key():
     return ""
-with gr.Blocks(theme=gr.themes.Citrus) as microphone:
     with gr.Column():
         gr.Markdown(
-    """
-    ### This app is designed to **transcribe and translate simultaneously from multiple Indian languages**. It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more. It can **translate the transcribed text in real-time to English**, making it incredibly useful for multilingual audio processing.
-    ### 🔑 Sarvam AI API Key Required
-    To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).
-    👉 **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai)
-    👉 **Step 2:** Sign up or log in
-    👉 **Step 3:** Generate your API key and paste it below
-    Your key stays on your device and is not stored.
-    """
-        )
         api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")
-        with gr.Row():
-            input_audio_microphone = gr.Audio(streaming=True)
-            output = gr.Textbox(label="Transcription", value="")
-            latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
         with gr.Row():
             clear_button = gr.Button("Clear Output")
             clear_api_key_button = gr.Button("Clear API Key")
         state = gr.State(value="")
-        def wrapped_stream_transcribe(history, new_chunk, api_key):
-            return stream_transcribe(history, new_chunk, api_key)
         input_audio_microphone.stream(
             wrapped_stream_transcribe,
-            [state, input_audio_microphone, api_key_box],
-            [state, output, latency_textbox],
             time_limit=30,
             stream_every=5,
             concurrency_limit=None,
@@ -143,17 +158,19 @@ with gr.Blocks(theme=gr.themes.Citrus) as microphone:
         gr.Markdown(
                     """
-                ---
-                ### 👋 Who am I?
-                I'm **Dr. Mohan Dash**, a PhD in Industrial Computer Science and an AI Research Engineer.
-                I run a YouTube channel called **[Intelligent Machines](https://www.youtube.com/@Mohankumardash)** where I share practical tutorials and insights on building real-world AI applications.
-                If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there.
-                [![YouTube Channel](https://yt3.googleusercontent.com/UYcIFCkqev-zwJemtbOPmmOzRU26gk-hetSSU18GWO-1wBbGHd7pjx5oTsz4x1sJ8riWg35TQw=w1707-fcrop64=1,00005a57ffffa5a8-k-c0xffffffff-no-nd-rj)]
                     """
-                        )
 demo = microphone
 demo.launch()

+def translate_audio(audio, language_code, SARVAM_API_KEY):
     # API endpoint for speech-to-text translation
     api_url = "https://api.sarvam.ai/speech-to-text-translate"
     # Data payload for the translation request
     model_data = {
         "model": "saaras:v2",  # Specify the model to be used
+        "with_diarization": False,  # Set to True for speaker diarization
+        "language_code": language_code
     }
         if response.status_code == 200 or response.status_code == 201:
             response_data = response.json()
             transcript = response_data.get("transcript", "")
+            detected_language = response_data.get("language_code", "")
         elif response.status_code == 401 or response.status_code == 403:
             raise ValueError("❌ Invalid API key. Please check your Sarvam AI key.")
         else:
     finally:
         chunk_buffer.close()
+    return transcript,detected_language
+def stream_transcribe(history, new_chunk, language_code, SARVAM_API_KEY):
     if history is None:
         history = ""
     try:
         sr, y = new_chunk
+        print(y.max(), y.min())
         # Convert to mono if stereo
         if y.ndim > 1:
             y = y.mean(axis=1)
             channels=1
         )
+        transcription,detected_language = translate_audio(audio_segment, language_code, SARVAM_API_KEY)
+        history = history + '\n' + f'({detected_language})==> ' +transcription
+        return history, history
     except ValueError as ve:
+        return history, str(ve)
     except Exception as e:
         print(f"Error during Transcription: {e}")
+        return history, str(e)
     return ""
+with gr.Blocks(theme=gr.themes.Soft()) as microphone:
     with gr.Column():
         gr.Markdown(
+                    """
+                    ## Translate simultaneously from multiple Indian languages to **English**.
+                    ### It supports **22 Indian languages**, including **Hindi, Oriya, Tamil, Telugu, Gujarati**, and more.
+                    ### 🔑 Sarvam AI API Key Required
+                    To use this app, you need a free API key from [Sarvam AI](https://sarvam.ai).
+                    👉 **Step 1:** Visit [https://sarvam.ai](https://sarvam.ai)
+                    👉 **Step 2:** Sign up or log in
+                    👉 **Step 3:** Generate your API key and paste it below
+                    Your key stays on your device and is not stored.
+                    """
+                        )
         api_key_box = gr.Textbox(label="Enter SARVAM AI API Key", type="password")
+        language_options = [
+                                "hi-IN", "bn-IN", "kn-IN", "ml-IN", "mr-IN", "od-IN",
+                                "pa-IN", "ta-IN", "te-IN", "en-IN", "gu-IN", "unknown"
+                            ]
+        language_code_box = gr.Dropdown(
+                                    choices=language_options,
+                                    label="Select Language Code",
+                                    value="unknown"  # optional: default selected value
+                                )
+        input_audio_microphone = gr.Audio(streaming=True)
+        output = gr.Textbox(label="Transcription", lines=10,max_lines=100, show_copy_button=True, value="")
         with gr.Row():
             clear_button = gr.Button("Clear Output")
             clear_api_key_button = gr.Button("Clear API Key")
         state = gr.State(value="")
+        def wrapped_stream_transcribe(history, new_chunk,language_code, api_key):
+            return stream_transcribe(history, new_chunk,language_code, api_key)
         input_audio_microphone.stream(
             wrapped_stream_transcribe,
+            [state, input_audio_microphone,language_code_box, api_key_box],
+            [state, output],
             time_limit=30,
             stream_every=5,
             concurrency_limit=None,
         gr.Markdown(
                     """
+                    ---
+                    ### 👋 Who am I?
+                    I am **Dr. Mohan Dash**, a PhD in Industrial Computer Science and an AI Research Engineer.
+                    I run a YouTube channel called **[Intelligent Machines](https://www.youtube.com/@Mohankumardash)** where I share practical tutorials and insights on building real-world AI applications.
+                    If you find this app useful, you'll definitely enjoy the tutorials and breakdowns I post there.
+                    ![YouTube Channel](https://yt3.googleusercontent.com/UYcIFCkqev-zwJemtbOPmmOzRU26gk-hetSSU18GWO-1wBbGHd7pjx5oTsz4x1sJ8riWg35TQw=w1707-fcrop64=1,00005a57ffffa5a8-k-c0xffffffff-no-nd-rj)
+                    ---
                     """
+                        )
 demo = microphone
 demo.launch()