Spaces:

iisadia
/

KASOTI_GAME

Sleeping

App Files Files Community

iisadia commited on Apr 12

Commit

6ddfbf8

verified ·

1 Parent(s): 9ae10a0

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -20

app.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import streamlit as st
 from transformers import pipeline
 import numpy as np
-import soundfile as sf
 from io import BytesIO
 from audio_recorder_streamlit import audio_recorder
 # Load Whisper model
 @st.cache_resource
@@ -25,33 +26,51 @@ with col1:
     text_input = st.text_area("Type your text here:", height=200)
 with col2:
-    # Audio input using alternative recorder
     st.write("Record your voice:")
     audio_bytes = audio_recorder()
-# Process audio when recording is available
-if audio_bytes:
     try:
-        # Convert bytes to audio array
-        with BytesIO(audio_bytes) as audio_file:
-            audio_data, sample_rate = sf.read(audio_file)
         # Convert stereo to mono if needed
-        if len(audio_data.shape) > 1:
-            audio_data = np.mean(audio_data, axis=1)
-        # Create input for Whisper
-        audio_dict = {"raw": audio_data, "sampling_rate": sample_rate}
-        # Transcribe audio
-        whisper = load_model()
-        transcribed_text = whisper(audio_dict)["text"]
-        # Update session state
-        st.session_state.combined_text = f"{text_input}\n{transcribed_text}".strip()
     except Exception as e:
-        st.error(f"Error processing audio: {str(e)}")
 # Combine inputs when button is clicked
 if st.button("Submit"):

 import streamlit as st
 from transformers import pipeline
 import numpy as np
+import torchaudio
 from io import BytesIO
 from audio_recorder_streamlit import audio_recorder
+import torch
 # Load Whisper model
 @st.cache_resource
     text_input = st.text_area("Type your text here:", height=200)
 with col2:
+    # Audio input
     st.write("Record your voice:")
     audio_bytes = audio_recorder()
+    if audio_bytes:
+        st.audio(audio_bytes, format="audio/wav")
+def process_audio(audio_bytes):
     try:
+        # Convert bytes to numpy array
+        waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
         # Convert stereo to mono if needed
+        if waveform.shape[0] > 1:
+            waveform = torch.mean(waveform, dim=0, keepdim=True)
+        # Resample to 16kHz if needed (Whisper's expected sample rate)
+        if sample_rate != 16000:
+            resampler = torchaudio.transforms.Resample(
+                orig_freq=sample_rate,
+                new_freq=16000
+            )
+            waveform = resampler(waveform)
+            sample_rate = 16000
+        # Convert to numpy array
+        audio_np = waveform.numpy().squeeze()
+        return {"raw": audio_np, "sampling_rate": sample_rate}
     except Exception as e:
+        st.error(f"Audio processing error: {str(e)}")
+        return None
+# Process audio when recording is available
+if audio_bytes:
+    audio_input = process_audio(audio_bytes)
+    if audio_input:
+        try:
+            # Transcribe audio
+            whisper = load_model()
+            transcribed_text = whisper(audio_input)["text"]
+            # Update session state
+            st.session_state.combined_text = f"{text_input}\n{transcribed_text}".strip()
+        except Exception as e:
+            st.error(f"Transcription error: {str(e)}")
 # Combine inputs when button is clicked
 if st.button("Submit"):