iisadia commited on
Commit
6ddfbf8
·
verified ·
1 Parent(s): 9ae10a0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -20
app.py CHANGED
@@ -1,9 +1,10 @@
1
  import streamlit as st
2
  from transformers import pipeline
3
  import numpy as np
4
- import soundfile as sf
5
  from io import BytesIO
6
  from audio_recorder_streamlit import audio_recorder
 
7
 
8
  # Load Whisper model
9
  @st.cache_resource
@@ -25,33 +26,51 @@ with col1:
25
  text_input = st.text_area("Type your text here:", height=200)
26
 
27
  with col2:
28
- # Audio input using alternative recorder
29
  st.write("Record your voice:")
30
  audio_bytes = audio_recorder()
 
 
31
 
32
- # Process audio when recording is available
33
- if audio_bytes:
34
  try:
35
- # Convert bytes to audio array
36
- with BytesIO(audio_bytes) as audio_file:
37
- audio_data, sample_rate = sf.read(audio_file)
38
-
39
  # Convert stereo to mono if needed
40
- if len(audio_data.shape) > 1:
41
- audio_data = np.mean(audio_data, axis=1)
42
 
43
- # Create input for Whisper
44
- audio_dict = {"raw": audio_data, "sampling_rate": sample_rate}
45
-
46
- # Transcribe audio
47
- whisper = load_model()
48
- transcribed_text = whisper(audio_dict)["text"]
49
-
50
- # Update session state
51
- st.session_state.combined_text = f"{text_input}\n{transcribed_text}".strip()
 
 
52
 
 
53
  except Exception as e:
54
- st.error(f"Error processing audio: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  # Combine inputs when button is clicked
57
  if st.button("Submit"):
 
1
  import streamlit as st
2
  from transformers import pipeline
3
  import numpy as np
4
+ import torchaudio
5
  from io import BytesIO
6
  from audio_recorder_streamlit import audio_recorder
7
+ import torch
8
 
9
  # Load Whisper model
10
  @st.cache_resource
 
26
  text_input = st.text_area("Type your text here:", height=200)
27
 
28
  with col2:
29
+ # Audio input
30
  st.write("Record your voice:")
31
  audio_bytes = audio_recorder()
32
+ if audio_bytes:
33
+ st.audio(audio_bytes, format="audio/wav")
34
 
35
+ def process_audio(audio_bytes):
 
36
  try:
37
+ # Convert bytes to numpy array
38
+ waveform, sample_rate = torchaudio.load(BytesIO(audio_bytes))
39
+
 
40
  # Convert stereo to mono if needed
41
+ if waveform.shape[0] > 1:
42
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
43
 
44
+ # Resample to 16kHz if needed (Whisper's expected sample rate)
45
+ if sample_rate != 16000:
46
+ resampler = torchaudio.transforms.Resample(
47
+ orig_freq=sample_rate,
48
+ new_freq=16000
49
+ )
50
+ waveform = resampler(waveform)
51
+ sample_rate = 16000
52
+
53
+ # Convert to numpy array
54
+ audio_np = waveform.numpy().squeeze()
55
 
56
+ return {"raw": audio_np, "sampling_rate": sample_rate}
57
  except Exception as e:
58
+ st.error(f"Audio processing error: {str(e)}")
59
+ return None
60
+
61
+ # Process audio when recording is available
62
+ if audio_bytes:
63
+ audio_input = process_audio(audio_bytes)
64
+ if audio_input:
65
+ try:
66
+ # Transcribe audio
67
+ whisper = load_model()
68
+ transcribed_text = whisper(audio_input)["text"]
69
+
70
+ # Update session state
71
+ st.session_state.combined_text = f"{text_input}\n{transcribed_text}".strip()
72
+ except Exception as e:
73
+ st.error(f"Transcription error: {str(e)}")
74
 
75
  # Combine inputs when button is clicked
76
  if st.button("Submit"):