helvekami commited on
Commit
9c37c06
·
1 Parent(s): f2fe7e2

Updated Gradio App

Browse files
Files changed (1) hide show
  1. app.py +26 -10
app.py CHANGED
@@ -15,26 +15,42 @@ def transcribe_and_respond(audio_file):
15
  torch_dtype=torch.bfloat16
16
  )
17
 
18
- # Load the audio file
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
-
21
- # Print audio properties for debugging
 
 
 
 
 
 
 
22
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
 
 
 
 
 
 
 
 
23
 
 
24
  turns = [
25
- {'role': 'system', 'content': 'Repeat the following text exactly, without any changes'},
26
  {'role': 'user', 'content': '<|audio|>'}
27
  ]
28
-
29
  # Debug: Print the initial turns
30
  print(f"Initial turns: {turns}")
31
-
32
  # Call the model with the audio and prompt
33
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
34
-
35
  # Debug: Print the final output from the model
36
  print(f"Model output: {output}")
37
-
38
  return output
39
 
40
  except Exception as e:
@@ -45,9 +61,9 @@ iface = gr.Interface(
45
  inputs=gr.Audio(sources="microphone", type="filepath"),
46
  outputs="text",
47
  title="Live Transcription and Response",
48
- description="Speak into your microphone, and the model will respond naturally and informatively.",
49
  live=True
50
  )
51
 
52
  if __name__ == "__main__":
53
- iface.launch()
 
15
  torch_dtype=torch.bfloat16
16
  )
17
 
18
+ # Load the audio file, requesting a sample rate of 16000
19
  audio, sr = librosa.load(audio_file, sr=16000)
20
+
21
+ # Convert the loaded audio to a contiguous float32 array
22
+ audio = np.ascontiguousarray(audio, dtype=np.float32)
23
+
24
+ # If audio has more than one channel, convert to mono by averaging channels
25
+ if audio.ndim > 1:
26
+ audio = np.mean(audio, axis=-1)
27
+
28
+ # Debug: Print audio properties
29
  print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
30
+
31
+ # Although we requested 16000 Hz, double-check the sample rate.
32
+ # If not 16000, force conversion:
33
+ if sr != 16000:
34
+ # Ensure the audio is float32 before resampling
35
+ audio = audio.astype(np.float32)
36
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
37
+ sr = 16000
38
 
39
+ # Set up the transcription prompt to get exact transcription
40
  turns = [
41
+ {'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
42
  {'role': 'user', 'content': '<|audio|>'}
43
  ]
44
+
45
  # Debug: Print the initial turns
46
  print(f"Initial turns: {turns}")
47
+
48
  # Call the model with the audio and prompt
49
  output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
50
+
51
  # Debug: Print the final output from the model
52
  print(f"Model output: {output}")
53
+
54
  return output
55
 
56
  except Exception as e:
 
61
  inputs=gr.Audio(sources="microphone", type="filepath"),
62
  outputs="text",
63
  title="Live Transcription and Response",
64
+ description="Speak into your microphone, and the model will transcribe your speech.",
65
  live=True
66
  )
67
 
68
  if __name__ == "__main__":
69
+ iface.launch()