Spaces:

helvekami
/

ShukaNote

Running

App Files Files Community

helvekami commited on Mar 6

Commit

b5f86ee

verified ·

1 Parent(s): 9c37c06

Update app.py

Browse files

Files changed (1) hide show

app.py +10 -26

app.py CHANGED Viewed

@@ -15,42 +15,26 @@ def transcribe_and_respond(audio_file):
             torch_dtype=torch.bfloat16
         )
-        # Load the audio file, requesting a sample rate of 16000
         audio, sr = librosa.load(audio_file, sr=16000)
-        # Convert the loaded audio to a contiguous float32 array
-        audio = np.ascontiguousarray(audio, dtype=np.float32)
-        # If audio has more than one channel, convert to mono by averaging channels
-        if audio.ndim > 1:
-            audio = np.mean(audio, axis=-1)
-        # Debug: Print audio properties
         print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
-        # Although we requested 16000 Hz, double-check the sample rate.
-        # If not 16000, force conversion:
-        if sr != 16000:
-            # Ensure the audio is float32 before resampling
-            audio = audio.astype(np.float32)
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-            sr = 16000
-        # Set up the transcription prompt to get exact transcription
         turns = [
-            {'role': 'system', 'content': 'Please transcribe the following audio exactly.'},
             {'role': 'user', 'content': '<|audio|>'}
         ]
         # Debug: Print the initial turns
         print(f"Initial turns: {turns}")
         # Call the model with the audio and prompt
         output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
         # Debug: Print the final output from the model
         print(f"Model output: {output}")
         return output
     except Exception as e:
@@ -61,9 +45,9 @@ iface = gr.Interface(
     inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs="text",
     title="Live Transcription and Response",
-    description="Speak into your microphone, and the model will transcribe your speech.",
     live=True
 )
 if __name__ == "__main__":
-    iface.launch()

             torch_dtype=torch.bfloat16
         )
+        # Load the audio file
         audio, sr = librosa.load(audio_file, sr=16000)
+        # Print audio properties for debugging
         print(f"Audio dtype: {audio.dtype}, Audio shape: {audio.shape}, Sample rate: {sr}")
         turns = [
+            {'role': 'system', 'content': 'repeat the previous text exactly with no changes'},
             {'role': 'user', 'content': '<|audio|>'}
         ]
         # Debug: Print the initial turns
         print(f"Initial turns: {turns}")
         # Call the model with the audio and prompt
         output = pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=512)
         # Debug: Print the final output from the model
         print(f"Model output: {output}")
         return output
     except Exception as e:
     inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs="text",
     title="Live Transcription and Response",
+    description="Speak into your microphone, and the model will respond naturally and informatively.",
     live=True
 )
 if __name__ == "__main__":
+    iface.launch()