vumichien commited on
Commit
982003f
·
1 Parent(s): e8e18ec

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +60 -0
utils.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ import base64
3
+ import numpy as np
4
+ import subprocess
5
+ import soundfile as sf
6
+ from speech_recognition import AudioFile, Recognizer
7
+
8
+
9
+ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
10
+ """
11
+ Helper function to read an audio file through ffmpeg.
12
+ """
13
+ ar = f"{sampling_rate}"
14
+ ac = "1"
15
+ format_for_conversion = "f32le"
16
+ ffmpeg_command = [
17
+ "ffmpeg",
18
+ "-i",
19
+ "pipe:0",
20
+ "-ac",
21
+ ac,
22
+ "-ar",
23
+ ar,
24
+ "-f",
25
+ format_for_conversion,
26
+ "-hide_banner",
27
+ "-loglevel",
28
+ "quiet",
29
+ "pipe:1",
30
+ ]
31
+
32
+ try:
33
+ ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
34
+ except FileNotFoundError:
35
+ raise ValueError("ffmpeg was not found but is required to load audio files from filename")
36
+ output_stream = ffmpeg_process.communicate(bpayload)
37
+ out_bytes = output_stream[0]
38
+ audio = np.frombuffer(out_bytes, np.float32)
39
+ sf.write('temp.wav', audio, sampling_rate, subtype='PCM_16')
40
+ return 'temp.wav'
41
+
42
+
43
+ def stt(audio: object, language='ja') -> str:
44
+ """Converts speech to text.
45
+ Args:
46
+ audio: record of user speech
47
+ language (str): language of text
48
+ Returns:
49
+ text (str): recognized speech of user
50
+ """
51
+ # Create a Recognizer object
52
+ r = Recognizer()
53
+ # Open the audio file
54
+ with AudioFile(audio) as source:
55
+ # Listen for the data (load audio to memory)
56
+ audio_data = r.record(source)
57
+ # Transcribe the audio using Google's speech-to-text API
58
+ text = r.recognize_google(audio_data, language=language)
59
+ return text
60
+