vocal_ai / utils /stt_processor.py
dhruv2842's picture
Upload 29 files
643a619 verified
import speech_recognition as sr
import re
import os
from pydub import AudioSegment
def convert_to_wav_pydub(input_path: str, output_path: str = "converted_temp.wav") -> str:
audio = AudioSegment.from_file(input_path)
audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2) # 16-bit PCM
audio.export(output_path, format="wav")
return output_path
def transcribe_audio(audio_file_path: str) -> str:
recognizer = sr.Recognizer()
# Convert to proper WAV if needed
if not audio_file_path.lower().endswith(".wav"):
converted_path = convert_to_wav_pydub(audio_file_path) # βœ… FIXED HERE
delete_after_use = True
else:
converted_path = convert_to_wav_pydub(audio_file_path) # Even if WAV, reconvert to ensure PCM format
delete_after_use = True
try:
with sr.AudioFile(converted_path) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
print("βœ… Transcribed Text:", text)
return text
except sr.UnknownValueError:
print("⚠️ Could not understand audio")
return "Could not understand audio"
except sr.RequestError as e:
print("❌ API Request Error:", e)
return f"Request failed: {e}"
finally:
if delete_after_use and os.path.exists(converted_path):
try:
os.remove(converted_path)
except PermissionError:
print("⚠️ Warning: File could not be deleted, still in use.")
def simulate_stt(audio_file_path: str) -> dict:
raw_text = transcribe_audio(audio_file_path)
# Extract structured data using regex
name_match = re.search(r"my name is ([a-zA-Z ]+?)(?= i am| and|,|\.|$)", raw_text, re.IGNORECASE)
age_match = re.search(r"i am (\d+) years old", raw_text, re.IGNORECASE)
symptoms_match = re.search(r"suffering from (.+)", raw_text, re.IGNORECASE)
return {
"patient_name": name_match.group(1).strip() if name_match else "Unknown",
"age": int(age_match.group(1)) if age_match else 0,
"symptoms": symptoms_match.group(1).strip() if symptoms_match else "Not mentioned",
"preferred_doctor": "Not specified"
}