# main.py import os import re import numpy as np from pydub import AudioSegment from fastapi import FastAPI, UploadFile, File from fastapi.responses import JSONResponse from huggingface_hub import login from hazm import Normalizer import nemo.collections.asr as nemo_asr import uvicorn # Load Hugging Face token HF_TOKEN = os.getenv("HF_TOKEN") if not HF_TOKEN: raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.") login(HF_TOKEN) # Load model once asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30") normalizer = Normalizer() app = FastAPI() def load_audio(audio_file_path): audio = AudioSegment.from_file(audio_file_path) audio = audio.set_channels(1).set_frame_rate(16000) audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32) audio_samples /= np.max(np.abs(audio_samples)) return audio_samples, audio.frame_rate def transcribe_chunk(audio_chunk, model): transcription = model.transcribe([audio_chunk], batch_size=1, verbose=False) return transcription[0].text def transcribe_audio(file_path, model, chunk_size=30 * 16000): waveform, _ = load_audio(file_path) transcriptions = [] for start in range(0, len(waveform), chunk_size): end = min(len(waveform), start + chunk_size) transcription = transcribe_chunk(waveform[start:end], model) transcriptions.append(transcription) final_transcription = ' '.join(transcriptions) final_transcription = re.sub(' +', ' ', final_transcription) final_transcription = normalizer.normalize(final_transcription) return final_transcription @app.post("/transcribe") async def transcribe(file: UploadFile = File(...)): try: temp_path = f"/tmp/{file.filename}" with open(temp_path, "wb") as f: f.write(await file.read()) result = transcribe_audio(temp_path, asr_model) return {"transcription": result} except Exception as e: return JSONResponse(status_code=500, content={"error": str(e)})