File size: 2,118 Bytes
d337705 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# main.py
import os
import re
import numpy as np
from pydub import AudioSegment
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from huggingface_hub import login
from hazm import Normalizer
import nemo.collections.asr as nemo_asr
import uvicorn
# Load Hugging Face token
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")
login(HF_TOKEN)
# Load model once
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30")
normalizer = Normalizer()
app = FastAPI()
def load_audio(audio_file_path):
audio = AudioSegment.from_file(audio_file_path)
audio = audio.set_channels(1).set_frame_rate(16000)
audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
audio_samples /= np.max(np.abs(audio_samples))
return audio_samples, audio.frame_rate
def transcribe_chunk(audio_chunk, model):
transcription = model.transcribe([audio_chunk], batch_size=1, verbose=False)
return transcription[0].text
def transcribe_audio(file_path, model, chunk_size=30 * 16000):
waveform, _ = load_audio(file_path)
transcriptions = []
for start in range(0, len(waveform), chunk_size):
end = min(len(waveform), start + chunk_size)
transcription = transcribe_chunk(waveform[start:end], model)
transcriptions.append(transcription)
final_transcription = ' '.join(transcriptions)
final_transcription = re.sub(' +', ' ', final_transcription)
final_transcription = normalizer.normalize(final_transcription)
return final_transcription
@app.post("/transcribe")
async def transcribe(file: UploadFile = File(...)):
try:
temp_path = f"/tmp/{file.filename}"
with open(temp_path, "wb") as f:
f.write(await file.read())
result = transcribe_audio(temp_path, asr_model)
return {"transcription": result}
except Exception as e:
return JSONResponse(status_code=500, content={"error": str(e)})
|