File size: 2,118 Bytes
d337705
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# main.py

import os
import re
import numpy as np
from pydub import AudioSegment
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from huggingface_hub import login
from hazm import Normalizer
import nemo.collections.asr as nemo_asr
import uvicorn

# Load Hugging Face token
HF_TOKEN = os.getenv("HF_TOKEN")
if not HF_TOKEN:
    raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")

login(HF_TOKEN)

# Load model once
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30")

normalizer = Normalizer()
app = FastAPI()


def load_audio(audio_file_path):
    audio = AudioSegment.from_file(audio_file_path)
    audio = audio.set_channels(1).set_frame_rate(16000)
    audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
    audio_samples /= np.max(np.abs(audio_samples))
    return audio_samples, audio.frame_rate


def transcribe_chunk(audio_chunk, model):
    transcription = model.transcribe([audio_chunk], batch_size=1, verbose=False)
    return transcription[0].text


def transcribe_audio(file_path, model, chunk_size=30 * 16000):
    waveform, _ = load_audio(file_path)
    transcriptions = []
    for start in range(0, len(waveform), chunk_size):
        end = min(len(waveform), start + chunk_size)
        transcription = transcribe_chunk(waveform[start:end], model)
        transcriptions.append(transcription)

    final_transcription = ' '.join(transcriptions)
    final_transcription = re.sub(' +', ' ', final_transcription)
    final_transcription = normalizer.normalize(final_transcription)

    return final_transcription


@app.post("/transcribe")
async def transcribe(file: UploadFile = File(...)):
    try:
        temp_path = f"/tmp/{file.filename}"
        with open(temp_path, "wb") as f:
            f.write(await file.read())

        result = transcribe_audio(temp_path, asr_model)
        return {"transcription": result}
    except Exception as e:
        return JSONResponse(status_code=500, content={"error": str(e)})