saeedzou commited on
Commit
b42c7b6
·
verified ·
1 Parent(s): 6e1fba5

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +14 -0
  2. main.py +67 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+
4
+ # Install ffmpeg for pydub
5
+ RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
6
+
7
+ WORKDIR /app
8
+
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY . .
13
+
14
+ ENTRYPOINT ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
main.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+
3
+ import os
4
+ import re
5
+ import numpy as np
6
+ from pydub import AudioSegment
7
+ from fastapi import FastAPI, UploadFile, File
8
+ from fastapi.responses import JSONResponse
9
+ from huggingface_hub import login
10
+ from hazm import Normalizer
11
+ import nemo.collections.asr as nemo_asr
12
+ import uvicorn
13
+
14
+ # Load Hugging Face token
15
+ HF_TOKEN = os.getenv("HF_TOKEN")
16
+ if not HF_TOKEN:
17
+ raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")
18
+
19
+ login(HF_TOKEN)
20
+
21
+ # Load model once
22
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_farsi_fastconformer")
23
+
24
+ normalizer = Normalizer()
25
+ app = FastAPI()
26
+
27
+
28
+ def load_audio(audio_file_path):
29
+ audio = AudioSegment.from_file(audio_file_path)
30
+ audio = audio.set_channels(1).set_frame_rate(16000)
31
+ audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
32
+ audio_samples /= np.max(np.abs(audio_samples))
33
+ return audio_samples, audio.frame_rate
34
+
35
+
36
+ def transcribe_chunk(audio_chunk, model):
37
+ transcription = model.transcribe([audio_chunk], batch_size=1, verbose=False)
38
+ return transcription[0].text
39
+
40
+
41
+ def transcribe_audio(file_path, model, chunk_size=30 * 16000):
42
+ waveform, _ = load_audio(file_path)
43
+ transcriptions = []
44
+ for start in range(0, len(waveform), chunk_size):
45
+ end = min(len(waveform), start + chunk_size)
46
+ transcription = transcribe_chunk(waveform[start:end], model)
47
+ transcriptions.append(transcription)
48
+
49
+ final_transcription = ' '.join(transcriptions)
50
+ final_transcription = re.sub(' +', ' ', final_transcription)
51
+ final_transcription = normalizer.normalize(final_transcription)
52
+
53
+ return final_transcription
54
+
55
+
56
+ @app.post("/transcribe")
57
+ async def transcribe(file: UploadFile = File(...)):
58
+ try:
59
+ temp_path = f"/tmp/{file.filename}"
60
+ with open(temp_path, "wb") as f:
61
+ f.write(await file.read())
62
+
63
+ result = transcribe_audio(temp_path, asr_model)
64
+ return {"transcription": result}
65
+ except Exception as e:
66
+ return JSONResponse(status_code=500, content={"error": str(e)})
67
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ nemo_toolkit[asr]
4
+ pydub
5
+ hazm
6
+ huggingface_hub