saeedzou commited on
Commit
d337705
·
verified ·
1 Parent(s): e387351

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +67 -67
main.py CHANGED
@@ -1,67 +1,67 @@
1
- # main.py
2
-
3
- import os
4
- import re
5
- import numpy as np
6
- from pydub import AudioSegment
7
- from fastapi import FastAPI, UploadFile, File
8
- from fastapi.responses import JSONResponse
9
- from huggingface_hub import login
10
- from hazm import Normalizer
11
- import nemo.collections.asr as nemo_asr
12
- import uvicorn
13
-
14
- # Load Hugging Face token
15
- HF_TOKEN = os.getenv("HF_TOKEN")
16
- if not HF_TOKEN:
17
- raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")
18
-
19
- login(HF_TOKEN)
20
-
21
- # Load model once
22
- asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_farsi_fastconformer")
23
-
24
- normalizer = Normalizer()
25
- app = FastAPI()
26
-
27
-
28
- def load_audio(audio_file_path):
29
- audio = AudioSegment.from_file(audio_file_path)
30
- audio = audio.set_channels(1).set_frame_rate(16000)
31
- audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
32
- audio_samples /= np.max(np.abs(audio_samples))
33
- return audio_samples, audio.frame_rate
34
-
35
-
36
- def transcribe_chunk(audio_chunk, model):
37
- transcription = model.transcribe([audio_chunk], batch_size=1, verbose=False)
38
- return transcription[0].text
39
-
40
-
41
- def transcribe_audio(file_path, model, chunk_size=30 * 16000):
42
- waveform, _ = load_audio(file_path)
43
- transcriptions = []
44
- for start in range(0, len(waveform), chunk_size):
45
- end = min(len(waveform), start + chunk_size)
46
- transcription = transcribe_chunk(waveform[start:end], model)
47
- transcriptions.append(transcription)
48
-
49
- final_transcription = ' '.join(transcriptions)
50
- final_transcription = re.sub(' +', ' ', final_transcription)
51
- final_transcription = normalizer.normalize(final_transcription)
52
-
53
- return final_transcription
54
-
55
-
56
- @app.post("/transcribe")
57
- async def transcribe(file: UploadFile = File(...)):
58
- try:
59
- temp_path = f"/tmp/{file.filename}"
60
- with open(temp_path, "wb") as f:
61
- f.write(await file.read())
62
-
63
- result = transcribe_audio(temp_path, asr_model)
64
- return {"transcription": result}
65
- except Exception as e:
66
- return JSONResponse(status_code=500, content={"error": str(e)})
67
-
 
1
+ # main.py
2
+
3
+ import os
4
+ import re
5
+ import numpy as np
6
+ from pydub import AudioSegment
7
+ from fastapi import FastAPI, UploadFile, File
8
+ from fastapi.responses import JSONResponse
9
+ from huggingface_hub import login
10
+ from hazm import Normalizer
11
+ import nemo.collections.asr as nemo_asr
12
+ import uvicorn
13
+
14
+ # Load Hugging Face token
15
+ HF_TOKEN = os.getenv("HF_TOKEN")
16
+ if not HF_TOKEN:
17
+ raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")
18
+
19
+ login(HF_TOKEN)
20
+
21
+ # Load model once
22
+ asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30")
23
+
24
+ normalizer = Normalizer()
25
+ app = FastAPI()
26
+
27
+
28
+ def load_audio(audio_file_path):
29
+ audio = AudioSegment.from_file(audio_file_path)
30
+ audio = audio.set_channels(1).set_frame_rate(16000)
31
+ audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
32
+ audio_samples /= np.max(np.abs(audio_samples))
33
+ return audio_samples, audio.frame_rate
34
+
35
+
36
+ def transcribe_chunk(audio_chunk, model):
37
+ transcription = model.transcribe([audio_chunk], batch_size=1, verbose=False)
38
+ return transcription[0].text
39
+
40
+
41
+ def transcribe_audio(file_path, model, chunk_size=30 * 16000):
42
+ waveform, _ = load_audio(file_path)
43
+ transcriptions = []
44
+ for start in range(0, len(waveform), chunk_size):
45
+ end = min(len(waveform), start + chunk_size)
46
+ transcription = transcribe_chunk(waveform[start:end], model)
47
+ transcriptions.append(transcription)
48
+
49
+ final_transcription = ' '.join(transcriptions)
50
+ final_transcription = re.sub(' +', ' ', final_transcription)
51
+ final_transcription = normalizer.normalize(final_transcription)
52
+
53
+ return final_transcription
54
+
55
+
56
+ @app.post("/transcribe")
57
+ async def transcribe(file: UploadFile = File(...)):
58
+ try:
59
+ temp_path = f"/tmp/{file.filename}"
60
+ with open(temp_path, "wb") as f:
61
+ f.write(await file.read())
62
+
63
+ result = transcribe_audio(temp_path, asr_model)
64
+ return {"transcription": result}
65
+ except Exception as e:
66
+ return JSONResponse(status_code=500, content={"error": str(e)})
67
+