Michael Natanael commited on
Commit
d9cc6c9
·
1 Parent(s): c44e2e3

change transcribe mechanism when uploading audio

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -1
  2. app.py +27 -41
  3. requirements.txt +0 -1
Dockerfile CHANGED
@@ -29,4 +29,4 @@ RUN pip install --no-cache-dir --upgrade -r requirements.txt
29
 
30
  COPY --chown=user . /app
31
  # CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
32
- CMD ["gunicorn", "--timeout", "120", "--workers", "2", "-b", "0.0.0.0:7860", "app:app"]
 
29
 
30
  COPY --chown=user . /app
31
  # CMD ["gunicorn", "-b", "0.0.0.0:7860", "app:app"]
32
+ CMD ["gunicorn", "--timeout", "120", "-b", "0.0.0.0:7860", "app:app"]
app.py CHANGED
@@ -9,7 +9,6 @@ import requests
9
  from tqdm import tqdm
10
  from transformers import BertTokenizer, AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
11
  from model.multi_class_model import MultiClassModel # Adjust if needed
12
- from optimum.intel import OVModelForSpeechSeq2Seq
13
 
14
  app = Flask(__name__)
15
 
@@ -50,48 +49,35 @@ model = MultiClassModel.load_from_checkpoint(
50
  )
51
  model.eval()
52
 
53
- # === OPENVINO WHISPER INIT ===
54
- def init_whisper_openvino():
55
- device = "cpu" # Force CPU for OpenVINO
56
- model_id = "openai/whisper-large-v3"
57
-
58
- # Load OpenVINO-optimized model
59
- ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
60
- model_id,
61
- export=True, # Auto-convert to OpenVINO format
62
- compile=False,
63
- trust_remote_code=True,
64
- )
65
-
66
- # Compile for specific input shapes
67
- ov_model.compile(use_auto=True)
68
-
69
- processor = AutoProcessor.from_pretrained(model_id)
70
-
71
- return pipeline(
72
- "automatic-speech-recognition",
73
- model=ov_model,
74
- feature_extractor=processor.feature_extractor,
75
- tokenizer=processor.tokenizer,
76
- max_new_tokens=128,
77
- chunk_length_s=30,
78
- batch_size=1, # Optimized for OpenVINO on CPU
79
- device=device,
80
- torch_dtype="float32",
81
- )
82
-
83
- # Initialize once at startup
84
- whisper_pipe = init_whisper_openvino()
85
 
86
  def whisper_api(temp_audio_path):
87
- result = whisper_pipe(
88
- temp_audio_path,
89
- return_timestamps=False,
90
- generate_kwargs={
91
- "language": "indonesian",
92
- "task": "transcribe",
93
- }
94
- )
95
  return result
96
 
97
 
 
9
  from tqdm import tqdm
10
  from transformers import BertTokenizer, AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
11
  from model.multi_class_model import MultiClassModel # Adjust if needed
 
12
 
13
  app = Flask(__name__)
14
 
 
49
  )
50
  model.eval()
51
 
52
+ # === INITIAL SETUP: Whisper Pipeline ===
53
+ # https://huggingface.co/openai/whisper-large-v3
54
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
55
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
56
+
57
+ model_id = "openai/whisper-large-v3"
58
+
59
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
60
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
61
+ )
62
+ whisper_model.to(device)
63
+
64
+ processor = AutoProcessor.from_pretrained(model_id)
65
+
66
+ pipe = pipeline(
67
+ "automatic-speech-recognition",
68
+ model=whisper_model,
69
+ tokenizer=processor.tokenizer,
70
+ feature_extractor=processor.feature_extractor,
71
+ chunk_length_s=10,
72
+ batch_size=4, # batch size for inference - set based on your device
73
+ torch_dtype=torch_dtype,
74
+ device=device,
75
+ )
76
+
 
 
 
 
 
 
 
77
 
78
  def whisper_api(temp_audio_path):
79
+ result = pipe(temp_audio_path, return_timestamps=False, generate_kwargs={"language": "indonesian"})
80
+ print(result["text"])
 
 
 
 
 
 
81
  return result
82
 
83
 
requirements.txt CHANGED
@@ -12,7 +12,6 @@ setuptools-rust
12
  # ffmpeg
13
  # ffmpeg-python
14
  # imageio[ffmpeg]
15
- torchaudio
16
  accelerate
17
  pytorch-lightning==2.2.1
18
  lightning==2.4.0
 
12
  # ffmpeg
13
  # ffmpeg-python
14
  # imageio[ffmpeg]
 
15
  accelerate
16
  pytorch-lightning==2.2.1
17
  lightning==2.4.0