File size: 10,040 Bytes
e43a761
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5da9a16
 
 
 
 
 
 
 
 
 
 
e43a761
5da9a16
 
 
 
e43a761
5da9a16
e43a761
5da9a16
 
e43a761
5da9a16
 
e43a761
 
 
45b571d
5da9a16
 
e43a761
5da9a16
 
 
 
 
 
 
e43a761
 
45b571d
e43a761
 
 
45b571d
5da9a16
e43a761
 
45b571d
e43a761
 
 
45b571d
e43a761
 
 
 
 
45b571d
5da9a16
45b571d
 
e43a761
 
 
45b571d
 
 
5da9a16
45b571d
5da9a16
 
 
e43a761
5da9a16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e43a761
 
 
 
 
 
5da9a16
 
 
 
 
 
 
 
 
 
 
 
 
e43a761
5da9a16
 
 
 
e43a761
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# import torch
# import torchaudio
# from transformers import (
#     WhisperProcessor, 
#     WhisperForConditionalGeneration,
#     pipeline
# )
# from pyannote.audio import Pipeline
# import librosa
# import numpy as np
# from pydub import AudioSegment
# import tempfile
# import os  # ADD THIS LINE - FIX FOR THE ERROR

# class SpeechProcessor:
#     def __init__(self):
#         # Load Whisper for ASR
#         self.whisper_processor = WhisperProcessor.from_pretrained(
#             "openai/whisper-medium"
#         )
#         self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
#             "openai/whisper-medium"
#         )
        
#         # Load speaker diarization
#         try:
#             self.diarization_pipeline = Pipeline.from_pretrained(
#                 "pyannote/speaker-diarization-3.1",
#                 use_auth_token=os.environ.get("HF_TOKEN")  # Now os is imported
#             )
#         except Exception as e:
#             print(f"Warning: Could not load diarization model: {e}")
#             self.diarization_pipeline = None
    
#     def process_audio(self, audio_path, language="id"):
#         """
#         Process audio file untuk ASR dan speaker diarization
#         """
#         # Convert to WAV if needed
#         audio_path = self._ensure_wav_format(audio_path)
        
#         # Load audio
#         waveform, sample_rate = torchaudio.load(audio_path)
        
#         # Speaker diarization
#         if self.diarization_pipeline:
#             try:
#                 diarization = self.diarization_pipeline(audio_path)
                
#                 # Process each speaker segment
#                 transcript_segments = []
                
#                 for turn, _, speaker in diarization.itertracks(yield_label=True):
#                     # Extract segment audio
#                     start_sample = int(turn.start * sample_rate)
#                     end_sample = int(turn.end * sample_rate)
#                     segment_waveform = waveform[:, start_sample:end_sample]
                    
#                     # ASR on segment
#                     text = self._transcribe_segment(
#                         segment_waveform, 
#                         sample_rate, 
#                         language
#                     )
                    
#                     transcript_segments.append({
#                         "start": round(turn.start, 2),
#                         "end": round(turn.end, 2),
#                         "speaker": speaker,
#                         "text": text
#                     })
                
#                 return self._merge_consecutive_segments(transcript_segments)
#             except Exception as e:
#                 print(f"Diarization failed, falling back to simple transcription: {e}")
        
#         # Fallback: simple transcription without diarization
#         return self._simple_transcription(waveform, sample_rate, language)
    
#     def _simple_transcription(self, waveform, sample_rate, language):
#         """Fallback transcription without speaker diarization"""
#         # Process in 30-second chunks
#         chunk_length = 30 * sample_rate
#         segments = []
        
#         for i in range(0, waveform.shape[1], chunk_length):
#             chunk = waveform[:, i:i + chunk_length]
#             text = self._transcribe_segment(chunk, sample_rate, language)
            
#             if text.strip():
#                 segments.append({
#                     "start": i / sample_rate,
#                     "end": min((i + chunk_length) / sample_rate, waveform.shape[1] / sample_rate),
#                     "speaker": "SPEAKER_01",
#                     "text": text
#                 })
        
#         return segments
    
#     def _transcribe_segment(self, waveform, sample_rate, language):
#         """
#         Transcribe audio segment menggunakan Whisper
#         """
#         # Resample if needed
#         if sample_rate != 16000:
#             resampler = torchaudio.transforms.Resample(sample_rate, 16000)
#             waveform = resampler(waveform)
        
#         # Prepare input
#         input_features = self.whisper_processor(
#             waveform.squeeze().numpy(),
#             sampling_rate=16000,
#             return_tensors="pt"
#         ).input_features
        
#         # Generate transcription
#         forced_decoder_ids = self.whisper_processor.get_decoder_prompt_ids(
#             language=language,
#             task="transcribe"
#         )
        
#         predicted_ids = self.whisper_model.generate(
#             input_features,
#             forced_decoder_ids=forced_decoder_ids,
#             max_length=448
#         )
        
#         transcription = self.whisper_processor.batch_decode(
#             predicted_ids,
#             skip_special_tokens=True
#         )[0]
        
#         return transcription.strip()
    
#     def _ensure_wav_format(self, audio_path):
#         """
#         Convert audio to WAV format if needed
#         """
#         if not audio_path.endswith('.wav'):
#             audio = AudioSegment.from_file(audio_path)
#             wav_path = tempfile.mktemp(suffix='.wav')
#             audio.export(wav_path, format='wav')
#             return wav_path
#         return audio_path
    
#     def _merge_consecutive_segments(self, segments):
#         """
#         Merge consecutive segments from same speaker
#         """
#         if not segments:
#             return segments
            
#         merged = [segments[0]]
        
#         for current in segments[1:]:
#             last = merged[-1]
            
#             # Merge if same speaker and close in time
#             if (last['speaker'] == current['speaker'] and 
#                 current['start'] - last['end'] < 1.0):
#                 last['end'] = current['end']
#                 last['text'] += ' ' + current['text']
#             else:
#                 merged.append(current)
        
#         return merged


import torch
import torchaudio
from transformers import (
    WhisperProcessor, 
    WhisperForConditionalGeneration,
    pipeline
)
import librosa
import numpy as np
from pydub import AudioSegment
import tempfile
import os

class SpeechProcessor:
    def __init__(self):
        # Load Whisper for ASR
        print("Loading Whisper model...")
        self.whisper_processor = WhisperProcessor.from_pretrained(
            "openai/whisper-small"  # Use small for HF Spaces
        )
        self.whisper_model = WhisperForConditionalGeneration.from_pretrained(
            "openai/whisper-small"
        )
        
        # No diarization in this version
        self.diarization_pipeline = None
        print("Speech processor initialized (without speaker diarization)")
    
    def process_audio(self, audio_path, language="id"):
        """
        Process audio file for ASR (without speaker diarization)
        """
        # Convert to WAV if needed
        audio_path = self._ensure_wav_format(audio_path)
        
        # Load audio
        waveform, sample_rate = torchaudio.load(audio_path)
        
        # Process audio in chunks
        return self._process_audio_chunks(waveform, sample_rate, language)
    
    def _process_audio_chunks(self, waveform, sample_rate, language):
        """Process audio in manageable chunks"""
        chunk_length = 30 * sample_rate  # 30-second chunks
        segments = []
        
        total_chunks = (waveform.shape[1] + chunk_length - 1) // chunk_length
        
        for i in range(0, waveform.shape[1], chunk_length):
            chunk_num = i // chunk_length + 1
            print(f"Processing chunk {chunk_num}/{total_chunks}...")
            
            chunk = waveform[:, i:i + chunk_length]
            
            # Skip very short chunks
            if chunk.shape[1] < sample_rate * 0.5:
                continue
            
            text = self._transcribe_segment(chunk, sample_rate, language)
            
            if text.strip():
                segments.append({
                    "start": round(i / sample_rate, 2),
                    "end": round(min((i + chunk_length) / sample_rate, 
                                   waveform.shape[1] / sample_rate), 2),
                    "speaker": "SPEAKER_01",
                    "text": text
                })
        
        return segments
    
    def _transcribe_segment(self, waveform, sample_rate, language):
        """
        Transcribe audio segment using Whisper
        """
        # Resample if needed
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
        
        # Prepare input
        input_features = self.whisper_processor(
            waveform.squeeze().numpy(),
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features
        
        # Generate transcription
        forced_decoder_ids = self.whisper_processor.get_decoder_prompt_ids(
            language=language,
            task="transcribe"
        )
        
        with torch.no_grad():
            predicted_ids = self.whisper_model.generate(
                input_features,
                forced_decoder_ids=forced_decoder_ids,
                max_length=448
            )
        
        transcription = self.whisper_processor.batch_decode(
            predicted_ids,
            skip_special_tokens=True
        )[0]
        
        return transcription.strip()
    
    def _ensure_wav_format(self, audio_path):
        """
        Convert audio to WAV format if needed
        """
        if not audio_path.endswith('.wav'):
            print("Converting audio to WAV format...")
            audio = AudioSegment.from_file(audio_path)
            wav_path = tempfile.mktemp(suffix='.wav')
            audio.export(wav_path, format='wav')
            return wav_path
        return audio_path