asr-inference

Running on Zero

App Files Files Community

ssolito commited on 9 days ago

Commit

0373cec

verified ·

1 Parent(s): 038bca7

Update whisper_cs.py

Browse files

Files changed (1) hide show

whisper_cs.py +38 -102

whisper_cs.py CHANGED Viewed

@@ -4,17 +4,15 @@ import os
 import torchaudio
 import torch
 import re
-from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor, GenerationConfig
-from pyannote.audio import Pipeline as DiarizationPipeline
-import whisperx
 import whisper_timestamped as whisper_ts
 from typing import Dict
 device = 0 if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float32
-MODEL_PATH_1 = "projecte-aina/whisper-large-v3-tiny-caesar"
-MODEL_PATH_2 = "langtech-veu/whisper-timestamped-cs"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def clean_text(input_text):
@@ -42,19 +40,6 @@ def split_stereo_channels(audio_path):
     channels[1].export(f"temp_mono_speaker2.wav", format="wav")  # Left
-def convert_to_mono(input_path):
-    audio = AudioSegment.from_file(input_path)
-    base, ext = os.path.splitext(input_path)
-    output_path = f"{base}_merged.wav"
-    print('output_path',output_path)
-    mono = audio.set_channels(1)
-    mono.export(output_path, format="wav")
-    return output_path
-def save_temp_audio(waveform, sample_rate, path):
-    waveform = waveform.unsqueeze(0) if waveform.dim() == 1 else waveform
-    torchaudio.save(path, waveform, sample_rate)
 def format_audio(audio_path):
     input_audio, sample_rate = torchaudio.load(audio_path)
     if input_audio.shape[0] == 2:
@@ -63,52 +48,6 @@ def format_audio(audio_path):
     input_audio = resampler(input_audio)
     print('resampled')
     return input_audio.squeeze(), 16000
-def assign_timestamps(asr_segments, audio_path):
-    waveform, sr = format_audio(audio_path)
-    total_duration = waveform.shape[-1] / sr
-    total_words = sum(len(seg["text"].split()) for seg in asr_segments)
-    if total_words == 0:
-        raise ValueError("Total number of words in ASR segments is zero. Cannot assign timestamps.")
-    avg_word_duration = total_duration / total_words
-    current_time = 0.0
-    for segment in asr_segments:
-        word_count = len(segment["text"].split())
-        segment_duration = word_count * avg_word_duration
-        segment["start"] = round(current_time, 3)
-        segment["end"] = round(current_time + segment_duration, 3)
-        current_time += segment_duration
-    return asr_segments
-def hf_chunks_to_whisperx_segments(chunks):
-    return [
-        {
-            "text": chunk["text"],
-            "start": chunk["timestamp"][0],
-            "end": chunk["timestamp"][1],
-        }
-        for chunk in chunks
-        if chunk["timestamp"] and isinstance(chunk["timestamp"], (list, tuple))
-    ]
-def align_words_to_segments(words, segments, window=5.0):
-    aligned = []
-    seg_idx = 0
-    for word in words:
-        while seg_idx < len(segments) and segments[seg_idx]["end"] < word["start"] - window:
-            seg_idx += 1
-        for j in range(seg_idx, len(segments)):
-            seg = segments[j]
-            if seg["start"] > word["end"] + window:
-                break
-            if seg["start"] <= word["start"] < seg["end"]:
-                aligned.append((word, seg))
-                break
-    return aligned
 def post_process_transcription(transcription, max_repeats=2):
     tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
@@ -166,7 +105,7 @@ def cleanup_temp_files(*file_paths):
         if path and os.path.exists(path):
             os.remove(path)
 def load_whisper_model(model_path: str):
     device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -241,33 +180,47 @@ def asr(audio_path):
     asr_segments = assign_timestamps(asr_segments, audio_path)
     return asr_segments
-def align_asr_to_diarization(asr_segments, diarized_segments, audio_path):
-    waveform, sample_rate = format_audio(audio_path)
-    word_segments = whisperx.align(asr_segments, align_model, metadata, waveform, DEVICE)
-    words = word_segments['word_segments']
-    diarized = [{"start": segment.start,"end": segment.end,"speaker": speaker} for segment, _, speaker in diarized_segments]
-    aligned_pairs = align_words_to_segments(words, diarized)
-    output = []
-    segment_map = {}
-    for word, segment in aligned_pairs:
-        key = (segment["start"], segment["end"], segment["speaker"])
-        if key not in segment_map:
-            segment_map[key] = []
-        segment_map[key].append(word["word"])
-    for (start, end, speaker), words in sorted(segment_map.items()):
-        output.append(f"[{speaker}] {' '.join(words)}")
-    return output
-def generate(audio_path, use_v2):
-    if use_v2:
-        model = load_whisper_model(MODEL_PATH_2)
         split_stereo_channels(audio_path)
         left_channel_path = "temp_mono_speaker2.wav"
@@ -300,23 +253,6 @@ def generate(audio_path, use_v2):
             output += f"[{speaker}]: {text}\n"
         clean_output = output.strip()
-    else:
-        mono_audio_path = convert_to_mono(audio_path)
-        waveform, sr = format_audio(mono_audio_path)
-        tmp_full_path = "tmp_full.wav"
-        save_temp_audio(waveform, sr, tmp_full_path)
-        diarized_segments = diarization(tmp_full_path)
-        asr_segments = asr(tmp_full_path)
-        for segment in asr_segments:
-            segment["text"] = post_process_transcription(segment["text"])
-        aligned_text = align_asr_to_diarization(asr_segments, diarized_segments, tmp_full_path)
-        clean_output = ""
-        for line in aligned_text:
-            clean_output += f"{line}\n"
-        clean_output = post_merge_consecutive_segments_from_text(clean_output)
-        cleanup_temp_files(mono_audio_path,tmp_full_path)
     cleanup_temp_files(
         "temp_mono_speaker1.wav",

 import torchaudio
 import torch
 import re
 import whisper_timestamped as whisper_ts
 from typing import Dict
+from faster_whisper import WhisperModel
 device = 0 if torch.cuda.is_available() else "cpu"
 torch_dtype = torch.float32
+MODEL_PATH_V2 = "langtech-veu/whisper-timestamped-cs"
+MODEL_PATH_V2_FAST = "langtech-veu/faster-whisper-timestamped-cs"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 def clean_text(input_text):
     channels[1].export(f"temp_mono_speaker2.wav", format="wav")  # Left
 def format_audio(audio_path):
     input_audio, sample_rate = torchaudio.load(audio_path)
     if input_audio.shape[0] == 2:
     input_audio = resampler(input_audio)
     print('resampled')
     return input_audio.squeeze(), 16000
 def post_process_transcription(transcription, max_repeats=2):
     tokens = re.findall(r'\b\w+\'?\w*\b[.,!?]?', transcription)
         if path and os.path.exists(path):
             os.remove(path)
+faster_model = WhisperModel(MODEL_PATH_V2_FAST, device=DEVICE, compute_type="int8")
 def load_whisper_model(model_path: str):
     device = "cuda" if torch.cuda.is_available() else "cpu"
     asr_segments = assign_timestamps(asr_segments, audio_path)
     return asr_segments
+def generate(audio_path, use_v2_fast):
+    if use_v2_fast:
+        left_channel_path = "temp_mono_speaker2.wav"
+        right_channel_path = "temp_mono_speaker1.wav"
+        left_waveform, left_sr = format_audio(left_channel_path)
+        right_waveform, right_sr = format_audio(right_channel_path)
+        left_waveform = left_waveform.numpy().astype("float32")
+        right_waveform = right_waveform.numpy().astype("float32")
+        left_result, info = faster_model.transcribe(left_waveform, beam_size=5, task="transcribe")
+        right_result, info = faster_model.transcribe(right_waveform, beam_size=5, task="transcribe")
+        left_result = list(left_result)
+        right_result = list(right_result)
+        def get_faster_segments(segments, speaker_label):
+            return [
+                (seg.start, seg.end, speaker_label, post_process_transcription(seg.text.strip()))
+                for seg in segments if seg.text
+            ]
+        left_segs = get_faster_segments(left_result, "Speaker 1")
+        right_segs = get_faster_segments(right_result, "Speaker 2")
+        merged_transcript = sorted(
+            left_segs + right_segs,
+            key=lambda x: float(x[0]) if x[0] is not None else float("inf")
+        )
+        clean_output = ""
+        for start, end, speaker, text in merged_transcript:
+            clean_output += f"[{speaker}]: {text}\n"
+        clean_output = post_merge_consecutive_segments_from_text(clean_output)
+    else:
+        model = load_whisper_model(MODEL_PATH_V2)
         split_stereo_channels(audio_path)
         left_channel_path = "temp_mono_speaker2.wav"
             output += f"[{speaker}]: {text}\n"
         clean_output = output.strip()
     cleanup_temp_files(
         "temp_mono_speaker1.wav",