File size: 4,197 Bytes
12f25c0
 
 
 
 
b6211a3
12f25c0
03701cd
12f25c0
 
 
 
 
 
 
 
 
 
 
 
 
 
f22e5b0
e3225d2
12f25c0
 
 
 
f22e5b0
991636c
 
 
 
3ec6e9a
991636c
 
 
 
 
 
03701cd
 
b6211a3
12f25c0
 
03701cd
b6211a3
 
 
 
 
 
 
12f25c0
 
62a24bb
12f25c0
 
 
 
991636c
 
b6211a3
 
01ddeb4
 
 
 
b6211a3
01ddeb4
b6211a3
01ddeb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import math
import gradio as gr
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from moviepy.editor import VideoFileClip, concatenate_audioclips

def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
    model_id = "openai/whisper-large-v3"
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)
    processor = AutoProcessor.from_pretrained(model_id)
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=10,  # Shorter chunk length to prevent overly long captions
        batch_size=2,
        return_timestamps=True,
        torch_dtype=torch_dtype,
        device=device,
    )

    if video_file is None:
        yield "Error: No video file provided.", None
        return

    video_path = video_file.name if hasattr(video_file, 'name') else video_file
    try:
        video = VideoFileClip(video_path)
    except Exception as e:
        yield f"Error processing video file: {str(e)}", None
        return

    audio = video.audio
    duration = video.duration
    n_chunks = math.ceil(duration / 10)  # Split into 10-second chunks
    transcription_txt = ""
    transcription_srt = []
    
    for i in range(n_chunks):
        start = i * 10
        end = min((i + 1) * 10, duration)
        audio_chunk = audio.subclip(start, end)
        
        temp_file_path = f"temp_audio_{i}.wav"
        audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
        
        with open(temp_file_path, "rb") as temp_file:
            result = pipe(temp_file_path, generate_kwargs={"language": language})
            transcription_txt += result["text"]
            if transcribe_to_srt:
                for chunk in result["chunks"]:
                    start_time, end_time = chunk["timestamp"]
                    if start_time is not None and end_time is not None:
                        transcription_srt.append({
                            "start": start_time + i * 10,
                            "end": end_time + i * 10,
                            "text": chunk["text"]
                        })
                    else:
                        print(f"Warning: Invalid timestamp for chunk: {chunk}")
        
        os.remove(temp_file_path)
        yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%", None

    output = ""
    srt_file_path = None
    if transcribe_to_text:
        output += "Text Transcription:\n" + transcription_txt + "\n\n"
    if transcribe_to_srt:
        output += "SRT Transcription:\n"
        srt_content = ""
        for i, sub in enumerate(transcription_srt, 1):
            srt_entry = f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
            output += srt_entry
            srt_content += srt_entry
        
        srt_file_path = "transcription.srt"
        with open(srt_file_path, "w", encoding="utf-8") as srt_file:
            srt_file.write(srt_content)
        
        output += f"\nSRT file saved as: {srt_file_path}"
    
    yield output, srt_file_path

def format_time(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return f"{int(h):02d}:{int(m):02d}:{s:06.3f}".replace('.', ',')

iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Video(),
        gr.Checkbox(label="Transcribe to Text"),
        gr.Checkbox(label="Transcribe to SRT"),
        gr.Dropdown(choices=['en', 'he', 'it', 'es', 'fr', 'de', 'zh', 'ar'], label="Language")
    ],
    outputs=[
        gr.Textbox(label="Transcription Output"),
        gr.File(label="Download SRT")
    ],
    title="WhisperCap Video Transcription",
    description="Upload a video file to transcribe its audio using Whisper. You can download the SRT file if generated.",
)

iface.launch(share=True)