|
|
|
import os, tempfile, subprocess, gradio as gr |
|
from dotenv import load_dotenv |
|
import whisper |
|
import pvfalcon |
|
|
|
|
|
|
|
|
|
load_dotenv() |
|
FALCON_ACCESS_KEY = os.getenv("FALCON_ACCESS_KEY") |
|
if not FALCON_ACCESS_KEY: |
|
raise RuntimeError( |
|
"Set FALCON_ACCESS_KEY in your environment or .env file " |
|
"(get one free at https://console.picovoice.ai)." |
|
) |
|
|
|
|
|
|
|
|
|
whisper_model = whisper.load_model("base") |
|
falcon = pvfalcon.create(access_key=FALCON_ACCESS_KEY) |
|
|
|
|
|
|
|
|
|
def process_video(file, language="Auto"): |
|
|
|
lang_code = None if language == "Auto" else language.lower() |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as wav: |
|
wav_path = wav.name |
|
subprocess.run( |
|
["ffmpeg", "-y", "-i", file.name, |
|
"-ar", "16000", "-ac", "1", "-acodec", "pcm_s16le", wav_path], |
|
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL |
|
) |
|
if not os.path.getsize(wav_path): |
|
return "Audio extraction failed.", "" |
|
|
|
|
|
segments = falcon.process_file(wav_path) |
|
diarized_map, label_map, counter = [], {}, 1 |
|
for seg in segments: |
|
tag = seg.speaker_tag |
|
if tag not in label_map: |
|
label_map[tag] = f"Speaker {counter}" |
|
counter += 1 |
|
diarized_map.append( |
|
dict(start=seg.start_sec, end=seg.end_sec, speaker=label_map[tag]) |
|
) |
|
|
|
|
|
res = whisper_model.transcribe(wav_path, language=lang_code) |
|
paragraph_transcript = res["text"] |
|
|
|
|
|
speaker_lines = [] |
|
for s in res.get("segments", []): |
|
speaker = next( |
|
(m["speaker"] for m in diarized_map if m["start"] <= s["start"] <= m["end"]), |
|
"Unknown" |
|
) |
|
speaker_lines.append(f"{speaker}: {s['text']}") |
|
speaker_transcript = "\n".join(speaker_lines) |
|
|
|
|
|
return speaker_transcript, paragraph_transcript |
|
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
fn=process_video, |
|
inputs=[ |
|
gr.File(label="Upload Video", type="filepath"), |
|
gr.Dropdown(["Auto", "English", "Hindi", "Urdu"], label="Language") |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Speaker-wise Transcript", show_copy_button=True), |
|
gr.Textbox(label=" Transcription", show_copy_button=True) |
|
], |
|
title="Transcription + Speaker Segmentation", |
|
description="Whisper + Picovoice Falcon running fully on CPU." |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|