import { useRef, useState, useEffect } from "react"; interface AlignedSegment { duration: number; end: number; start: number; text: string; } interface TranscriptionResponse { aligned_segments: AlignedSegment[]; alignment_available: boolean; device: string; model: string; num_segments: number; status: string; total_duration: number; transcription: string; } const SCROLL_INTERVAL = 5; const USE_MOCK_DATA = false; // Helper function to encode audio buffer as WAV const encodeWAV = (audioBuffer: AudioBuffer): ArrayBuffer => { const length = audioBuffer.length; const numberOfChannels = audioBuffer.numberOfChannels; const sampleRate = audioBuffer.sampleRate; const arrayBuffer = new ArrayBuffer(44 + length * numberOfChannels * 2); const view = new DataView(arrayBuffer); // WAV header const writeString = (offset: number, string: string) => { for (let i = 0; i < string.length; i++) { view.setUint8(offset + i, string.charCodeAt(i)); } }; writeString(0, "RIFF"); view.setUint32(4, 36 + length * numberOfChannels * 2, true); writeString(8, "WAVE"); writeString(12, "fmt "); view.setUint32(16, 16, true); view.setUint16(20, 1, true); view.setUint16(22, numberOfChannels, true); view.setUint32(24, sampleRate, true); view.setUint32(28, sampleRate * numberOfChannels * 2, true); view.setUint16(32, numberOfChannels * 2, true); view.setUint16(34, 16, true); writeString(36, "data"); view.setUint32(40, length * numberOfChannels * 2, true); // Convert float32 audio data to int16 const channels = []; for (let i = 0; i < numberOfChannels; i++) { channels.push(audioBuffer.getChannelData(i)); } let offset = 44; for (let i = 0; i < length; i++) { for (let channel = 0; channel < numberOfChannels; channel++) { const sample = Math.max(-1, Math.min(1, channels[channel][i])); view.setInt16( offset, sample < 0 ? sample * 0x8000 : sample * 0x7fff, true, ); offset += 2; } } return arrayBuffer; }; // Audio extraction function using Web Audio API with WAV output const extractAudioFromVideo = async (videoFile: File): Promise => { console.log( "Extracting audio from video using Web Audio API (WAV format):", videoFile.name, ); try { // Create a video element to load the video file const video = document.createElement("video"); const videoUrl = URL.createObjectURL(videoFile); video.src = videoUrl; video.crossOrigin = "anonymous"; // Wait for video metadata to load await new Promise((resolve, reject) => { video.onloadedmetadata = resolve; video.onerror = reject; video.load(); }); // Create audio context const audioContext = new (window.AudioContext || (window as any).webkitAudioContext)(); // Create buffer to store audio data const source = audioContext.createMediaElementSource(video); // Create a script processor to capture audio data const processor = audioContext.createScriptProcessor(4096, 2, 2); const audioData: number[][] = [[], []]; let isRecording = false; processor.onaudioprocess = (event) => { if (!isRecording) return; const inputBuffer = event.inputBuffer; const leftChannel = inputBuffer.getChannelData(0); const rightChannel = inputBuffer.numberOfChannels > 1 ? inputBuffer.getChannelData(1) : leftChannel; // Append audio data by converting to arrays and concatenating audioData[0] = audioData[0].concat(Array.from(leftChannel)); audioData[1] = audioData[1].concat(Array.from(rightChannel)); }; // Connect audio processing chain source.connect(processor); processor.connect(audioContext.destination); return new Promise((resolve, reject) => { // Start recording when video plays video.onplay = () => { isRecording = true; }; video.onended = () => { isRecording = false; // Create audio buffer from captured data const audioBuffer = audioContext.createBuffer( 2, audioData[0].length, audioContext.sampleRate, ); // Convert number arrays to Float32Array and copy to channels const leftChannelData = new Float32Array(audioData[0]); const rightChannelData = new Float32Array(audioData[1]); audioBuffer.copyToChannel(leftChannelData, 0); audioBuffer.copyToChannel(rightChannelData, 1); // Encode as WAV const wavArrayBuffer = encodeWAV(audioBuffer); const audioBlob = new Blob([wavArrayBuffer], { type: "audio/wav" }); const audioFile = new File( [audioBlob], videoFile.name.replace(/\.[^/.]+$/, ".wav"), { type: "audio/wav" }, ); // Clean up URL.revokeObjectURL(videoUrl); processor.disconnect(); source.disconnect(); audioContext.close(); resolve(audioFile); }; video.onerror = (error) => { console.error("Video error:", error); URL.revokeObjectURL(videoUrl); processor.disconnect(); source.disconnect(); audioContext.close(); reject(new Error("Failed to process video")); }; // Start playing the video video.currentTime = 0; video.play().catch(reject); // Set a timeout as fallback setTimeout( () => { if (isRecording) { video.pause(); video.dispatchEvent(new Event("ended")); } }, (video.duration + 2) * 1000, ); }); } catch (error) { console.error("Error extracting audio from video:", error); throw new Error( `Failed to extract audio: ${error instanceof Error ? error.message : "Unknown error"}`, ); } }; // Generate SRT subtitle format const generateSRT = (segments: AlignedSegment[]): string => { let srt = ""; segments.forEach((segment, index) => { const startTime = formatTimeForSRT(segment.start); const endTime = formatTimeForSRT(segment.end); srt += `${index + 1}\n`; srt += `${startTime} --> ${endTime}\n`; srt += `${segment.text}\n\n`; }); return srt; }; // Generate WebVTT subtitle format const generateWebVTT = (segments: AlignedSegment[]): string => { let vtt = "WEBVTT\n\n"; segments.forEach((segment, index) => { const startTime = formatTimeForVTT(segment.start); const endTime = formatTimeForVTT(segment.end); vtt += `${index + 1}\n`; vtt += `${startTime} --> ${endTime}\n`; vtt += `${segment.text}\n\n`; }); return vtt; }; // Format time for SRT (HH:MM:SS,mmm) const formatTimeForSRT = (seconds: number): string => { const hours = Math.floor(seconds / 3600); const minutes = Math.floor((seconds % 3600) / 60); const secs = Math.floor(seconds % 60); const milliseconds = Math.floor((seconds % 1) * 1000); return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")},${milliseconds.toString().padStart(3, "0")}`; }; // Format time for WebVTT (HH:MM:SS.mmm) const formatTimeForVTT = (seconds: number): string => { const hours = Math.floor(seconds / 3600); const minutes = Math.floor((seconds % 3600) / 60); const secs = Math.floor(seconds % 60); const milliseconds = Math.floor((seconds % 1) * 1000); return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")}.${milliseconds.toString().padStart(3, "0")}`; }; // Download subtitle file const downloadSubtitles = (content: string, filename: string) => { const blob = new Blob([content], { type: "text/plain;charset=utf-8" }); const url = URL.createObjectURL(blob); const link = document.createElement("a"); link.href = url; link.download = filename; document.body.appendChild(link); link.click(); document.body.removeChild(link); URL.revokeObjectURL(url); }; // Download video with embedded subtitles const downloadVideoWithSubtitles = async ( videoFile: File, subtitleContent: string, filename: string, ) => { try { // Create a canvas to render the video with subtitles const video = document.createElement("video"); const canvas = document.createElement("canvas"); const ctx = canvas.getContext("2d")!; // Load the video const videoUrl = URL.createObjectURL(videoFile); video.src = videoUrl; video.crossOrigin = "anonymous"; await new Promise((resolve, reject) => { video.onloadedmetadata = resolve; video.onerror = reject; video.load(); }); // Set canvas dimensions to match video canvas.width = video.videoWidth; canvas.height = video.videoHeight; // Parse WebVTT subtitles const subtitleLines = subtitleContent.split("\n"); const subtitles: Array<{ start: number; end: number; text: string }> = []; for (let i = 0; i < subtitleLines.length; i++) { const line = subtitleLines[i].trim(); if (line.includes(" --> ")) { const [startStr, endStr] = line.split(" --> "); const start = parseVTTTime(startStr); const end = parseVTTTime(endStr); const text = subtitleLines[i + 1]?.trim() || ""; if (text) { subtitles.push({ start, end, text }); } } } // Create MediaRecorder to capture the canvas const stream = canvas.captureStream(30); // 30 FPS // Add audio from original video const audioContext = new AudioContext(); const source = audioContext.createMediaElementSource(video); const dest = audioContext.createMediaStreamDestination(); source.connect(dest); // Combine video and audio streams const audioTrack = dest.stream.getAudioTracks()[0]; if (audioTrack) { stream.addTrack(audioTrack); } const mediaRecorder = new MediaRecorder(stream, { mimeType: "video/webm;codecs=vp8,opus", }); const chunks: Blob[] = []; mediaRecorder.ondataavailable = (event) => { if (event.data.size > 0) { chunks.push(event.data); } }; mediaRecorder.onstop = () => { const blob = new Blob(chunks, { type: "video/webm" }); const url = URL.createObjectURL(blob); const link = document.createElement("a"); link.href = url; link.download = filename; document.body.appendChild(link); link.click(); document.body.removeChild(link); URL.revokeObjectURL(url); URL.revokeObjectURL(videoUrl); audioContext.close(); }; // Start recording mediaRecorder.start(); // Play video and render frames with subtitles const renderFrame = () => { if (video.ended) { mediaRecorder.stop(); return; } // Draw video frame ctx.drawImage(video, 0, 0, canvas.width, canvas.height); // Find current subtitle const currentTime = video.currentTime; const currentSubtitle = subtitles.find( (sub) => currentTime >= sub.start && currentTime <= sub.end, ); // Draw subtitle if exists if (currentSubtitle) { ctx.fillStyle = "rgba(0, 0, 0, 0.7)"; ctx.font = `${Math.max(16, canvas.height / 25)}px Arial`; ctx.textAlign = "center"; ctx.textBaseline = "bottom"; const textWidth = ctx.measureText(currentSubtitle.text).width; const padding = 10; const textX = canvas.width / 2; const textY = canvas.height - 30; // Draw background rectangle ctx.fillRect( textX - textWidth / 2 - padding, textY - parseInt(ctx.font) - padding, textWidth + padding * 2, parseInt(ctx.font) + padding * 2, ); // Draw text ctx.fillStyle = "white"; ctx.fillText(currentSubtitle.text, textX, textY); } requestAnimationFrame(renderFrame); }; // Start playback and rendering video.play(); renderFrame(); } catch (error) { console.error("Error creating video with subtitles:", error); throw new Error("Failed to create video with subtitles"); } }; // Helper function to parse VTT time format const parseVTTTime = (timeStr: string): number => { const parts = timeStr.split(":"); const seconds = parts[parts.length - 1].split("."); return ( parseInt(parts[0]) * 3600 + // hours parseInt(parts[1]) * 60 + // minutes parseInt(seconds[0]) + // seconds parseInt(seconds[1] || "0") / 1000 // milliseconds ); }; interface TranscriptionPlayerProps {} export default function TranscriptionPlayer({}: TranscriptionPlayerProps) { const [file, setFile] = useState(null); const [transcription, setTranscription] = useState(null); const [isLoading, setIsLoading] = useState(false); const [error, setError] = useState(null); const [activeSegmentIndex, setActiveSegmentIndex] = useState( null, ); const [mediaUrl, setMediaUrl] = useState(null); const [currentTime, setCurrentTime] = useState(0); const [lastScrollTime, setLastScrollTime] = useState(0); const [isProcessingVideo, setIsProcessingVideo] = useState(false); const [isDownloadingVideo, setIsDownloadingVideo] = useState(false); const audioRef = useRef(null); const videoRef = useRef(null); const fileInputRef = useRef(null); const activeSegmentRef = useRef(null); const apiEndpoint = import.meta.env.VITE_TRANSCRIPTION_API_URL || "/transcribe"; const handleFileSelect = (event: React.ChangeEvent) => { const selectedFile = event.target.files?.[0]; if (selectedFile) { // Clean up previous URL if it exists if (mediaUrl) { URL.revokeObjectURL(mediaUrl); } // Create new URL for the selected file const newMediaUrl = URL.createObjectURL(selectedFile); setFile(selectedFile); setMediaUrl(newMediaUrl); setTranscription(null); setError(null); setActiveSegmentIndex(null); } }; const handleTranscribe = async () => { if (!file) return; setIsLoading(true); setError(null); try { let audioFileToProcess = file; // If it's a video file, extract audio first if (file.type.startsWith("video/")) { setIsProcessingVideo(true); try { console.log("Processing video file for audio extraction..."); audioFileToProcess = await extractAudioFromVideo(file); console.log("Audio extraction completed:", audioFileToProcess.name); } catch (videoError) { console.warn( "Video processing failed, using original file:", videoError, ); // Fallback to sending the video file directly audioFileToProcess = file; } finally { setIsProcessingVideo(false); } } let result: TranscriptionResponse = { transcription: "the birch canoe slid on the smooth plants glue the sheet to the dark blue background it is easy to tell the depth of a well", aligned_segments: [ { text: "the", start: 0.0, end: 0.62124248496994, duration: 0.62124248496994, }, { text: "birch", start: 0.7214428857715431, end: 1.122244488977956, duration: 0.4008016032064128, }, { text: "canoe", start: 1.2024048096192383, end: 1.743486973947896, duration: 0.5410821643286576, }, { text: "slid", start: 1.8236472945891784, end: 2.084168336673347, duration: 0.2605210420841686, }, { text: "on", start: 2.1442885771543088, end: 2.284569138276553, duration: 0.1402805611222444, }, { text: "the", start: 2.364729458917836, end: 2.5450901803607215, duration: 0.18036072144288573, }, { text: "smooth", start: 2.625250501002004, end: 3.687374749498998, duration: 1.062124248496994, }, { text: "plants", start: 4.328657314629258, end: 4.749498997995992, duration: 0.4208416833667332, }, { text: "glue", start: 4.829659318637275, end: 5.01002004008016, duration: 0.18036072144288529, }, { text: "the", start: 5.070140280561122, end: 5.170340681362725, duration: 0.10020040080160264, }, { text: "sheet", start: 5.2304609218436875, end: 5.591182364729459, duration: 0.36072144288577146, }, { text: "to", start: 5.631262525050101, end: 5.771543086172345, duration: 0.1402805611222444, }, { text: "the", start: 5.851703406813627, end: 6.012024048096193, duration: 0.16032064128256618, }, { text: "dark", start: 6.072144288577155, end: 6.332665330661323, duration: 0.26052104208416793, }, { text: "blue", start: 7.114228456913828, end: 8.056112224448897, duration: 0.9418837675350691, }, { text: "background", start: 8.136272545090181, end: 8.737474949899799, duration: 0.6012024048096176, }, { text: "it", start: 8.77755511022044, end: 8.897795591182364, duration: 0.12024048096192352, }, { text: "is", start: 8.977955911823647, end: 9.058116232464931, duration: 0.08016032064128353, }, { text: "easy", start: 9.118236472945892, end: 9.438877755511022, duration: 0.3206412825651306, }, { text: "to", start: 9.498997995991983, end: 9.97995991983968, duration: 0.48096192384769587, }, { text: "tell", start: 2.1042084168336674, end: 2.124248496993988, duration: 0.02004008016032044, }, { text: "the", start: 2.1442885771543088, end: 2.1843687374749496, duration: 0.04008016032064088, }, { text: "depth", start: 2.1843687374749496, end: 2.284569138276553, duration: 0.10020040080160353, }, { text: "of", start: 2.284569138276553, end: 2.364729458917836, duration: 0.08016032064128265, }, { text: "a", start: 2.364729458917836, end: 2.4448897795591185, duration: 0.08016032064128265, }, { text: "well", start: 2.4448897795591185, end: 2.50501002004008, duration: 0.06012024048096176, }, ], total_duration: 2.50501002004008, num_segments: 26, status: "success", alignment_available: true, device: "cpu", model: "mock-whisper-model", }; if (!USE_MOCK_DATA) { // Use real API const formData = new FormData(); formData.append("audio", audioFileToProcess); const response = await fetch(apiEndpoint, { method: "POST", body: formData, }); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } result = await response.json(); } if (result.status === "success") { setTranscription(result); } else { throw new Error("Transcription failed"); } } catch (err) { setError( err instanceof Error ? err.message : "An error occurred during transcription", ); } finally { setIsLoading(false); setIsProcessingVideo(false); } }; const handleTimeUpdate = () => { const mediaElement = audioRef.current || videoRef.current; if (mediaElement && transcription?.aligned_segments) { const mediaCurrentTime = mediaElement.currentTime; setCurrentTime(mediaCurrentTime); // Find the active segment with a small tolerance for timing precision const activeIndex = transcription.aligned_segments.findIndex( (segment) => mediaCurrentTime >= segment.start && mediaCurrentTime <= segment.end, ); // If no exact match, find the closest segment if (activeIndex === -1) { let closestIndex = -1; let minDistance = Infinity; transcription.aligned_segments.forEach((segment, index) => { const distance = Math.min( Math.abs(mediaCurrentTime - segment.start), Math.abs(mediaCurrentTime - segment.end), ); if (distance < minDistance && distance < 0.5) { // 0.5 second tolerance minDistance = distance; closestIndex = index; } }); setActiveSegmentIndex(closestIndex >= 0 ? closestIndex : null); } else { setActiveSegmentIndex(activeIndex); } } }; const handleSeekToSegment = (segment: AlignedSegment) => { const mediaElement = audioRef.current || videoRef.current; if (mediaElement) { mediaElement.currentTime = segment.start; } }; const handleDownloadVideoWithSubtitles = async () => { if (!file || !transcription || !isVideoFile) return; setIsDownloadingVideo(true); try { const vttContent = generateWebVTT(transcription.aligned_segments); const filename = file.name.replace(/\.[^/.]+$/, "_with_subtitles.webm"); await downloadVideoWithSubtitles(file, vttContent, filename); } catch (error) { console.error("Error downloading video with subtitles:", error); setError("Failed to download video with subtitles"); } finally { setIsDownloadingVideo(false); } }; // Auto-scroll timeline to follow playback with throttling useEffect(() => { if (transcription && currentTime >= 0) { const now = Date.now(); // Throttle scroll updates to every 100ms for smoother performance if (now - lastScrollTime < SCROLL_INTERVAL) { return; } const timelineContainer = document.getElementById("timeline-container"); if (timelineContainer) { const containerWidth = timelineContainer.clientWidth; const timelineWidth = Math.max(transcription.total_duration * 200, 1200) - 32; const currentPosition = 4 + (currentTime / transcription.total_duration) * timelineWidth; // Only scroll if the current position is near the edges of the visible area const currentScrollLeft = timelineContainer.scrollLeft; const leftBoundary = currentScrollLeft + containerWidth * 0.15; // 15% from left edge const rightBoundary = currentScrollLeft + containerWidth * 0.85; // 15% from right edge if (currentPosition < leftBoundary || currentPosition > rightBoundary) { // Auto-scroll to keep the progress indicator centered const scrollPosition = Math.max( 0, currentPosition - containerWidth / 2, ); timelineContainer.scrollTo({ left: scrollPosition, behavior: "smooth", }); setLastScrollTime(now); } } } }, [currentTime, transcription, lastScrollTime]); // Also scroll to active segment when clicked useEffect(() => { if ( activeSegmentIndex !== null && activeSegmentRef.current && transcription ) { const timelineContainer = document.getElementById("timeline-container"); if (timelineContainer) { const segment = transcription.aligned_segments[activeSegmentIndex]; const containerWidth = timelineContainer.clientWidth; const timelineWidth = Math.max(transcription.total_duration * 200, 1200) - 32; const segmentPosition = 4 + (segment.start / transcription.total_duration) * timelineWidth; // Scroll to center the active segment const scrollPosition = Math.max( 0, segmentPosition - containerWidth / 2, ); timelineContainer.scrollTo({ left: scrollPosition, behavior: "smooth", }); } } }, [activeSegmentIndex, transcription]); // Cleanup media URL on unmount useEffect(() => { return () => { if (mediaUrl) { URL.revokeObjectURL(mediaUrl); } }; }, [mediaUrl]); const formatTime = (seconds: number) => { const mins = Math.floor(seconds / 60); const secs = Math.floor(seconds % 60); return `${mins}:${secs.toString().padStart(2, "0")}`; }; const isVideoFile = file?.type.startsWith("video/"); return (
{/* Side Panel */}

Audio/Video Transcription

{/* File Upload */}

Upload Media

{file && (
{file.name}
{(file.size / 1024 / 1024).toFixed(2)} MB
)}
{/* Transcribe Button */} {file && !transcription && (
{isVideoFile && (
Video files will be processed to extract audio for transcription.
)}
)} {/* Error Display */} {error && (
Error
{error}
)} {/* Transcription Info */} {transcription && (

Transcription Info

Model: {transcription.model}
Segments: {transcription.num_segments}
Duration: {formatTime(transcription.total_duration)}
Device: {transcription.device}
)} {/* Subtitle Download Section */} {transcription && (

Download Subtitles

{isVideoFile && ( )}
SRT format works with most video players. WebVTT is ideal for web browsers. {isVideoFile && " Video with embedded subtitles will be in WebM format."}
)} {/* Full Transcription */} {transcription && (

Full Transcription

{transcription.transcription}
)} {/* Instructions */}

How to Use

• Upload an audio or video file
• Click "Transcribe" to process
• For videos, audio will be extracted automatically
• Play media to see synchronized text
• Click on segments to jump to that time
• Download subtitles in SRT or WebVTT format
• For videos, download with embedded subtitles
• Active segments are highlighted in blue
{/* Main Content */}
{/* Media Player */} {file && (
{isVideoFile ? (
{transcription && (
Subtitles: {transcription.aligned_segments.length}{" "} segments
)}
) : (
)}
)} {/* Transcription Timeline */} {transcription && (

Synchronized Transcription Timeline

Click on segments to jump to that time • Active segment highlighted in blue
{/* Timeline Container */}
{/* Timeline Base Line */}
{/* Progress Indicator */}
{/* Segment Blocks */} {transcription.aligned_segments.map((segment, index) => { const timelineWidth = Math.max(transcription.total_duration * 200, 1200) - 32; // Account for padding const leftPosition = 4 + (segment.start / transcription.total_duration) * timelineWidth; const blockWidth = Math.max( (segment.duration / transcription.total_duration) * timelineWidth, 80, ); const isActive = activeSegmentIndex === index; return (
handleSeekToSegment(segment)} className={`timeline-segment absolute cursor-pointer transition-all duration-200 ${ isActive ? "bg-blue-600 text-white scale-105 z-10" : "bg-gray-700 text-gray-300 hover:bg-gray-600 hover:scale-102" }`} style={{ left: `${leftPosition}px`, width: `${blockWidth}px`, top: "35%", height: "30%", }} >
{segment.text}
{formatTime(segment.start)}
{/* Hover tooltip for longer text */}
{segment.text}
{formatTime(segment.start)} -{" "} {formatTime(segment.end)} ( {segment.duration.toFixed(1)}s)
); })} {/* Time markers */} {Array.from( { length: Math.ceil(transcription.total_duration / 5) + 1, }, (_, i) => i * 5, ) .filter((time) => time <= transcription.total_duration) .map((time) => { const timelineWidth = Math.max(transcription.total_duration * 200, 1200) - 32; const position = 4 + (time / transcription.total_duration) * timelineWidth; return (
{formatTime(time)}
); })}
{/* Timeline Controls */}
Timeline View | {activeSegmentIndex !== null ? `Segment ${activeSegmentIndex + 1}/${transcription.aligned_segments.length}` : "No active segment"} | {formatTime(currentTime)} /{" "} {formatTime(transcription.total_duration)}
)} {/* Loading State */} {isLoading && (
{isProcessingVideo ? "Extracting audio from video..." : "Transcribing your media..."}
{isProcessingVideo ? "Converting video to audio format for transcription" : "This may take a few moments"}
)} {/* Empty State */} {!file && !isLoading && (
🎵
Upload Audio or Video
Choose a media file to get started with transcription
)}
); }