mms-transcription / frontend /src /components /TranscriptionPlayer.tsx
Mark Duppenthaler
Debug Memory
1923610
import { useRef, useState, useEffect } from "react";
interface AlignedSegment {
duration: number;
end: number;
start: number;
text: string;
}
interface TranscriptionResponse {
aligned_segments: AlignedSegment[];
alignment_available: boolean;
device: string;
model: string;
num_segments: number;
status: string;
total_duration: number;
transcription: string;
}
const SCROLL_INTERVAL = 5;
const USE_MOCK_DATA = false;
// Helper function to encode audio buffer as WAV
const encodeWAV = (audioBuffer: AudioBuffer): ArrayBuffer => {
const length = audioBuffer.length;
const numberOfChannels = audioBuffer.numberOfChannels;
const sampleRate = audioBuffer.sampleRate;
const arrayBuffer = new ArrayBuffer(44 + length * numberOfChannels * 2);
const view = new DataView(arrayBuffer);
// WAV header
const writeString = (offset: number, string: string) => {
for (let i = 0; i < string.length; i++) {
view.setUint8(offset + i, string.charCodeAt(i));
}
};
writeString(0, "RIFF");
view.setUint32(4, 36 + length * numberOfChannels * 2, true);
writeString(8, "WAVE");
writeString(12, "fmt ");
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, numberOfChannels, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * numberOfChannels * 2, true);
view.setUint16(32, numberOfChannels * 2, true);
view.setUint16(34, 16, true);
writeString(36, "data");
view.setUint32(40, length * numberOfChannels * 2, true);
// Convert float32 audio data to int16
const channels = [];
for (let i = 0; i < numberOfChannels; i++) {
channels.push(audioBuffer.getChannelData(i));
}
let offset = 44;
for (let i = 0; i < length; i++) {
for (let channel = 0; channel < numberOfChannels; channel++) {
const sample = Math.max(-1, Math.min(1, channels[channel][i]));
view.setInt16(
offset,
sample < 0 ? sample * 0x8000 : sample * 0x7fff,
true,
);
offset += 2;
}
}
return arrayBuffer;
};
// Audio extraction function using Web Audio API with WAV output
const extractAudioFromVideo = async (videoFile: File): Promise<File> => {
console.log(
"Extracting audio from video using Web Audio API (WAV format):",
videoFile.name,
);
try {
// Create a video element to load the video file
const video = document.createElement("video");
const videoUrl = URL.createObjectURL(videoFile);
video.src = videoUrl;
video.crossOrigin = "anonymous";
// Wait for video metadata to load
await new Promise((resolve, reject) => {
video.onloadedmetadata = resolve;
video.onerror = reject;
video.load();
});
// Create audio context
const audioContext = new (window.AudioContext ||
(window as any).webkitAudioContext)();
// Create buffer to store audio data
const source = audioContext.createMediaElementSource(video);
// Create a script processor to capture audio data
const processor = audioContext.createScriptProcessor(4096, 2, 2);
const audioData: number[][] = [[], []];
let isRecording = false;
processor.onaudioprocess = (event) => {
if (!isRecording) return;
const inputBuffer = event.inputBuffer;
const leftChannel = inputBuffer.getChannelData(0);
const rightChannel =
inputBuffer.numberOfChannels > 1
? inputBuffer.getChannelData(1)
: leftChannel;
// Append audio data by converting to arrays and concatenating
audioData[0] = audioData[0].concat(Array.from(leftChannel));
audioData[1] = audioData[1].concat(Array.from(rightChannel));
};
// Connect audio processing chain
source.connect(processor);
processor.connect(audioContext.destination);
return new Promise((resolve, reject) => {
// Start recording when video plays
video.onplay = () => {
isRecording = true;
};
video.onended = () => {
isRecording = false;
// Create audio buffer from captured data
const audioBuffer = audioContext.createBuffer(
2,
audioData[0].length,
audioContext.sampleRate,
);
// Convert number arrays to Float32Array and copy to channels
const leftChannelData = new Float32Array(audioData[0]);
const rightChannelData = new Float32Array(audioData[1]);
audioBuffer.copyToChannel(leftChannelData, 0);
audioBuffer.copyToChannel(rightChannelData, 1);
// Encode as WAV
const wavArrayBuffer = encodeWAV(audioBuffer);
const audioBlob = new Blob([wavArrayBuffer], { type: "audio/wav" });
const audioFile = new File(
[audioBlob],
videoFile.name.replace(/\.[^/.]+$/, ".wav"),
{ type: "audio/wav" },
);
// Clean up
URL.revokeObjectURL(videoUrl);
processor.disconnect();
source.disconnect();
audioContext.close();
resolve(audioFile);
};
video.onerror = (error) => {
console.error("Video error:", error);
URL.revokeObjectURL(videoUrl);
processor.disconnect();
source.disconnect();
audioContext.close();
reject(new Error("Failed to process video"));
};
// Start playing the video
video.currentTime = 0;
video.play().catch(reject);
// Set a timeout as fallback
setTimeout(
() => {
if (isRecording) {
video.pause();
video.dispatchEvent(new Event("ended"));
}
},
(video.duration + 2) * 1000,
);
});
} catch (error) {
console.error("Error extracting audio from video:", error);
throw new Error(
`Failed to extract audio: ${error instanceof Error ? error.message : "Unknown error"}`,
);
}
};
// Generate SRT subtitle format
const generateSRT = (segments: AlignedSegment[]): string => {
let srt = "";
segments.forEach((segment, index) => {
const startTime = formatTimeForSRT(segment.start);
const endTime = formatTimeForSRT(segment.end);
srt += `${index + 1}\n`;
srt += `${startTime} --> ${endTime}\n`;
srt += `${segment.text}\n\n`;
});
return srt;
};
// Generate WebVTT subtitle format
const generateWebVTT = (segments: AlignedSegment[]): string => {
let vtt = "WEBVTT\n\n";
segments.forEach((segment, index) => {
const startTime = formatTimeForVTT(segment.start);
const endTime = formatTimeForVTT(segment.end);
vtt += `${index + 1}\n`;
vtt += `${startTime} --> ${endTime}\n`;
vtt += `${segment.text}\n\n`;
});
return vtt;
};
// Format time for SRT (HH:MM:SS,mmm)
const formatTimeForSRT = (seconds: number): string => {
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
const secs = Math.floor(seconds % 60);
const milliseconds = Math.floor((seconds % 1) * 1000);
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")},${milliseconds.toString().padStart(3, "0")}`;
};
// Format time for WebVTT (HH:MM:SS.mmm)
const formatTimeForVTT = (seconds: number): string => {
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
const secs = Math.floor(seconds % 60);
const milliseconds = Math.floor((seconds % 1) * 1000);
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")}.${milliseconds.toString().padStart(3, "0")}`;
};
// Download subtitle file
const downloadSubtitles = (content: string, filename: string) => {
const blob = new Blob([content], { type: "text/plain;charset=utf-8" });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = filename;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(url);
};
// Download video with embedded subtitles
const downloadVideoWithSubtitles = async (
videoFile: File,
subtitleContent: string,
filename: string,
) => {
try {
// Create a canvas to render the video with subtitles
const video = document.createElement("video");
const canvas = document.createElement("canvas");
const ctx = canvas.getContext("2d")!;
// Load the video
const videoUrl = URL.createObjectURL(videoFile);
video.src = videoUrl;
video.crossOrigin = "anonymous";
await new Promise((resolve, reject) => {
video.onloadedmetadata = resolve;
video.onerror = reject;
video.load();
});
// Set canvas dimensions to match video
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
// Parse WebVTT subtitles
const subtitleLines = subtitleContent.split("\n");
const subtitles: Array<{ start: number; end: number; text: string }> = [];
for (let i = 0; i < subtitleLines.length; i++) {
const line = subtitleLines[i].trim();
if (line.includes(" --> ")) {
const [startStr, endStr] = line.split(" --> ");
const start = parseVTTTime(startStr);
const end = parseVTTTime(endStr);
const text = subtitleLines[i + 1]?.trim() || "";
if (text) {
subtitles.push({ start, end, text });
}
}
}
// Create MediaRecorder to capture the canvas
const stream = canvas.captureStream(30); // 30 FPS
// Add audio from original video
const audioContext = new AudioContext();
const source = audioContext.createMediaElementSource(video);
const dest = audioContext.createMediaStreamDestination();
source.connect(dest);
// Combine video and audio streams
const audioTrack = dest.stream.getAudioTracks()[0];
if (audioTrack) {
stream.addTrack(audioTrack);
}
const mediaRecorder = new MediaRecorder(stream, {
mimeType: "video/webm;codecs=vp8,opus",
});
const chunks: Blob[] = [];
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
chunks.push(event.data);
}
};
mediaRecorder.onstop = () => {
const blob = new Blob(chunks, { type: "video/webm" });
const url = URL.createObjectURL(blob);
const link = document.createElement("a");
link.href = url;
link.download = filename;
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(url);
URL.revokeObjectURL(videoUrl);
audioContext.close();
};
// Start recording
mediaRecorder.start();
// Play video and render frames with subtitles
const renderFrame = () => {
if (video.ended) {
mediaRecorder.stop();
return;
}
// Draw video frame
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
// Find current subtitle
const currentTime = video.currentTime;
const currentSubtitle = subtitles.find(
(sub) => currentTime >= sub.start && currentTime <= sub.end,
);
// Draw subtitle if exists
if (currentSubtitle) {
ctx.fillStyle = "rgba(0, 0, 0, 0.7)";
ctx.font = `${Math.max(16, canvas.height / 25)}px Arial`;
ctx.textAlign = "center";
ctx.textBaseline = "bottom";
const textWidth = ctx.measureText(currentSubtitle.text).width;
const padding = 10;
const textX = canvas.width / 2;
const textY = canvas.height - 30;
// Draw background rectangle
ctx.fillRect(
textX - textWidth / 2 - padding,
textY - parseInt(ctx.font) - padding,
textWidth + padding * 2,
parseInt(ctx.font) + padding * 2,
);
// Draw text
ctx.fillStyle = "white";
ctx.fillText(currentSubtitle.text, textX, textY);
}
requestAnimationFrame(renderFrame);
};
// Start playback and rendering
video.play();
renderFrame();
} catch (error) {
console.error("Error creating video with subtitles:", error);
throw new Error("Failed to create video with subtitles");
}
};
// Helper function to parse VTT time format
const parseVTTTime = (timeStr: string): number => {
const parts = timeStr.split(":");
const seconds = parts[parts.length - 1].split(".");
return (
parseInt(parts[0]) * 3600 + // hours
parseInt(parts[1]) * 60 + // minutes
parseInt(seconds[0]) + // seconds
parseInt(seconds[1] || "0") / 1000 // milliseconds
);
};
interface TranscriptionPlayerProps {}
export default function TranscriptionPlayer({}: TranscriptionPlayerProps) {
const [file, setFile] = useState<File | null>(null);
const [transcription, setTranscription] =
useState<TranscriptionResponse | null>(null);
const [isLoading, setIsLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const [activeSegmentIndex, setActiveSegmentIndex] = useState<number | null>(
null,
);
const [mediaUrl, setMediaUrl] = useState<string | null>(null);
const [currentTime, setCurrentTime] = useState<number>(0);
const [lastScrollTime, setLastScrollTime] = useState<number>(0);
const [isProcessingVideo, setIsProcessingVideo] = useState<boolean>(false);
const [isDownloadingVideo, setIsDownloadingVideo] = useState<boolean>(false);
const audioRef = useRef<HTMLAudioElement>(null);
const videoRef = useRef<HTMLVideoElement>(null);
const fileInputRef = useRef<HTMLInputElement>(null);
const activeSegmentRef = useRef<HTMLDivElement>(null);
const apiEndpoint =
import.meta.env.VITE_TRANSCRIPTION_API_URL || "/transcribe";
const handleFileSelect = (event: React.ChangeEvent<HTMLInputElement>) => {
const selectedFile = event.target.files?.[0];
if (selectedFile) {
// Clean up previous URL if it exists
if (mediaUrl) {
URL.revokeObjectURL(mediaUrl);
}
// Create new URL for the selected file
const newMediaUrl = URL.createObjectURL(selectedFile);
setFile(selectedFile);
setMediaUrl(newMediaUrl);
setTranscription(null);
setError(null);
setActiveSegmentIndex(null);
}
};
const handleTranscribe = async () => {
if (!file) return;
setIsLoading(true);
setError(null);
try {
let audioFileToProcess = file;
// If it's a video file, extract audio first
if (file.type.startsWith("video/")) {
setIsProcessingVideo(true);
try {
console.log("Processing video file for audio extraction...");
audioFileToProcess = await extractAudioFromVideo(file);
console.log("Audio extraction completed:", audioFileToProcess.name);
} catch (videoError) {
console.warn(
"Video processing failed, using original file:",
videoError,
);
// Fallback to sending the video file directly
audioFileToProcess = file;
} finally {
setIsProcessingVideo(false);
}
}
let result: TranscriptionResponse = {
transcription:
"the birch canoe slid on the smooth plants glue the sheet to the dark blue background it is easy to tell the depth of a well",
aligned_segments: [
{
text: "the",
start: 0.0,
end: 0.62124248496994,
duration: 0.62124248496994,
},
{
text: "birch",
start: 0.7214428857715431,
end: 1.122244488977956,
duration: 0.4008016032064128,
},
{
text: "canoe",
start: 1.2024048096192383,
end: 1.743486973947896,
duration: 0.5410821643286576,
},
{
text: "slid",
start: 1.8236472945891784,
end: 2.084168336673347,
duration: 0.2605210420841686,
},
{
text: "on",
start: 2.1442885771543088,
end: 2.284569138276553,
duration: 0.1402805611222444,
},
{
text: "the",
start: 2.364729458917836,
end: 2.5450901803607215,
duration: 0.18036072144288573,
},
{
text: "smooth",
start: 2.625250501002004,
end: 3.687374749498998,
duration: 1.062124248496994,
},
{
text: "plants",
start: 4.328657314629258,
end: 4.749498997995992,
duration: 0.4208416833667332,
},
{
text: "glue",
start: 4.829659318637275,
end: 5.01002004008016,
duration: 0.18036072144288529,
},
{
text: "the",
start: 5.070140280561122,
end: 5.170340681362725,
duration: 0.10020040080160264,
},
{
text: "sheet",
start: 5.2304609218436875,
end: 5.591182364729459,
duration: 0.36072144288577146,
},
{
text: "to",
start: 5.631262525050101,
end: 5.771543086172345,
duration: 0.1402805611222444,
},
{
text: "the",
start: 5.851703406813627,
end: 6.012024048096193,
duration: 0.16032064128256618,
},
{
text: "dark",
start: 6.072144288577155,
end: 6.332665330661323,
duration: 0.26052104208416793,
},
{
text: "blue",
start: 7.114228456913828,
end: 8.056112224448897,
duration: 0.9418837675350691,
},
{
text: "background",
start: 8.136272545090181,
end: 8.737474949899799,
duration: 0.6012024048096176,
},
{
text: "it",
start: 8.77755511022044,
end: 8.897795591182364,
duration: 0.12024048096192352,
},
{
text: "is",
start: 8.977955911823647,
end: 9.058116232464931,
duration: 0.08016032064128353,
},
{
text: "easy",
start: 9.118236472945892,
end: 9.438877755511022,
duration: 0.3206412825651306,
},
{
text: "to",
start: 9.498997995991983,
end: 9.97995991983968,
duration: 0.48096192384769587,
},
{
text: "tell",
start: 2.1042084168336674,
end: 2.124248496993988,
duration: 0.02004008016032044,
},
{
text: "the",
start: 2.1442885771543088,
end: 2.1843687374749496,
duration: 0.04008016032064088,
},
{
text: "depth",
start: 2.1843687374749496,
end: 2.284569138276553,
duration: 0.10020040080160353,
},
{
text: "of",
start: 2.284569138276553,
end: 2.364729458917836,
duration: 0.08016032064128265,
},
{
text: "a",
start: 2.364729458917836,
end: 2.4448897795591185,
duration: 0.08016032064128265,
},
{
text: "well",
start: 2.4448897795591185,
end: 2.50501002004008,
duration: 0.06012024048096176,
},
],
total_duration: 2.50501002004008,
num_segments: 26,
status: "success",
alignment_available: true,
device: "cpu",
model: "mock-whisper-model",
};
if (!USE_MOCK_DATA) {
// Use real API
const formData = new FormData();
formData.append("audio", audioFileToProcess);
const response = await fetch(apiEndpoint, {
method: "POST",
body: formData,
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
result = await response.json();
}
if (result.status === "success") {
setTranscription(result);
} else {
throw new Error("Transcription failed");
}
} catch (err) {
setError(
err instanceof Error
? err.message
: "An error occurred during transcription",
);
} finally {
setIsLoading(false);
setIsProcessingVideo(false);
}
};
const handleTimeUpdate = () => {
const mediaElement = audioRef.current || videoRef.current;
if (mediaElement && transcription?.aligned_segments) {
const mediaCurrentTime = mediaElement.currentTime;
setCurrentTime(mediaCurrentTime);
// Find the active segment with a small tolerance for timing precision
const activeIndex = transcription.aligned_segments.findIndex(
(segment) =>
mediaCurrentTime >= segment.start && mediaCurrentTime <= segment.end,
);
// If no exact match, find the closest segment
if (activeIndex === -1) {
let closestIndex = -1;
let minDistance = Infinity;
transcription.aligned_segments.forEach((segment, index) => {
const distance = Math.min(
Math.abs(mediaCurrentTime - segment.start),
Math.abs(mediaCurrentTime - segment.end),
);
if (distance < minDistance && distance < 0.5) {
// 0.5 second tolerance
minDistance = distance;
closestIndex = index;
}
});
setActiveSegmentIndex(closestIndex >= 0 ? closestIndex : null);
} else {
setActiveSegmentIndex(activeIndex);
}
}
};
const handleSeekToSegment = (segment: AlignedSegment) => {
const mediaElement = audioRef.current || videoRef.current;
if (mediaElement) {
mediaElement.currentTime = segment.start;
}
};
const handleDownloadVideoWithSubtitles = async () => {
if (!file || !transcription || !isVideoFile) return;
setIsDownloadingVideo(true);
try {
const vttContent = generateWebVTT(transcription.aligned_segments);
const filename = file.name.replace(/\.[^/.]+$/, "_with_subtitles.webm");
await downloadVideoWithSubtitles(file, vttContent, filename);
} catch (error) {
console.error("Error downloading video with subtitles:", error);
setError("Failed to download video with subtitles");
} finally {
setIsDownloadingVideo(false);
}
};
// Auto-scroll timeline to follow playback with throttling
useEffect(() => {
if (transcription && currentTime >= 0) {
const now = Date.now();
// Throttle scroll updates to every 100ms for smoother performance
if (now - lastScrollTime < SCROLL_INTERVAL) {
return;
}
const timelineContainer = document.getElementById("timeline-container");
if (timelineContainer) {
const containerWidth = timelineContainer.clientWidth;
const timelineWidth =
Math.max(transcription.total_duration * 200, 1200) - 32;
const currentPosition =
4 + (currentTime / transcription.total_duration) * timelineWidth;
// Only scroll if the current position is near the edges of the visible area
const currentScrollLeft = timelineContainer.scrollLeft;
const leftBoundary = currentScrollLeft + containerWidth * 0.15; // 15% from left edge
const rightBoundary = currentScrollLeft + containerWidth * 0.85; // 15% from right edge
if (currentPosition < leftBoundary || currentPosition > rightBoundary) {
// Auto-scroll to keep the progress indicator centered
const scrollPosition = Math.max(
0,
currentPosition - containerWidth / 2,
);
timelineContainer.scrollTo({
left: scrollPosition,
behavior: "smooth",
});
setLastScrollTime(now);
}
}
}
}, [currentTime, transcription, lastScrollTime]);
// Also scroll to active segment when clicked
useEffect(() => {
if (
activeSegmentIndex !== null &&
activeSegmentRef.current &&
transcription
) {
const timelineContainer = document.getElementById("timeline-container");
if (timelineContainer) {
const segment = transcription.aligned_segments[activeSegmentIndex];
const containerWidth = timelineContainer.clientWidth;
const timelineWidth =
Math.max(transcription.total_duration * 200, 1200) - 32;
const segmentPosition =
4 + (segment.start / transcription.total_duration) * timelineWidth;
// Scroll to center the active segment
const scrollPosition = Math.max(
0,
segmentPosition - containerWidth / 2,
);
timelineContainer.scrollTo({
left: scrollPosition,
behavior: "smooth",
});
}
}
}, [activeSegmentIndex, transcription]);
// Cleanup media URL on unmount
useEffect(() => {
return () => {
if (mediaUrl) {
URL.revokeObjectURL(mediaUrl);
}
};
}, [mediaUrl]);
const formatTime = (seconds: number) => {
const mins = Math.floor(seconds / 60);
const secs = Math.floor(seconds % 60);
return `${mins}:${secs.toString().padStart(2, "0")}`;
};
const isVideoFile = file?.type.startsWith("video/");
return (
<div className="flex h-screen bg-gray-900">
<style>{`
.line-clamp-2 {
display: -webkit-box;
-webkit-line-clamp: 2;
-webkit-box-orient: vertical;
overflow: hidden;
}
.hover\\:scale-102:hover {
transform: scale(1.02);
}
.timeline-segment:hover .timeline-tooltip {
opacity: 1;
}
#timeline-container {
scroll-behavior: smooth;
-webkit-overflow-scrolling: touch;
}
#timeline-container::-webkit-scrollbar {
height: 8px;
}
#timeline-container::-webkit-scrollbar-track {
background: #374151;
border-radius: 4px;
}
#timeline-container::-webkit-scrollbar-thumb {
background: #6b7280;
border-radius: 4px;
}
#timeline-container::-webkit-scrollbar-thumb:hover {
background: #9ca3af;
}
`}</style>
{/* Side Panel */}
<div className="w-80 bg-gray-800 text-white p-6 overflow-y-auto">
<h2 className="text-2xl font-bold mb-6">Audio/Video Transcription</h2>
{/* File Upload */}
<div className="mb-6">
<h3 className="text-lg font-semibold mb-3">Upload Media</h3>
<input
ref={fileInputRef}
type="file"
accept="audio/*,video/*"
onChange={handleFileSelect}
className="hidden"
/>
<button
onClick={() => fileInputRef.current?.click()}
className="w-full p-3 bg-blue-600 hover:bg-blue-700 rounded-lg transition-colors"
>
Choose Audio/Video File
</button>
{file && (
<div className="mt-3 p-3 bg-gray-700 rounded">
<div className="text-sm font-medium">{file.name}</div>
<div className="text-xs text-gray-400">
{(file.size / 1024 / 1024).toFixed(2)} MB
</div>
</div>
)}
</div>
{/* Transcribe Button */}
{file && !transcription && (
<div className="mb-6">
<button
onClick={handleTranscribe}
disabled={isLoading}
className="w-full p-3 bg-green-600 hover:bg-green-700 disabled:bg-gray-600 rounded-lg transition-colors"
>
{isLoading
? isProcessingVideo
? "Processing Video..."
: "Transcribing..."
: "Transcribe"}
</button>
{isVideoFile && (
<div className="mt-2 text-xs text-gray-400">
Video files will be processed to extract audio for
transcription.
</div>
)}
</div>
)}
{/* Error Display */}
{error && (
<div className="mb-6 p-3 bg-red-600 rounded">
<div className="text-sm font-medium">Error</div>
<div className="text-xs">{error}</div>
</div>
)}
{/* Transcription Info */}
{transcription && (
<div className="mb-6">
<h3 className="text-lg font-semibold mb-3">Transcription Info</h3>
<div className="space-y-2 text-sm">
<div>Model: {transcription.model}</div>
<div>Segments: {transcription.num_segments}</div>
<div>Duration: {formatTime(transcription.total_duration)}</div>
<div>Device: {transcription.device}</div>
</div>
</div>
)}
{/* Subtitle Download Section */}
{transcription && (
<div className="mb-6">
<h3 className="text-lg font-semibold mb-3">Download Subtitles</h3>
<div className="space-y-2">
<button
onClick={() => {
const srtContent = generateSRT(
transcription.aligned_segments,
);
const filename =
file?.name?.replace(/\.[^/.]+$/, ".srt") || "subtitles.srt";
downloadSubtitles(srtContent, filename);
}}
className="w-full p-2 bg-purple-600 hover:bg-purple-700 rounded-lg transition-colors text-sm"
>
Download SRT Subtitles
</button>
<button
onClick={() => {
const vttContent = generateWebVTT(
transcription.aligned_segments,
);
const filename =
file?.name?.replace(/\.[^/.]+$/, ".vtt") || "subtitles.vtt";
downloadSubtitles(vttContent, filename);
}}
className="w-full p-2 bg-indigo-600 hover:bg-indigo-700 rounded-lg transition-colors text-sm"
>
Download WebVTT Subtitles
</button>
{isVideoFile && (
<button
onClick={handleDownloadVideoWithSubtitles}
disabled={isDownloadingVideo}
className="w-full p-2 bg-orange-600 hover:bg-orange-700 disabled:bg-gray-600 rounded-lg transition-colors text-sm"
>
{isDownloadingVideo
? "Creating Video..."
: "Download Video with Embedded Subtitles"}
</button>
)}
</div>
<div className="mt-2 text-xs text-gray-400">
SRT format works with most video players. WebVTT is ideal for web
browsers.
{isVideoFile &&
" Video with embedded subtitles will be in WebM format."}
</div>
</div>
)}
{/* Full Transcription */}
{transcription && (
<div className="mb-6">
<h3 className="text-lg font-semibold mb-3">Full Transcription</h3>
<div className="p-3 bg-gray-700 rounded text-sm">
{transcription.transcription}
</div>
</div>
)}
{/* Instructions */}
<div className="border-t border-gray-700 pt-4">
<h3 className="text-sm font-semibold mb-2">How to Use</h3>
<div className="text-xs text-gray-400 space-y-1">
<div>• Upload an audio or video file</div>
<div>• Click "Transcribe" to process</div>
<div>• For videos, audio will be extracted automatically</div>
<div>• Play media to see synchronized text</div>
<div>• Click on segments to jump to that time</div>
<div>• Download subtitles in SRT or WebVTT format</div>
<div>• For videos, download with embedded subtitles</div>
<div>• Active segments are highlighted in blue</div>
</div>
</div>
</div>
{/* Main Content */}
<div className="flex-1 flex flex-col bg-black">
{/* Media Player */}
{file && (
<div className="p-6 bg-gray-800">
<div className="max-w-4xl mx-auto">
{isVideoFile ? (
<div className="relative">
<video
ref={videoRef}
src={mediaUrl || ""}
controls
onTimeUpdate={handleTimeUpdate}
className="w-full max-h-96 rounded-lg"
>
{transcription && (
<track
kind="subtitles"
src={`data:text/vtt;base64,${btoa(generateWebVTT(transcription.aligned_segments))}`}
srcLang="en"
label="English"
default
/>
)}
</video>
{transcription && (
<div className="absolute bottom-2 right-2 bg-black bg-opacity-75 text-white px-2 py-1 rounded text-xs">
Subtitles: {transcription.aligned_segments.length}{" "}
segments
</div>
)}
</div>
) : (
<div className="bg-gray-700 p-8 rounded-lg">
<audio
ref={audioRef}
src={mediaUrl || ""}
controls
onTimeUpdate={handleTimeUpdate}
className="w-full"
/>
<div className="mt-4 text-center text-gray-300">
<div className="text-lg font-medium">Audio File</div>
<div className="text-sm">{file.name}</div>
</div>
</div>
)}
</div>
</div>
)}
{/* Transcription Timeline */}
{transcription && (
<div className="flex-1 flex flex-col bg-gray-900">
<div className="p-4 bg-gray-800">
<h3 className="text-xl font-bold text-white mb-2">
Synchronized Transcription Timeline
</h3>
<div className="text-sm text-gray-400">
Click on segments to jump to that time • Active segment
highlighted in blue
</div>
</div>
<div className="flex-1 relative overflow-hidden">
{/* Timeline Container */}
<div
className="h-full overflow-x-auto overflow-y-hidden"
id="timeline-container"
>
<div
className="relative h-full py-8 px-4"
style={{
width: `${Math.max(transcription.total_duration * 200, 1200)}px`,
minWidth: "100%",
}}
>
{/* Timeline Base Line */}
<div className="absolute top-1/2 left-4 right-4 h-0.5 bg-gray-600 transform -translate-y-1/2"></div>
{/* Progress Indicator */}
<div
className="absolute top-0 bottom-0 w-0.5 bg-red-500 z-20 transition-all duration-75"
style={{
left: `${4 + (currentTime / transcription.total_duration) * (Math.max(transcription.total_duration * 200, 1200) - 32)}px`,
}}
></div>
{/* Segment Blocks */}
{transcription.aligned_segments.map((segment, index) => {
const timelineWidth =
Math.max(transcription.total_duration * 200, 1200) - 32; // Account for padding
const leftPosition =
4 +
(segment.start / transcription.total_duration) *
timelineWidth;
const blockWidth = Math.max(
(segment.duration / transcription.total_duration) *
timelineWidth,
80,
);
const isActive = activeSegmentIndex === index;
return (
<div
key={index}
ref={isActive ? activeSegmentRef : null}
onClick={() => handleSeekToSegment(segment)}
className={`timeline-segment absolute cursor-pointer transition-all duration-200 ${
isActive
? "bg-blue-600 text-white scale-105 z-10"
: "bg-gray-700 text-gray-300 hover:bg-gray-600 hover:scale-102"
}`}
style={{
left: `${leftPosition}px`,
width: `${blockWidth}px`,
top: "35%",
height: "30%",
}}
>
<div className="h-full flex flex-col justify-center p-2 rounded-lg shadow-lg">
<div className="text-xs font-medium leading-tight line-clamp-2">
{segment.text}
</div>
<div className="text-xs opacity-75 mt-1">
{formatTime(segment.start)}
</div>
</div>
{/* Hover tooltip for longer text */}
<div className="timeline-tooltip absolute bottom-full left-1/2 transform -translate-x-1/2 mb-2 px-2 py-1 bg-gray-800 text-white text-xs rounded opacity-0 transition-opacity duration-200 pointer-events-none z-30 max-w-xs">
<div className="whitespace-normal break-words">
{segment.text}
</div>
<div className="text-gray-400 mt-1">
{formatTime(segment.start)} -{" "}
{formatTime(segment.end)} (
{segment.duration.toFixed(1)}s)
</div>
</div>
</div>
);
})}
{/* Time markers */}
{Array.from(
{
length: Math.ceil(transcription.total_duration / 5) + 1,
},
(_, i) => i * 5,
)
.filter((time) => time <= transcription.total_duration)
.map((time) => {
const timelineWidth =
Math.max(transcription.total_duration * 200, 1200) - 32;
const position =
4 +
(time / transcription.total_duration) * timelineWidth;
return (
<div
key={time}
className="absolute text-xs text-gray-500"
style={{
left: `${position}px`,
top: "75%",
transform: "translateX(-50%)",
}}
>
<div className="w-0.5 h-4 bg-gray-500 mx-auto mb-1"></div>
{formatTime(time)}
</div>
);
})}
</div>
</div>
{/* Timeline Controls */}
<div className="absolute bottom-4 left-1/2 transform -translate-x-1/2 bg-gray-800 rounded-lg p-2 shadow-lg">
<div className="flex items-center space-x-4 text-white text-sm">
<span>Timeline View</span>
<span className="text-gray-400">|</span>
<span className="text-blue-400">
{activeSegmentIndex !== null
? `Segment ${activeSegmentIndex + 1}/${transcription.aligned_segments.length}`
: "No active segment"}
</span>
<span className="text-gray-400">|</span>
<span className="text-green-400">
{formatTime(currentTime)} /{" "}
{formatTime(transcription.total_duration)}
</span>
</div>
</div>
</div>
</div>
)}
{/* Loading State */}
{isLoading && (
<div className="flex-1 flex items-center justify-center">
<div className="text-center text-white">
<div className="animate-spin rounded-full h-12 w-12 border-b-2 border-white mx-auto mb-4"></div>
<div className="text-lg">
{isProcessingVideo
? "Extracting audio from video..."
: "Transcribing your media..."}
</div>
<div className="text-sm text-gray-400 mt-2">
{isProcessingVideo
? "Converting video to audio format for transcription"
: "This may take a few moments"}
</div>
</div>
</div>
)}
{/* Empty State */}
{!file && !isLoading && (
<div className="flex-1 flex items-center justify-center">
<div className="text-center text-gray-400">
<div className="text-6xl mb-4">🎵</div>
<div className="text-xl mb-2">Upload Audio or Video</div>
<div className="text-sm">
Choose a media file to get started with transcription
</div>
</div>
</div>
)}
</div>
</div>
);
}