Spaces:

mduppes
/

mms-transcription

Runtime error

mms-transcription / frontend /src /components /TranscriptionPlayer.tsx

Mark Duppenthaler

Debug Memory

1923610 about 1 month ago

42.7 kB

	import { useRef, useState, useEffect } from "react";

	interface AlignedSegment {
	duration: number;
	end: number;
	start: number;
	text: string;
	}

	interface TranscriptionResponse {
	aligned_segments: AlignedSegment[];
	alignment_available: boolean;
	device: string;
	model: string;
	num_segments: number;
	status: string;
	total_duration: number;
	transcription: string;
	}

	const SCROLL_INTERVAL = 5;
	const USE_MOCK_DATA = false;

	// Helper function to encode audio buffer as WAV
	const encodeWAV = (audioBuffer: AudioBuffer): ArrayBuffer => {
	const length = audioBuffer.length;
	const numberOfChannels = audioBuffer.numberOfChannels;
	const sampleRate = audioBuffer.sampleRate;
	const arrayBuffer = new ArrayBuffer(44 + length * numberOfChannels * 2);
	const view = new DataView(arrayBuffer);

	// WAV header
	const writeString = (offset: number, string: string) => {
	for (let i = 0; i < string.length; i++) {
	view.setUint8(offset + i, string.charCodeAt(i));
	}
	};

	writeString(0, "RIFF");
	view.setUint32(4, 36 + length * numberOfChannels * 2, true);
	writeString(8, "WAVE");
	writeString(12, "fmt ");
	view.setUint32(16, 16, true);
	view.setUint16(20, 1, true);
	view.setUint16(22, numberOfChannels, true);
	view.setUint32(24, sampleRate, true);
	view.setUint32(28, sampleRate * numberOfChannels * 2, true);
	view.setUint16(32, numberOfChannels * 2, true);
	view.setUint16(34, 16, true);
	writeString(36, "data");
	view.setUint32(40, length * numberOfChannels * 2, true);

	// Convert float32 audio data to int16
	const channels = [];
	for (let i = 0; i < numberOfChannels; i++) {
	channels.push(audioBuffer.getChannelData(i));
	}

	let offset = 44;
	for (let i = 0; i < length; i++) {
	for (let channel = 0; channel < numberOfChannels; channel++) {
	const sample = Math.max(-1, Math.min(1, channels[channel][i]));
	view.setInt16(
	offset,
	sample < 0 ? sample * 0x8000 : sample * 0x7fff,
	true,
	);
	offset += 2;
	}
	}

	return arrayBuffer;
	};

	// Audio extraction function using Web Audio API with WAV output
	const extractAudioFromVideo = async (videoFile: File): Promise<File> => {
	console.log(
	"Extracting audio from video using Web Audio API (WAV format):",
	videoFile.name,
	);

	try {
	// Create a video element to load the video file
	const video = document.createElement("video");
	const videoUrl = URL.createObjectURL(videoFile);
	video.src = videoUrl;
	video.crossOrigin = "anonymous";

	// Wait for video metadata to load
	await new Promise((resolve, reject) => {
	video.onloadedmetadata = resolve;
	video.onerror = reject;
	video.load();
	});

	// Create audio context
	const audioContext = new (window.AudioContext \|\|
	(window as any).webkitAudioContext)();

	// Create buffer to store audio data
	const source = audioContext.createMediaElementSource(video);

	// Create a script processor to capture audio data
	const processor = audioContext.createScriptProcessor(4096, 2, 2);
	const audioData: number[][] = [[], []];
	let isRecording = false;

	processor.onaudioprocess = (event) => {
	if (!isRecording) return;

	const inputBuffer = event.inputBuffer;
	const leftChannel = inputBuffer.getChannelData(0);
	const rightChannel =
	inputBuffer.numberOfChannels > 1
	? inputBuffer.getChannelData(1)
	: leftChannel;

	// Append audio data by converting to arrays and concatenating
	audioData[0] = audioData[0].concat(Array.from(leftChannel));
	audioData[1] = audioData[1].concat(Array.from(rightChannel));
	};

	// Connect audio processing chain
	source.connect(processor);
	processor.connect(audioContext.destination);

	return new Promise((resolve, reject) => {
	// Start recording when video plays
	video.onplay = () => {
	isRecording = true;
	};

	video.onended = () => {
	isRecording = false;

	// Create audio buffer from captured data
	const audioBuffer = audioContext.createBuffer(
	2,
	audioData[0].length,
	audioContext.sampleRate,
	);

	// Convert number arrays to Float32Array and copy to channels
	const leftChannelData = new Float32Array(audioData[0]);
	const rightChannelData = new Float32Array(audioData[1]);

	audioBuffer.copyToChannel(leftChannelData, 0);
	audioBuffer.copyToChannel(rightChannelData, 1);

	// Encode as WAV
	const wavArrayBuffer = encodeWAV(audioBuffer);
	const audioBlob = new Blob([wavArrayBuffer], { type: "audio/wav" });
	const audioFile = new File(
	[audioBlob],
	videoFile.name.replace(/\.[^/.]+$/, ".wav"),
	{ type: "audio/wav" },
	);

	// Clean up
	URL.revokeObjectURL(videoUrl);
	processor.disconnect();
	source.disconnect();
	audioContext.close();

	resolve(audioFile);
	};

	video.onerror = (error) => {
	console.error("Video error:", error);
	URL.revokeObjectURL(videoUrl);
	processor.disconnect();
	source.disconnect();
	audioContext.close();
	reject(new Error("Failed to process video"));
	};

	// Start playing the video
	video.currentTime = 0;
	video.play().catch(reject);

	// Set a timeout as fallback
	setTimeout(
	() => {
	if (isRecording) {
	video.pause();
	video.dispatchEvent(new Event("ended"));
	}
	},
	(video.duration + 2) * 1000,
	);
	});
	} catch (error) {
	console.error("Error extracting audio from video:", error);
	throw new Error(
	`Failed to extract audio: ${error instanceof Error ? error.message : "Unknown error"}`,
	);
	}
	};

	// Generate SRT subtitle format
	const generateSRT = (segments: AlignedSegment[]): string => {
	let srt = "";
	segments.forEach((segment, index) => {
	const startTime = formatTimeForSRT(segment.start);
	const endTime = formatTimeForSRT(segment.end);
	srt += `${index + 1}\n`;
	srt += `${startTime} --> ${endTime}\n`;
	srt += `${segment.text}\n\n`;
	});
	return srt;
	};

	// Generate WebVTT subtitle format
	const generateWebVTT = (segments: AlignedSegment[]): string => {
	let vtt = "WEBVTT\n\n";
	segments.forEach((segment, index) => {
	const startTime = formatTimeForVTT(segment.start);
	const endTime = formatTimeForVTT(segment.end);
	vtt += `${index + 1}\n`;
	vtt += `${startTime} --> ${endTime}\n`;
	vtt += `${segment.text}\n\n`;
	});
	return vtt;
	};

	// Format time for SRT (HH:MM:SS,mmm)
	const formatTimeForSRT = (seconds: number): string => {
	const hours = Math.floor(seconds / 3600);
	const minutes = Math.floor((seconds % 3600) / 60);
	const secs = Math.floor(seconds % 60);
	const milliseconds = Math.floor((seconds % 1) * 1000);
	return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")},${milliseconds.toString().padStart(3, "0")}`;
	};

	// Format time for WebVTT (HH:MM:SS.mmm)
	const formatTimeForVTT = (seconds: number): string => {
	const hours = Math.floor(seconds / 3600);
	const minutes = Math.floor((seconds % 3600) / 60);
	const secs = Math.floor(seconds % 60);
	const milliseconds = Math.floor((seconds % 1) * 1000);
	return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")}.${milliseconds.toString().padStart(3, "0")}`;
	};

	// Download subtitle file
	const downloadSubtitles = (content: string, filename: string) => {
	const blob = new Blob([content], { type: "text/plain;charset=utf-8" });
	const url = URL.createObjectURL(blob);
	const link = document.createElement("a");
	link.href = url;
	link.download = filename;
	document.body.appendChild(link);
	link.click();
	document.body.removeChild(link);
	URL.revokeObjectURL(url);
	};

	// Download video with embedded subtitles
	const downloadVideoWithSubtitles = async (
	videoFile: File,
	subtitleContent: string,
	filename: string,
	) => {
	try {
	// Create a canvas to render the video with subtitles
	const video = document.createElement("video");
	const canvas = document.createElement("canvas");
	const ctx = canvas.getContext("2d")!;

	// Load the video
	const videoUrl = URL.createObjectURL(videoFile);
	video.src = videoUrl;
	video.crossOrigin = "anonymous";

	await new Promise((resolve, reject) => {
	video.onloadedmetadata = resolve;
	video.onerror = reject;
	video.load();
	});

	// Set canvas dimensions to match video
	canvas.width = video.videoWidth;
	canvas.height = video.videoHeight;

	// Parse WebVTT subtitles
	const subtitleLines = subtitleContent.split("\n");
	const subtitles: Array<{ start: number; end: number; text: string }> = [];

	for (let i = 0; i < subtitleLines.length; i++) {
	const line = subtitleLines[i].trim();
	if (line.includes(" --> ")) {
	const [startStr, endStr] = line.split(" --> ");
	const start = parseVTTTime(startStr);
	const end = parseVTTTime(endStr);
	const text = subtitleLines[i + 1]?.trim() \|\| "";
	if (text) {
	subtitles.push({ start, end, text });
	}
	}
	}

	// Create MediaRecorder to capture the canvas
	const stream = canvas.captureStream(30); // 30 FPS

	// Add audio from original video
	const audioContext = new AudioContext();
	const source = audioContext.createMediaElementSource(video);
	const dest = audioContext.createMediaStreamDestination();
	source.connect(dest);

	// Combine video and audio streams
	const audioTrack = dest.stream.getAudioTracks()[0];
	if (audioTrack) {
	stream.addTrack(audioTrack);
	}

	const mediaRecorder = new MediaRecorder(stream, {
	mimeType: "video/webm;codecs=vp8,opus",
	});

	const chunks: Blob[] = [];
	mediaRecorder.ondataavailable = (event) => {
	if (event.data.size > 0) {
	chunks.push(event.data);
	}
	};

	mediaRecorder.onstop = () => {
	const blob = new Blob(chunks, { type: "video/webm" });
	const url = URL.createObjectURL(blob);
	const link = document.createElement("a");
	link.href = url;
	link.download = filename;
	document.body.appendChild(link);
	link.click();
	document.body.removeChild(link);
	URL.revokeObjectURL(url);
	URL.revokeObjectURL(videoUrl);
	audioContext.close();
	};

	// Start recording
	mediaRecorder.start();

	// Play video and render frames with subtitles
	const renderFrame = () => {
	if (video.ended) {
	mediaRecorder.stop();
	return;
	}

	// Draw video frame
	ctx.drawImage(video, 0, 0, canvas.width, canvas.height);

	// Find current subtitle
	const currentTime = video.currentTime;
	const currentSubtitle = subtitles.find(
	(sub) => currentTime >= sub.start && currentTime <= sub.end,
	);

	// Draw subtitle if exists
	if (currentSubtitle) {
	ctx.fillStyle = "rgba(0, 0, 0, 0.7)";
	ctx.font = `${Math.max(16, canvas.height / 25)}px Arial`;
	ctx.textAlign = "center";
	ctx.textBaseline = "bottom";

	const textWidth = ctx.measureText(currentSubtitle.text).width;
	const padding = 10;
	const textX = canvas.width / 2;
	const textY = canvas.height - 30;

	// Draw background rectangle
	ctx.fillRect(
	textX - textWidth / 2 - padding,
	textY - parseInt(ctx.font) - padding,
	textWidth + padding * 2,
	parseInt(ctx.font) + padding * 2,
	);

	// Draw text
	ctx.fillStyle = "white";
	ctx.fillText(currentSubtitle.text, textX, textY);
	}

	requestAnimationFrame(renderFrame);
	};

	// Start playback and rendering
	video.play();
	renderFrame();
	} catch (error) {
	console.error("Error creating video with subtitles:", error);
	throw new Error("Failed to create video with subtitles");
	}
	};

	// Helper function to parse VTT time format
	const parseVTTTime = (timeStr: string): number => {
	const parts = timeStr.split(":");
	const seconds = parts[parts.length - 1].split(".");
	return (
	parseInt(parts[0]) * 3600 + // hours
	parseInt(parts[1]) * 60 + // minutes
	parseInt(seconds[0]) + // seconds
	parseInt(seconds[1] \|\| "0") / 1000 // milliseconds
	);
	};

	interface TranscriptionPlayerProps {}

	export default function TranscriptionPlayer({}: TranscriptionPlayerProps) {
	const [file, setFile] = useState<File \| null>(null);
	const [transcription, setTranscription] =
	useState<TranscriptionResponse \| null>(null);
	const [isLoading, setIsLoading] = useState(false);
	const [error, setError] = useState<string \| null>(null);
	const [activeSegmentIndex, setActiveSegmentIndex] = useState<number \| null>(
	null,
	);
	const [mediaUrl, setMediaUrl] = useState<string \| null>(null);
	const [currentTime, setCurrentTime] = useState<number>(0);
	const [lastScrollTime, setLastScrollTime] = useState<number>(0);
	const [isProcessingVideo, setIsProcessingVideo] = useState<boolean>(false);
	const [isDownloadingVideo, setIsDownloadingVideo] = useState<boolean>(false);

	const audioRef = useRef<HTMLAudioElement>(null);
	const videoRef = useRef<HTMLVideoElement>(null);
	const fileInputRef = useRef<HTMLInputElement>(null);
	const activeSegmentRef = useRef<HTMLDivElement>(null);

	const apiEndpoint =
	import.meta.env.VITE_TRANSCRIPTION_API_URL \|\| "/transcribe";

	const handleFileSelect = (event: React.ChangeEvent<HTMLInputElement>) => {
	const selectedFile = event.target.files?.[0];
	if (selectedFile) {
	// Clean up previous URL if it exists
	if (mediaUrl) {
	URL.revokeObjectURL(mediaUrl);
	}

	// Create new URL for the selected file
	const newMediaUrl = URL.createObjectURL(selectedFile);

	setFile(selectedFile);
	setMediaUrl(newMediaUrl);
	setTranscription(null);
	setError(null);
	setActiveSegmentIndex(null);
	}
	};

	const handleTranscribe = async () => {
	if (!file) return;

	setIsLoading(true);
	setError(null);

	try {
	let audioFileToProcess = file;

	// If it's a video file, extract audio first
	if (file.type.startsWith("video/")) {
	setIsProcessingVideo(true);
	try {
	console.log("Processing video file for audio extraction...");
	audioFileToProcess = await extractAudioFromVideo(file);
	console.log("Audio extraction completed:", audioFileToProcess.name);
	} catch (videoError) {
	console.warn(
	"Video processing failed, using original file:",
	videoError,
	);
	// Fallback to sending the video file directly
	audioFileToProcess = file;
	} finally {
	setIsProcessingVideo(false);
	}
	}

	let result: TranscriptionResponse = {
	transcription:
	"the birch canoe slid on the smooth plants glue the sheet to the dark blue background it is easy to tell the depth of a well",
	aligned_segments: [
	{
	text: "the",
	start: 0.0,
	end: 0.62124248496994,
	duration: 0.62124248496994,
	},
	{
	text: "birch",
	start: 0.7214428857715431,
	end: 1.122244488977956,
	duration: 0.4008016032064128,
	},
	{
	text: "canoe",
	start: 1.2024048096192383,
	end: 1.743486973947896,
	duration: 0.5410821643286576,
	},
	{
	text: "slid",
	start: 1.8236472945891784,
	end: 2.084168336673347,
	duration: 0.2605210420841686,
	},
	{
	text: "on",
	start: 2.1442885771543088,
	end: 2.284569138276553,
	duration: 0.1402805611222444,
	},
	{
	text: "the",
	start: 2.364729458917836,
	end: 2.5450901803607215,
	duration: 0.18036072144288573,
	},
	{
	text: "smooth",
	start: 2.625250501002004,
	end: 3.687374749498998,
	duration: 1.062124248496994,
	},
	{
	text: "plants",
	start: 4.328657314629258,
	end: 4.749498997995992,
	duration: 0.4208416833667332,
	},
	{
	text: "glue",
	start: 4.829659318637275,
	end: 5.01002004008016,
	duration: 0.18036072144288529,
	},
	{
	text: "the",
	start: 5.070140280561122,
	end: 5.170340681362725,
	duration: 0.10020040080160264,
	},
	{
	text: "sheet",
	start: 5.2304609218436875,
	end: 5.591182364729459,
	duration: 0.36072144288577146,
	},
	{
	text: "to",
	start: 5.631262525050101,
	end: 5.771543086172345,
	duration: 0.1402805611222444,
	},
	{
	text: "the",
	start: 5.851703406813627,
	end: 6.012024048096193,
	duration: 0.16032064128256618,
	},
	{
	text: "dark",
	start: 6.072144288577155,
	end: 6.332665330661323,
	duration: 0.26052104208416793,
	},
	{
	text: "blue",
	start: 7.114228456913828,
	end: 8.056112224448897,
	duration: 0.9418837675350691,
	},
	{
	text: "background",
	start: 8.136272545090181,
	end: 8.737474949899799,
	duration: 0.6012024048096176,
	},
	{
	text: "it",
	start: 8.77755511022044,
	end: 8.897795591182364,
	duration: 0.12024048096192352,
	},
	{
	text: "is",
	start: 8.977955911823647,
	end: 9.058116232464931,
	duration: 0.08016032064128353,
	},
	{
	text: "easy",
	start: 9.118236472945892,
	end: 9.438877755511022,
	duration: 0.3206412825651306,
	},
	{
	text: "to",
	start: 9.498997995991983,
	end: 9.97995991983968,
	duration: 0.48096192384769587,
	},
	{
	text: "tell",
	start: 2.1042084168336674,
	end: 2.124248496993988,
	duration: 0.02004008016032044,
	},
	{
	text: "the",
	start: 2.1442885771543088,
	end: 2.1843687374749496,
	duration: 0.04008016032064088,
	},
	{
	text: "depth",
	start: 2.1843687374749496,
	end: 2.284569138276553,
	duration: 0.10020040080160353,
	},
	{
	text: "of",
	start: 2.284569138276553,
	end: 2.364729458917836,
	duration: 0.08016032064128265,
	},
	{
	text: "a",
	start: 2.364729458917836,
	end: 2.4448897795591185,
	duration: 0.08016032064128265,
	},
	{
	text: "well",
	start: 2.4448897795591185,
	end: 2.50501002004008,
	duration: 0.06012024048096176,
	},
	],
	total_duration: 2.50501002004008,
	num_segments: 26,
	status: "success",
	alignment_available: true,
	device: "cpu",
	model: "mock-whisper-model",
	};

	if (!USE_MOCK_DATA) {
	// Use real API
	const formData = new FormData();
	formData.append("audio", audioFileToProcess);

	const response = await fetch(apiEndpoint, {
	method: "POST",
	body: formData,
	});

	if (!response.ok) {
	throw new Error(`HTTP error! status: ${response.status}`);
	}

	result = await response.json();
	}

	if (result.status === "success") {
	setTranscription(result);
	} else {
	throw new Error("Transcription failed");
	}
	} catch (err) {
	setError(
	err instanceof Error
	? err.message
	: "An error occurred during transcription",
	);
	} finally {
	setIsLoading(false);
	setIsProcessingVideo(false);
	}
	};

	const handleTimeUpdate = () => {
	const mediaElement = audioRef.current \|\| videoRef.current;
	if (mediaElement && transcription?.aligned_segments) {
	const mediaCurrentTime = mediaElement.currentTime;
	setCurrentTime(mediaCurrentTime);

	// Find the active segment with a small tolerance for timing precision
	const activeIndex = transcription.aligned_segments.findIndex(
	(segment) =>
	mediaCurrentTime >= segment.start && mediaCurrentTime <= segment.end,
	);

	// If no exact match, find the closest segment
	if (activeIndex === -1) {
	let closestIndex = -1;
	let minDistance = Infinity;

	transcription.aligned_segments.forEach((segment, index) => {
	const distance = Math.min(
	Math.abs(mediaCurrentTime - segment.start),
	Math.abs(mediaCurrentTime - segment.end),
	);
	if (distance < minDistance && distance < 0.5) {
	// 0.5 second tolerance
	minDistance = distance;
	closestIndex = index;
	}
	});

	setActiveSegmentIndex(closestIndex >= 0 ? closestIndex : null);
	} else {
	setActiveSegmentIndex(activeIndex);
	}
	}
	};

	const handleSeekToSegment = (segment: AlignedSegment) => {
	const mediaElement = audioRef.current \|\| videoRef.current;
	if (mediaElement) {
	mediaElement.currentTime = segment.start;
	}
	};

	const handleDownloadVideoWithSubtitles = async () => {
	if (!file \|\| !transcription \|\| !isVideoFile) return;

	setIsDownloadingVideo(true);
	try {
	const vttContent = generateWebVTT(transcription.aligned_segments);
	const filename = file.name.replace(/\.[^/.]+$/, "_with_subtitles.webm");
	await downloadVideoWithSubtitles(file, vttContent, filename);
	} catch (error) {
	console.error("Error downloading video with subtitles:", error);
	setError("Failed to download video with subtitles");
	} finally {
	setIsDownloadingVideo(false);
	}
	};

	// Auto-scroll timeline to follow playback with throttling
	useEffect(() => {
	if (transcription && currentTime >= 0) {
	const now = Date.now();

	// Throttle scroll updates to every 100ms for smoother performance
	if (now - lastScrollTime < SCROLL_INTERVAL) {
	return;
	}

	const timelineContainer = document.getElementById("timeline-container");
	if (timelineContainer) {
	const containerWidth = timelineContainer.clientWidth;
	const timelineWidth =
	Math.max(transcription.total_duration * 200, 1200) - 32;
	const currentPosition =
	4 + (currentTime / transcription.total_duration) * timelineWidth;

	// Only scroll if the current position is near the edges of the visible area
	const currentScrollLeft = timelineContainer.scrollLeft;
	const leftBoundary = currentScrollLeft + containerWidth * 0.15; // 15% from left edge
	const rightBoundary = currentScrollLeft + containerWidth * 0.85; // 15% from right edge

	if (currentPosition < leftBoundary \|\| currentPosition > rightBoundary) {
	// Auto-scroll to keep the progress indicator centered
	const scrollPosition = Math.max(
	0,
	currentPosition - containerWidth / 2,
	);

	timelineContainer.scrollTo({
	left: scrollPosition,
	behavior: "smooth",
	});

	setLastScrollTime(now);
	}
	}
	}
	}, [currentTime, transcription, lastScrollTime]);

	// Also scroll to active segment when clicked
	useEffect(() => {
	if (
	activeSegmentIndex !== null &&
	activeSegmentRef.current &&
	transcription
	) {
	const timelineContainer = document.getElementById("timeline-container");
	if (timelineContainer) {
	const segment = transcription.aligned_segments[activeSegmentIndex];
	const containerWidth = timelineContainer.clientWidth;
	const timelineWidth =
	Math.max(transcription.total_duration * 200, 1200) - 32;
	const segmentPosition =
	4 + (segment.start / transcription.total_duration) * timelineWidth;

	// Scroll to center the active segment
	const scrollPosition = Math.max(
	0,
	segmentPosition - containerWidth / 2,
	);

	timelineContainer.scrollTo({
	left: scrollPosition,
	behavior: "smooth",
	});
	}
	}
	}, [activeSegmentIndex, transcription]);

	// Cleanup media URL on unmount
	useEffect(() => {
	return () => {
	if (mediaUrl) {
	URL.revokeObjectURL(mediaUrl);
	}
	};
	}, [mediaUrl]);

	const formatTime = (seconds: number) => {
	const mins = Math.floor(seconds / 60);
	const secs = Math.floor(seconds % 60);
	return `${mins}:${secs.toString().padStart(2, "0")}`;
	};

	const isVideoFile = file?.type.startsWith("video/");

	return (
	<div className="flex h-screen bg-gray-900">
	<style>{`
	.line-clamp-2 {
	display: -webkit-box;
	-webkit-line-clamp: 2;
	-webkit-box-orient: vertical;
	overflow: hidden;
	}
	.hover\\:scale-102:hover {
	transform: scale(1.02);
	}
	.timeline-segment:hover .timeline-tooltip {
	opacity: 1;
	}
	#timeline-container {
	scroll-behavior: smooth;
	-webkit-overflow-scrolling: touch;
	}
	#timeline-container::-webkit-scrollbar {
	height: 8px;
	}
	#timeline-container::-webkit-scrollbar-track {
	background: #374151;
	border-radius: 4px;
	}
	#timeline-container::-webkit-scrollbar-thumb {
	background: #6b7280;
	border-radius: 4px;
	}
	#timeline-container::-webkit-scrollbar-thumb:hover {
	background: #9ca3af;
	}
	`}</style>
	{/* Side Panel */}
	<div className="w-80 bg-gray-800 text-white p-6 overflow-y-auto">
	<h2 className="text-2xl font-bold mb-6">Audio/Video Transcription</h2>

	{/* File Upload */}
	<div className="mb-6">
	<h3 className="text-lg font-semibold mb-3">Upload Media</h3>
	<input
	ref={fileInputRef}
	type="file"
	accept="audio/,video/"
	onChange={handleFileSelect}
	className="hidden"
	/>
	<button
	onClick={() => fileInputRef.current?.click()}
	className="w-full p-3 bg-blue-600 hover:bg-blue-700 rounded-lg transition-colors"
	>
	Choose Audio/Video File
	</button>

	{file && (
	<div className="mt-3 p-3 bg-gray-700 rounded">
	<div className="text-sm font-medium">{file.name}</div>
	<div className="text-xs text-gray-400">
	{(file.size / 1024 / 1024).toFixed(2)} MB
	</div>
	</div>
	)}
	</div>

	{/* Transcribe Button */}
	{file && !transcription && (
	<div className="mb-6">
	<button
	onClick={handleTranscribe}
	disabled={isLoading}
	className="w-full p-3 bg-green-600 hover:bg-green-700 disabled:bg-gray-600 rounded-lg transition-colors"
	>
	{isLoading
	? isProcessingVideo
	? "Processing Video..."
	: "Transcribing..."
	: "Transcribe"}
	</button>
	{isVideoFile && (
	<div className="mt-2 text-xs text-gray-400">
	Video files will be processed to extract audio for
	transcription.
	</div>
	)}
	</div>
	)}

	{/* Error Display */}
	{error && (
	<div className="mb-6 p-3 bg-red-600 rounded">
	<div className="text-sm font-medium">Error</div>
	<div className="text-xs">{error}</div>
	</div>
	)}

	{/* Transcription Info */}
	{transcription && (
	<div className="mb-6">
	<h3 className="text-lg font-semibold mb-3">Transcription Info</h3>
	<div className="space-y-2 text-sm">
	<div>Model: {transcription.model}</div>
	<div>Segments: {transcription.num_segments}</div>
	<div>Duration: {formatTime(transcription.total_duration)}</div>
	<div>Device: {transcription.device}</div>
	</div>
	</div>
	)}

	{/* Subtitle Download Section */}
	{transcription && (
	<div className="mb-6">
	<h3 className="text-lg font-semibold mb-3">Download Subtitles</h3>
	<div className="space-y-2">
	<button
	onClick={() => {
	const srtContent = generateSRT(
	transcription.aligned_segments,
	);
	const filename =
	file?.name?.replace(/\.[^/.]+$/, ".srt") \|\| "subtitles.srt";
	downloadSubtitles(srtContent, filename);
	}}
	className="w-full p-2 bg-purple-600 hover:bg-purple-700 rounded-lg transition-colors text-sm"
	>
	Download SRT Subtitles
	</button>
	<button
	onClick={() => {
	const vttContent = generateWebVTT(
	transcription.aligned_segments,
	);
	const filename =
	file?.name?.replace(/\.[^/.]+$/, ".vtt") \|\| "subtitles.vtt";
	downloadSubtitles(vttContent, filename);
	}}
	className="w-full p-2 bg-indigo-600 hover:bg-indigo-700 rounded-lg transition-colors text-sm"
	>
	Download WebVTT Subtitles
	</button>
	{isVideoFile && (
	<button
	onClick={handleDownloadVideoWithSubtitles}
	disabled={isDownloadingVideo}
	className="w-full p-2 bg-orange-600 hover:bg-orange-700 disabled:bg-gray-600 rounded-lg transition-colors text-sm"
	>
	{isDownloadingVideo
	? "Creating Video..."
	: "Download Video with Embedded Subtitles"}
	</button>
	)}
	</div>
	<div className="mt-2 text-xs text-gray-400">
	SRT format works with most video players. WebVTT is ideal for web
	browsers.
	{isVideoFile &&
	" Video with embedded subtitles will be in WebM format."}
	</div>
	</div>
	)}

	{/* Full Transcription */}
	{transcription && (
	<div className="mb-6">
	<h3 className="text-lg font-semibold mb-3">Full Transcription</h3>
	<div className="p-3 bg-gray-700 rounded text-sm">
	{transcription.transcription}
	</div>
	</div>
	)}

	{/* Instructions */}
	<div className="border-t border-gray-700 pt-4">
	<h3 className="text-sm font-semibold mb-2">How to Use</h3>
	<div className="text-xs text-gray-400 space-y-1">
	<div>• Upload an audio or video file</div>
	<div>• Click "Transcribe" to process</div>
	<div>• For videos, audio will be extracted automatically</div>
	<div>• Play media to see synchronized text</div>
	<div>• Click on segments to jump to that time</div>
	<div>• Download subtitles in SRT or WebVTT format</div>
	<div>• For videos, download with embedded subtitles</div>
	<div>• Active segments are highlighted in blue</div>
	</div>
	</div>
	</div>

	{/* Main Content */}
	<div className="flex-1 flex flex-col bg-black">
	{/* Media Player */}
	{file && (
	<div className="p-6 bg-gray-800">
	<div className="max-w-4xl mx-auto">
	{isVideoFile ? (
	<div className="relative">
	<video
	ref={videoRef}
	src={mediaUrl \|\| ""}
	controls
	onTimeUpdate={handleTimeUpdate}
	className="w-full max-h-96 rounded-lg"
	>
	{transcription && (
	<track
	kind="subtitles"
	src={`data:text/vtt;base64,${btoa(generateWebVTT(transcription.aligned_segments))}`}
	srcLang="en"
	label="English"
	default
	/>
	)}
	</video>
	{transcription && (
	<div className="absolute bottom-2 right-2 bg-black bg-opacity-75 text-white px-2 py-1 rounded text-xs">
	Subtitles: {transcription.aligned_segments.length}{" "}
	segments
	</div>
	)}
	</div>
	) : (
	<div className="bg-gray-700 p-8 rounded-lg">
	<audio
	ref={audioRef}
	src={mediaUrl \|\| ""}
	controls
	onTimeUpdate={handleTimeUpdate}
	className="w-full"
	/>
	<div className="mt-4 text-center text-gray-300">
	<div className="text-lg font-medium">Audio File</div>
	<div className="text-sm">{file.name}</div>
	</div>
	</div>
	)}
	</div>
	</div>
	)}

	{/* Transcription Timeline */}
	{transcription && (
	<div className="flex-1 flex flex-col bg-gray-900">
	<div className="p-4 bg-gray-800">
	<h3 className="text-xl font-bold text-white mb-2">
	Synchronized Transcription Timeline
	</h3>
	<div className="text-sm text-gray-400">
	Click on segments to jump to that time • Active segment
	highlighted in blue
	</div>
	</div>

	<div className="flex-1 relative overflow-hidden">
	{/* Timeline Container */}
	<div
	className="h-full overflow-x-auto overflow-y-hidden"
	id="timeline-container"
	>
	<div
	className="relative h-full py-8 px-4"
	style={{
	width: `${Math.max(transcription.total_duration * 200, 1200)}px`,
	minWidth: "100%",
	}}
	>
	{/* Timeline Base Line */}
	<div className="absolute top-1/2 left-4 right-4 h-0.5 bg-gray-600 transform -translate-y-1/2"></div>

	{/* Progress Indicator */}
	<div
	className="absolute top-0 bottom-0 w-0.5 bg-red-500 z-20 transition-all duration-75"
	style={{
	left: `${4 + (currentTime / transcription.total_duration) * (Math.max(transcription.total_duration * 200, 1200) - 32)}px`,
	}}
	></div>

	{/* Segment Blocks */}
	{transcription.aligned_segments.map((segment, index) => {
	const timelineWidth =
	Math.max(transcription.total_duration * 200, 1200) - 32; // Account for padding
	const leftPosition =
	4 +
	(segment.start / transcription.total_duration) *
	timelineWidth;
	const blockWidth = Math.max(
	(segment.duration / transcription.total_duration) *
	timelineWidth,
	80,
	);
	const isActive = activeSegmentIndex === index;

	return (
	<div
	key={index}
	ref={isActive ? activeSegmentRef : null}
	onClick={() => handleSeekToSegment(segment)}
	className={`timeline-segment absolute cursor-pointer transition-all duration-200 ${
	isActive
	? "bg-blue-600 text-white scale-105 z-10"
	: "bg-gray-700 text-gray-300 hover:bg-gray-600 hover:scale-102"
	}`}
	style={{
	left: `${leftPosition}px`,
	width: `${blockWidth}px`,
	top: "35%",
	height: "30%",
	}}
	>
	<div className="h-full flex flex-col justify-center p-2 rounded-lg shadow-lg">
	<div className="text-xs font-medium leading-tight line-clamp-2">
	{segment.text}
	</div>
	<div className="text-xs opacity-75 mt-1">
	{formatTime(segment.start)}
	</div>
	</div>

	{/* Hover tooltip for longer text */}
	<div className="timeline-tooltip absolute bottom-full left-1/2 transform -translate-x-1/2 mb-2 px-2 py-1 bg-gray-800 text-white text-xs rounded opacity-0 transition-opacity duration-200 pointer-events-none z-30 max-w-xs">
	<div className="whitespace-normal break-words">
	{segment.text}
	</div>
	<div className="text-gray-400 mt-1">
	{formatTime(segment.start)} -{" "}
	{formatTime(segment.end)} (
	{segment.duration.toFixed(1)}s)
	</div>
	</div>
	</div>
	);
	})}

	{/* Time markers */}
	{Array.from(
	{
	length: Math.ceil(transcription.total_duration / 5) + 1,
	},
	(_, i) => i * 5,
	)
	.filter((time) => time <= transcription.total_duration)
	.map((time) => {
	const timelineWidth =
	Math.max(transcription.total_duration * 200, 1200) - 32;
	const position =
	4 +
	(time / transcription.total_duration) * timelineWidth;

	return (
	<div
	key={time}
	className="absolute text-xs text-gray-500"
	style={{
	left: `${position}px`,
	top: "75%",
	transform: "translateX(-50%)",
	}}
	>
	<div className="w-0.5 h-4 bg-gray-500 mx-auto mb-1"></div>
	{formatTime(time)}
	</div>
	);
	})}
	</div>
	</div>

	{/* Timeline Controls */}
	<div className="absolute bottom-4 left-1/2 transform -translate-x-1/2 bg-gray-800 rounded-lg p-2 shadow-lg">
	<div className="flex items-center space-x-4 text-white text-sm">
	<span>Timeline View</span>
	<span className="text-gray-400">\|</span>
	<span className="text-blue-400">
	{activeSegmentIndex !== null
	? `Segment ${activeSegmentIndex + 1}/${transcription.aligned_segments.length}`
	: "No active segment"}
	</span>
	<span className="text-gray-400">\|</span>
	<span className="text-green-400">
	{formatTime(currentTime)} /{" "}
	{formatTime(transcription.total_duration)}
	</span>
	</div>
	</div>
	</div>
	</div>
	)}

	{/* Loading State */}
	{isLoading && (
	<div className="flex-1 flex items-center justify-center">
	<div className="text-center text-white">
	<div className="animate-spin rounded-full h-12 w-12 border-b-2 border-white mx-auto mb-4"></div>
	<div className="text-lg">
	{isProcessingVideo
	? "Extracting audio from video..."
	: "Transcribing your media..."}
	</div>
	<div className="text-sm text-gray-400 mt-2">
	{isProcessingVideo
	? "Converting video to audio format for transcription"
	: "This may take a few moments"}
	</div>
	</div>
	</div>
	)}

	{/* Empty State */}
	{!file && !isLoading && (
	<div className="flex-1 flex items-center justify-center">
	<div className="text-center text-gray-400">
	<div className="text-6xl mb-4">🎵</div>
	<div className="text-xl mb-2">Upload Audio or Video</div>
	<div className="text-sm">
	Choose a media file to get started with transcription
	</div>
	</div>
	</div>
	)}
	</div>
	</div>
	);
	}