Spaces:
Runtime error
Runtime error
import { useRef, useState, useEffect } from "react"; | |
interface AlignedSegment { | |
duration: number; | |
end: number; | |
start: number; | |
text: string; | |
} | |
interface TranscriptionResponse { | |
aligned_segments: AlignedSegment[]; | |
alignment_available: boolean; | |
device: string; | |
model: string; | |
num_segments: number; | |
status: string; | |
total_duration: number; | |
transcription: string; | |
} | |
const SCROLL_INTERVAL = 5; | |
const USE_MOCK_DATA = false; | |
// Helper function to encode audio buffer as WAV | |
const encodeWAV = (audioBuffer: AudioBuffer): ArrayBuffer => { | |
const length = audioBuffer.length; | |
const numberOfChannels = audioBuffer.numberOfChannels; | |
const sampleRate = audioBuffer.sampleRate; | |
const arrayBuffer = new ArrayBuffer(44 + length * numberOfChannels * 2); | |
const view = new DataView(arrayBuffer); | |
// WAV header | |
const writeString = (offset: number, string: string) => { | |
for (let i = 0; i < string.length; i++) { | |
view.setUint8(offset + i, string.charCodeAt(i)); | |
} | |
}; | |
writeString(0, "RIFF"); | |
view.setUint32(4, 36 + length * numberOfChannels * 2, true); | |
writeString(8, "WAVE"); | |
writeString(12, "fmt "); | |
view.setUint32(16, 16, true); | |
view.setUint16(20, 1, true); | |
view.setUint16(22, numberOfChannels, true); | |
view.setUint32(24, sampleRate, true); | |
view.setUint32(28, sampleRate * numberOfChannels * 2, true); | |
view.setUint16(32, numberOfChannels * 2, true); | |
view.setUint16(34, 16, true); | |
writeString(36, "data"); | |
view.setUint32(40, length * numberOfChannels * 2, true); | |
// Convert float32 audio data to int16 | |
const channels = []; | |
for (let i = 0; i < numberOfChannels; i++) { | |
channels.push(audioBuffer.getChannelData(i)); | |
} | |
let offset = 44; | |
for (let i = 0; i < length; i++) { | |
for (let channel = 0; channel < numberOfChannels; channel++) { | |
const sample = Math.max(-1, Math.min(1, channels[channel][i])); | |
view.setInt16( | |
offset, | |
sample < 0 ? sample * 0x8000 : sample * 0x7fff, | |
true, | |
); | |
offset += 2; | |
} | |
} | |
return arrayBuffer; | |
}; | |
// Audio extraction function using Web Audio API with WAV output | |
const extractAudioFromVideo = async (videoFile: File): Promise<File> => { | |
console.log( | |
"Extracting audio from video using Web Audio API (WAV format):", | |
videoFile.name, | |
); | |
try { | |
// Create a video element to load the video file | |
const video = document.createElement("video"); | |
const videoUrl = URL.createObjectURL(videoFile); | |
video.src = videoUrl; | |
video.crossOrigin = "anonymous"; | |
// Wait for video metadata to load | |
await new Promise((resolve, reject) => { | |
video.onloadedmetadata = resolve; | |
video.onerror = reject; | |
video.load(); | |
}); | |
// Create audio context | |
const audioContext = new (window.AudioContext || | |
(window as any).webkitAudioContext)(); | |
// Create buffer to store audio data | |
const source = audioContext.createMediaElementSource(video); | |
// Create a script processor to capture audio data | |
const processor = audioContext.createScriptProcessor(4096, 2, 2); | |
const audioData: number[][] = [[], []]; | |
let isRecording = false; | |
processor.onaudioprocess = (event) => { | |
if (!isRecording) return; | |
const inputBuffer = event.inputBuffer; | |
const leftChannel = inputBuffer.getChannelData(0); | |
const rightChannel = | |
inputBuffer.numberOfChannels > 1 | |
? inputBuffer.getChannelData(1) | |
: leftChannel; | |
// Append audio data by converting to arrays and concatenating | |
audioData[0] = audioData[0].concat(Array.from(leftChannel)); | |
audioData[1] = audioData[1].concat(Array.from(rightChannel)); | |
}; | |
// Connect audio processing chain | |
source.connect(processor); | |
processor.connect(audioContext.destination); | |
return new Promise((resolve, reject) => { | |
// Start recording when video plays | |
video.onplay = () => { | |
isRecording = true; | |
}; | |
video.onended = () => { | |
isRecording = false; | |
// Create audio buffer from captured data | |
const audioBuffer = audioContext.createBuffer( | |
2, | |
audioData[0].length, | |
audioContext.sampleRate, | |
); | |
// Convert number arrays to Float32Array and copy to channels | |
const leftChannelData = new Float32Array(audioData[0]); | |
const rightChannelData = new Float32Array(audioData[1]); | |
audioBuffer.copyToChannel(leftChannelData, 0); | |
audioBuffer.copyToChannel(rightChannelData, 1); | |
// Encode as WAV | |
const wavArrayBuffer = encodeWAV(audioBuffer); | |
const audioBlob = new Blob([wavArrayBuffer], { type: "audio/wav" }); | |
const audioFile = new File( | |
[audioBlob], | |
videoFile.name.replace(/\.[^/.]+$/, ".wav"), | |
{ type: "audio/wav" }, | |
); | |
// Clean up | |
URL.revokeObjectURL(videoUrl); | |
processor.disconnect(); | |
source.disconnect(); | |
audioContext.close(); | |
resolve(audioFile); | |
}; | |
video.onerror = (error) => { | |
console.error("Video error:", error); | |
URL.revokeObjectURL(videoUrl); | |
processor.disconnect(); | |
source.disconnect(); | |
audioContext.close(); | |
reject(new Error("Failed to process video")); | |
}; | |
// Start playing the video | |
video.currentTime = 0; | |
video.play().catch(reject); | |
// Set a timeout as fallback | |
setTimeout( | |
() => { | |
if (isRecording) { | |
video.pause(); | |
video.dispatchEvent(new Event("ended")); | |
} | |
}, | |
(video.duration + 2) * 1000, | |
); | |
}); | |
} catch (error) { | |
console.error("Error extracting audio from video:", error); | |
throw new Error( | |
`Failed to extract audio: ${error instanceof Error ? error.message : "Unknown error"}`, | |
); | |
} | |
}; | |
// Generate SRT subtitle format | |
const generateSRT = (segments: AlignedSegment[]): string => { | |
let srt = ""; | |
segments.forEach((segment, index) => { | |
const startTime = formatTimeForSRT(segment.start); | |
const endTime = formatTimeForSRT(segment.end); | |
srt += `${index + 1}\n`; | |
srt += `${startTime} --> ${endTime}\n`; | |
srt += `${segment.text}\n\n`; | |
}); | |
return srt; | |
}; | |
// Generate WebVTT subtitle format | |
const generateWebVTT = (segments: AlignedSegment[]): string => { | |
let vtt = "WEBVTT\n\n"; | |
segments.forEach((segment, index) => { | |
const startTime = formatTimeForVTT(segment.start); | |
const endTime = formatTimeForVTT(segment.end); | |
vtt += `${index + 1}\n`; | |
vtt += `${startTime} --> ${endTime}\n`; | |
vtt += `${segment.text}\n\n`; | |
}); | |
return vtt; | |
}; | |
// Format time for SRT (HH:MM:SS,mmm) | |
const formatTimeForSRT = (seconds: number): string => { | |
const hours = Math.floor(seconds / 3600); | |
const minutes = Math.floor((seconds % 3600) / 60); | |
const secs = Math.floor(seconds % 60); | |
const milliseconds = Math.floor((seconds % 1) * 1000); | |
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")},${milliseconds.toString().padStart(3, "0")}`; | |
}; | |
// Format time for WebVTT (HH:MM:SS.mmm) | |
const formatTimeForVTT = (seconds: number): string => { | |
const hours = Math.floor(seconds / 3600); | |
const minutes = Math.floor((seconds % 3600) / 60); | |
const secs = Math.floor(seconds % 60); | |
const milliseconds = Math.floor((seconds % 1) * 1000); | |
return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${secs.toString().padStart(2, "0")}.${milliseconds.toString().padStart(3, "0")}`; | |
}; | |
// Download subtitle file | |
const downloadSubtitles = (content: string, filename: string) => { | |
const blob = new Blob([content], { type: "text/plain;charset=utf-8" }); | |
const url = URL.createObjectURL(blob); | |
const link = document.createElement("a"); | |
link.href = url; | |
link.download = filename; | |
document.body.appendChild(link); | |
link.click(); | |
document.body.removeChild(link); | |
URL.revokeObjectURL(url); | |
}; | |
// Download video with embedded subtitles | |
const downloadVideoWithSubtitles = async ( | |
videoFile: File, | |
subtitleContent: string, | |
filename: string, | |
) => { | |
try { | |
// Create a canvas to render the video with subtitles | |
const video = document.createElement("video"); | |
const canvas = document.createElement("canvas"); | |
const ctx = canvas.getContext("2d")!; | |
// Load the video | |
const videoUrl = URL.createObjectURL(videoFile); | |
video.src = videoUrl; | |
video.crossOrigin = "anonymous"; | |
await new Promise((resolve, reject) => { | |
video.onloadedmetadata = resolve; | |
video.onerror = reject; | |
video.load(); | |
}); | |
// Set canvas dimensions to match video | |
canvas.width = video.videoWidth; | |
canvas.height = video.videoHeight; | |
// Parse WebVTT subtitles | |
const subtitleLines = subtitleContent.split("\n"); | |
const subtitles: Array<{ start: number; end: number; text: string }> = []; | |
for (let i = 0; i < subtitleLines.length; i++) { | |
const line = subtitleLines[i].trim(); | |
if (line.includes(" --> ")) { | |
const [startStr, endStr] = line.split(" --> "); | |
const start = parseVTTTime(startStr); | |
const end = parseVTTTime(endStr); | |
const text = subtitleLines[i + 1]?.trim() || ""; | |
if (text) { | |
subtitles.push({ start, end, text }); | |
} | |
} | |
} | |
// Create MediaRecorder to capture the canvas | |
const stream = canvas.captureStream(30); // 30 FPS | |
// Add audio from original video | |
const audioContext = new AudioContext(); | |
const source = audioContext.createMediaElementSource(video); | |
const dest = audioContext.createMediaStreamDestination(); | |
source.connect(dest); | |
// Combine video and audio streams | |
const audioTrack = dest.stream.getAudioTracks()[0]; | |
if (audioTrack) { | |
stream.addTrack(audioTrack); | |
} | |
const mediaRecorder = new MediaRecorder(stream, { | |
mimeType: "video/webm;codecs=vp8,opus", | |
}); | |
const chunks: Blob[] = []; | |
mediaRecorder.ondataavailable = (event) => { | |
if (event.data.size > 0) { | |
chunks.push(event.data); | |
} | |
}; | |
mediaRecorder.onstop = () => { | |
const blob = new Blob(chunks, { type: "video/webm" }); | |
const url = URL.createObjectURL(blob); | |
const link = document.createElement("a"); | |
link.href = url; | |
link.download = filename; | |
document.body.appendChild(link); | |
link.click(); | |
document.body.removeChild(link); | |
URL.revokeObjectURL(url); | |
URL.revokeObjectURL(videoUrl); | |
audioContext.close(); | |
}; | |
// Start recording | |
mediaRecorder.start(); | |
// Play video and render frames with subtitles | |
const renderFrame = () => { | |
if (video.ended) { | |
mediaRecorder.stop(); | |
return; | |
} | |
// Draw video frame | |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
// Find current subtitle | |
const currentTime = video.currentTime; | |
const currentSubtitle = subtitles.find( | |
(sub) => currentTime >= sub.start && currentTime <= sub.end, | |
); | |
// Draw subtitle if exists | |
if (currentSubtitle) { | |
ctx.fillStyle = "rgba(0, 0, 0, 0.7)"; | |
ctx.font = `${Math.max(16, canvas.height / 25)}px Arial`; | |
ctx.textAlign = "center"; | |
ctx.textBaseline = "bottom"; | |
const textWidth = ctx.measureText(currentSubtitle.text).width; | |
const padding = 10; | |
const textX = canvas.width / 2; | |
const textY = canvas.height - 30; | |
// Draw background rectangle | |
ctx.fillRect( | |
textX - textWidth / 2 - padding, | |
textY - parseInt(ctx.font) - padding, | |
textWidth + padding * 2, | |
parseInt(ctx.font) + padding * 2, | |
); | |
// Draw text | |
ctx.fillStyle = "white"; | |
ctx.fillText(currentSubtitle.text, textX, textY); | |
} | |
requestAnimationFrame(renderFrame); | |
}; | |
// Start playback and rendering | |
video.play(); | |
renderFrame(); | |
} catch (error) { | |
console.error("Error creating video with subtitles:", error); | |
throw new Error("Failed to create video with subtitles"); | |
} | |
}; | |
// Helper function to parse VTT time format | |
const parseVTTTime = (timeStr: string): number => { | |
const parts = timeStr.split(":"); | |
const seconds = parts[parts.length - 1].split("."); | |
return ( | |
parseInt(parts[0]) * 3600 + // hours | |
parseInt(parts[1]) * 60 + // minutes | |
parseInt(seconds[0]) + // seconds | |
parseInt(seconds[1] || "0") / 1000 // milliseconds | |
); | |
}; | |
interface TranscriptionPlayerProps {} | |
export default function TranscriptionPlayer({}: TranscriptionPlayerProps) { | |
const [file, setFile] = useState<File | null>(null); | |
const [transcription, setTranscription] = | |
useState<TranscriptionResponse | null>(null); | |
const [isLoading, setIsLoading] = useState(false); | |
const [error, setError] = useState<string | null>(null); | |
const [activeSegmentIndex, setActiveSegmentIndex] = useState<number | null>( | |
null, | |
); | |
const [mediaUrl, setMediaUrl] = useState<string | null>(null); | |
const [currentTime, setCurrentTime] = useState<number>(0); | |
const [lastScrollTime, setLastScrollTime] = useState<number>(0); | |
const [isProcessingVideo, setIsProcessingVideo] = useState<boolean>(false); | |
const [isDownloadingVideo, setIsDownloadingVideo] = useState<boolean>(false); | |
const audioRef = useRef<HTMLAudioElement>(null); | |
const videoRef = useRef<HTMLVideoElement>(null); | |
const fileInputRef = useRef<HTMLInputElement>(null); | |
const activeSegmentRef = useRef<HTMLDivElement>(null); | |
const apiEndpoint = | |
import.meta.env.VITE_TRANSCRIPTION_API_URL || "/transcribe"; | |
const handleFileSelect = (event: React.ChangeEvent<HTMLInputElement>) => { | |
const selectedFile = event.target.files?.[0]; | |
if (selectedFile) { | |
// Clean up previous URL if it exists | |
if (mediaUrl) { | |
URL.revokeObjectURL(mediaUrl); | |
} | |
// Create new URL for the selected file | |
const newMediaUrl = URL.createObjectURL(selectedFile); | |
setFile(selectedFile); | |
setMediaUrl(newMediaUrl); | |
setTranscription(null); | |
setError(null); | |
setActiveSegmentIndex(null); | |
} | |
}; | |
const handleTranscribe = async () => { | |
if (!file) return; | |
setIsLoading(true); | |
setError(null); | |
try { | |
let audioFileToProcess = file; | |
// If it's a video file, extract audio first | |
if (file.type.startsWith("video/")) { | |
setIsProcessingVideo(true); | |
try { | |
console.log("Processing video file for audio extraction..."); | |
audioFileToProcess = await extractAudioFromVideo(file); | |
console.log("Audio extraction completed:", audioFileToProcess.name); | |
} catch (videoError) { | |
console.warn( | |
"Video processing failed, using original file:", | |
videoError, | |
); | |
// Fallback to sending the video file directly | |
audioFileToProcess = file; | |
} finally { | |
setIsProcessingVideo(false); | |
} | |
} | |
let result: TranscriptionResponse = { | |
transcription: | |
"the birch canoe slid on the smooth plants glue the sheet to the dark blue background it is easy to tell the depth of a well", | |
aligned_segments: [ | |
{ | |
text: "the", | |
start: 0.0, | |
end: 0.62124248496994, | |
duration: 0.62124248496994, | |
}, | |
{ | |
text: "birch", | |
start: 0.7214428857715431, | |
end: 1.122244488977956, | |
duration: 0.4008016032064128, | |
}, | |
{ | |
text: "canoe", | |
start: 1.2024048096192383, | |
end: 1.743486973947896, | |
duration: 0.5410821643286576, | |
}, | |
{ | |
text: "slid", | |
start: 1.8236472945891784, | |
end: 2.084168336673347, | |
duration: 0.2605210420841686, | |
}, | |
{ | |
text: "on", | |
start: 2.1442885771543088, | |
end: 2.284569138276553, | |
duration: 0.1402805611222444, | |
}, | |
{ | |
text: "the", | |
start: 2.364729458917836, | |
end: 2.5450901803607215, | |
duration: 0.18036072144288573, | |
}, | |
{ | |
text: "smooth", | |
start: 2.625250501002004, | |
end: 3.687374749498998, | |
duration: 1.062124248496994, | |
}, | |
{ | |
text: "plants", | |
start: 4.328657314629258, | |
end: 4.749498997995992, | |
duration: 0.4208416833667332, | |
}, | |
{ | |
text: "glue", | |
start: 4.829659318637275, | |
end: 5.01002004008016, | |
duration: 0.18036072144288529, | |
}, | |
{ | |
text: "the", | |
start: 5.070140280561122, | |
end: 5.170340681362725, | |
duration: 0.10020040080160264, | |
}, | |
{ | |
text: "sheet", | |
start: 5.2304609218436875, | |
end: 5.591182364729459, | |
duration: 0.36072144288577146, | |
}, | |
{ | |
text: "to", | |
start: 5.631262525050101, | |
end: 5.771543086172345, | |
duration: 0.1402805611222444, | |
}, | |
{ | |
text: "the", | |
start: 5.851703406813627, | |
end: 6.012024048096193, | |
duration: 0.16032064128256618, | |
}, | |
{ | |
text: "dark", | |
start: 6.072144288577155, | |
end: 6.332665330661323, | |
duration: 0.26052104208416793, | |
}, | |
{ | |
text: "blue", | |
start: 7.114228456913828, | |
end: 8.056112224448897, | |
duration: 0.9418837675350691, | |
}, | |
{ | |
text: "background", | |
start: 8.136272545090181, | |
end: 8.737474949899799, | |
duration: 0.6012024048096176, | |
}, | |
{ | |
text: "it", | |
start: 8.77755511022044, | |
end: 8.897795591182364, | |
duration: 0.12024048096192352, | |
}, | |
{ | |
text: "is", | |
start: 8.977955911823647, | |
end: 9.058116232464931, | |
duration: 0.08016032064128353, | |
}, | |
{ | |
text: "easy", | |
start: 9.118236472945892, | |
end: 9.438877755511022, | |
duration: 0.3206412825651306, | |
}, | |
{ | |
text: "to", | |
start: 9.498997995991983, | |
end: 9.97995991983968, | |
duration: 0.48096192384769587, | |
}, | |
{ | |
text: "tell", | |
start: 2.1042084168336674, | |
end: 2.124248496993988, | |
duration: 0.02004008016032044, | |
}, | |
{ | |
text: "the", | |
start: 2.1442885771543088, | |
end: 2.1843687374749496, | |
duration: 0.04008016032064088, | |
}, | |
{ | |
text: "depth", | |
start: 2.1843687374749496, | |
end: 2.284569138276553, | |
duration: 0.10020040080160353, | |
}, | |
{ | |
text: "of", | |
start: 2.284569138276553, | |
end: 2.364729458917836, | |
duration: 0.08016032064128265, | |
}, | |
{ | |
text: "a", | |
start: 2.364729458917836, | |
end: 2.4448897795591185, | |
duration: 0.08016032064128265, | |
}, | |
{ | |
text: "well", | |
start: 2.4448897795591185, | |
end: 2.50501002004008, | |
duration: 0.06012024048096176, | |
}, | |
], | |
total_duration: 2.50501002004008, | |
num_segments: 26, | |
status: "success", | |
alignment_available: true, | |
device: "cpu", | |
model: "mock-whisper-model", | |
}; | |
if (!USE_MOCK_DATA) { | |
// Use real API | |
const formData = new FormData(); | |
formData.append("audio", audioFileToProcess); | |
const response = await fetch(apiEndpoint, { | |
method: "POST", | |
body: formData, | |
}); | |
if (!response.ok) { | |
throw new Error(`HTTP error! status: ${response.status}`); | |
} | |
result = await response.json(); | |
} | |
if (result.status === "success") { | |
setTranscription(result); | |
} else { | |
throw new Error("Transcription failed"); | |
} | |
} catch (err) { | |
setError( | |
err instanceof Error | |
? err.message | |
: "An error occurred during transcription", | |
); | |
} finally { | |
setIsLoading(false); | |
setIsProcessingVideo(false); | |
} | |
}; | |
const handleTimeUpdate = () => { | |
const mediaElement = audioRef.current || videoRef.current; | |
if (mediaElement && transcription?.aligned_segments) { | |
const mediaCurrentTime = mediaElement.currentTime; | |
setCurrentTime(mediaCurrentTime); | |
// Find the active segment with a small tolerance for timing precision | |
const activeIndex = transcription.aligned_segments.findIndex( | |
(segment) => | |
mediaCurrentTime >= segment.start && mediaCurrentTime <= segment.end, | |
); | |
// If no exact match, find the closest segment | |
if (activeIndex === -1) { | |
let closestIndex = -1; | |
let minDistance = Infinity; | |
transcription.aligned_segments.forEach((segment, index) => { | |
const distance = Math.min( | |
Math.abs(mediaCurrentTime - segment.start), | |
Math.abs(mediaCurrentTime - segment.end), | |
); | |
if (distance < minDistance && distance < 0.5) { | |
// 0.5 second tolerance | |
minDistance = distance; | |
closestIndex = index; | |
} | |
}); | |
setActiveSegmentIndex(closestIndex >= 0 ? closestIndex : null); | |
} else { | |
setActiveSegmentIndex(activeIndex); | |
} | |
} | |
}; | |
const handleSeekToSegment = (segment: AlignedSegment) => { | |
const mediaElement = audioRef.current || videoRef.current; | |
if (mediaElement) { | |
mediaElement.currentTime = segment.start; | |
} | |
}; | |
const handleDownloadVideoWithSubtitles = async () => { | |
if (!file || !transcription || !isVideoFile) return; | |
setIsDownloadingVideo(true); | |
try { | |
const vttContent = generateWebVTT(transcription.aligned_segments); | |
const filename = file.name.replace(/\.[^/.]+$/, "_with_subtitles.webm"); | |
await downloadVideoWithSubtitles(file, vttContent, filename); | |
} catch (error) { | |
console.error("Error downloading video with subtitles:", error); | |
setError("Failed to download video with subtitles"); | |
} finally { | |
setIsDownloadingVideo(false); | |
} | |
}; | |
// Auto-scroll timeline to follow playback with throttling | |
useEffect(() => { | |
if (transcription && currentTime >= 0) { | |
const now = Date.now(); | |
// Throttle scroll updates to every 100ms for smoother performance | |
if (now - lastScrollTime < SCROLL_INTERVAL) { | |
return; | |
} | |
const timelineContainer = document.getElementById("timeline-container"); | |
if (timelineContainer) { | |
const containerWidth = timelineContainer.clientWidth; | |
const timelineWidth = | |
Math.max(transcription.total_duration * 200, 1200) - 32; | |
const currentPosition = | |
4 + (currentTime / transcription.total_duration) * timelineWidth; | |
// Only scroll if the current position is near the edges of the visible area | |
const currentScrollLeft = timelineContainer.scrollLeft; | |
const leftBoundary = currentScrollLeft + containerWidth * 0.15; // 15% from left edge | |
const rightBoundary = currentScrollLeft + containerWidth * 0.85; // 15% from right edge | |
if (currentPosition < leftBoundary || currentPosition > rightBoundary) { | |
// Auto-scroll to keep the progress indicator centered | |
const scrollPosition = Math.max( | |
0, | |
currentPosition - containerWidth / 2, | |
); | |
timelineContainer.scrollTo({ | |
left: scrollPosition, | |
behavior: "smooth", | |
}); | |
setLastScrollTime(now); | |
} | |
} | |
} | |
}, [currentTime, transcription, lastScrollTime]); | |
// Also scroll to active segment when clicked | |
useEffect(() => { | |
if ( | |
activeSegmentIndex !== null && | |
activeSegmentRef.current && | |
transcription | |
) { | |
const timelineContainer = document.getElementById("timeline-container"); | |
if (timelineContainer) { | |
const segment = transcription.aligned_segments[activeSegmentIndex]; | |
const containerWidth = timelineContainer.clientWidth; | |
const timelineWidth = | |
Math.max(transcription.total_duration * 200, 1200) - 32; | |
const segmentPosition = | |
4 + (segment.start / transcription.total_duration) * timelineWidth; | |
// Scroll to center the active segment | |
const scrollPosition = Math.max( | |
0, | |
segmentPosition - containerWidth / 2, | |
); | |
timelineContainer.scrollTo({ | |
left: scrollPosition, | |
behavior: "smooth", | |
}); | |
} | |
} | |
}, [activeSegmentIndex, transcription]); | |
// Cleanup media URL on unmount | |
useEffect(() => { | |
return () => { | |
if (mediaUrl) { | |
URL.revokeObjectURL(mediaUrl); | |
} | |
}; | |
}, [mediaUrl]); | |
const formatTime = (seconds: number) => { | |
const mins = Math.floor(seconds / 60); | |
const secs = Math.floor(seconds % 60); | |
return `${mins}:${secs.toString().padStart(2, "0")}`; | |
}; | |
const isVideoFile = file?.type.startsWith("video/"); | |
return ( | |
<div className="flex h-screen bg-gray-900"> | |
<style>{` | |
.line-clamp-2 { | |
display: -webkit-box; | |
-webkit-line-clamp: 2; | |
-webkit-box-orient: vertical; | |
overflow: hidden; | |
} | |
.hover\\:scale-102:hover { | |
transform: scale(1.02); | |
} | |
.timeline-segment:hover .timeline-tooltip { | |
opacity: 1; | |
} | |
#timeline-container { | |
scroll-behavior: smooth; | |
-webkit-overflow-scrolling: touch; | |
} | |
#timeline-container::-webkit-scrollbar { | |
height: 8px; | |
} | |
#timeline-container::-webkit-scrollbar-track { | |
background: #374151; | |
border-radius: 4px; | |
} | |
#timeline-container::-webkit-scrollbar-thumb { | |
background: #6b7280; | |
border-radius: 4px; | |
} | |
#timeline-container::-webkit-scrollbar-thumb:hover { | |
background: #9ca3af; | |
} | |
`}</style> | |
{/* Side Panel */} | |
<div className="w-80 bg-gray-800 text-white p-6 overflow-y-auto"> | |
<h2 className="text-2xl font-bold mb-6">Audio/Video Transcription</h2> | |
{/* File Upload */} | |
<div className="mb-6"> | |
<h3 className="text-lg font-semibold mb-3">Upload Media</h3> | |
<input | |
ref={fileInputRef} | |
type="file" | |
accept="audio/*,video/*" | |
onChange={handleFileSelect} | |
className="hidden" | |
/> | |
<button | |
onClick={() => fileInputRef.current?.click()} | |
className="w-full p-3 bg-blue-600 hover:bg-blue-700 rounded-lg transition-colors" | |
> | |
Choose Audio/Video File | |
</button> | |
{file && ( | |
<div className="mt-3 p-3 bg-gray-700 rounded"> | |
<div className="text-sm font-medium">{file.name}</div> | |
<div className="text-xs text-gray-400"> | |
{(file.size / 1024 / 1024).toFixed(2)} MB | |
</div> | |
</div> | |
)} | |
</div> | |
{/* Transcribe Button */} | |
{file && !transcription && ( | |
<div className="mb-6"> | |
<button | |
onClick={handleTranscribe} | |
disabled={isLoading} | |
className="w-full p-3 bg-green-600 hover:bg-green-700 disabled:bg-gray-600 rounded-lg transition-colors" | |
> | |
{isLoading | |
? isProcessingVideo | |
? "Processing Video..." | |
: "Transcribing..." | |
: "Transcribe"} | |
</button> | |
{isVideoFile && ( | |
<div className="mt-2 text-xs text-gray-400"> | |
Video files will be processed to extract audio for | |
transcription. | |
</div> | |
)} | |
</div> | |
)} | |
{/* Error Display */} | |
{error && ( | |
<div className="mb-6 p-3 bg-red-600 rounded"> | |
<div className="text-sm font-medium">Error</div> | |
<div className="text-xs">{error}</div> | |
</div> | |
)} | |
{/* Transcription Info */} | |
{transcription && ( | |
<div className="mb-6"> | |
<h3 className="text-lg font-semibold mb-3">Transcription Info</h3> | |
<div className="space-y-2 text-sm"> | |
<div>Model: {transcription.model}</div> | |
<div>Segments: {transcription.num_segments}</div> | |
<div>Duration: {formatTime(transcription.total_duration)}</div> | |
<div>Device: {transcription.device}</div> | |
</div> | |
</div> | |
)} | |
{/* Subtitle Download Section */} | |
{transcription && ( | |
<div className="mb-6"> | |
<h3 className="text-lg font-semibold mb-3">Download Subtitles</h3> | |
<div className="space-y-2"> | |
<button | |
onClick={() => { | |
const srtContent = generateSRT( | |
transcription.aligned_segments, | |
); | |
const filename = | |
file?.name?.replace(/\.[^/.]+$/, ".srt") || "subtitles.srt"; | |
downloadSubtitles(srtContent, filename); | |
}} | |
className="w-full p-2 bg-purple-600 hover:bg-purple-700 rounded-lg transition-colors text-sm" | |
> | |
Download SRT Subtitles | |
</button> | |
<button | |
onClick={() => { | |
const vttContent = generateWebVTT( | |
transcription.aligned_segments, | |
); | |
const filename = | |
file?.name?.replace(/\.[^/.]+$/, ".vtt") || "subtitles.vtt"; | |
downloadSubtitles(vttContent, filename); | |
}} | |
className="w-full p-2 bg-indigo-600 hover:bg-indigo-700 rounded-lg transition-colors text-sm" | |
> | |
Download WebVTT Subtitles | |
</button> | |
{isVideoFile && ( | |
<button | |
onClick={handleDownloadVideoWithSubtitles} | |
disabled={isDownloadingVideo} | |
className="w-full p-2 bg-orange-600 hover:bg-orange-700 disabled:bg-gray-600 rounded-lg transition-colors text-sm" | |
> | |
{isDownloadingVideo | |
? "Creating Video..." | |
: "Download Video with Embedded Subtitles"} | |
</button> | |
)} | |
</div> | |
<div className="mt-2 text-xs text-gray-400"> | |
SRT format works with most video players. WebVTT is ideal for web | |
browsers. | |
{isVideoFile && | |
" Video with embedded subtitles will be in WebM format."} | |
</div> | |
</div> | |
)} | |
{/* Full Transcription */} | |
{transcription && ( | |
<div className="mb-6"> | |
<h3 className="text-lg font-semibold mb-3">Full Transcription</h3> | |
<div className="p-3 bg-gray-700 rounded text-sm"> | |
{transcription.transcription} | |
</div> | |
</div> | |
)} | |
{/* Instructions */} | |
<div className="border-t border-gray-700 pt-4"> | |
<h3 className="text-sm font-semibold mb-2">How to Use</h3> | |
<div className="text-xs text-gray-400 space-y-1"> | |
<div>• Upload an audio or video file</div> | |
<div>• Click "Transcribe" to process</div> | |
<div>• For videos, audio will be extracted automatically</div> | |
<div>• Play media to see synchronized text</div> | |
<div>• Click on segments to jump to that time</div> | |
<div>• Download subtitles in SRT or WebVTT format</div> | |
<div>• For videos, download with embedded subtitles</div> | |
<div>• Active segments are highlighted in blue</div> | |
</div> | |
</div> | |
</div> | |
{/* Main Content */} | |
<div className="flex-1 flex flex-col bg-black"> | |
{/* Media Player */} | |
{file && ( | |
<div className="p-6 bg-gray-800"> | |
<div className="max-w-4xl mx-auto"> | |
{isVideoFile ? ( | |
<div className="relative"> | |
<video | |
ref={videoRef} | |
src={mediaUrl || ""} | |
controls | |
onTimeUpdate={handleTimeUpdate} | |
className="w-full max-h-96 rounded-lg" | |
> | |
{transcription && ( | |
<track | |
kind="subtitles" | |
src={`data:text/vtt;base64,${btoa(generateWebVTT(transcription.aligned_segments))}`} | |
srcLang="en" | |
label="English" | |
default | |
/> | |
)} | |
</video> | |
{transcription && ( | |
<div className="absolute bottom-2 right-2 bg-black bg-opacity-75 text-white px-2 py-1 rounded text-xs"> | |
Subtitles: {transcription.aligned_segments.length}{" "} | |
segments | |
</div> | |
)} | |
</div> | |
) : ( | |
<div className="bg-gray-700 p-8 rounded-lg"> | |
<audio | |
ref={audioRef} | |
src={mediaUrl || ""} | |
controls | |
onTimeUpdate={handleTimeUpdate} | |
className="w-full" | |
/> | |
<div className="mt-4 text-center text-gray-300"> | |
<div className="text-lg font-medium">Audio File</div> | |
<div className="text-sm">{file.name}</div> | |
</div> | |
</div> | |
)} | |
</div> | |
</div> | |
)} | |
{/* Transcription Timeline */} | |
{transcription && ( | |
<div className="flex-1 flex flex-col bg-gray-900"> | |
<div className="p-4 bg-gray-800"> | |
<h3 className="text-xl font-bold text-white mb-2"> | |
Synchronized Transcription Timeline | |
</h3> | |
<div className="text-sm text-gray-400"> | |
Click on segments to jump to that time • Active segment | |
highlighted in blue | |
</div> | |
</div> | |
<div className="flex-1 relative overflow-hidden"> | |
{/* Timeline Container */} | |
<div | |
className="h-full overflow-x-auto overflow-y-hidden" | |
id="timeline-container" | |
> | |
<div | |
className="relative h-full py-8 px-4" | |
style={{ | |
width: `${Math.max(transcription.total_duration * 200, 1200)}px`, | |
minWidth: "100%", | |
}} | |
> | |
{/* Timeline Base Line */} | |
<div className="absolute top-1/2 left-4 right-4 h-0.5 bg-gray-600 transform -translate-y-1/2"></div> | |
{/* Progress Indicator */} | |
<div | |
className="absolute top-0 bottom-0 w-0.5 bg-red-500 z-20 transition-all duration-75" | |
style={{ | |
left: `${4 + (currentTime / transcription.total_duration) * (Math.max(transcription.total_duration * 200, 1200) - 32)}px`, | |
}} | |
></div> | |
{/* Segment Blocks */} | |
{transcription.aligned_segments.map((segment, index) => { | |
const timelineWidth = | |
Math.max(transcription.total_duration * 200, 1200) - 32; // Account for padding | |
const leftPosition = | |
4 + | |
(segment.start / transcription.total_duration) * | |
timelineWidth; | |
const blockWidth = Math.max( | |
(segment.duration / transcription.total_duration) * | |
timelineWidth, | |
80, | |
); | |
const isActive = activeSegmentIndex === index; | |
return ( | |
<div | |
key={index} | |
ref={isActive ? activeSegmentRef : null} | |
onClick={() => handleSeekToSegment(segment)} | |
className={`timeline-segment absolute cursor-pointer transition-all duration-200 ${ | |
isActive | |
? "bg-blue-600 text-white scale-105 z-10" | |
: "bg-gray-700 text-gray-300 hover:bg-gray-600 hover:scale-102" | |
}`} | |
style={{ | |
left: `${leftPosition}px`, | |
width: `${blockWidth}px`, | |
top: "35%", | |
height: "30%", | |
}} | |
> | |
<div className="h-full flex flex-col justify-center p-2 rounded-lg shadow-lg"> | |
<div className="text-xs font-medium leading-tight line-clamp-2"> | |
{segment.text} | |
</div> | |
<div className="text-xs opacity-75 mt-1"> | |
{formatTime(segment.start)} | |
</div> | |
</div> | |
{/* Hover tooltip for longer text */} | |
<div className="timeline-tooltip absolute bottom-full left-1/2 transform -translate-x-1/2 mb-2 px-2 py-1 bg-gray-800 text-white text-xs rounded opacity-0 transition-opacity duration-200 pointer-events-none z-30 max-w-xs"> | |
<div className="whitespace-normal break-words"> | |
{segment.text} | |
</div> | |
<div className="text-gray-400 mt-1"> | |
{formatTime(segment.start)} -{" "} | |
{formatTime(segment.end)} ( | |
{segment.duration.toFixed(1)}s) | |
</div> | |
</div> | |
</div> | |
); | |
})} | |
{/* Time markers */} | |
{Array.from( | |
{ | |
length: Math.ceil(transcription.total_duration / 5) + 1, | |
}, | |
(_, i) => i * 5, | |
) | |
.filter((time) => time <= transcription.total_duration) | |
.map((time) => { | |
const timelineWidth = | |
Math.max(transcription.total_duration * 200, 1200) - 32; | |
const position = | |
4 + | |
(time / transcription.total_duration) * timelineWidth; | |
return ( | |
<div | |
key={time} | |
className="absolute text-xs text-gray-500" | |
style={{ | |
left: `${position}px`, | |
top: "75%", | |
transform: "translateX(-50%)", | |
}} | |
> | |
<div className="w-0.5 h-4 bg-gray-500 mx-auto mb-1"></div> | |
{formatTime(time)} | |
</div> | |
); | |
})} | |
</div> | |
</div> | |
{/* Timeline Controls */} | |
<div className="absolute bottom-4 left-1/2 transform -translate-x-1/2 bg-gray-800 rounded-lg p-2 shadow-lg"> | |
<div className="flex items-center space-x-4 text-white text-sm"> | |
<span>Timeline View</span> | |
<span className="text-gray-400">|</span> | |
<span className="text-blue-400"> | |
{activeSegmentIndex !== null | |
? `Segment ${activeSegmentIndex + 1}/${transcription.aligned_segments.length}` | |
: "No active segment"} | |
</span> | |
<span className="text-gray-400">|</span> | |
<span className="text-green-400"> | |
{formatTime(currentTime)} /{" "} | |
{formatTime(transcription.total_duration)} | |
</span> | |
</div> | |
</div> | |
</div> | |
</div> | |
)} | |
{/* Loading State */} | |
{isLoading && ( | |
<div className="flex-1 flex items-center justify-center"> | |
<div className="text-center text-white"> | |
<div className="animate-spin rounded-full h-12 w-12 border-b-2 border-white mx-auto mb-4"></div> | |
<div className="text-lg"> | |
{isProcessingVideo | |
? "Extracting audio from video..." | |
: "Transcribing your media..."} | |
</div> | |
<div className="text-sm text-gray-400 mt-2"> | |
{isProcessingVideo | |
? "Converting video to audio format for transcription" | |
: "This may take a few moments"} | |
</div> | |
</div> | |
</div> | |
)} | |
{/* Empty State */} | |
{!file && !isLoading && ( | |
<div className="flex-1 flex items-center justify-center"> | |
<div className="text-center text-gray-400"> | |
<div className="text-6xl mb-4">🎵</div> | |
<div className="text-xl mb-2">Upload Audio or Video</div> | |
<div className="text-sm"> | |
Choose a media file to get started with transcription | |
</div> | |
</div> | |
</div> | |
)} | |
</div> | |
</div> | |
); | |
} | |