Spaces:
Running
Running
import { debounce_timeout } from '../../constants.js'; | |
import { debounceAsync, splitRecursive } from '../../utils.js'; | |
import { getPreviewString, saveTtsProviderSettings } from './index.js'; | |
export class KokoroTtsProvider { | |
constructor() { | |
this.settings = { | |
modelId: 'onnx-community/Kokoro-82M-v1.0-ONNX', | |
dtype: 'q8', | |
device: 'wasm', | |
voiceMap: {}, | |
defaultVoice: 'af_heart', | |
speakingRate: 1.0, | |
}; | |
this.ready = false; | |
this.voices = [ | |
'af_heart', | |
'af_alloy', | |
'af_aoede', | |
'af_bella', | |
'af_jessica', | |
'af_kore', | |
'af_nicole', | |
'af_nova', | |
'af_river', | |
'af_sarah', | |
'af_sky', | |
'am_adam', | |
'am_echo', | |
'am_eric', | |
'am_fenrir', | |
'am_liam', | |
'am_michael', | |
'am_onyx', | |
'am_puck', | |
'am_santa', | |
'bf_emma', | |
'bf_isabella', | |
'bm_george', | |
'bm_lewis', | |
'bf_alice', | |
'bf_lily', | |
'bm_daniel', | |
'bm_fable', | |
]; | |
this.worker = null; | |
this.separator = ' ... ... ... '; | |
this.pendingRequests = new Map(); | |
this.nextRequestId = 1; | |
// Update display values immediately but only reinitialize TTS after a delay | |
this.initTtsDebounced = debounceAsync(this.initializeWorker.bind(this), debounce_timeout.relaxed); | |
} | |
/** | |
* Perform any text processing before passing to TTS engine. | |
* @param {string} text Input text | |
* @returns {string} Processed text | |
*/ | |
processText(text) { | |
// TILDE! | |
text = text.replace(/~/g, '.'); | |
return text; | |
} | |
async loadSettings(settings) { | |
if (settings.modelId !== undefined) this.settings.modelId = settings.modelId; | |
if (settings.dtype !== undefined) this.settings.dtype = settings.dtype; | |
if (settings.device !== undefined) this.settings.device = settings.device; | |
if (settings.voiceMap !== undefined) this.settings.voiceMap = settings.voiceMap; | |
if (settings.defaultVoice !== undefined) this.settings.defaultVoice = settings.defaultVoice; | |
if (settings.speakingRate !== undefined) this.settings.speakingRate = settings.speakingRate; | |
$('#kokoro_model_id').val(this.settings.modelId).on('input', this.onSettingsChange.bind(this)); | |
$('#kokoro_dtype').val(this.settings.dtype).on('change', this.onSettingsChange.bind(this)); | |
$('#kokoro_device').val(this.settings.device).on('change', this.onSettingsChange.bind(this)); | |
$('#kokoro_speaking_rate').val(this.settings.speakingRate).on('input', this.onSettingsChange.bind(this)); | |
$('#kokoro_speaking_rate_output').text(this.settings.speakingRate + 'x'); | |
} | |
initializeWorker() { | |
return new Promise((resolve, reject) => { | |
try { | |
// Terminate the existing worker if it exists | |
if (this.worker) { | |
this.worker.terminate(); | |
$('#kokoro_status_text').text('Initializing...').removeAttr('style'); | |
} | |
// Create a new worker | |
this.worker = new Worker(new URL('./kokoro-worker.js', import.meta.url), { type: 'module' }); | |
// Set up message handling | |
this.worker.onmessage = this.handleWorkerMessage.bind(this); | |
// Initialize the worker with the current settings | |
this.worker.postMessage({ | |
action: 'initialize', | |
data: { | |
modelId: this.settings.modelId, | |
dtype: this.settings.dtype, | |
device: this.settings.device, | |
}, | |
}); | |
// Create a promise that will resolve when initialization completes | |
const initPromise = new Promise((initResolve, initReject) => { | |
const timeoutId = setTimeout(() => { | |
initReject(new Error('Worker initialization timed out')); | |
}, 600000); // 600 second timeout | |
this.pendingRequests.set('initialization', { | |
resolve: (result) => { | |
clearTimeout(timeoutId); | |
initResolve(result); | |
}, | |
reject: (error) => { | |
clearTimeout(timeoutId); | |
initReject(error); | |
}, | |
}); | |
}); | |
// Resolve the outer promise when initialization completes | |
initPromise.then(success => { | |
this.ready = success; | |
this.updateStatusDisplay(); | |
resolve(success); | |
}).catch(error => { | |
console.error('Worker initialization failed:', error); | |
this.ready = false; | |
this.updateStatusDisplay(); | |
reject(error); | |
}); | |
} catch (error) { | |
console.error('Failed to create worker:', error); | |
this.ready = false; | |
this.updateStatusDisplay(); | |
reject(error); | |
} | |
}); | |
} | |
handleWorkerMessage(event) { | |
const { action, success, ready, error, requestId, blobUrl } = event.data; | |
switch (action) { | |
case 'initialized': { | |
const initRequest = this.pendingRequests.get('initialization'); | |
if (initRequest) { | |
if (success) { | |
initRequest.resolve(true); | |
} else { | |
initRequest.reject(new Error(error || 'Initialization failed')); | |
} | |
this.pendingRequests.delete('initialization'); | |
} | |
} break; | |
case 'generatedTts': { | |
const request = this.pendingRequests.get(requestId); | |
if (request) { | |
if (success) { | |
fetch(blobUrl).then(response => response.blob()).then(audioBlob => { | |
// Clean up the blob URL | |
URL.revokeObjectURL(blobUrl); | |
request.resolve(new Response(audioBlob, { | |
headers: { | |
'Content-Type': 'audio/wav', | |
}, | |
})); | |
}).catch(error => { | |
request.reject(new Error('Failed to fetch TTS audio blob: ' + error)); | |
}); | |
} else { | |
request.reject(new Error(error || 'TTS generation failed')); | |
} | |
this.pendingRequests.delete(requestId); | |
} | |
} break; | |
case 'readyStatus': | |
this.ready = ready; | |
this.updateStatusDisplay(); | |
break; | |
} | |
} | |
updateStatusDisplay() { | |
const statusText = this.ready ? 'Ready' : 'Failed'; | |
const statusColor = this.ready ? 'green' : 'red'; | |
$('#kokoro_status_text').text(statusText).css('color', statusColor); | |
} | |
async checkReady() { | |
if (!this.worker) { | |
return await this.initializeWorker(); | |
} | |
this.worker.postMessage({ action: 'checkReady' }); | |
return this.ready; | |
} | |
async onRefreshClick() { | |
return await this.initializeWorker(); | |
} | |
get settingsHtml() { | |
return ` | |
<div class="kokoro_tts_settings"> | |
<label for="kokoro_model_id">Model ID:</label> | |
<input id="kokoro_model_id" type="text" class="text_pole" value="${this.settings.modelId}" /> | |
<label for="kokoro_dtype">Data Type:</label> | |
<select id="kokoro_dtype" class="text_pole"> | |
<option value="q8" ${this.settings.dtype === 'q8' ? 'selected' : ''}>q8 (Recommended)</option> | |
<option value="fp32" ${this.settings.dtype === 'fp32' ? 'selected' : ''}>fp32 (High Precision)</option> | |
<option value="fp16" ${this.settings.dtype === 'fp16' ? 'selected' : ''}>fp16</option> | |
<option value="q4" ${this.settings.dtype === 'q4' ? 'selected' : ''}>q4 (Low Memory)</option> | |
<option value="q4f16" ${this.settings.dtype === 'q4f16' ? 'selected' : ''}>q4f16</option> | |
</select> | |
<label for="kokoro_device">Device:</label> | |
<select id="kokoro_device" class="text_pole"> | |
<option value="wasm" ${this.settings.device === 'wasm' ? 'selected' : ''}>WebAssembly (CPU)</option> | |
<option value="webgpu" ${this.settings.device === 'webgpu' ? 'selected' : ''}>WebGPU (GPU Acceleration)</option> | |
</select> | |
<label for="kokoro_speaking_rate">Speaking Rate: <span id="kokoro_speaking_rate_output">${this.settings.speakingRate}x</span></label> | |
<input id="kokoro_speaking_rate" type="range" value="${this.settings.speakingRate}" min="0.5" max="2.0" step="0.1" /> | |
<hr> | |
<div> | |
Status: <span id="kokoro_status_text">Initializing...</span> | |
</div> | |
</div> | |
`; | |
} | |
async onSettingsChange() { | |
this.settings.modelId = $('#kokoro_model_id').val().toString(); | |
this.settings.dtype = $('#kokoro_dtype').val().toString(); | |
this.settings.device = $('#kokoro_device').val().toString(); | |
this.settings.speakingRate = parseFloat($('#kokoro_speaking_rate').val().toString()); | |
// Update UI display | |
$('#kokoro_speaking_rate_output').text(this.settings.speakingRate + 'x'); | |
// Reinitialize TTS engine with debounce | |
this.initTtsDebounced(); | |
saveTtsProviderSettings(); | |
} | |
async fetchTtsVoiceObjects() { | |
if (!this.ready) { | |
await this.checkReady(); | |
} | |
return this.voices.map(voice => ({ | |
name: voice, | |
voice_id: voice, | |
preview_url: null, | |
lang: voice.startsWith('b') ? 'en-GB' : 'en-US', | |
})); | |
} | |
async previewTtsVoice(voiceId) { | |
if (!this.ready) { | |
await this.checkReady(); | |
} | |
const voice = this.getVoice(voiceId); | |
const previewText = getPreviewString(voice.lang); | |
for await (const response of this.generateTts(previewText, voiceId)) { | |
const audio = await response.blob(); | |
const url = URL.createObjectURL(audio); | |
await new Promise(resolve => { | |
const audioElement = new Audio(); | |
audioElement.src = url; | |
audioElement.play(); | |
audioElement.onended = () => resolve(); | |
}); | |
URL.revokeObjectURL(url); | |
} | |
} | |
getVoiceDisplayName(voiceId) { | |
return voiceId; | |
} | |
getVoice(voiceName) { | |
const defaultVoice = this.settings.defaultVoice || 'af_heart'; | |
const actualVoiceName = this.voices.includes(voiceName) ? voiceName : defaultVoice; | |
return { | |
name: actualVoiceName, | |
voice_id: actualVoiceName, | |
preview_url: null, | |
lang: actualVoiceName.startsWith('b') ? 'en-GB' : 'en-US', | |
}; | |
} | |
/** | |
* Generate TTS audio for the given text using the specified voice. | |
* @param {string} text Text to generate | |
* @param {string} voiceId Voice ID | |
* @returns {AsyncGenerator<Response>} Audio response generator | |
*/ | |
async* generateTts(text, voiceId) { | |
if (!this.ready || !this.worker) { | |
console.log('TTS not ready, initializing...'); | |
await this.initializeWorker(); | |
} | |
if (!this.ready || !this.worker) { | |
throw new Error('Failed to initialize TTS engine'); | |
} | |
if (text.trim().length === 0) { | |
throw new Error('Empty text'); | |
} | |
const voice = this.getVoice(voiceId); | |
const requestId = this.nextRequestId++; | |
const chunkSize = 400; | |
const chunks = splitRecursive(text, chunkSize, ['\n\n', '\n', '.', '?', '!', ',', ' ', '']); | |
for (const chunk of chunks) { | |
yield await new Promise((resolve, reject) => { | |
// Store the promise callbacks | |
this.pendingRequests.set(requestId, { resolve, reject }); | |
// Send the request to the worker | |
this.worker.postMessage({ | |
action: 'generateTts', | |
data: { | |
text: chunk, | |
voice: voice.voice_id, | |
speakingRate: this.settings.speakingRate || 1.0, | |
requestId, | |
}, | |
}); | |
}); | |
} | |
} | |
dispose() { | |
// Clean up the worker when the provider is disposed | |
if (this.worker) { | |
this.worker.terminate(); | |
this.worker = null; | |
} | |
} | |
} | |