diff --git "a/assets/worker-60e26223.js" "b/assets/worker-60e26223.js" --- "a/assets/worker-60e26223.js" +++ "b/assets/worker-60e26223.js" @@ -2372,4 +2372,4 @@ If a question does not make any sense, or is not factually coherent, explain why ' }}{% endfor %}{% if add_generation_prompt %}{{'model '}}{% endif %}`)}}class Mv extends Ce{}function jo(t,e,r,n){if(!("language_codes"in t)||!Array.isArray(t.language_codes))throw new Error("Tokenizer must have `language_codes` attribute set and it should be an array of language ids.");if(!("languageRegex"in t)||!(t.languageRegex instanceof RegExp))throw new Error("Tokenizer must have `languageRegex` attribute set and it should be a regular expression.");if(!("lang_to_token"in t)||typeof t.lang_to_token!="function")throw new Error("Tokenizer must have `lang_to_token` attribute set and it should be a function.");const a=n.src_lang,s=n.tgt_lang;if(!t.language_codes.includes(s))throw new Error(`Target language code "${s}" is not valid. Must be one of: {${t.language_codes.join(", ")}}`);if(a!==void 0){if(!t.language_codes.includes(a))throw new Error(`Source language code "${a}" is not valid. Must be one of: {${t.language_codes.join(", ")}}`);for(const i of t.post_processor.config.single)if("SpecialToken"in i&&t.languageRegex.test(i.SpecialToken.id)){i.SpecialToken.id=t.lang_to_token(a);break}}return n.forced_bos_token_id=t.model.convert_tokens_to_ids([t.lang_to_token(s)])[0],t._call(e,r)}class Ov extends Ce{constructor(e,r){super(e,r),this.languageRegex=/^[a-z]{3}_[A-Z][a-z]{3}$/,this.language_codes=this.special_tokens.filter(n=>this.languageRegex.test(n)),this.lang_to_token=n=>n}_build_translation_inputs(e,r,n){return jo(this,e,r,n)}}class zv extends Ce{constructor(e,r){super(e,r),this.languageRegex=/^__[a-z]{2,3}__$/,this.language_codes=this.special_tokens.filter(n=>this.languageRegex.test(n)).map(n=>n.slice(2,-2)),this.lang_to_token=n=>`__${n}__`}_build_translation_inputs(e,r,n){return jo(this,e,r,n)}}class Pv extends Ce{constructor(){super(...arguments);D(this,"_default_chat_template",'{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}')}get timestamp_begin(){return this.model.convert_tokens_to_ids(["<|notimestamps|>"])[0]+1}_decode_asr(r,{return_timestamps:n=!1,return_language:a=!1,time_precision:s=null,force_full_sequences:i=!0}={}){if(s===null)throw Error("Must specify time_precision");let o=null;const l=n==="word";function u(){return{language:o,timestamp:[null,null],text:""}}const d=[];let h=u(),m=0;const g=this.timestamp_begin;let p=[],w=[],v=!1,x=null;const $=new Set(this.all_special_ids);for(const A of r){const P=A.tokens,B=l?A.token_timestamps:null;let L=null,j=g;if("stride"in A){const[ae,ne,ie]=A.stride;if(m-=ne,x=ae-ie,ne&&(j=ne/s+g),ie)for(let N=P.length-1;N>=0;--N){const M=Number(P[N]);if(M>=g){if(L!==null&&(M-g)*s=g){const ie=(ne-g)*s+m,N=ri(ie,2);if(L!==null&&ne>=L)v=!0;else if(v||p.length>0&&ne0?(p.push(q),l&&w.push(ue)):p.every(ae=>ae.length===0)&&(h=u(),p=[],q=[],w=[],ue=[])}if(p.length>0){if(i&&n)throw new Error("Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.");const[A,P]=this.findLongestCommonSequence(p,w),B=this.decode(A);h.text=B,l&&(h.words=this.collateWordTimestamps(A,P,o)),d.push(h)}let C=Object.create(null);const T=d.map(A=>A.text).join("");if(n||a){for(let A=0;A0;let l=o?[]:null,u=o?n[0]:null;for(let d=1;dN===ae[M]).length,ie=ne/A+P;ne>1&&ie>m&&(m=ie,g=[B,L,q,ue])}const[w,v,x,$]=g,C=Math.floor((v+w)/2),T=Math.floor(($+x)/2);i.push(...a.slice(0,C)),a=h.slice(T),s=a.length,o&&(l.push(...u.slice(0,C)),u=n[d].slice(T))}return i.push(...a),o?(l.push(...u),[i,l]):[i,[]]}collateWordTimestamps(r,n,a){const[s,i,o]=this.combineTokensIntoWords(r,a),l=[];for(let u=0;u=s){const l=((o-s)*a).toFixed(2);i.push(`<|${l}|>`),i.push([])}else i[i.length-1].push(o);return i=i.map(o=>typeof o=="string"?o:super.decode(o,n)),i.join("")}splitTokensOnUnicode(r){const n=this.decode(r,{decode_with_timestamps:!0}),a="�",s=[],i=[],o=[];let l=[],u=[],d=0;for(let h=0;h=this.model.tokens_to_ids.get("<|endoftext|>"),w=h.startsWith(" "),v=h.trim(),x=u.test(v);if(p||w||x||i.length===0)i.push(h),o.push(m),l.push(g);else{const $=i.length-1;i[$]+=h,o[$].push(...m),l[$].push(...g)}}return[i,o,l]}mergePunctuations(r,n,a,s,i){const o=structuredClone(r),l=structuredClone(n),u=structuredClone(a);let d=o.length-2,h=o.length-1;for(;d>=0;)o[d].startsWith(" ")&&s.includes(o[d].trim())?(o[h]=o[d]+o[h],l[h]=ct(l[d],l[h]),u[h]=ct(u[d],u[h]),o[d]="",l[d]=[],u[d]=[]):h=d,--d;for(d=0,h=1;hm),l.filter(m=>m.length>0),u.filter(m=>m.length>0)]}get_decoder_prompt_ids({language:r=null,task:n=null,no_timestamps:a=!0}={}){const s=[];if(r){const i=Am(r),o=this.model.tokens_to_ids.get(`<|${i}|>`);if(o===void 0)throw new Error(`Unable to find language "${i}" in model vocabulary. Please report this issue at ${Vo}.`);s.push(o)}else s.push(null);if(n){if(n=n.toLowerCase(),n!=="transcribe"&&n!=="translate")throw new Error(`Task "${n}" is not supported. Must be one of: ["transcribe", "translate"]`);const i=this.model.tokens_to_ids.get(`<|${n}|>`);if(i===void 0)throw new Error(`Unable to find task "${n}" in model vocabulary. Please report this issue at ${Vo}.`);s.push(i)}else s.push(null);if(a){const i=this.model.tokens_to_ids.get("<|notimestamps|>");if(i===void 0)throw new Error(`Unable to find "<|notimestamps|>" in model vocabulary. Please report this issue at ${Vo}.`);s.push(i)}return s.map((i,o)=>[o+1,i]).filter(i=>i[1]!==null)}}class Rv extends Ce{}class Bv extends Ce{}class Dv extends Ce{}class Nv extends Ce{constructor(e,r){super(e,r),this.languageRegex=/^(>>\w+<<)\s*/g,this.supported_language_codes=this.model.vocab.filter(n=>this.languageRegex.test(n)),console.warn('WARNING: `MarianTokenizer` is not yet supported by Hugging Face\'s "fast" tokenizers library. Therefore, you may experience slightly inaccurate results.')}_encode_text(e){if(e===null)return null;const[r,...n]=e.trim().split(this.languageRegex);if(n.length===0)return super._encode_text(r);if(n.length===2){const[a,s]=n;return this.supported_language_codes.includes(a)||console.warn(`Unsupported language code "${a}" detected, which may lead to unexpected behavior. Should be one of: ${JSON.stringify(this.supported_language_codes)}`),ct([a],super._encode_text(s))}}}class Fv extends Ce{}class Wm extends Ce{constructor(){super(...arguments);D(this,"_default_chat_template","{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}")}}class Lv extends Wm{}class Uv extends Ce{}class Wv extends Ce{}class Vv extends Ce{constructor(e,r){super(e,r),this.decoder=new Yb({})}}class Gv extends Ce{}class ht{static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:s=!1,revision:i="main",legacy:o=null}={}){var m;const[l,u]=await Im(e,{progress_callback:r,config:n,cache_dir:a,local_files_only:s,revision:i,legacy:o}),d=((m=u.tokenizer_class)==null?void 0:m.replace(/Fast$/,""))??"PreTrainedTokenizer";let h=this.TOKENIZER_CLASS_MAPPING[d];return h||(console.warn(`Unknown tokenizer class "${d}", attempting to construct from base class.`),h=Ce),new h(l,u)}}D(ht,"TOKENIZER_CLASS_MAPPING",{T5Tokenizer:yv,DistilBertTokenizer:fv,CamembertTokenizer:mv,DebertaTokenizer:uv,DebertaV2Tokenizer:dv,BertTokenizer:iv,HerbertTokenizer:cv,ConvBertTokenizer:pv,RoFormerTokenizer:hv,XLMTokenizer:gv,ElectraTokenizer:_v,MobileBertTokenizer:ov,SqueezeBertTokenizer:lv,AlbertTokenizer:sv,GPT2Tokenizer:Fm,BartTokenizer:wv,MBartTokenizer:Lm,MBart50Tokenizer:bv,RobertaTokenizer:vv,WhisperTokenizer:Pv,CodeGenTokenizer:Rv,CLIPTokenizer:Bv,SiglipTokenizer:Dv,MarianTokenizer:Nv,BloomTokenizer:$v,NllbTokenizer:Ov,M2M100Tokenizer:zv,LlamaTokenizer:Um,CodeLlamaTokenizer:xv,XLMRobertaTokenizer:Sv,MPNetTokenizer:kv,FalconTokenizer:Ev,GPTNeoXTokenizer:Cv,EsmTokenizer:Tv,Wav2Vec2CTCTokenizer:Fv,BlenderbotTokenizer:Wm,BlenderbotSmallTokenizer:Lv,SpeechT5Tokenizer:Uv,NougatTokenizer:Wv,VitsTokenizer:Vv,Qwen2Tokenizer:Av,GemmaTokenizer:Iv,Grok1Tokenizer:Mv,CohereTokenizer:Gv,PreTrainedTokenizer:Ce});async function Hv(t,e){return await zr(t,"config.json",!0,e)}function In(t){const e={};let r={};switch(t.model_type){case"llava":case"paligemma":r=In(t.text_config);break;case"moondream1":r=In(t.phi_config);break;case"musicgen":r=In(t.decoder);break;case"gpt2":case"gptj":case"codegen":case"gpt_bigcode":e.num_heads="n_head",e.num_layers="n_layer",e.hidden_size="n_embd";break;case"gpt_neox":case"stablelm":case"opt":case"phi":case"phi3":case"falcon":e.num_heads="num_attention_heads",e.num_layers="num_hidden_layers",e.hidden_size="hidden_size";break;case"llama":case"mistral":case"starcoder2":case"qwen2":e.num_heads="num_key_value_heads",e.num_layers="num_hidden_layers",e.hidden_size="hidden_size",e.num_attention_heads="num_attention_heads";break;case"gemma":e.num_heads="num_key_value_heads",e.num_layers="num_hidden_layers",e.dim_kv="head_dim";break;case"openelm":e.num_heads="num_kv_heads",e.num_layers="num_transformer_layers",e.dim_kv="head_dim";break;case"gpt_neo":e.num_heads="num_heads",e.num_layers="num_layers",e.hidden_size="hidden_size";break;case"bloom":e.num_heads="n_head",e.num_layers="n_layer",e.hidden_size="hidden_size";break;case"mpt":e.num_heads="n_heads",e.num_layers="n_layers",e.hidden_size="d_model";break;case"t5":case"mt5":case"longt5":e.num_decoder_layers="num_decoder_layers",e.num_decoder_heads="num_heads",e.decoder_dim_kv="d_kv",e.num_encoder_layers="num_layers",e.num_encoder_heads="num_heads",e.encoder_dim_kv="d_kv";break;case"bart":case"mbart":case"marian":case"whisper":case"m2m_100":case"blenderbot":case"blenderbot-small":e.num_decoder_layers="decoder_layers",e.num_decoder_heads="decoder_attention_heads",e.decoder_hidden_size="d_model",e.num_encoder_layers="encoder_layers",e.num_encoder_heads="encoder_attention_heads",e.encoder_hidden_size="d_model";break;case"speecht5":e.num_decoder_layers="decoder_layers",e.num_decoder_heads="decoder_attention_heads",e.decoder_hidden_size="hidden_size",e.num_encoder_layers="encoder_layers",e.num_encoder_heads="encoder_attention_heads",e.encoder_hidden_size="hidden_size";break;case"trocr":e.num_encoder_layers=e.num_decoder_layers="decoder_layers",e.num_encoder_heads=e.num_decoder_heads="decoder_attention_heads",e.encoder_hidden_size=e.decoder_hidden_size="d_model";break;case"musicgen_decoder":e.num_encoder_layers=e.num_decoder_layers="num_hidden_layers",e.num_encoder_heads=e.num_decoder_heads="num_attention_heads",e.encoder_hidden_size=e.decoder_hidden_size="hidden_size";break;case"vision-encoder-decoder":const a=In(t.encoder),s=In(t.decoder),i="num_decoder_layers"in s,o={};return i?(o.num_decoder_layers=s.num_layers,o.num_decoder_heads=s.num_heads,o.decoder_hidden_size=s.hidden_size,o.num_encoder_layers=a.num_layers,o.num_encoder_heads=a.num_heads,o.encoder_hidden_size=a.hidden_size):(o.num_layers=s.num_layers,o.num_heads=s.num_heads,o.hidden_size=s.hidden_size),o}const n={...r,...Hr(t,["model_type","multi_query","is_encoder_decoder"])};for(const a in e)n[a]=t[e[a]];return n}function Vm(t,{prefix:e="past_key_values",encoder_add_pkv:r=!0}={}){const n={},a=t.normalized_config,s=1;if(a.is_encoder_decoder&&r){const i=a.encoder_dim_kv??a.encoder_hidden_size/a.num_encoder_heads,o=a.decoder_dim_kv??a.decoder_hidden_size/a.num_decoder_heads,l=[s,a.num_encoder_heads,0,i],u=[s,a.num_decoder_heads,0,o];for(let d=0;d=1&&i[i.length-1]>=this.timestamp_begin,l=i.length<2||i[i.length-2]>=this.timestamp_begin;if(o&&(l?s.subarray(this.timestamp_begin).fill(-1/0):s.subarray(0,this.eos_token_id).fill(-1/0)),e[n].length===this.begin_index&&this.max_initial_timestamp_index!==null){const m=this.timestamp_begin+this.max_initial_timestamp_index;s.subarray(m+1).fill(-1/0)}const u=V0(s),d=Math.log(u.subarray(this.timestamp_begin).map(Math.exp).reduce((m,g)=>m+g)),h=jt(u.subarray(0,this.timestamp_begin))[0];d>h&&s.subarray(0,this.timestamp_begin).fill(-1/0)}return r}}class Zv extends yr{constructor(e){super(),this.no_repeat_ngram_size=e}getNgrams(e){const r=e.length,n=[];for(let s=0;s1 to use the classifier free guidance processor, got guidance scale ${e}.`);this.guidance_scale=e}_call(e,r){if(r.dims[0]!==2*e.length)throw new Error(`Logits should have twice the batch size of the input ids, the first half of batches corresponding to the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got batch size ${r.dims[0]} for the logits and ${e.length} for the input ids.`);const n=e.length,a=r.slice([0,n],null),s=r.slice([n,r.dims[0]],null);for(let i=0;i1)throw new Error(`\`top_p\` must be a float > 0 and < 1, but is ${e}`);if(!Number.isInteger(n)||n<1)throw new Error(`\`min_tokens_to_keep\` must be a positive integer, but is ${n}`);this.top_p=e,this.filter_value=r,this.min_tokens_to_keep=n}}class s2 extends qo{constructor(e,{filter_value:r=-1/0,min_tokens_to_keep:n=1}={}){if(super(),!Number.isInteger(e)||e<0)throw new Error(`\`top_k\` must be a positive integer, but is ${e}`);this.top_k=Math.max(e,n),this.filter_value=r}}class jm{constructor(e){D(this,"max_length",20);D(this,"max_new_tokens",null);D(this,"min_length",0);D(this,"min_new_tokens",null);D(this,"early_stopping",!1);D(this,"max_time",null);D(this,"do_sample",!1);D(this,"num_beams",1);D(this,"num_beam_groups",1);D(this,"penalty_alpha",null);D(this,"use_cache",!0);D(this,"temperature",1);D(this,"top_k",50);D(this,"top_p",1);D(this,"typical_p",1);D(this,"epsilon_cutoff",0);D(this,"eta_cutoff",0);D(this,"diversity_penalty",0);D(this,"repetition_penalty",1);D(this,"encoder_repetition_penalty",1);D(this,"length_penalty",1);D(this,"no_repeat_ngram_size",0);D(this,"bad_words_ids",null);D(this,"force_words_ids",null);D(this,"renormalize_logits",!1);D(this,"constraints",null);D(this,"forced_bos_token_id",null);D(this,"forced_eos_token_id",null);D(this,"remove_invalid_values",!1);D(this,"exponential_decay_length_penalty",null);D(this,"suppress_tokens",null);D(this,"begin_suppress_tokens",null);D(this,"forced_decoder_ids",null);D(this,"guidance_scale",null);D(this,"num_return_sequences",1);D(this,"output_attentions",!1);D(this,"output_hidden_states",!1);D(this,"output_scores",!1);D(this,"return_dict_in_generate",!1);D(this,"pad_token_id",null);D(this,"bos_token_id",null);D(this,"eos_token_id",null);D(this,"encoder_no_repeat_ngram_size",0);D(this,"decoder_start_token_id",null);D(this,"generation_kwargs",{});Object.assign(this,Hr(e,Object.getOwnPropertyNames(this)))}}class Yo extends wt{_call(e,r){throw Error("StoppingCriteria needs to be subclassed")}}class Xo extends wt{constructor(){super(),this.criteria=[]}push(e){this.criteria.push(e)}extend(e){e instanceof Xo?e=e.criteria:e instanceof Yo&&(e=[e]),this.criteria.push(...e)}_call(e,r){const n=new Array(e.length).fill(!1);for(const a of this.criteria){const s=a(e,r);for(let i=0;ir.length>=this.max_length)}}class l2 extends Yo{constructor(e){super(),Array.isArray(e)||(e=[e]),this.eos_token_id=e}_call(e,r){return e.map(n=>{const a=n.at(-1);return this.eos_token_id.some(s=>a==s)})}}class Ri extends wt{constructor(e){super(),this.generation_config=e}_call(e,r=-1){return this.sample(e,r)}sample(e,r){throw Error("sample should be implemented in subclasses.")}getLogits(e,r){let n=e.dims.at(-1),a=e.data;if(r===-1)a=a.slice(-n);else{let s=r*n;a=a.slice(s,s+n)}return a}randomSelect(e){let r=e.reduce((a,s)=>a+s,0),n=Math.random()*r;for(let a=0;a1)return new c2(e);if(e.num_return_sequences>1)throw Error(`num_return_sequences has to be 1 when doing greedy search, but is ${e.num_return_sequences}.`);return new u2(e)}}class u2 extends Ri{sample(e,r=-1){let n=this.getLogits(e,r);return[[jt(n)[1],0]]}}class d2 extends Ri{sample(e,r=-1){let n=e.dims.at(-1);this.generation_config.top_k>0&&(n=Math.min(this.generation_config.top_k,n));const a=this.getLogits(e,r),s=wn(a,n),i=bt(s.map(o=>o[1]));return Array.from({length:this.generation_config.num_beams},()=>{const o=this.randomSelect(i);return[s[o][0],Math.log(i[o])]})}}class c2 extends Ri{sample(e,r=-1){let n=e.dims.at(-1);this.generation_config.top_k>0&&(n=Math.min(this.generation_config.top_k,n));const a=this.getLogits(e,r),s=wn(a,n),i=bt(s.map(o=>o[1]));return Array.from({length:this.generation_config.num_beams},(o,l)=>[s[l][0],Math.log(i[l])])}}class p2 extends jm{constructor(){super(...arguments);D(this,"return_timestamps",null);D(this,"return_token_timestamps",null);D(this,"num_frames",null);D(this,"alignment_heads",null);D(this,"task",null);D(this,"language",null);D(this,"no_timestamps_token_id",null);D(this,"prompt_ids",null);D(this,"is_multilingual",null);D(this,"lang_to_id",null);D(this,"task_to_id",null);D(this,"max_initial_timestamp_index",1)}}const $e={EncoderOnly:0,EncoderDecoder:1,Seq2Seq:2,Vision2Seq:3,DecoderOnly:4,MaskGeneration:5,ImageTextToText:6,Musicgen:7},Bi=new Map,qm=new Map,ma=new Map;async function h2(t,e,r){let n=r.device;n&&typeof n!="string"&&(n.hasOwnProperty(e)?n=n[e]:(console.warn(`Device not specified for ${e}. Using the default device.`),n=null));const a=Ew(n);let s=r.dtype;if(typeof s!="string"&&(s&&s.hasOwnProperty(e)?s=s[e]:(s=qv[a[0]],console.warn(`Dtype not specified for ${e}. Using the default dtype: ${s}.`))),Hm.hasOwnProperty(s)){if(s===Pt.fp16&&!await jv())throw new Error("The device does not support fp16.")}else throw new Error(`Invalid dtype: ${s}. Should be one of: ${Object.keys(Pt).join(", ")}`);const i=Hm[s],o=`${r.subfolder??""}/${e}${i}.onnx`,l={...r.session_options};l.executionProviders??(l.executionProviders=a);const u=ei(t,o,!0,r);let d=[];if(r.use_external_data_format){if(Gr.IS_NODE_ENV)throw new Error("External data format is not yet supported in Node.js");const m=`${e}${i}.onnx_data`,g=`${r.subfolder??""}/${m}`;d.push(new Promise(async(p,w)=>{const v=await ei(t,g,!0,r);p({path:m,data:v})}))}else l.externalData!==void 0&&(d=l.externalData.map(async m=>{if(typeof m.data=="string"){const g=await ei(t,m.data,!0,r);return{...m,data:g}}return m}));if(d.length>0&&(l.externalData=await Promise.all(d)),n==="webgpu"){const m=Vm(r.config,{prefix:"present"});if(Object.keys(m).length>0){const g={};for(const p in m)g[p]="gpu-buffer";l.preferredOutputLocation=g}}return{buffer:await u,session_options:l}}async function sn(t,e,r){const n=Object.keys(e),a=await Promise.all(n.map(async i=>h2(t,e[i],r))),s={};for(let i=0;i0)throw new Error(`An error occurred during model execution: "Missing the following inputs: ${n.join(", ")}.`);const a=Object.keys(e).length,s=t.inputNames.length;if(a>s){let i=Object.keys(e).filter(o=>!t.inputNames.includes(o));console.warn(`WARNING: Too many inputs were provided (${a} > ${s}). The following inputs will be ignored: "${i.join(", ")}".`)}return r}async function Nr(t,e){const r=f2(t,e);try{const n=Object.fromEntries(Object.entries(r).map(([s,i])=>[s,i.ort_tensor]));let a=await t.run(n);return a=Km(a),a}catch(n){throw console.error(`An error occurred during model execution: "${n}".`),console.error("Inputs given to model:",r),n}}function Km(t){for(let e in t)mm(t[e])?t[e]=new fe(t[e]):typeof t[e]=="object"&&Km(t[e]);return t}function Ym(t){if(t instanceof fe)return t;if(t.length===0)throw Error("items must be non-empty");if(Array.isArray(t[0])){if(t.some(e=>e.length!==t[0].length))throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.");return new fe("int64",BigInt64Array.from(t.flat().map(e=>BigInt(e))),[t.length,t[0].length])}else return new fe("int64",BigInt64Array.from(t.map(e=>BigInt(e))),[1,t.length])}function Xm(t){return new fe("bool",[t],[1])}async function Qm(t,e){let{encoder_outputs:r,past_key_values:n}=e;if(!r){const l=Hr(e,t.sessions.model.inputNames);r=(await ga(t,l)).last_hidden_state}const{input_ids:a,decoder_input_ids:s,...i}=e;return i.input_ids=s,i.encoder_hidden_states=r,t.sessions.decoder_model_merged.inputNames.includes("encoder_attention_mask")&&(i.encoder_attention_mask=e.attention_mask),await Qo(t,i,!0)}async function ga(t,e){const r=t.sessions.model,n=Object.create(null);for(const a of r.inputNames)n[a]=e[a];return r.inputNames.includes("token_type_ids")&&!n.token_type_ids&&(n.token_type_ids=new fe("int64",new BigInt64Array(n.input_ids.data.length),n.input_ids.dims)),await Nr(r,n)}async function Qo(t,e,r=!1){const n=t.sessions[r?"decoder_model_merged":"model"],{past_key_values:a,...s}=e;n.inputNames.includes("use_cache_branch")&&(s.use_cache_branch=Xm(!!a)),n.inputNames.includes("position_ids")&&s.attention_mask&&!s.position_ids&&(s.position_ids=g2(s,a)),t.addPastKeyValues(s,a);const i=Hr(s,n.inputNames);return await Nr(n,i)}async function m2(t,{input_ids:e=null,attention_mask:r=null,pixel_values:n=null,position_ids:a=null,inputs_embeds:s=null,past_key_values:i=null,generation_config:o=null,logits_processor:l=null,...u}){if(!s){if(s=await t.encode_text({input_ids:e}),n&&e.dims[1]!==1){const h=await t.encode_image({pixel_values:n});({inputs_embeds:s,attention_mask:r}=t._merge_input_ids_with_image_features({image_features:h,inputs_embeds:s,input_ids:e,attention_mask:r}))}else if(i&&n&&e.dims[1]===1){const h=e.dims[1],m=Object.values(i)[0].dims.at(-2);r=gr([la([e.dims[0],m]),r.slice(null,[r.dims[1]-h,r.dims[1]])],1)}}return await Qo(t,{inputs_embeds:s,past_key_values:i,attention_mask:r,position_ids:a,generation_config:o,logits_processor:l},!0)}function g2(t,e=null){const{input_ids:r,inputs_embeds:n,attention_mask:a}=t,[s,i]=a.dims,o=new BigInt64Array(a.data.length);for(let u=0;us.dims[1])){if(ao==t.config.image_token_index)){const o=t.config.num_image_tokens;if(!o)throw new Error("`num_image_tokens` is missing in the model configuration.");const l=s.dims[1]-(a-o);r.input_ids=s.slice(null,[-l,null]),r.attention_mask=la([1,a+l])}}}return r}function _2(t,e,r,n){const{...a}=r;return r.past_key_values&&(e=e.map(i=>[i.at(-1)])),a.decoder_input_ids=Ym(e),a}class re extends wt{constructor(r,n){super();D(this,"main_input_name","input_ids");D(this,"forward_params",["input_ids","attention_mask"]);this.config=r,this.sessions=n;const a=ma.get(this.constructor),s=Bi.get(a);this.can_generate=!1,this._forward=null,this._prepare_inputs_for_generation=null,s===$e.DecoderOnly?(this.can_generate=!0,this._forward=Qo,this._prepare_inputs_for_generation=Zm):s===$e.Seq2Seq||s===$e.Vision2Seq||s===$e.Musicgen?(this.can_generate=!0,this._forward=Qm,this._prepare_inputs_for_generation=_2):s===$e.EncoderDecoder?this._forward=Qm:s===$e.ImageTextToText?(this.can_generate=!0,this._forward=m2,this._prepare_inputs_for_generation=Zm):this._forward=ga,this.can_generate&&this.forward_params.push("past_key_values"),this.custom_config=this.config["transformers.js_config"]??{}}async dispose(){var n;const r=[];for(const a of Object.values(this.sessions))(n=a==null?void 0:a.handler)!=null&&n.dispose&&r.push(a.handler.dispose());return await Promise.all(r)}static async from_pretrained(r,{progress_callback:n=null,config:a=null,cache_dir:s=null,local_files_only:i=!1,revision:o="main",model_file_name:l=null,subfolder:u="onnx",device:d=null,dtype:h=null,use_external_data_format:m=null,session_options:g={}}={}){let p={progress_callback:n,config:a,cache_dir:s,local_files_only:i,revision:o,model_file_name:l,subfolder:u,device:d,dtype:h,use_external_data_format:m,session_options:g};const w=ma.get(this),v=Bi.get(w);p.config=await Gm.from_pretrained(r,p);let x;return v===$e.DecoderOnly?x=await Promise.all([sn(r,{model:p.model_file_name??"model"},p),zr(r,"generation_config.json",!1,p)]):v===$e.Seq2Seq||v===$e.Vision2Seq?x=await Promise.all([sn(r,{model:"encoder_model",decoder_model_merged:"decoder_model_merged"},p),zr(r,"generation_config.json",!1,p)]):v===$e.MaskGeneration?x=await Promise.all([sn(r,{model:"vision_encoder",prompt_encoder_mask_decoder:"prompt_encoder_mask_decoder"},p)]):v===$e.EncoderDecoder?x=await Promise.all([sn(r,{model:"encoder_model",decoder_model_merged:"decoder_model_merged"},p)]):v===$e.ImageTextToText?x=await Promise.all([sn(r,{embed_tokens:"embed_tokens",vision_encoder:"vision_encoder",decoder_model_merged:"decoder_model_merged"},p),zr(r,"generation_config.json",!1,p)]):v===$e.Musicgen?x=await Promise.all([sn(r,{model:"text_encoder",decoder_model_merged:"decoder_model_merged",encodec_decode:"encodec_decode"},p),zr(r,"generation_config.json",!1,p)]):(v!==$e.EncoderOnly&&console.warn(`Model type for '${w??(a==null?void 0:a.model_type)}' not found, assuming encoder-only architecture. Please report this at https://github.com/xenova/transformers.js/issues/new/choose.`),x=await Promise.all([sn(r,{model:p.model_file_name??"model"},p)])),new this(p.config,...x)}async _call(r){return await this.forward(r)}async forward(r){return await this._forward(this,r)}_get_logits_warper(r){const n=new Ko;return r.temperature!==null&&r.temperature!==1&&n.push(new a2(r.temperature)),r.top_k!==null&&r.top_k!==0&&n.push(new s2(r.top_k)),r.top_p!==null&&r.top_p<1&&n.push(new i2(r.top_p)),n}_get_logits_processor(r,n,a=null){const s=new Ko;if(r.repetition_penalty!==null&&r.repetition_penalty!==1&&s.push(new Jv(r.repetition_penalty)),r.no_repeat_ngram_size!==null&&r.no_repeat_ngram_size>0&&s.push(new Zv(r.no_repeat_ngram_size)),r.bad_words_ids!==null&&s.push(new r2(r.bad_words_ids,r.eos_token_id)),r.min_length!==null&&r.eos_token_id!==null&&r.min_length>0&&s.push(new e2(r.min_length,r.eos_token_id)),r.min_new_tokens!==null&&r.eos_token_id!==null&&r.min_new_tokens>0&&s.push(new t2(n,r.min_new_tokens,r.eos_token_id)),r.forced_bos_token_id!==null&&s.push(new Kv(r.forced_bos_token_id)),r.forced_eos_token_id!==null&&s.push(new Yv(r.max_length,r.forced_eos_token_id)),r.begin_suppress_tokens!==null){const i=n>1||r.forced_bos_token_id===null?n:n+1;s.push(new Xv(r.begin_suppress_tokens,i))}return r.guidance_scale!==null&&r.guidance_scale>1&&s.push(new n2(r.guidance_scale)),a!==null&&s.extend(a),s}_prepare_generation_config(r,n,a=jm){const s={...this.config};for(const o of["decoder","generator","text_config"])o in s&&Object.assign(s,s[o]);const i=new a(s);return"generation_config"in this&&Object.assign(i,this.generation_config),r&&Object.assign(i,r),n&&Object.assign(i,Hr(n,Object.getOwnPropertyNames(i))),i}_get_stopping_criteria(r,n=null){const a=new Xo;return r.max_length!==null&&a.push(new o2(r.max_length,this.config.max_position_embeddings??null)),r.eos_token_id!==null&&a.push(new l2(r.eos_token_id)),n&&a.extend(n),a}_validate_model_class(){if(!this.can_generate){const r=[il,sl,al,nl],n=ma.get(this.constructor),a=new Set,s=this.config.model_type;for(const o of r){const l=o.get(s);l&&a.add(l[0])}let i=`The current model class (${n}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;throw a.size>0&&(i+=` Please use the following class instead: ${[...a].join(", ")}`),Error(i)}}prepare_inputs_for_generation(...r){return this._prepare_inputs_for_generation(this,...r)}_update_model_kwargs_for_generation({generated_input_ids:r,outputs:n,model_inputs:a,is_encoder_decoder:s}){return a.past_key_values=this.getPastKeyValues(n,a.past_key_values),a.input_ids=new fe("int64",r.flat(),[r.length,1]),s||(a.attention_mask=gr([a.attention_mask,la([a.attention_mask.dims[0],1])],1)),a.position_ids=null,a}_prepare_model_inputs({inputs:r,bos_token_id:n,model_kwargs:a}){const s=Hr(a,this.forward_params),i=this.main_input_name;if(i in s){if(r)throw new Error("`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. Make sure to either pass {inputs} or {input_name}=...")}else s[i]=r;return{inputs_tensor:s[i],model_inputs:s,model_input_name:i}}async _prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:r,model_inputs:n,model_input_name:a,generation_config:s}){const i=Hr(n,this.sessions.model.inputNames);let{last_hidden_state:o}=await ga(this,i);return s.guidance_scale!==null&&s.guidance_scale>1&&(o=gr([o,Rw(o,0)],0),"attention_mask"in n&&(n.attention_mask=gr([n.attention_mask,Nw(n.attention_mask)],0))),n.encoder_outputs=o,n}_prepare_decoder_input_ids_for_generation({batch_size:r,model_input_name:n,model_kwargs:a,decoder_start_token_id:s,bos_token_id:i,generation_config:o}){let{decoder_input_ids:l,...u}=a;if(!l)if(s??(s=i),this.config.model_type==="musicgen")l=Array.from({length:r*this.config.decoder.num_codebooks},()=>[s]);else if(Array.isArray(s)){if(s.length!==r)throw new Error(`\`decoder_start_token_id\` expcted to have length ${r} but got ${s.length}`);l=s}else l=Array.from({length:r},()=>[s]);return l=Ym(l),a.decoder_attention_mask=Bw(l),{input_ids:l,model_inputs:u}}async generate({inputs:r=null,generation_config:n=null,logits_processor:a=null,stopping_criteria:s=null,streamer:i=null,...o}){this._validate_model_class(),n=this._prepare_generation_config(n,o);let{inputs_tensor:l,model_inputs:u,model_input_name:d}=this._prepare_model_inputs({inputs:r,model_kwargs:o});const h=this.config.is_encoder_decoder;h&&("encoder_outputs"in u||(u=await this._prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:l,model_inputs:u,model_input_name:d,generation_config:n})));let m;h?{input_ids:m,model_inputs:u}=this._prepare_decoder_input_ids_for_generation({batch_size:u[d].dims.at(0),model_input_name:d,model_kwargs:u,decoder_start_token_id:n.decoder_start_token_id,bos_token_id:n.bos_token_id,generation_config:n}):m=u[d];let g=m.dims.at(-1);n.max_new_tokens!==null&&(n.max_length=g+n.max_new_tokens);const p=this._get_logits_processor(n,g,a),w=this._get_stopping_criteria(n,s),v=u[d].dims.at(0),x=Ri.getSampler(n),$=new Array(v).fill(0),C=m.tolist();i&&i.put(C);let T=null;for(;;){u=this.prepare_inputs_for_generation(C,u,n);const P=await this.forward(u),B=P.logits.slice(null,-1,null),L=p(C,B),j=[];for(let ue=0;ueue)){n.return_dict_in_generate&&(T=this.getPastKeyValues(P,u.past_key_values,!1));break}u=this._update_model_kwargs_for_generation({generated_input_ids:j,outputs:P,model_inputs:u,is_encoder_decoder:h})}i&&i.end();const A=new fe("int64",C.flat(),[C.length,C[0].length]);return n.return_dict_in_generate?{sequences:A,past_key_values:T}:A}addAttentionsToBeam(r,n){if(this.config.is_encoder_decoder){if(!n.cross_attentions||n.cross_attentions.length===0)throw Error("`output_attentions` is true, but the model did not produce cross-attentions. This is most likely because the model was not exported with `output_attentions=True`.");r.cross_attentions||(r.cross_attentions=[]),r.cross_attentions.push(n.cross_attentions)}if(!n.decoder_attentions||n.decoder_attentions.length===0)throw Error("`output_attentions` is true, but the model did not produce decoder-attentions. This is most likely because the model was not exported with `output_attentions=True`.");r.decoder_attentions||(r.decoder_attentions=[]),r.decoder_attentions.push(n.decoder_attentions)}groupBeams(r){const n=Object.create(null);for(const a of r)n[a.id]===void 0?n[a.id]=[a]:n[a.id].push(a);return Object.values(n)}getPastKeyValues(r,n,a=!0){const s=Object.create(null);for(const i in r)if(i.startsWith("present")){let o=i.replace("present","past_key_values");if(n&&i.includes("encoder"))s[o]=n[o];else{if(a&&n){const l=n[o];l.location==="gpu-buffer"&&l.dispose()}s[o]=r[i]}}return s}getAttentions(r){const n=Object.create(null);for(const a of["cross_attentions","decoder_attentions"]){const s=[];for(const i in r)if(i.startsWith(a)){const o=i.split(".").pop();s[o]=r[i]}n[a]=s}return n}addPastKeyValues(r,n){if(n)Object.assign(r,n);else{const a=this.custom_config.kv_cache_dtype??"float32",s=a==="float16"?new Uint16Array:[],i=Vm(this.config);for(const o in i)r[o]=new fe(a,s,i[o])}}}class Xt{}class _a extends re{}class y2 extends _a{}class w2 extends _a{async _call(e){return new $t(await super._call(e))}}class b2 extends _a{async _call(e){return new ze(await super._call(e))}}class v2 extends _a{async _call(e){return new vt(await super._call(e))}}class $2 extends _a{async _call(e){return new Tt(await super._call(e))}}class x2 extends re{}class S2 extends x2{}class ya extends re{}class k2 extends ya{}class E2 extends ya{async _call(e){return new $t(await super._call(e))}}class C2 extends ya{async _call(e){return new ze(await super._call(e))}}class T2 extends ya{async _call(e){return new vt(await super._call(e))}}class A2 extends ya{async _call(e){return new Tt(await super._call(e))}}class wa extends re{}class I2 extends wa{}class M2 extends wa{async _call(e){return new $t(await super._call(e))}}class O2 extends wa{async _call(e){return new ze(await super._call(e))}}class z2 extends wa{async _call(e){return new vt(await super._call(e))}}class P2 extends wa{async _call(e){return new Tt(await super._call(e))}}class ba extends re{}class R2 extends ba{}class B2 extends ba{async _call(e){return new $t(await super._call(e))}}class D2 extends ba{async _call(e){return new ze(await super._call(e))}}class N2 extends ba{async _call(e){return new vt(await super._call(e))}}class F2 extends ba{async _call(e){return new Tt(await super._call(e))}}class va extends re{}class L2 extends va{}class U2 extends va{async _call(e){return new $t(await super._call(e))}}class W2 extends va{async _call(e){return new ze(await super._call(e))}}class V2 extends va{async _call(e){return new vt(await super._call(e))}}class G2 extends va{async _call(e){return new Tt(await super._call(e))}}class $a extends re{}class H2 extends $a{}class j2 extends $a{async _call(e){return new $t(await super._call(e))}}class q2 extends $a{async _call(e){return new ze(await super._call(e))}}class K2 extends $a{async _call(e){return new vt(await super._call(e))}}class Y2 extends $a{async _call(e){return new Tt(await super._call(e))}}class xa extends re{}class X2 extends xa{}class Q2 extends xa{async _call(e){return new $t(await super._call(e))}}class Z2 extends xa{async _call(e){return new ze(await super._call(e))}}class J2 extends xa{async _call(e){return new vt(await super._call(e))}}class e1 extends xa{async _call(e){return new Tt(await super._call(e))}}class Sa extends re{}class t1 extends Sa{}class r1 extends Sa{async _call(e){return new ze(await super._call(e))}}class n1 extends Sa{async _call(e){return new vt(await super._call(e))}}class a1 extends Sa{async _call(e){return new Tt(await super._call(e))}}class i1 extends Sa{async _call(e){return new $t(await super._call(e))}}class Di extends re{}class s1 extends Di{}class o1 extends Di{async _call(e){return new $t(await super._call(e))}}class l1 extends Di{async _call(e){return new ze(await super._call(e))}}class u1 extends Di{async _call(e){return new vt(await super._call(e))}}class Ni extends re{}class d1 extends Ni{}class c1 extends Ni{async _call(e){return new $t(await super._call(e))}}class p1 extends Ni{async _call(e){return new ze(await super._call(e))}}class h1 extends Ni{async _call(e){return new Tt(await super._call(e))}}class ka extends re{}class f1 extends ka{}class m1 extends ka{async _call(e){return new $t(await super._call(e))}}class g1 extends ka{async _call(e){return new ze(await super._call(e))}}class _1 extends ka{async _call(e){return new vt(await super._call(e))}}class y1 extends ka{async _call(e){return new Tt(await super._call(e))}}class Fi extends re{}class w1 extends Fi{}class b1 extends Fi{async _call(e){return new $t(await super._call(e))}}class v1 extends Fi{async _call(e){return new ze(await super._call(e))}}class $1 extends Fi{async _call(e){return new Tt(await super._call(e))}}class Li extends re{}class x1 extends Li{}class S1 extends Li{async _call(e){return new ze(await super._call(e))}}class k1 extends Li{async _call(e){return new Tt(await super._call(e))}}class E1 extends Li{async _call(e){return new $t(await super._call(e))}}class Jm extends re{constructor(r,n,a){super(r,n);D(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"]);this.generation_config=a}}class C1 extends Jm{}class T1 extends Jm{}class eg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class A1 extends eg{}class I1 extends eg{}class tg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class M1 extends tg{}class O1 extends tg{}class Zo extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class z1 extends Zo{}class P1 extends Zo{}class R1 extends Zo{async _call(e){return new ze(await super._call(e))}}class Ui extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class B1 extends Ui{}class D1 extends Ui{}class N1 extends Ui{async _call(e){return new ze(await super._call(e))}}class F1 extends Ui{}class rg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class L1 extends rg{}class U1 extends rg{}class ng extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class W1 extends ng{}class V1 extends ng{}class Ea extends re{}class G1 extends Ea{}class H1 extends Ea{async _call(e){return new $t(await super._call(e))}}class j1 extends Ea{async _call(e){return new ze(await super._call(e))}}class q1 extends Ea{async _call(e){return new vt(await super._call(e))}}class K1 extends Ea{async _call(e){return new Tt(await super._call(e))}}class Ca extends re{}class Y1 extends Ca{}class X1 extends Ca{async _call(e){return new $t(await super._call(e))}}class Q1 extends Ca{async _call(e){return new ze(await super._call(e))}}class Z1 extends Ca{async _call(e){return new vt(await super._call(e))}}class J1 extends Ca{async _call(e){return new Tt(await super._call(e))}}class Ta extends re{}class e$ extends Ta{}class t$ extends Ta{async _call(e){return new $t(await super._call(e))}}class r$ extends Ta{async _call(e){return new ze(await super._call(e))}}class n$ extends Ta{async _call(e){return new vt(await super._call(e))}}class a$ extends Ta{async _call(e){return new Tt(await super._call(e))}}class ag extends re{}class i$ extends ag{}class s$ extends ag{}class ig extends re{constructor(r,n,a){super(r,n);D(this,"requires_attention_mask",!1);D(this,"main_input_name","input_features");D(this,"forward_params",["input_features","attention_mask","decoder_input_ids","decoder_attention_mask","past_key_values"]);this.generation_config=a}}class o$ extends ig{}class l$ extends ig{_prepare_generation_config(e,r){return super._prepare_generation_config(e,r,p2)}_retrieve_init_tokens(e){const r=[e.decoder_start_token_id];let n=e.language;const a=e.task;if(e.is_multilingual){n||(console.warn("No language specified - defaulting to English (en)."),n="en");const i=`<|${Am(n)}|>`;r.push(e.lang_to_id[i]),r.push(e.task_to_id[a??"transcribe"])}else if(n||a)throw new Error("Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.");return!e.return_timestamps&&e.no_timestamps_token_id&&r.at(-1)!==e.no_timestamps_token_id?r.push(e.no_timestamps_token_id):e.return_timestamps&&r.at(-1)===e.no_timestamps_token_id&&(console.warn("<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`."),r.pop()),r.filter(s=>s!=null)}async generate({inputs:e=null,generation_config:r=null,logits_processor:n=null,stopping_criteria:a=null,...s}){r=this._prepare_generation_config(r,s);const i=this._retrieve_init_tokens(r);return r.return_timestamps&&(n??(n=new Ko),n.push(new Qv(r,i))),await super.generate({inputs:e,generation_config:r,logits_processor:n,decoder_input_ids:i,...s})}_extract_token_timestamps(e,r,n=null,a=.02){if(!e.cross_attentions)throw new Error("Model outputs must contain cross attentions to extract timestamps. This is most likely because the model was not exported with `output_attentions=True`.");let s=this.config.median_filter_width;s===void 0&&(console.warn("Model config has no `median_filter_width`, using default value of 7."),s=7);const i=e.cross_attentions.map(u=>{let d=Array.from({length:this.config.decoder_layers},(v,x)=>gr(u.map($=>$[x]),2)),h=oa(r.map(([v,x])=>n?d[v].slice(null,x,null,[0,n]):d[v].slice(null,x)));h=h.transpose(1,0,2,3);let[m,g]=Mw(h,-2,0,!0),p=h.clone();for(let v=0;vh[x+1]-h[x]),p=ct([1],g).map(v=>!!v),w=[];for(let v=0;vm.findIndex(g=>g==s)),l=o.every(m=>m===-1),u=o.every(m=>m!==-1);if(!l&&!u)throw new Error("Every input should contain either 0 or 1 image token.");if(l)return{inputs_embeds:e,attention_mask:a};const d=[],h=[];for(let m=0;ms*i,1);e.input_labels=new fe("int64",new BigInt64Array(a).fill(1n),n)}const r={image_embeddings:e.image_embeddings,image_positional_embeddings:e.image_positional_embeddings};return e.input_points&&(r.input_points=e.input_points),e.input_labels&&(r.input_labels=e.input_labels),e.input_boxes&&(r.input_boxes=e.input_boxes),await Nr(this.sessions.prompt_encoder_mask_decoder,r)}async _call(e){return new qx(await super._call(e))}}class qx extends Xt{constructor({iou_scores:e,pred_masks:r}){super(),this.iou_scores=e,this.pred_masks=r}}class Gg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class Kx extends Gg{}class Yx extends Gg{}class Hg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class Xx extends Hg{}class Qx extends Hg{}class on extends re{}class Zx extends on{}class Jx extends on{async _call(e){return new Mn(await super._call(e))}}class eS extends on{async _call(e){return new ze(await super._call(e))}}class tS extends on{async _call(e){return new vt(await super._call(e))}}class el extends re{}class rS extends el{}class nS extends el{async _call(e){return new Mn(await super._call(e))}}class aS extends el{async _call(e){return new ze(await super._call(e))}}class Vi extends re{}class iS extends Vi{}class sS extends Vi{async _call(e){return new Mn(await super._call(e))}}class oS extends Vi{async _call(e){return new ze(await super._call(e))}}class lS extends Vi{async _call(e){return new vt(await super._call(e))}}class tl extends re{}class uS extends tl{}class dS extends tl{async _call(e){return new Mn(await super._call(e))}}class cS extends tl{async _call(e){return new ze(await super._call(e))}}class pS extends on{}class hS extends on{async _call(e){return new Mn(await super._call(e))}}class fS extends on{async _call(e){return new ze(await super._call(e))}}class Aa extends re{}class mS extends Aa{}class gS extends Aa{async _call(e){return new Mn(await super._call(e))}}class _S extends Aa{async _call(e){return new ze(await super._call(e))}}class yS extends Aa{async _call(e){return new QS(await super._call(e))}}class wS extends Aa{async _call(e){return new vt(await super._call(e))}}class jg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class bS extends jg{}class vS extends jg{async generate_speech(e,r,{threshold:n=.5,minlenratio:a=0,maxlenratio:s=20,vocoder:i=null}={}){const o={input_ids:e},{encoder_outputs:l,encoder_attention_mask:u}=await ga(this,o),d=l.dims[1]/this.config.reduction_factor,h=Math.floor(d*s),m=Math.floor(d*a),g=this.config.num_mel_bins;let p=[],w=null,v=null,x=0;for(;;){++x;const T=Xm(!!v);let A;v?A=v.output_sequence_out:A=new fe("float32",new Float32Array(g),[1,1,g]);let P={use_cache_branch:T,output_sequence:A,encoder_attention_mask:u,speaker_embeddings:r,encoder_hidden_states:l};this.addPastKeyValues(P,w),v=await Nr(this.sessions.decoder_model_merged,P),w=this.getPastKeyValues(v,w);const{prob:B,spectrum:L}=v;if(p.push(L),x>=m&&(Array.from(B.data).filter(j=>j>=n).length>0||x>=h))break}const $=gr(p),{waveform:C}=await Nr(i.sessions.model,{spectrogram:$});return{spectrogram:$,waveform:C}}}class $S extends re{constructor(){super(...arguments);D(this,"main_input_name","spectrogram")}}class xS extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class SS extends xS{}class qg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class kS extends qg{}class ES extends qg{}class Kg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class CS extends Kg{}class TS extends Kg{}class Yg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class AS extends Yg{}class IS extends Yg{}class rl extends re{}class MS extends rl{}class OS extends rl{static async from_pretrained(e,r={}){return r.model_file_name??(r.model_file_name="text_model"),super.from_pretrained(e,r)}}class zS extends rl{static async from_pretrained(e,r={}){return r.model_file_name??(r.model_file_name="audio_model"),super.from_pretrained(e,r)}}class PS extends re{}class Xg extends PS{async _call(e){return new JS(await super._call(e))}}class Qg extends re{}class RS extends Qg{}class BS extends Qg{}class Zg extends re{constructor(e,r,n){super(e,r),this.generation_config=n}}class DS extends Zg{}class NS extends Zg{}class Jg extends re{}class FS extends Jg{}class LS extends Jg{async _call(e){return new ze(await super._call(e))}}class e_ extends re{constructor(r,n,a){super(r,n);D(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"]);this.generation_config=a}_apply_and_filter_by_delay_pattern_mask(r){const[n,a]=r.dims,s=this.config.decoder.num_codebooks,i=a-s;let o=0;for(let d=0;d0&&g<=i&&(r.data[o++]=r.data[d])}const l=Math.floor(n/s),u=o/(l*s);return new fe(r.type,r.data.slice(0,o),[l,s,u])}prepare_inputs_for_generation(r,n,a){let s=structuredClone(r);for(let o=0;o=l&&(s[o][l]=BigInt(this.config.decoder.pad_token_id));return a.guidance_scale!==null&&a.guidance_scale>1&&(s=s.concat(s)),super.prepare_inputs_for_generation(s,n,a)}async generate(r){const n=await super.generate(r),a=this._apply_and_filter_by_delay_pattern_mask(n).unsqueeze_(0),{audio_values:s}=await Nr(this.sessions.encodec_decode,{audio_codes:a});return s}}class Xe{static async from_pretrained(e,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:s=!1,revision:i="main",model_file_name:o=null,subfolder:l="onnx",device:u=null,dtype:d=null,use_external_data_format:h=null,session_options:m={}}={}){let g={progress_callback:r,config:n,cache_dir:a,local_files_only:s,revision:i,model_file_name:o,subfolder:l,device:u,dtype:d,use_external_data_format:h,session_options:m};if(g.config=await Gm.from_pretrained(e,g),!this.MODEL_CLASS_MAPPINGS)throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: "+this.name);for(let p of this.MODEL_CLASS_MAPPINGS){const w=p.get(g.config.model_type);if(w)return await w[1].from_pretrained(e,g)}if(this.BASE_IF_FAIL)return console.warn(`Unknown model class "${g.config.model_type}", attempting to construct from base class.`),await re.from_pretrained(e,g);throw Error(`Unsupported model type: ${g.config.model_type}`)}}D(Xe,"MODEL_CLASS_MAPPINGS",null),D(Xe,"BASE_IF_FAIL",!1);const US=new Map([["bert",["BertModel",y2]],["nomic_bert",["NomicBertModel",S2]],["roformer",["RoFormerModel",k2]],["electra",["ElectraModel",R2]],["esm",["EsmModel",s1]],["convbert",["ConvBertModel",I2]],["camembert",["CamembertModel",L2]],["deberta",["DebertaModel",H2]],["deberta-v2",["DebertaV2Model",X2]],["mpnet",["MPNetModel",f1]],["albert",["AlbertModel",x1]],["distilbert",["DistilBertModel",t1]],["roberta",["RobertaModel",G1]],["xlm",["XLMModel",Y1]],["xlm-roberta",["XLMRobertaModel",e$]],["clap",["ClapModel",MS]],["clip",["CLIPModel",c$]],["clipseg",["CLIPSegModel",w$]],["chinese_clip",["ChineseCLIPModel",y$]],["siglip",["SiglipModel",f$]],["mobilebert",["MobileBertModel",d1]],["squeezebert",["SqueezeBertModel",w1]],["wav2vec2",["Wav2Vec2Model",Zx]],["wav2vec2-bert",["Wav2Vec2BertModel",uS]],["unispeech",["UniSpeechModel",rS]],["unispeech-sat",["UniSpeechSatModel",iS]],["hubert",["HubertModel",pS]],["wavlm",["WavLMModel",mS]],["audio-spectrogram-transformer",["ASTModel",i$]],["vits",["VitsModel",Xg]],["detr",["DetrModel",hx]],["table-transformer",["TableTransformerModel",_x]],["vit",["ViTModel",Q$]],["fastvit",["FastViTModel",J$]],["mobilevit",["MobileViTModel",nx]],["mobilevitv2",["MobileViTV2Model",ix]],["owlvit",["OwlViTModel",ox]],["owlv2",["Owlv2Model",ux]],["beit",["BeitModel",cx]],["deit",["DeiTModel",bx]],["convnext",["ConvNextModel",Bx]],["convnextv2",["ConvNextV2Model",Nx]],["dinov2",["Dinov2Model",Lx]],["resnet",["ResNetModel",$x]],["swin",["SwinModel",Sx]],["swin2sr",["Swin2SRModel",Ex]],["donut-swin",["DonutSwinModel",Rx]],["yolos",["YolosModel",Wx]],["dpt",["DPTModel",Tx]],["glpn",["GLPNModel",Ox]],["hifigan",["SpeechT5HifiGan",$S]],["efficientnet",["EfficientNetModel",FS]]]),WS=new Map([["t5",["T5Model",C1]],["longt5",["LongT5Model",A1]],["mt5",["MT5Model",M1]],["bart",["BartModel",z1]],["mbart",["MBartModel",B1]],["marian",["MarianModel",Kx]],["whisper",["WhisperModel",o$]],["m2m_100",["M2M100Model",Xx]],["blenderbot",["BlenderbotModel",L1]],["blenderbot-small",["BlenderbotSmallModel",W1]]]),VS=new Map([["bloom",["BloomModel",H$]],["gpt2",["GPT2Model",v$]],["gptj",["GPTJModel",C$]],["gpt_bigcode",["GPTBigCodeModel",A$]],["gpt_neo",["GPTNeoModel",x$]],["gpt_neox",["GPTNeoXModel",k$]],["codegen",["CodeGenModel",M$]],["llama",["LlamaModel",z$]],["gemma",["GemmaModel",R$]],["openelm",["OpenELMModel",D$]],["qwen2",["Qwen2Model",F$]],["phi",["PhiModel",U$]],["phi3",["Phi3Model",V$]],["mpt",["MptModel",q$]],["opt",["OPTModel",Y$]],["mistral",["MistralModel",kS]],["starcoder2",["Starcoder2Model",CS]],["falcon",["FalconModel",AS]],["stablelm",["StableLmModel",DS]]]),nl=new Map([["speecht5",["SpeechT5ForSpeechToText",bS]],["whisper",["WhisperForConditionalGeneration",l$]]]),t_=new Map([["speecht5",["SpeechT5ForTextToSpeech",vS]]]),r_=new Map([["vits",["VitsModel",Xg]],["musicgen",["MusicgenForConditionalGeneration",e_]]]),n_=new Map([["bert",["BertForSequenceClassification",b2]],["roformer",["RoFormerForSequenceClassification",C2]],["electra",["ElectraForSequenceClassification",D2]],["esm",["EsmForSequenceClassification",l1]],["convbert",["ConvBertForSequenceClassification",O2]],["camembert",["CamembertForSequenceClassification",W2]],["deberta",["DebertaForSequenceClassification",q2]],["deberta-v2",["DebertaV2ForSequenceClassification",Z2]],["mpnet",["MPNetForSequenceClassification",g1]],["albert",["AlbertForSequenceClassification",S1]],["distilbert",["DistilBertForSequenceClassification",r1]],["roberta",["RobertaForSequenceClassification",j1]],["xlm",["XLMForSequenceClassification",Q1]],["xlm-roberta",["XLMRobertaForSequenceClassification",r$]],["bart",["BartForSequenceClassification",R1]],["mbart",["MBartForSequenceClassification",N1]],["mobilebert",["MobileBertForSequenceClassification",p1]],["squeezebert",["SqueezeBertForSequenceClassification",v1]]]),a_=new Map([["bert",["BertForTokenClassification",v2]],["roformer",["RoFormerForTokenClassification",T2]],["electra",["ElectraForTokenClassification",N2]],["esm",["EsmForTokenClassification",u1]],["convbert",["ConvBertForTokenClassification",z2]],["camembert",["CamembertForTokenClassification",V2]],["deberta",["DebertaForTokenClassification",K2]],["deberta-v2",["DebertaV2ForTokenClassification",J2]],["mpnet",["MPNetForTokenClassification",_1]],["distilbert",["DistilBertForTokenClassification",n1]],["roberta",["RobertaForTokenClassification",q1]],["xlm",["XLMForTokenClassification",Z1]],["xlm-roberta",["XLMRobertaForTokenClassification",n$]]]),al=new Map([["t5",["T5ForConditionalGeneration",T1]],["longt5",["LongT5ForConditionalGeneration",I1]],["mt5",["MT5ForConditionalGeneration",O1]],["bart",["BartForConditionalGeneration",P1]],["mbart",["MBartForConditionalGeneration",D1]],["marian",["MarianMTModel",Yx]],["m2m_100",["M2M100ForConditionalGeneration",Qx]],["blenderbot",["BlenderbotForConditionalGeneration",U1]],["blenderbot-small",["BlenderbotSmallForConditionalGeneration",V1]]]),il=new Map([["bloom",["BloomForCausalLM",j$]],["gpt2",["GPT2LMHeadModel",$$]],["gptj",["GPTJForCausalLM",T$]],["gpt_bigcode",["GPTBigCodeForCausalLM",I$]],["gpt_neo",["GPTNeoForCausalLM",S$]],["gpt_neox",["GPTNeoXForCausalLM",E$]],["codegen",["CodeGenForCausalLM",O$]],["llama",["LlamaForCausalLM",P$]],["gemma",["GemmaForCausalLM",B$]],["openelm",["OpenELMForCausalLM",N$]],["qwen2",["Qwen2ForCausalLM",L$]],["phi",["PhiForCausalLM",W$]],["phi3",["Phi3ForCausalLM",G$]],["mpt",["MptForCausalLM",K$]],["opt",["OPTForCausalLM",X$]],["mbart",["MBartForCausalLM",F1]],["mistral",["MistralForCausalLM",ES]],["starcoder2",["Starcoder2ForCausalLM",TS]],["falcon",["FalconForCausalLM",IS]],["trocr",["TrOCRForCausalLM",SS]],["stablelm",["StableLmForCausalLM",NS]]]),i_=new Map([["bert",["BertForMaskedLM",w2]],["roformer",["RoFormerForMaskedLM",E2]],["electra",["ElectraForMaskedLM",B2]],["esm",["EsmForMaskedLM",o1]],["convbert",["ConvBertForMaskedLM",M2]],["camembert",["CamembertForMaskedLM",U2]],["deberta",["DebertaForMaskedLM",j2]],["deberta-v2",["DebertaV2ForMaskedLM",Q2]],["mpnet",["MPNetForMaskedLM",m1]],["albert",["AlbertForMaskedLM",E1]],["distilbert",["DistilBertForMaskedLM",i1]],["roberta",["RobertaForMaskedLM",H1]],["xlm",["XLMWithLMHeadModel",X1]],["xlm-roberta",["XLMRobertaForMaskedLM",t$]],["mobilebert",["MobileBertForMaskedLM",c1]],["squeezebert",["SqueezeBertForMaskedLM",b1]]]),s_=new Map([["bert",["BertForQuestionAnswering",$2]],["roformer",["RoFormerForQuestionAnswering",A2]],["electra",["ElectraForQuestionAnswering",F2]],["convbert",["ConvBertForQuestionAnswering",P2]],["camembert",["CamembertForQuestionAnswering",G2]],["deberta",["DebertaForQuestionAnswering",Y2]],["deberta-v2",["DebertaV2ForQuestionAnswering",e1]],["mpnet",["MPNetForQuestionAnswering",y1]],["albert",["AlbertForQuestionAnswering",k1]],["distilbert",["DistilBertForQuestionAnswering",a1]],["roberta",["RobertaForQuestionAnswering",K1]],["xlm",["XLMForQuestionAnswering",J1]],["xlm-roberta",["XLMRobertaForQuestionAnswering",a$]],["mobilebert",["MobileBertForQuestionAnswering",h1]],["squeezebert",["SqueezeBertForQuestionAnswering",$1]]]),sl=new Map([["vision-encoder-decoder",["VisionEncoderDecoderModel",sg]]]),GS=new Map([["llava",["LlavaForConditionalGeneration",og]],["moondream1",["Moondream1ForConditionalGeneration",d$]]]),HS=new Map([["vision-encoder-decoder",["VisionEncoderDecoderModel",sg]]]),o_=new Map([["vit",["ViTForImageClassification",Z$]],["fastvit",["FastViTForImageClassification",ex]],["mobilevit",["MobileViTForImageClassification",ax]],["mobilevitv2",["MobileViTV2ForImageClassification",sx]],["beit",["BeitForImageClassification",px]],["deit",["DeiTForImageClassification",vx]],["convnext",["ConvNextForImageClassification",Dx]],["convnextv2",["ConvNextV2ForImageClassification",Fx]],["dinov2",["Dinov2ForImageClassification",Ux]],["resnet",["ResNetForImageClassification",xx]],["swin",["SwinForImageClassification",kx]],["segformer",["SegformerForImageClassification",RS]],["efficientnet",["EfficientNetForImageClassification",LS]]]),l_=new Map([["detr",["DetrForObjectDetection",fx]],["table-transformer",["TableTransformerForObjectDetection",yx]],["yolos",["YolosForObjectDetection",Vx]]]),u_=new Map([["owlvit",["OwlViTForObjectDetection",lx]],["owlv2",["Owlv2ForObjectDetection",dx]]]),d_=new Map([["detr",["DetrForSegmentation",mx]],["clipseg",["CLIPSegForImageSegmentation",b$]]]),c_=new Map([["segformer",["SegformerForSemanticSegmentation",BS]]]),jS=new Map([["sam",["SamModel",jx]]]),p_=new Map([["wav2vec2",["Wav2Vec2ForCTC",Jx]],["wav2vec2-bert",["Wav2Vec2BertForCTC",dS]],["unispeech",["UniSpeechForCTC",nS]],["unispeech-sat",["UniSpeechSatForCTC",sS]],["wavlm",["WavLMForCTC",gS]],["hubert",["HubertForCTC",hS]]]),h_=new Map([["wav2vec2",["Wav2Vec2ForSequenceClassification",eS]],["wav2vec2-bert",["Wav2Vec2BertForSequenceClassification",cS]],["unispeech",["UniSpeechForSequenceClassification",aS]],["unispeech-sat",["UniSpeechSatForSequenceClassification",oS]],["wavlm",["WavLMForSequenceClassification",_S]],["hubert",["HubertForSequenceClassification",fS]],["audio-spectrogram-transformer",["ASTForAudioClassification",s$]]]),qS=new Map([["wavlm",["WavLMForXVector",yS]]]),KS=new Map([["unispeech-sat",["UniSpeechSatForAudioFrameClassification",lS]],["wavlm",["WavLMForAudioFrameClassification",wS]],["wav2vec2",["Wav2Vec2ForAudioFrameClassification",tS]]]),YS=new Map([["vitmatte",["VitMatteForImageMatting",rx]]]),f_=new Map([["swin2sr",["Swin2SRForImageSuperResolution",Cx]]]),m_=new Map([["dpt",["DPTForDepthEstimation",Ax]],["depth_anything",["DepthAnythingForDepthEstimation",Mx]],["glpn",["GLPNForDepthEstimation",zx]]]),g_=new Map([["clip",["CLIPVisionModelWithProjection",h$]],["siglip",["SiglipVisionModel",g$]]]),__=[[US,$e.EncoderOnly],[WS,$e.EncoderDecoder],[VS,$e.DecoderOnly],[n_,$e.EncoderOnly],[a_,$e.EncoderOnly],[al,$e.Seq2Seq],[nl,$e.Seq2Seq],[il,$e.DecoderOnly],[i_,$e.EncoderOnly],[s_,$e.EncoderOnly],[sl,$e.Vision2Seq],[GS,$e.ImageTextToText],[o_,$e.EncoderOnly],[d_,$e.EncoderOnly],[c_,$e.EncoderOnly],[YS,$e.EncoderOnly],[f_,$e.EncoderOnly],[m_,$e.EncoderOnly],[l_,$e.EncoderOnly],[u_,$e.EncoderOnly],[jS,$e.MaskGeneration],[p_,$e.EncoderOnly],[h_,$e.EncoderOnly],[t_,$e.Seq2Seq],[r_,$e.EncoderOnly],[qS,$e.EncoderOnly],[KS,$e.EncoderOnly],[g_,$e.EncoderOnly]];for(const[t,e]of __)for(const[r,n]of t.values())Bi.set(r,e),ma.set(n,r),qm.set(r,n);const XS=[["MusicgenForConditionalGeneration",e_,$e.Musicgen],["CLIPTextModelWithProjection",p$,$e.EncoderOnly],["SiglipTextModel",m$,$e.EncoderOnly],["ClapTextModelWithProjection",OS,$e.EncoderOnly],["ClapAudioModelWithProjection",zS,$e.EncoderOnly]];for(const[t,e,r]of XS)Bi.set(t,r),ma.set(e,t),qm.set(t,e);class ln extends Xe{}D(ln,"MODEL_CLASS_MAPPINGS",__.map(e=>e[0])),D(ln,"BASE_IF_FAIL",!0);class ol extends Xe{}D(ol,"MODEL_CLASS_MAPPINGS",[n_]);class y_ extends Xe{}D(y_,"MODEL_CLASS_MAPPINGS",[a_]);class Gi extends Xe{}D(Gi,"MODEL_CLASS_MAPPINGS",[al]);class w_ extends Xe{}D(w_,"MODEL_CLASS_MAPPINGS",[nl]);class b_ extends Xe{}D(b_,"MODEL_CLASS_MAPPINGS",[t_]);class v_ extends Xe{}D(v_,"MODEL_CLASS_MAPPINGS",[r_]);class $_ extends Xe{}D($_,"MODEL_CLASS_MAPPINGS",[il]);class x_ extends Xe{}D(x_,"MODEL_CLASS_MAPPINGS",[i_]);class S_ extends Xe{}D(S_,"MODEL_CLASS_MAPPINGS",[s_]);class k_ extends Xe{}D(k_,"MODEL_CLASS_MAPPINGS",[sl]);class E_ extends Xe{}D(E_,"MODEL_CLASS_MAPPINGS",[o_]);class C_ extends Xe{}D(C_,"MODEL_CLASS_MAPPINGS",[d_]);class T_ extends Xe{}D(T_,"MODEL_CLASS_MAPPINGS",[c_]);class A_ extends Xe{}D(A_,"MODEL_CLASS_MAPPINGS",[l_]);class I_ extends Xe{}D(I_,"MODEL_CLASS_MAPPINGS",[u_]);class M_ extends Xe{}D(M_,"MODEL_CLASS_MAPPINGS",[p_]);class O_ extends Xe{}D(O_,"MODEL_CLASS_MAPPINGS",[h_]);class z_ extends Xe{}D(z_,"MODEL_CLASS_MAPPINGS",[HS]);class P_ extends Xe{}D(P_,"MODEL_CLASS_MAPPINGS",[f_]);class R_ extends Xe{}D(R_,"MODEL_CLASS_MAPPINGS",[m_]);class B_ extends Xe{}D(B_,"MODEL_CLASS_MAPPINGS",[g_]);class ze extends Xt{constructor({logits:e}){super(),this.logits=e}}class QS extends Xt{constructor({logits:e,embeddings:r}){super(),this.logits=e,this.embeddings=r}}class vt extends Xt{constructor({logits:e}){super(),this.logits=e}}class $t extends Xt{constructor({logits:e}){super(),this.logits=e}}class Tt extends Xt{constructor({start_logits:e,end_logits:r}){super(),this.start_logits=e,this.end_logits=r}}class Mn extends Xt{constructor({logits:e}){super(),this.logits=e}}class ZS extends Xt{constructor({alphas:e}){super(),this.alphas=e}}class JS extends Xt{constructor({waveform:e,spectrogram:r}){super(),this.waveform=e,this.spectrogram=r}}const Qt=typeof self<"u",ek=Qt&&self.constructor.name==="DedicatedWorkerGlobalScope";let un,D_,Fr;if(Qt)un=(t,e)=>{if(!self.OffscreenCanvas)throw new Error("OffscreenCanvas not supported by this browser.");return new self.OffscreenCanvas(t,e)},Fr=self.createImageBitmap,D_=self.ImageData;else if(Ve)Fr=async t=>{const r=(await t.metadata()).channels,{data:n,info:a}=await t.rotate().raw().toBuffer({resolveWithObject:!0}),s=new At(new Uint8ClampedArray(n),a.width,a.height,a.channels);return r!==void 0&&r!==a.channels&&s.convert(r),s};else throw new Error("Unable to load image processing library.");const tk={0:"nearest",1:"lanczos",2:"bilinear",3:"bicubic",4:"box",5:"hamming"},rk=new Map([["png","image/png"],["jpg","image/jpeg"],["jpeg","image/jpeg"],["gif","image/gif"]]);class At{constructor(e,r,n,a){this.data=e,this.width=r,this.height=n,this.channels=a}get size(){return[this.width,this.height]}static async read(e){if(e instanceof At)return e;if(typeof e=="string"||e instanceof URL)return await this.fromURL(e);throw new Error(`Unsupported input type: ${typeof e}`)}static fromCanvas(e){if(!Qt)throw new Error("fromCanvas() is only supported in browser environments.");const n=e.getContext("2d").getImageData(0,0,e.width,e.height).data;return new At(n,e.width,e.height,4)}static async fromURL(e){const r=await Ja(e);if(r.status!==200)throw new Error(`Unable to read image from "${e}" (${r.status} ${r.statusText})`);const n=await r.blob();return this.fromBlob(n)}static async fromBlob(e){if(Qt){const r=await Fr(e),n=un(r.width,r.height).getContext("2d");return n.drawImage(r,0,0),new this(n.getImageData(0,0,r.width,r.height).data,r.width,r.height,4)}else{const r=Ve(await e.arrayBuffer());return await Fr(r)}}static fromTensor(e,r="CHW"){if(e.dims.length!==3)throw new Error(`Tensor should have 3 dimensions, but has ${e.dims.length} dimensions.`);if(r==="CHW")e=e.transpose(1,2,0);else if(r!=="HWC")throw new Error(`Unsupported channel format: ${r}`);if(!(e.data instanceof Uint8ClampedArray||e.data instanceof Uint8Array))throw new Error(`Unsupported tensor type: ${e.type}`);switch(e.dims[2]){case 1:case 2:case 3:case 4:return new At(e.data,e.dims[1],e.dims[0],e.dims[2]);default:throw new Error(`Unsupported number of channels: ${e.dims[2]}`)}}grayscale(){if(this.channels===1)return this;const e=new Uint8ClampedArray(this.width*this.height*1);switch(this.channels){case 3:case 4:for(let r=0,n=0;r=0?l=n:d=-n,a>=0?u=a:h=-a,o.drawImage(i,l,u,e,r,d,h,e,r),new At(o.getImageData(0,0,e,r).data,e,r,4).convert(s)}else{let s=this.toSharp();if(n>=0&&a>=0)s=s.extract({left:Math.floor(n),top:Math.floor(a),width:e,height:r});else if(n<=0&&a<=0){const i=Math.floor(-a),o=Math.floor(-n);s=s.extend({top:i,left:o,right:e-this.width-o,bottom:r-this.height-i})}else{let i=[0,0],o=0;a<0?(i[0]=Math.floor(-a),i[1]=r-this.height-i[0]):o=Math.floor(a);let l=[0,0],u=0;n<0?(l[0]=Math.floor(-n),l[1]=e-this.width-l[0]):u=Math.floor(n),s=s.extend({top:i[0],bottom:i[1],left:l[0],right:l[1]}).extract({left:u,top:o,width:e,height:r})}return await Fr(s)}}async toBlob(e="image/png",r=1){if(!Qt)throw new Error("toBlob() is only supported in browser environments.");return await this.toCanvas().convertToBlob({type:e,quality:r})}toTensor(e="CHW"){let r=new fe("uint8",new Uint8Array(this.data),[this.height,this.width,this.channels]);if(e!=="HWC")if(e==="CHW")r=r.permute(2,0,1);else throw new Error(`Unsupported channel format: ${e}`);return r}toCanvas(){if(!Qt)throw new Error("toCanvas() is only supported in browser environments.");const e=this.clone().rgba(),r=un(e.width,e.height),n=new D_(e.data,e.width,e.height);return r.getContext("2d").putImageData(n,0,0),r}_update(e,r,n,a=null){return this.data=e,this.width=r,this.height=n,a!==null&&(this.channels=a),this}clone(){return new At(this.data.slice(),this.width,this.height,this.channels)}convert(e){if(this.channels===e)return this;switch(e){case 1:this.grayscale();break;case 3:this.rgb();break;case 4:this.rgba();break;default:throw new Error(`Conversion failed due to unsupported number of channels: ${this.channels}`)}return this}async save(e){if(Qt){if(ek)throw new Error("Unable to save an image from a Web Worker.");const r=e.split(".").pop().toLowerCase(),n=rk.get(r)??"image/png",a=await this.toBlob(n),s=URL.createObjectURL(a),i=document.createElement("a");i.href=s,i.download=e,i.click(),i.remove()}else{if(Mt.useFS)return await this.toSharp().toFile(e);throw new Error("Unable to save the image because filesystem is disabled in this environment.")}}toSharp(){if(Qt)throw new Error("toSharp() is only supported in server-side environments.");return Ve(this.data,{raw:{width:this.width,height:this.height,channels:this.channels}})}}async function nk(t,e){if(typeof AudioContext>"u")throw Error("Unable to load audio from path/URL since `AudioContext` is not available in your environment. Instead, audio data should be passed directly to the pipeline/processor. For more information and some example code, see https://huggingface.co/docs/transformers.js/guides/node-audio-processing.");const r=await(await Ja(t)).arrayBuffer(),n=new AudioContext({sampleRate:e});typeof e>"u"&&console.warn(`No sampling rate provided, using default of ${n.sampleRate}Hz.`);const a=await n.decodeAudioData(r);let s;if(a.numberOfChannels===2){const i=Math.sqrt(2),o=a.getChannelData(0),l=a.getChannelData(1);s=new Float32Array(o.length);for(let u=0;u2595*Math.log10(1+t/700),kaldi:t=>1127*Math.log(1+t/700),slaney:(t,e=1e3,r=15,n=27/Math.log(6.4))=>t>=e?r+Math.log(t/e)*n:3*t/200};function ll(t,e="htk"){const r=ak[e];if(!r)throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');return typeof t=="number"?r(t):t.map(n=>r(n))}const ik={htk:t=>700*(10**(t/2595)-1),kaldi:t=>700*(Math.exp(t/1127)-1),slaney:(t,e=1e3,r=15,n=Math.log(6.4)/27)=>t>=r?e*Math.exp(n*(t-r)):200*t/3};function sk(t,e="htk"){const r=ik[e];if(!r)throw new Error('mel_scale should be one of "htk", "slaney" or "kaldi".');return typeof t=="number"?r(t):t.map(n=>r(n))}function ok(t,e){const r=Float64Array.from({length:e.length-1},(i,o)=>e[o+1]-e[o]),n=Array.from({length:t.length},()=>new Array(e.length));for(let i=0;inew Array(t.length));for(let i=0;it+n*s)}function Ia(t,e,r,n,a,s=null,i="htk",o=!1){if(s!==null&&s!=="slaney")throw new Error('norm must be one of null or "slaney"');const l=ll(r,i),u=ll(n,i),d=F_(l,u,e+2);let h=sk(d,i),m;if(o){const p=a/(t*2);m=ll(Float64Array.from({length:t},(w,v)=>v*p),i),h=d}else m=F_(0,Math.floor(a/2),t);const g=ok(m,h);if(s!==null&&s==="slaney")for(let p=0;pa)throw Error(`frame_length (${r}) may not be larger than fft_length (${a})`);if(T!==r)throw new Error(`Length of the window (${T}) must equal frame_length (${r})`);if(n<=0)throw new Error("hop_length must be greater than zero");if(s===null&&d!==null)throw new Error("You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram. Specify `power` to fix this issue.");if(i){if(o!=="reflect")throw new Error(`pad_mode="${o}" not implemented yet.`);const M=Math.floor((a-1)/2)+1;t=lk(t,M,M)}const A=Math.floor(1+Math.floor((t.length-r)/n)),P=l?Math.floor(a/2)+1:a;let B=A,L=A;x!==null&&(x>A?$&&(L=x):L=B=x);const j=new H0(a),q=new Float64Array(a),ue=new Float64Array(j.outputBufferSize),ae=new Array(B);for(let M=0;M=1;--ee)q[ee]-=u*q[ee-1];q[0]*=1-u}for(let ee=0;eeMath.pow(o,.85));break;default:throw new Error(`Unknown window type ${e}.`)}if(r&&(i=i.subarray(0,t)),n===null)return i;if(t>n)throw new Error(`Length of the window (${t}) may not be larger than frame_length (${n})`);return i}function ck([t,e,r,n]){return[t-r/2,e-n/2,t+r/2,e+n/2]}function ul(t,e=.5,r=null,n=!1){const a=t.logits,s=t.pred_boxes,[i,o,l]=a.dims;if(r!==null&&r.length!==i)throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits");let u=[];for(let d=0;de&&x.push(C)}else{let C=jt(v.data)[1];if(C===l-1||($=bt(v.data),$[C]A*h[(P+1)%2])),m.boxes.push(T),m.classes.push(C),m.scores.push($[C])}}u.push(m)}return u}function Ma(t,e){var r;if(!(t instanceof Float32Array||t instanceof Float64Array))throw new Error(`${e} expects input to be a Float32Array or a Float64Array, but got ${((r=t==null?void 0:t.constructor)==null?void 0:r.name)??typeof t} instead. If using the feature extractor directly, remember to use \`read_audio(url, sampling_rate)\` to obtain the raw audio data of the file/url.`)}function U_(t,e,r=0,n=null){const a=t/e;let s=q0(a)*e;return n!==null&&s>n&&(s=Math.floor(a)*e),ss?u=Math.floor(s*l/a):s>a&&(l=Math.floor(a*u/s)),await e.resize(u,l,{resample:n}))}async crop_margin(e,r=200){const n=e.clone().grayscale(),a=Bl(n.data)[0],i=jt(n.data)[0]-a;if(i===0)return e;const o=r/255;let l=n.width,u=n.height,d=0,h=0;const m=n.data;for(let g=0;gthis.preprocess(s)));return{pixel_values:oa(n.map(s=>s.pixel_values),0),original_sizes:n.map(s=>s.original_size),reshaped_input_sizes:n.map(s=>s.reshaped_input_size)}}}class pk extends Qe{post_process_semantic_segmentation(e,r=null){const n=e.logits,a=n.dims[0];if(r!==null&&r.length!==a)throw Error("Make sure that you pass in as many target sizes as the batch dimension of the logits");const s=[];for(let i=0;im[C]&&(m[C]=$[C],g[C]=x)}const p=new Array(l.dims[0]),w=h.data;for(let x=0;xx!==void 0);s.push({segmentation:h,labels:v})}return s}}class W_ extends Qe{}class hk extends W_{}class fk extends Qe{}class mk extends Qe{}class V_ extends Qe{}class gk extends V_{}class _k extends Qe{}class yk extends Qe{}class G_ extends Qe{constructor(e){super(e),this.crop_pct=this.config.crop_pct??224/256}async resize(e){var n;const r=(n=this.size)==null?void 0:n.shortest_edge;if(r===void 0)throw new Error("Size dictionary must contain 'shortest_edge' key.");if(r<384){const a=Math.floor(r/this.crop_pct),[s,i]=this.get_resize_output_image_size(e,{shortest_edge:a});e=await e.resize(s,i,{resample:this.resample}),e=await e.center_crop(r,r)}else e=await e.resize(r,r,{resample:this.resample});return e}}class wk extends G_{}class bk extends Qe{}class vk extends Qe{}class $k extends Qe{constructor(e){super(e),this.include_top=this.config.include_top??!0,this.include_top&&(this.image_std=this.image_std.map(r=>r*r))}}class H_ extends Qe{}class xk extends H_{}class j_ extends Qe{post_process_object_detection(...e){return ul(...e)}}class Sk extends j_{}class kk extends Qe{}class Ek extends Qe{}class q_ extends Qe{pad_image(e,r,n,a={}){const[s,i,o]=r;let l=this.image_mean;Array.isArray(this.image_mean)||(l=new Array(o).fill(l));let u=this.image_std;Array.isArray(u)||(u=new Array(o).fill(l));const d=l.map((h,m)=>-h/u[m]);return super.pad_image(e,r,n,{center:!0,constant_values:d,...a})}}class Ck extends q_{}class Tk extends Qe{async _call(e){const r=await super._call(e),n=[r.pixel_values.dims[0],64,64],a=new fe("int64",new BigInt64Array(n.reduce((s,i)=>s*i)).fill(1n),n);return{...r,pixel_mask:a}}post_process_object_detection(...e){return ul(...e)}remove_low_and_no_objects(e,r,n,a){let s=[],i=[],o=[];for(let l=0;ln&&(s.push(d),i.push(g),o.push(h))}return[s,i,o]}check_segment_validity(e,r,n,a=.5,s=.8){let i=[],o=0,l=0;const u=r[n].data;for(let h=0;h=a&&++l;let d=o>0&&l>0;return d&&(d=o/l>s),[d,i]}compute_segments(e,r,n,a,s,i=null,o=null){let[l,u]=o??e[0].dims,d=new fe("int32",new Int32Array(l*u),[l,u]),h=[];if(o!==null)for(let v=0;vg[C]&&(m[C]=v,g[C]=$[C])}let p=0;const w=d.data;for(let v=0;va!==r.dims[s]))throw Error(`The first ${n.length} dimensions of 'input_points' and 'input_labels' must be the same.`);return new fe("int64",e.flat(1/0).map(BigInt),n)}async _call(e,{input_points:r=null,input_labels:n=null,input_boxes:a=null}={}){const s=await super._call(e);if(r&&(s.input_points=this.reshape_input_points(r,s.original_sizes,s.reshaped_input_sizes)),n){if(!s.input_points)throw Error("`input_points` must be provided if `input_labels` are provided.");s.input_labels=this.add_input_labels(n,s.input_points)}return a&&(s.input_boxes=this.reshape_input_points(a,s.original_sizes,s.reshaped_input_sizes,!0)),s}async post_process_masks(e,r,n,{mask_threshold:a=0,binarize:s=!0,pad_size:i=null}={}){const o=[];i=i??this.pad_size;const l=[i.height,i.width];for(let u=0;ua&&(p[w]=1);m=new fe("bool",p,m.dims)}o.push(m)}return o}generate_crop_boxes(e,r,{crop_n_layers:n=0,overlap_ratio:a=512/1500,points_per_crop:s=32,crop_n_points_downscale_factor:i=1}={}){}}class Mk extends Qe{pad_image(e,r,n,a={}){const[s,i,o]=r;return super.pad_image(e,r,{width:i+(n-i%n)%n,height:s+(n-s%n)%n},{mode:"symmetric",center:!1,constant_values:-1,...a})}}class Ok extends Qe{async _call(e,r){Array.isArray(e)||(e=[e]),Array.isArray(r)||(r=[r]);const n=await Promise.all(e.map(i=>this.preprocess(i))),a=await Promise.all(r.map(i=>this.preprocess(i,{do_normalize:!1,do_convert_rgb:!1,do_convert_grayscale:!0})));return{pixel_values:oa(n.map((i,o)=>gr([i.pixel_values,a[o].pixel_values],0)),0),original_sizes:n.map(i=>i.original_size),reshaped_input_sizes:n.map(i=>i.reshaped_input_size)}}}class zk extends dn{constructor(e){var r;super(e),(r=this.config).mel_filters??(r.mel_filters=Ia(Math.floor(1+this.config.n_fft/2),this.config.feature_size,0,8e3,this.config.sampling_rate,"slaney","slaney")),this.window=ji(this.config.n_fft,"hann")}_extract_fbank_features(e){const{data:r,dims:n}=Hi(e,this.window,this.config.n_fft,this.config.hop_length,{power:2,mel_filters:this.config.mel_filters,log_mel:"log10",max_num_frames:this.config.nb_max_frames}),a=jt(r)[0];for(let s=0;sthis.config.n_samples?(console.warn("Attempting to extract features for audio longer than 30 seconds. If using a pipeline to extract transcript from a long audio clip, remember to specify `chunk_length_s` and/or `stride_length_s`."),r=e.slice(0,this.config.n_samples)):(r=new Float32Array(this.config.n_samples),r.set(e));const{data:n,dims:a}=this._extract_fbank_features(r);return{input_features:new fe("float32",n,[1,...a])}}}class Pk extends dn{_zero_mean_unit_var_norm(e){const n=e.reduce((s,i)=>s+i,0)/e.length,a=e.reduce((s,i)=>s+(i-n)**2,0)/e.length;return e.map(s=>(s-n)/Math.sqrt(a+1e-7))}async _call(e){Ma(e,"Wav2Vec2FeatureExtractor"),e instanceof Float64Array&&(e=new Float32Array(e));let r=e;this.config.do_normalize&&(r=this._zero_mean_unit_var_norm(r));const n=[1,r.length];return{input_values:new fe("float32",r,n),attention_mask:new fe("int64",new BigInt64Array(r.length).fill(1n),n)}}}class Rk extends dn{constructor(e){super(e);const r=this.config.sampling_rate,n=Ia(256,this.config.num_mel_bins,20,Math.floor(r/2),r,null,"kaldi",!0);for(let a=0;an*32768),Hi(e,this.window,400,160,{fft_length:512,power:2,center:!1,preemphasis:.97,mel_filters:this.mel_filters,log_mel:"log",mel_floor:1192092955078125e-22,remove_dc_offset:!0,max_num_frames:r,transpose:!0})}async _call(e,{padding:r=!0,pad_to_multiple_of:n=2,do_normalize_per_mel_bins:a=!0,return_attention_mask:s=!0}={}){Ma(e,"SeamlessM4TFeatureExtractor");let{data:i,dims:o}=this._extract_fbank_features(e,this.config.max_length);if(a){const[w,v]=o;for(let x=0;x0){const $=new Float32Array(v*(w+x));$.set(i),$.fill(this.config.padding_value,i.length);const C=w+x;i=$,o=[C,v],s&&(l=new fe("int64",new BigInt64Array(C),[1,C]),l.data.fill(1n,0,w))}}const[u,d]=o,h=this.config.stride;if(u%h!==0)throw new Error(`The number of frames (${u}) must be a multiple of the stride (${h}).`);const g=new fe("float32",i,o).view(1,Math.floor(u/h),d*h),p={input_features:g};if(s){const w=g.dims[1],v=new BigInt64Array(w);if(l){const x=l.data;for(let $=1,C=0;$0)if(n==="rand_trunc"){i=!0;const l=Math.floor(Math.random()*(o+1));e=e.subarray(l,l+r),s=this._extract_fbank_features(e,this.mel_filters_slaney,this.config.nb_max_samples),s.dims=[1,...s.dims]}else throw new Error(`Truncation strategy "${n}" not implemented`);else{if(o<0){let l=new Float64Array(r);if(l.set(e),a==="repeat")for(let u=e.length;uAt.read(e)))}async function qi(t,e){return Array.isArray(t)||(t=[t]),await Promise.all(t.map(r=>typeof r=="string"||r instanceof URL?nk(r,e):r instanceof Float64Array?new Float32Array(r):r))}function K_(t,e){e&&(t=t.map(i=>i|0));const[r,n,a,s]=t;return{xmin:r,ymin:n,xmax:a,ymax:s}}class tt extends wt{constructor({task:e,model:r,tokenizer:n=null,processor:a=null}){super(),this.task=e,this.model=r,this.tokenizer=n,this.processor=a}async dispose(){await this.model.dispose()}}class Gk extends tt{constructor(e){super(e)}async _call(e,{topk:r=1}={}){const n=this.tokenizer(e,{padding:!0,truncation:!0}),a=await this.model(n),s=this.model.config.problem_type==="multi_label_classification"?l=>l.sigmoid().data:l=>bt(l.data),i=this.model.config.id2label,o=[];for(const l of a.logits){const u=s(l),h=wn(u,r).map(m=>({label:i[m[0]],score:m[1]}));r===1?o.push(...h):o.push(h)}return Array.isArray(e)||r===1?o:o[0]}}class Hk extends tt{constructor(e){super(e)}async _call(e,{ignore_labels:r=["O"]}={}){const n=Array.isArray(e),a=this.tokenizer(n?e:[e],{padding:!0,truncation:!0}),i=(await this.model(a)).logits,o=this.model.config.id2label,l=[];for(let u=0;u[g,p]).filter(g=>g[1]>u),h=Array.from(bt(s.end_logits[o].data)).map((g,p)=>[g,p]).filter(g=>g[1]>u),m=B0(d,h).filter(g=>g[0][1]<=g[1][1]).map(g=>[g[0][1],g[1][1],g[0][0]*g[1][0]]).sort((g,p)=>p[2]-g[2]);for(let g=0;g{const g=[...o];return g[l]=m[0],{score:m[1],token:m[0],token_str:this.tokenizer.model.vocab[m[0]],sequence:this.tokenizer.decode(g,{skip_special_tokens:!0})}}))}return Array.isArray(e)?s:s[0]}}class cl extends tt{constructor(r){super(r);D(this,"_key","generated_text")}async _call(r,n={}){Array.isArray(r)||(r=[r]),this.model.config.prefix&&(r=r.map(u=>this.model.config.prefix+u));const a=this.model.config.task_specific_params;a&&a[this.task]&&a[this.task].prefix&&(r=r.map(u=>a[this.task].prefix+u));const s=this.tokenizer,i={padding:!0,truncation:!0};let o;this instanceof Y_&&"_build_translation_inputs"in s?o=s._build_translation_inputs(r,i,n):o=s(r,i);const l=await this.model.generate({...o,...n});return s.batch_decode(l,{skip_special_tokens:!0}).map(u=>({[this._key]:u}))}}class Kk extends cl{constructor(r){super(r);D(this,"_key","summary_text")}}class Y_ extends cl{constructor(r){super(r);D(this,"_key","translation_text")}}class Yk extends tt{constructor(e){super(e)}async _call(e,r={}){throw new Error("This pipeline is not yet supported in Transformers.js v3.")}}class Xk extends tt{constructor(e){super(e),this.label2id=Object.fromEntries(Object.entries(this.model.config.label2id).map(([r,n])=>[r.toLowerCase(),n])),this.entailment_id=this.label2id.entailment,this.entailment_id===void 0&&(console.warn("Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."),this.entailment_id=2),this.contradiction_id=this.label2id.contradiction??this.label2id.not_entailment,this.contradiction_id===void 0&&(console.warn("Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."),this.contradiction_id=0)}async _call(e,r,{hypothesis_template:n="This example is {}.",multi_label:a=!1}={}){const s=Array.isArray(e);s||(e=[e]),Array.isArray(r)||(r=[r]);const i=r.map(u=>n.replace("{}",u)),o=a||r.length===1,l=[];for(const u of e){const d=[];for(const g of i){const p=this.tokenizer(u,{text_pair:g,padding:!0,truncation:!0}),w=await this.model(p);o?d.push([w.logits.data[this.contradiction_id],w.logits.data[this.entailment_id]]):d.push(w.logits.data[this.entailment_id])}const m=(o?d.map(g=>bt(g)[1]):bt(d)).map((g,p)=>[g,p]).sort((g,p)=>p[0]-g[0]);l.push({sequence:u,labels:m.map(g=>r[g[1]]),scores:m.map(g=>g[0])})}return s?l:l[0]}}class Qk extends tt{constructor(e){super(e)}async _call(e,{pooling:r="none",normalize:n=!1,quantize:a=!1,precision:s="binary"}={}){const i=this.tokenizer(e,{padding:!0,truncation:!0}),o=await this.model(i);let l=o.last_hidden_state??o.logits??o.token_embeddings;if(r!=="none")if(r==="mean")l=Iw(l,i.attention_mask);else if(r==="cls")l=l.slice(null,0);else throw Error(`Pooling method '${r}' not supported.`);return n&&(l=l.normalize(2,-1)),a&&(l=Fw(l,s)),l}}class Zk extends tt{constructor(e){super(e)}async _call(e,{pool:r=null}={}){const n=await Er(e),{pixel_values:a}=await this.processor(n),s=await this.model({pixel_values:a});let i;if(r){if(!("pooler_output"in s))throw Error("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.");i=s.pooler_output}else i=s.last_hidden_state??s.logits??s.image_embeds;return i}}class Jk extends tt{constructor(e){super(e)}async _call(e,{topk:r=null}={}){const n=!Array.isArray(e),a=this.processor.feature_extractor.config.sampling_rate,s=await qi(e,a),i=this.model.config.id2label,o=[];for(const l of s){const u=await this.processor(l),h=(await this.model(u)).logits[0],g=wn(bt(h.data),r).map(p=>({label:i[p[0]],score:p[1]}));r===1?o.push(...g):o.push(g)}return!n||r===1?o:o[0]}}class e3 extends tt{constructor(e){super(e)}async _call(e,r,{hypothesis_template:n="This is a sound of {}."}={}){const a=!Array.isArray(e);a&&(e=[e]);const s=r.map(d=>n.replace("{}",d)),i=this.tokenizer(s,{padding:!0,truncation:!0}),o=this.processor.feature_extractor.config.sampling_rate,l=await qi(e,o),u=[];for(const d of l){const h=await this.processor(d),m=await this.model({...i,...h}),g=bt(m.logits_per_audio.data);u.push([...g].map((p,w)=>({score:p,label:r[w]})))}return a?u[0]:u}}class t3 extends tt{constructor(e){super(e)}async _call(e,r={}){switch(this.model.config.model_type){case"whisper":return this._call_whisper(e,r);case"wav2vec2":case"wav2vec2-bert":case"unispeech":case"unispeech-sat":case"hubert":return this._call_wav2vec2(e,r);default:throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)}}async _call_wav2vec2(e,r){r.language&&console.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".'),r.task&&console.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');const n=!Array.isArray(e);n&&(e=[e]);const a=this.processor.feature_extractor.config.sampling_rate,s=await qi(e,a),i=[];for(const o of s){const l=await this.processor(o),d=(await this.model(l)).logits[0],h=[];for(const g of d)h.push(jt(g.data)[1]);const m=this.tokenizer.decode(h);i.push({text:m})}return n?i[0]:i}async _call_whisper(e,r){const n=r.return_timestamps??!1,a=r.chunk_length_s??0,s=r.force_full_sequences??!1;let i=r.stride_length_s??null;n==="word"&&(r.return_token_timestamps=!0);const o=!Array.isArray(e);o&&(e=[e]);const l=this.processor.feature_extractor.config.chunk_length/this.model.config.max_source_positions,u=this.processor.feature_extractor.config.hop_length,d=this.processor.feature_extractor.config.sampling_rate,h=await qi(e,d),m=[];for(const g of h){let p=[];if(a>0){if(i===null)i=a/6;else if(a<=i)throw Error("`chunk_length_s` must be larger than `stride_length_s`.");const x=d*a,$=d*i,C=x-2*$;let T=0;for(;T=g.length;p.push({stride:[A.length,B?0:$,L?0:$],input_features:P.input_features,is_last:L}),T+=C}}else p=[{stride:[g.length,0,0],input_features:(await this.processor(g)).input_features,is_last:!0}];for(const x of p){r.num_frames=Math.floor(x.stride[0]/u);const $=await this.model.generate({inputs:x.input_features,...r});n==="word"?(x.tokens=$.sequences[0].tolist(),x.token_timestamps=$.token_timestamps.tolist()[0].map(C=>ri(C,2))):x.tokens=$[0].tolist(),x.stride=x.stride.map(C=>C/d)}const[w,v]=this.tokenizer._decode_asr(p,{time_precision:l,return_timestamps:n,force_full_sequences:s});m.push({text:w,...v})}return o?m[0]:m}}class r3 extends tt{constructor(e){super(e)}async _call(e,r={}){const n=Array.isArray(e),a=await Er(e),{pixel_values:s}=await this.processor(a),i=[];for(const o of s){o.dims=[1,...o.dims];const l=await this.model.generate({inputs:o,...r}),u=this.tokenizer.batch_decode(l,{skip_special_tokens:!0}).map(d=>({generated_text:d.trim()}));i.push(u)}return n?i:i[0]}}class n3 extends tt{constructor(e){super(e)}async _call(e,{topk:r=1}={}){const n=Array.isArray(e),a=await Er(e),{pixel_values:s}=await this.processor(a),i=await this.model({pixel_values:s}),o=this.model.config.id2label,l=[];for(const u of i.logits){const h=wn(bt(u.data),r).map(m=>({label:o[m[0]],score:m[1]}));r===1?l.push(...h):l.push(h)}return n||r===1?l:l[0]}}class a3 extends tt{constructor(e){super(e),this.subtasks_mapping={panoptic:"post_process_panoptic_segmentation",instance:"post_process_instance_segmentation",semantic:"post_process_semantic_segmentation"}}async _call(e,{threshold:r=.5,mask_threshold:n=.5,overlap_mask_area_threshold:a=.8,label_ids_to_fuse:s=null,target_sizes:i=null,subtask:o=null}={}){if(Array.isArray(e)&&e.length!==1)throw Error("Image segmentation pipeline currently only supports a batch size of 1.");const u=await Er(e),d=u.map(x=>[x.height,x.width]),{pixel_values:h,pixel_mask:m}=await this.processor(u),g=await this.model({pixel_values:h,pixel_mask:m});let p=null;if(o!==null)p=this.subtasks_mapping[o];else for(let[x,$]of Object.entries(this.subtasks_mapping))if($ in this.processor.feature_extractor){p=this.processor.feature_extractor[$].bind(this.processor.feature_extractor),o=x;break}const w=this.model.config.id2label,v=[];if(o==="panoptic"||o==="instance"){const x=p(g,r,n,a,s,i??d)[0],$=x.segmentation;for(const C of x.segments_info){const T=new Uint8ClampedArray($.data.length);for(let P=0;P<$.data.length;++P)$.data[P]===C.id&&(T[P]=255);const A=new At(T,$.dims[1],$.dims[0],1);v.push({score:C.score,label:w[C.label_id],mask:A})}}else if(o==="semantic"){const{segmentation:x,labels:$}=p(g,i??d)[0];for(const C of $){const T=new Uint8ClampedArray(x.data.length);for(let P=0;Pn.replace("{}",m)),o=this.tokenizer(i,{padding:this.model.config.model_type==="siglip"?"max_length":!0,truncation:!0}),{pixel_values:l}=await this.processor(s),u=await this.model({...o,pixel_values:l}),d=this.model.config.model_type==="siglip"?m=>m.sigmoid().data:m=>bt(m.data),h=[];for(const m of u.logits_per_image){const p=[...d(m)].map((w,v)=>({score:w,label:r[v]}));p.sort((w,v)=>v.score-w.score),h.push(p)}return a?h:h[0]}}class s3 extends tt{constructor(e){super(e)}async _call(e,{threshold:r=.9,percentage:n=!1}={}){const a=Array.isArray(e);if(a&&e.length!==1)throw Error("Object detection pipeline currently only supports a batch size of 1.");const s=await Er(e),i=n?null:s.map(g=>[g.height,g.width]),{pixel_values:o,pixel_mask:l}=await this.processor(s),u=await this.model({pixel_values:o,pixel_mask:l}),d=this.processor.feature_extractor.post_process_object_detection(u,r,i),h=this.model.config.id2label,m=d.map(g=>g.boxes.map((p,w)=>({score:g.scores[w],label:h[g.classes[w]],box:K_(p,!n)})));return a?m:m[0]}}class o3 extends tt{constructor(e){super(e)}async _call(e,r,{threshold:n=.1,topk:a=null,percentage:s=!1}={}){const i=Array.isArray(e),o=await Er(e),l=this.tokenizer(r,{padding:!0,truncation:!0}),u=await this.processor(o),d=[];for(let h=0;h({score:v.scores[C],label:r[v.classes[C]],box:K_($,!s)})).sort(($,C)=>C.score-$.score);a!==null&&(x=x.slice(0,a)),d.push(x)}return i?d:d[0]}}class l3 extends tt{constructor(e){super(e)}async _call(e,r,n={}){throw new Error("This pipeline is not yet supported in Transformers.js v3.")}}class u3 extends tt{constructor(r){super(r);D(this,"DEFAULT_VOCODER_ID","Xenova/speecht5_hifigan");this.vocoder=r.vocoder??null}async _call(r,{speaker_embeddings:n=null}={}){throw new Error("This pipeline is not yet supported in Transformers.js v3.")}async _call_text_to_waveform(r){const n=this.tokenizer(r,{padding:!0,truncation:!0}),{waveform:a}=await this.model(n),s=this.model.config.sampling_rate;return{audio:a.data,sampling_rate:s}}async _call_text_to_spectrogram(r,{speaker_embeddings:n}){if(this.vocoder||(console.log("No vocoder specified, using default HifiGan vocoder."),this.vocoder=await ln.from_pretrained(this.DEFAULT_VOCODER_ID,{dtype:"fp32"})),(typeof n=="string"||n instanceof URL)&&(n=new Float32Array(await(await fetch(n)).arrayBuffer())),n instanceof Float32Array)n=new fe("float32",n,[1,n.length]);else if(!(n instanceof fe))throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.");const{input_ids:a}=this.tokenizer(r,{padding:!0,truncation:!0}),{waveform:s}=await this.model.generate_speech(a,n,{vocoder:this.vocoder}),i=this.processor.feature_extractor.config.sampling_rate;return{audio:s.data,sampling_rate:i}}}class d3 extends tt{constructor(e){super(e)}async _call(e){const r=await Er(e),n=await this.processor(r),a=await this.model(n),s=[];for(const i of a.reconstruction){const o=i.squeeze().clamp_(0,1).mul_(255).round_().to("uint8");s.push(At.fromTensor(o))}return s.length>1?s:s[0]}}class c3 extends tt{constructor(e){super(e)}async _call(e){const r=await Er(e),n=await this.processor(r),{predicted_depth:a}=await this.model(n),s=[];for(let i=0;i1?s:s[0]}}const X_=Object.freeze({"text-classification":{tokenizer:ht,pipeline:Gk,model:ol,default:{model:"Xenova/distilbert-base-uncased-finetuned-sst-2-english"},type:"text"},"token-classification":{tokenizer:ht,pipeline:Hk,model:y_,default:{model:"Xenova/bert-base-multilingual-cased-ner-hrl"},type:"text"},"question-answering":{tokenizer:ht,pipeline:jk,model:S_,default:{model:"Xenova/distilbert-base-cased-distilled-squad"},type:"text"},"fill-mask":{tokenizer:ht,pipeline:qk,model:x_,default:{model:"Xenova/bert-base-uncased"},type:"text"},summarization:{tokenizer:ht,pipeline:Kk,model:Gi,default:{model:"Xenova/distilbart-cnn-6-6"},type:"text"},translation:{tokenizer:ht,pipeline:Y_,model:Gi,default:{model:"Xenova/t5-small"},type:"text"},"text2text-generation":{tokenizer:ht,pipeline:cl,model:Gi,default:{model:"Xenova/flan-t5-small"},type:"text"},"text-generation":{tokenizer:ht,pipeline:Yk,model:$_,default:{model:"Xenova/gpt2"},type:"text"},"zero-shot-classification":{tokenizer:ht,pipeline:Xk,model:ol,default:{model:"Xenova/distilbert-base-uncased-mnli"},type:"text"},"audio-classification":{pipeline:Jk,model:O_,processor:xt,default:{model:"Xenova/wav2vec2-base-superb-ks"},type:"audio"},"zero-shot-audio-classification":{tokenizer:ht,pipeline:e3,model:ln,processor:xt,default:{model:"Xenova/clap-htsat-unfused"},type:"multimodal"},"automatic-speech-recognition":{tokenizer:ht,pipeline:t3,model:[w_,M_],processor:xt,default:{model:"Xenova/whisper-tiny.en"},type:"multimodal"},"text-to-audio":{tokenizer:ht,pipeline:u3,model:[v_,b_],processor:[xt,null],default:{model:"Xenova/speecht5_tts"},type:"text"},"image-to-text":{tokenizer:ht,pipeline:r3,model:k_,processor:xt,default:{model:"Xenova/vit-gpt2-image-captioning"},type:"multimodal"},"image-classification":{pipeline:n3,model:E_,processor:xt,default:{model:"Xenova/vit-base-patch16-224"},type:"multimodal"},"image-segmentation":{pipeline:a3,model:[C_,T_],processor:xt,default:{model:"Xenova/detr-resnet-50-panoptic"},type:"multimodal"},"zero-shot-image-classification":{tokenizer:ht,pipeline:i3,model:ln,processor:xt,default:{model:"Xenova/clip-vit-base-patch32"},type:"multimodal"},"object-detection":{pipeline:s3,model:A_,processor:xt,default:{model:"Xenova/detr-resnet-50"},type:"multimodal"},"zero-shot-object-detection":{tokenizer:ht,pipeline:o3,model:I_,processor:xt,default:{model:"Xenova/owlvit-base-patch32"},type:"multimodal"},"document-question-answering":{tokenizer:ht,pipeline:l3,model:z_,processor:xt,default:{model:"Xenova/donut-base-finetuned-docvqa"},type:"multimodal"},"image-to-image":{pipeline:d3,model:P_,processor:xt,default:{model:"Xenova/swin2SR-classical-sr-x2-64"},type:"image"},"depth-estimation":{pipeline:c3,model:R_,processor:xt,default:{model:"Xenova/dpt-large"},type:"image"},"feature-extraction":{tokenizer:ht,pipeline:Qk,model:ln,default:{model:"Xenova/all-MiniLM-L6-v2"},type:"text"},"image-feature-extraction":{processor:xt,pipeline:Zk,model:[B_,ln],default:{model:"Xenova/vit-base-patch16-224-in21k"},type:"image"}}),p3=Object.freeze({"sentiment-analysis":"text-classification",ner:"token-classification",asr:"automatic-speech-recognition","text-to-speech":"text-to-audio",embeddings:"feature-extraction"});async function h3(t,e=null,{progress_callback:r=null,config:n=null,cache_dir:a=null,local_files_only:s=!1,revision:i="main",device:o=null,dtype:l=null,model_file_name:u=null,session_options:d={}}={}){t=p3[t]??t;const h=X_[t.split("_",1)[0]];if(!h)throw Error(`Unsupported pipeline: ${t}. Must be one of [${Object.keys(X_)}]`);e||(e=h.default.model,console.log(`No model specified. Using default model: "${e}".`));const m={progress_callback:r,config:n,cache_dir:a,local_files_only:s,revision:i,device:o,dtype:l,model_file_name:u,session_options:d},g=new Map([["tokenizer",h.tokenizer],["model",h.model],["processor",h.processor]]),p=await f3(g,e,m);p.task=t,yn(r,{status:"ready",task:t,model:e});const w=h.pipeline;return new w(p)}async function f3(t,e,r){const n=Object.create(null),a=[];for(let[s,i]of t.entries()){if(!i)continue;let o;Array.isArray(i)?o=new Promise(async(l,u)=>{var h;let d;for(let m of i){if(m===null){l(null);return}try{l(await m.from_pretrained(e,r));return}catch(g){if((h=g.message)!=null&&h.includes("Unsupported model type"))d=g;else{u(g);return}}}u(d)}):o=i.from_pretrained(e,r),n[s]=o,a.push(o)}await Promise.all(a);for(let[s,i]of Object.entries(n))n[s]=await i;return n}class m3{put(e){throw Error("Not implemented")}end(){throw Error("Not implemented")}}const g3=Gr.IS_PROCESS_AVAILABLE?t=>process.stdout.write(t):t=>console.log(t);class _3 extends m3{constructor(e,{skip_prompt:r=!1,callback_function:n=null,token_callback_function:a=null,decode_kwargs:s={},...i}={}){super(),this.tokenizer=e,this.skip_prompt=r,this.callback_function=n??g3,this.token_callback_function=a,this.decode_kwargs={...s,...i},this.token_cache=[],this.print_len=0,this.next_tokens_are_prompt=!0}put(e){var s;if(e.length>1)throw Error("TextStreamer only supports batch size of 1");const r=e[0];if((s=this.token_callback_function)==null||s.call(this,r),this.skip_prompt&&this.next_tokens_are_prompt){this.next_tokens_are_prompt=!1;return}this.token_cache=ct(this.token_cache,r);const n=this.tokenizer.decode(this.token_cache,this.decode_kwargs);let a;n.endsWith(` `)?(a=n.slice(this.print_len),this.token_cache=[],this.print_len=0):n.length>0&&zm(n.charCodeAt(n.length-1))?(a=n.slice(this.print_len),this.print_len+=a.length):(a=n.slice(this.print_len,n.lastIndexOf(" ")+1),this.print_len+=a.length),this.on_finalized_text(a,!1)}end(){let e;this.token_cache.length>0?(e=this.tokenizer.decode(this.token_cache,this.decode_kwargs).slice(this.print_len),this.token_cache=[],this.print_len=0):e="",this.next_tokens_are_prompt=!0,this.on_finalized_text(e,!0)}on_finalized_text(e,r){var n,a;e.length>0&&((n=this.callback_function)==null||n.call(this,e)),r&&((a=this.callback_function)==null||a.call(this,` -`))}}class y3 extends _3{constructor(e,{skip_prompt:r=!1,callback_function:n=null,token_callback_function:a=null,on_chunk_start:s=null,on_chunk_end:i=null,on_finalize:o=null,time_precision:l=.02,skip_special_tokens:u=!0,decode_kwargs:d={}}={}){super(e,{skip_prompt:r,callback_function:n,token_callback_function:a,decode_kwargs:{skip_special_tokens:u,...d}}),this.timestamp_begin=e.timestamp_begin,this.on_chunk_start=s,this.on_chunk_end=i,this.on_finalize=o,this.time_precision=l,this.waiting_for_timestamp=!1}put(e){var n,a;if(e.length>1)throw Error("WhisperTextStreamer only supports batch size of 1");const r=e[0];if(r.length===1){const s=Number(r[0])-this.timestamp_begin;if(s>=0){const i=s*this.time_precision;this.waiting_for_timestamp?(n=this.on_chunk_end)==null||n.call(this,i):(a=this.on_chunk_start)==null||a.call(this,i),this.waiting_for_timestamp=!this.waiting_for_timestamp,e=[[]]}}return super.put(e)}end(){var e;super.end(),(e=this.on_finalize)==null||e.call(this)}}class Ki{constructor(e,r){this.tokenizer=e,this.model=r}static async getInstance(e=null){return this.instance===null&&(this.instance=h3(this.task,this.model,{dtype:{encoder_model:"fp32",decoder_model_merged:"q4"},device:"webgpu",progress_callback:e})),this.instance}}D(Ki,"task",null),D(Ki,"model",null),D(Ki,"instance",null),self.addEventListener("message",async t=>{const e=t.data;let r=await w3(e);r!==null&&self.postMessage({status:"complete",data:r})});class pl extends Ki{}D(pl,"task","automatic-speech-recognition"),D(pl,"model",null);const w3=async({audio:t,model:e,subtask:r,language:n})=>{const a=e.startsWith("distil-whisper/"),s=pl;s.model!==e&&(s.model=e,s.instance!==null&&((await s.getInstance()).dispose(),s.instance=null));const i=await s.getInstance(x=>{self.postMessage(x)}),o=i.processor.feature_extractor.config.chunk_length/i.model.config.max_source_positions,l=[],u=a?20:30,d=a?3:5;let h=0,m,g=0,p;const w=new y3(i.tokenizer,{time_precision:o,on_chunk_start:x=>{const $=(u-d)*h;l.push({text:"",timestamp:[$+x,null],finalised:!1,offset:$})},token_callback_function:x=>{m??(m=performance.now()),g++>0&&(p=g/(performance.now()-m)*1e3)},callback_function:x=>{l.length!==0&&(l.at(-1).text+=x,self.postMessage({status:"update",data:{text:"",chunks:l,tps:p}}))},on_chunk_end:x=>{const $=l.at(-1);$.timestamp[1]=x+$.offset,$.finalised=!0},on_finalize:()=>{m=null,g=0,++h}}),v=await i(t,{top_k:0,do_sample:!1,chunk_length_s:u,stride_length_s:d,language:n,task:r,return_timestamps:!0,force_full_sequences:!1,streamer:w}).catch(x=>(console.error(x),self.postMessage({status:"error",data:x}),null));return{tps:p,...v}}})(); +`))}}class y3 extends _3{constructor(e,{skip_prompt:r=!1,callback_function:n=null,token_callback_function:a=null,on_chunk_start:s=null,on_chunk_end:i=null,on_finalize:o=null,time_precision:l=.02,skip_special_tokens:u=!0,decode_kwargs:d={}}={}){super(e,{skip_prompt:r,callback_function:n,token_callback_function:a,decode_kwargs:{skip_special_tokens:u,...d}}),this.timestamp_begin=e.timestamp_begin,this.on_chunk_start=s,this.on_chunk_end=i,this.on_finalize=o,this.time_precision=l,this.waiting_for_timestamp=!1}put(e){var n,a;if(e.length>1)throw Error("WhisperTextStreamer only supports batch size of 1");const r=e[0];if(r.length===1){const s=Number(r[0])-this.timestamp_begin;if(s>=0){const i=s*this.time_precision;this.waiting_for_timestamp?(n=this.on_chunk_end)==null||n.call(this,i):(a=this.on_chunk_start)==null||a.call(this,i),this.waiting_for_timestamp=!this.waiting_for_timestamp,e=[[]]}}return super.put(e)}end(){var e;super.end(),(e=this.on_finalize)==null||e.call(this)}}class Ki{constructor(e,r){this.tokenizer=e,this.model=r}static async getInstance(e=null){return this.instance===null&&(this.instance=h3(this.task,this.model,{dtype:{encoder_model:"fp32",decoder_model_merged:"fp32"},device:"webgpu",progress_callback:e})),this.instance}}D(Ki,"task",null),D(Ki,"model",null),D(Ki,"instance",null),self.addEventListener("message",async t=>{const e=t.data;let r=await w3(e);r!==null&&self.postMessage({status:"complete",data:r})});class pl extends Ki{}D(pl,"task","automatic-speech-recognition"),D(pl,"model",null);const w3=async({audio:t,model:e,subtask:r,language:n})=>{const a=e.startsWith("distil-whisper/"),s=pl;s.model!==e&&(s.model=e,s.instance!==null&&((await s.getInstance()).dispose(),s.instance=null));const i=await s.getInstance(x=>{self.postMessage(x)}),o=i.processor.feature_extractor.config.chunk_length/i.model.config.max_source_positions,l=[],u=a?20:30,d=a?3:5;let h=0,m,g=0,p;const w=new y3(i.tokenizer,{time_precision:o,on_chunk_start:x=>{const $=(u-d)*h;l.push({text:"",timestamp:[$+x,null],finalised:!1,offset:$})},token_callback_function:x=>{m??(m=performance.now()),g++>0&&(p=g/(performance.now()-m)*1e3)},callback_function:x=>{l.length!==0&&(l.at(-1).text+=x,self.postMessage({status:"update",data:{text:"",chunks:l,tps:p}}))},on_chunk_end:x=>{const $=l.at(-1);$.timestamp[1]=x+$.offset,$.finalised=!0},on_finalize:()=>{m=null,g=0,++h}}),v=await i(t,{top_k:0,do_sample:!1,chunk_length_s:u,stride_length_s:d,language:n,task:r,return_timestamps:!0,force_full_sequences:!1,streamer:w}).catch(x=>(console.error(x),self.postMessage({status:"error",data:x}),null));return{tps:p,...v}}})();