add prompt pt

Files changed (7) hide show

ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/config.json +0 -71
ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/config.json +1 -0
ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/pytorch_model.bin +0 -3
ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/pytorch_model.bin +1 -0
ckpt/prompt.pt +3 -0
ckpt/songgeneration_base_zn/config.yaml +108 -0
ckpt/{60000_alnew.pt → songgeneration_base_zn/model.pt} +0 -0

ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/config.json DELETED Viewed

@@ -1,71 +0,0 @@
-{
-  "activation_dropout": 0.1,
-  "apply_spec_augment": true,
-  "architectures": [
-    "HubertModelWithFinalProj"
-  ],
-  "attention_dropout": 0.1,
-  "bos_token_id": 1,
-  "classifier_proj_size": 256,
-  "conv_bias": false,
-  "conv_dim": [
-    512,
-    512,
-    512,
-    512,
-    512,
-    512,
-    512
-  ],
-  "conv_kernel": [
-    10,
-    3,
-    3,
-    3,
-    3,
-    2,
-    2
-  ],
-  "conv_stride": [
-    5,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2
-  ],
-  "ctc_loss_reduction": "sum",
-  "ctc_zero_infinity": false,
-  "do_stable_layer_norm": false,
-  "eos_token_id": 2,
-  "feat_extract_activation": "gelu",
-  "feat_extract_norm": "group",
-  "feat_proj_dropout": 0.0,
-  "feat_proj_layer_norm": true,
-  "final_dropout": 0.1,
-  "hidden_act": "gelu",
-  "hidden_dropout": 0.1,
-  "hidden_size": 768,
-  "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "layer_norm_eps": 1e-05,
-  "layerdrop": 0.1,
-  "mask_feature_length": 10,
-  "mask_feature_min_masks": 0,
-  "mask_feature_prob": 0.0,
-  "mask_time_length": 10,
-  "mask_time_min_masks": 2,
-  "mask_time_prob": 0.05,
-  "model_type": "hubert",
-  "num_attention_heads": 12,
-  "num_conv_pos_embedding_groups": 16,
-  "num_conv_pos_embeddings": 128,
-  "num_feat_extract_layers": 7,
-  "num_hidden_layers": 12,
-  "pad_token_id": 0,
-  "torch_dtype": "float32",
-  "transformers_version": "4.27.3",
-  "use_weighted_layer_sum": false,
-  "vocab_size": 32
-}

ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/5186a71b15933aca2d9942db95e1aff02642d1f0

ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e
-size 378342945

ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e

ckpt/prompt.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93fbcc4b88050a0ac9180beee723eb3600229ebcd22afa70aaeb450a622b9f49
+size 3133236

ckpt/songgeneration_base_zn/config.yaml ADDED Viewed

	@@ -0,0 +1,108 @@

+# ================ Train Config ================ #
+lyric_processor:
+max_dur: 150
+min_dur: 30
+prompt_len: 10
+pad_to_max: true
+# ================ Audio tokenzier ================ #
+audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
+audio_tokenizer_frame_rate: 25
+audio_tokenizer_code_depth: 1
+sample_rate: 48000
+audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors
+audio_tokenizer_frame_rate_sep: 25
+audio_tokenizer_code_depth_sep: 2
+sample_rate_sep: 48000
+# ================ VAE ================ #
+vae_config: ./ckpt/vae/stable_audio_1920_vae.json
+vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt
+# ================== LM =========================== #
+lm:
+  lm_type: Llama # [Llama]
+  dim: 1536
+  intermediate_size: 8960
+  num_heads: 12
+  num_layers: 28
+  num_layers_sub: 12
+  code_depth: 3
+  code_size: 16384
+  max_position_embeddings: 8196
+  max_position_embeddings_sub: 10000
+  rope_theta: 100000.0
+  rope_theta_sub: 500000.0
+  dropout: 0.0
+  use_flash_attn_2: true
+  activation: gelu
+  norm_first: true
+  bias_ff: false
+  bias_attn: false
+  causal: true
+  custom: false
+  memory_efficient: true
+  attention_as_float32: false
+  layer_scale: null
+  positional_embedding: sin
+  xpos: false
+  checkpointing: torch
+  weight_init: gaussian
+  depthwise_init: current
+  zero_bias_init: true
+  norm: layer_norm
+  cross_attention: false
+  qk_layer_norm: false
+  qk_layer_norm_cross: false
+  attention_dropout: null
+  kv_repeat: 1
+codebooks_pattern:
+  modeling: delay
+  delay:
+    delays: [ 0, 250, 250 ]
+    flatten_first: 0
+    empty_initial: 0
+# ================ Conditioners ===================== #
+classifier_free_guidance:
+  # drop all conditions simultaneously
+  training_dropout: 0.15
+  inference_coef: 1.5
+attribute_dropout:
+  # drop each condition separately
+  args:
+    active_on_eval: false
+  text:
+    description: 0.0
+    type_info: 0.5
+  audio:
+    prompt_audio: 0.0
+use_text_training: True
+fuser:
+  sum: []
+  prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order
+conditioners:
+  prompt_audio:
+    model: qt_embedding
+    qt_embedding:
+      code_size: 16384
+      code_depth: 3
+      max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1
+  description:
+    model: QwTokenizer
+    QwTokenizer:
+      token_path: third_party/Qwen2-7B
+      max_len: 300
+      add_token_list: ${load_yaml:conf/vocab.yaml}
+  type_info:
+    model: QwTextTokenizer
+    QwTextTokenizer:
+      token_path: third_party/Qwen2-7B
+      max_len: 50

ckpt/{60000_alnew.pt → songgeneration_base_zn/model.pt} RENAMED Viewed

File without changes