waytan22 commited on
Commit
f41b710
·
1 Parent(s): e730386

add prompt pt

Browse files
ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/config.json DELETED
@@ -1,71 +0,0 @@
1
- {
2
- "activation_dropout": 0.1,
3
- "apply_spec_augment": true,
4
- "architectures": [
5
- "HubertModelWithFinalProj"
6
- ],
7
- "attention_dropout": 0.1,
8
- "bos_token_id": 1,
9
- "classifier_proj_size": 256,
10
- "conv_bias": false,
11
- "conv_dim": [
12
- 512,
13
- 512,
14
- 512,
15
- 512,
16
- 512,
17
- 512,
18
- 512
19
- ],
20
- "conv_kernel": [
21
- 10,
22
- 3,
23
- 3,
24
- 3,
25
- 3,
26
- 2,
27
- 2
28
- ],
29
- "conv_stride": [
30
- 5,
31
- 2,
32
- 2,
33
- 2,
34
- 2,
35
- 2,
36
- 2
37
- ],
38
- "ctc_loss_reduction": "sum",
39
- "ctc_zero_infinity": false,
40
- "do_stable_layer_norm": false,
41
- "eos_token_id": 2,
42
- "feat_extract_activation": "gelu",
43
- "feat_extract_norm": "group",
44
- "feat_proj_dropout": 0.0,
45
- "feat_proj_layer_norm": true,
46
- "final_dropout": 0.1,
47
- "hidden_act": "gelu",
48
- "hidden_dropout": 0.1,
49
- "hidden_size": 768,
50
- "initializer_range": 0.02,
51
- "intermediate_size": 3072,
52
- "layer_norm_eps": 1e-05,
53
- "layerdrop": 0.1,
54
- "mask_feature_length": 10,
55
- "mask_feature_min_masks": 0,
56
- "mask_feature_prob": 0.0,
57
- "mask_time_length": 10,
58
- "mask_time_min_masks": 2,
59
- "mask_time_prob": 0.05,
60
- "model_type": "hubert",
61
- "num_attention_heads": 12,
62
- "num_conv_pos_embedding_groups": 16,
63
- "num_conv_pos_embeddings": 128,
64
- "num_feat_extract_layers": 7,
65
- "num_hidden_layers": 12,
66
- "pad_token_id": 0,
67
- "torch_dtype": "float32",
68
- "transformers_version": "4.27.3",
69
- "use_weighted_layer_sum": false,
70
- "vocab_size": 32
71
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../blobs/5186a71b15933aca2d9942db95e1aff02642d1f0
ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e
3
- size 378342945
 
 
 
 
ckpt/models--lengyue233--content-vec-best/snapshots/c0b9ba13db21beaa4053faae94c102ebe326fd68/pytorch_model.bin ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../blobs/d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e
ckpt/prompt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93fbcc4b88050a0ac9180beee723eb3600229ebcd22afa70aaeb450a622b9f49
3
+ size 3133236
ckpt/songgeneration_base_zn/config.yaml ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ================ Train Config ================ #
2
+ lyric_processor:
3
+ max_dur: 150
4
+ min_dur: 30
5
+ prompt_len: 10
6
+ pad_to_max: true
7
+
8
+
9
+ # ================ Audio tokenzier ================ #
10
+ audio_tokenizer_checkpoint: Flow1dVAE1rvq_./ckpt/model_1rvq/model_2_fixed.safetensors
11
+ audio_tokenizer_frame_rate: 25
12
+ audio_tokenizer_code_depth: 1
13
+ sample_rate: 48000
14
+
15
+ audio_tokenizer_checkpoint_sep: Flow1dVAESeparate_./ckpt/model_septoken/model_2.safetensors
16
+ audio_tokenizer_frame_rate_sep: 25
17
+ audio_tokenizer_code_depth_sep: 2
18
+ sample_rate_sep: 48000
19
+
20
+ # ================ VAE ================ #
21
+ vae_config: ./ckpt/vae/stable_audio_1920_vae.json
22
+ vae_model: ./ckpt/vae/autoencoder_music_1320k.ckpt
23
+
24
+ # ================== LM =========================== #
25
+ lm:
26
+ lm_type: Llama # [Llama]
27
+ dim: 1536
28
+ intermediate_size: 8960
29
+ num_heads: 12
30
+ num_layers: 28
31
+ num_layers_sub: 12
32
+ code_depth: 3
33
+ code_size: 16384
34
+ max_position_embeddings: 8196
35
+ max_position_embeddings_sub: 10000
36
+ rope_theta: 100000.0
37
+ rope_theta_sub: 500000.0
38
+ dropout: 0.0
39
+ use_flash_attn_2: true
40
+ activation: gelu
41
+ norm_first: true
42
+ bias_ff: false
43
+ bias_attn: false
44
+ causal: true
45
+ custom: false
46
+ memory_efficient: true
47
+ attention_as_float32: false
48
+ layer_scale: null
49
+ positional_embedding: sin
50
+ xpos: false
51
+ checkpointing: torch
52
+ weight_init: gaussian
53
+ depthwise_init: current
54
+ zero_bias_init: true
55
+ norm: layer_norm
56
+ cross_attention: false
57
+ qk_layer_norm: false
58
+ qk_layer_norm_cross: false
59
+ attention_dropout: null
60
+ kv_repeat: 1
61
+
62
+ codebooks_pattern:
63
+ modeling: delay
64
+ delay:
65
+ delays: [ 0, 250, 250 ]
66
+ flatten_first: 0
67
+ empty_initial: 0
68
+
69
+ # ================ Conditioners ===================== #
70
+ classifier_free_guidance:
71
+ # drop all conditions simultaneously
72
+ training_dropout: 0.15
73
+ inference_coef: 1.5
74
+
75
+ attribute_dropout:
76
+ # drop each condition separately
77
+ args:
78
+ active_on_eval: false
79
+ text:
80
+ description: 0.0
81
+ type_info: 0.5
82
+ audio:
83
+ prompt_audio: 0.0
84
+
85
+
86
+ use_text_training: True
87
+ fuser:
88
+ sum: []
89
+ prepend: [ description, prompt_audio, type_info ] # this order is the SAME with the input concatenation order
90
+
91
+ conditioners:
92
+ prompt_audio:
93
+ model: qt_embedding
94
+ qt_embedding:
95
+ code_size: 16384
96
+ code_depth: 3
97
+ max_len: ${eval:${prompt_len}*${audio_tokenizer_frame_rate}+2} # 25*10+2+1
98
+ description:
99
+ model: QwTokenizer
100
+ QwTokenizer:
101
+ token_path: third_party/Qwen2-7B
102
+ max_len: 300
103
+ add_token_list: ${load_yaml:conf/vocab.yaml}
104
+ type_info:
105
+ model: QwTextTokenizer
106
+ QwTextTokenizer:
107
+ token_path: third_party/Qwen2-7B
108
+ max_len: 50
ckpt/{60000_alnew.pt → songgeneration_base_zn/model.pt} RENAMED
File without changes