File tree Expand file tree Collapse file tree 4 files changed +26
-24
lines changed
Expand file tree Collapse file tree 4 files changed +26
-24
lines changed Original file line number Diff line number Diff line change 11[qwen2]
2- model_name = /data/models/Qwen1 .5-14B-Chat
2+ model_name = /data/Qwen2 .5-14B-Instruct
33head_num = 40
4- kv_head_num = 40
4+ kv_head_num = 8
55size_per_head = 128
6- inter_size = 13696
6+ inter_size = 13824
77max_pos_seq_len = 32768
8- num_layer = 40
8+ num_layer = 48
99rms_norm_eps = 1e-06
1010layernorm_type = pre_layernorm
1111activation_type = silu
@@ -16,4 +16,6 @@ start_id = 151643
1616end_id = 151645
1717pad_id = 151643
1818weight_data_type = fp16
19+ attn_params_type = GQAttnParams
20+ ffn_params_type = LlamaFFNParams
1921
Original file line number Diff line number Diff line change 88 "hidden_act" : " silu" ,
99 "hidden_size" : 5120 ,
1010 "initializer_range" : 0.02 ,
11- "intermediate_size" : 13696 ,
11+ "intermediate_size" : 13824 ,
1212 "max_position_embeddings" : 32768 ,
13- "max_window_layers" : 35 ,
13+ "max_window_layers" : 70 ,
1414 "model_type" : " qwen2" ,
1515 "num_attention_heads" : 40 ,
16- "num_hidden_layers" : 40 ,
17- "num_key_value_heads" : 40 ,
16+ "num_hidden_layers" : 48 ,
17+ "num_key_value_heads" : 8 ,
1818 "rms_norm_eps" : 1e-06 ,
1919 "rope_theta" : 1000000.0 ,
20- "sliding_window" : 32768 ,
20+ "sliding_window" : 131072 ,
2121 "tie_word_embeddings" : false ,
2222 "torch_dtype" : " bfloat16" ,
23- "transformers_version" : " 4.37.0 " ,
23+ "transformers_version" : " 4.43.1 " ,
2424 "use_cache" : true ,
2525 "use_sliding_window" : false ,
2626 "vocab_size" : 152064
Original file line number Diff line number Diff line change 11[qwen2]
2- model_name = /data/models/Qwen1 .5-7B-Chat
3- head_num = 32
4- kv_head_num = 32
2+ model_name = /data/Qwen2 .5-7B-Instruct
3+ head_num = 28
4+ kv_head_num = 4
55size_per_head = 128
6- inter_size = 11008
6+ inter_size = 18944
77max_pos_seq_len = 32768
8- num_layer = 32
8+ num_layer = 28
99rms_norm_eps = 1e-06
1010layernorm_type = pre_layernorm
1111activation_type = silu
1212rope_theta = 1000000.0
1313has_post_decoder_layernorm = 1
14- vocab_size = 151936
14+ vocab_size = 152064
1515start_id = 151643
1616end_id = 151645
1717pad_id = 151643
Original file line number Diff line number Diff line change 66 "bos_token_id" : 151643 ,
77 "eos_token_id" : 151645 ,
88 "hidden_act" : " silu" ,
9- "hidden_size" : 4096 ,
9+ "hidden_size" : 3584 ,
1010 "initializer_range" : 0.02 ,
11- "intermediate_size" : 11008 ,
11+ "intermediate_size" : 18944 ,
1212 "max_position_embeddings" : 32768 ,
1313 "max_window_layers" : 28 ,
1414 "model_type" : " qwen2" ,
15- "num_attention_heads" : 32 ,
16- "num_hidden_layers" : 32 ,
17- "num_key_value_heads" : 32 ,
15+ "num_attention_heads" : 28 ,
16+ "num_hidden_layers" : 28 ,
17+ "num_key_value_heads" : 4 ,
1818 "rms_norm_eps" : 1e-06 ,
1919 "rope_theta" : 1000000.0 ,
20- "sliding_window" : 32768 ,
20+ "sliding_window" : 131072 ,
2121 "tie_word_embeddings" : false ,
2222 "torch_dtype" : " bfloat16" ,
23- "transformers_version" : " 4.37.0 " ,
23+ "transformers_version" : " 4.43.1 " ,
2424 "use_cache" : true ,
2525 "use_sliding_window" : false ,
26- "vocab_size" : 151936
26+ "vocab_size" : 152064
2727}
You can’t perform that action at this time.
0 commit comments