Spaces:
Sleeping
Sleeping
Delete deepseek_v3_tokenizer
Browse files
deepseek_v3_tokenizer/.DS_Store
DELETED
Binary file (6.15 kB)
|
|
deepseek_v3_tokenizer/deepseek_service.py
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
from flask import Flask, request, jsonify
|
2 |
-
from transformers import AutoTokenizer
|
3 |
-
import os
|
4 |
-
|
5 |
-
app = Flask(__name__)
|
6 |
-
|
7 |
-
# 加载tokenizer
|
8 |
-
tokenizer = AutoTokenizer.from_pretrained(os.path.dirname(__file__))
|
9 |
-
|
10 |
-
@app.route('/count_tokens', methods=['POST'])
|
11 |
-
def count_tokens():
|
12 |
-
try:
|
13 |
-
data = request.json
|
14 |
-
messages = data.get('messages', [])
|
15 |
-
system = data.get('system')
|
16 |
-
|
17 |
-
# 构建完整文本
|
18 |
-
text = ""
|
19 |
-
if system:
|
20 |
-
text += f"System: {system}\n\n"
|
21 |
-
|
22 |
-
for msg in messages:
|
23 |
-
role = msg.get('role', '')
|
24 |
-
content = msg.get('content', '')
|
25 |
-
if role == 'user':
|
26 |
-
text += f"User: {content}\n"
|
27 |
-
elif role == 'assistant':
|
28 |
-
text += f"Assistant: {content}\n"
|
29 |
-
else:
|
30 |
-
text += f"{role}: {content}\n"
|
31 |
-
|
32 |
-
# 计算token数量
|
33 |
-
tokens = tokenizer.encode(text)
|
34 |
-
token_count = len(tokens)
|
35 |
-
|
36 |
-
return jsonify({
|
37 |
-
'input_tokens': token_count
|
38 |
-
})
|
39 |
-
except Exception as e:
|
40 |
-
return jsonify({
|
41 |
-
'error': str(e)
|
42 |
-
}), 400
|
43 |
-
|
44 |
-
@app.route('/health', methods=['GET'])
|
45 |
-
def health():
|
46 |
-
return jsonify({
|
47 |
-
'status': 'healthy',
|
48 |
-
'tokenizer': 'deepseek-v3'
|
49 |
-
})
|
50 |
-
|
51 |
-
if __name__ == '__main__':
|
52 |
-
app.run(host='127.0.0.1', port=7861)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
deepseek_v3_tokenizer/deepseek_tokenizer.py
DELETED
@@ -1,12 +0,0 @@
|
|
1 |
-
# pip3 install transformers
|
2 |
-
# python3 deepseek_tokenizer.py
|
3 |
-
import transformers
|
4 |
-
|
5 |
-
chat_tokenizer_dir = "./"
|
6 |
-
|
7 |
-
tokenizer = transformers.AutoTokenizer.from_pretrained(
|
8 |
-
chat_tokenizer_dir, trust_remote_code=True
|
9 |
-
)
|
10 |
-
|
11 |
-
result = tokenizer.encode("Hello!")
|
12 |
-
print(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
deepseek_v3_tokenizer/tokenizer.json
DELETED
The diff for this file is too large to render.
See raw diff
|
|
deepseek_v3_tokenizer/tokenizer_config.json
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"add_bos_token": false,
|
3 |
-
"add_eos_token": false,
|
4 |
-
"bos_token": {
|
5 |
-
"__type": "AddedToken",
|
6 |
-
"content": "<|begin▁of▁sentence|>",
|
7 |
-
"lstrip": false,
|
8 |
-
"normalized": true,
|
9 |
-
"rstrip": false,
|
10 |
-
"single_word": false
|
11 |
-
},
|
12 |
-
"clean_up_tokenization_spaces": false,
|
13 |
-
"eos_token": {
|
14 |
-
"__type": "AddedToken",
|
15 |
-
"content": "<|end▁of▁sentence|>",
|
16 |
-
"lstrip": false,
|
17 |
-
"normalized": true,
|
18 |
-
"rstrip": false,
|
19 |
-
"single_word": false
|
20 |
-
},
|
21 |
-
"legacy": true,
|
22 |
-
"model_max_length": 16384,
|
23 |
-
"pad_token": {
|
24 |
-
"__type": "AddedToken",
|
25 |
-
"content": "<|end▁of▁sentence|>",
|
26 |
-
"lstrip": false,
|
27 |
-
"normalized": true,
|
28 |
-
"rstrip": false,
|
29 |
-
"single_word": false
|
30 |
-
},
|
31 |
-
"sp_model_kwargs": {},
|
32 |
-
"unk_token": null,
|
33 |
-
"tokenizer_class": "LlamaTokenizerFast",
|
34 |
-
"chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}"
|
35 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|