malt666 commited on
Commit
fc1ff51
·
verified ·
1 Parent(s): ad7a735

Delete deepseek_v3_tokenizer

Browse files
deepseek_v3_tokenizer/.DS_Store DELETED
Binary file (6.15 kB)
 
deepseek_v3_tokenizer/deepseek_service.py DELETED
@@ -1,52 +0,0 @@
1
- from flask import Flask, request, jsonify
2
- from transformers import AutoTokenizer
3
- import os
4
-
5
- app = Flask(__name__)
6
-
7
- # 加载tokenizer
8
- tokenizer = AutoTokenizer.from_pretrained(os.path.dirname(__file__))
9
-
10
- @app.route('/count_tokens', methods=['POST'])
11
- def count_tokens():
12
- try:
13
- data = request.json
14
- messages = data.get('messages', [])
15
- system = data.get('system')
16
-
17
- # 构建完整文本
18
- text = ""
19
- if system:
20
- text += f"System: {system}\n\n"
21
-
22
- for msg in messages:
23
- role = msg.get('role', '')
24
- content = msg.get('content', '')
25
- if role == 'user':
26
- text += f"User: {content}\n"
27
- elif role == 'assistant':
28
- text += f"Assistant: {content}\n"
29
- else:
30
- text += f"{role}: {content}\n"
31
-
32
- # 计算token数量
33
- tokens = tokenizer.encode(text)
34
- token_count = len(tokens)
35
-
36
- return jsonify({
37
- 'input_tokens': token_count
38
- })
39
- except Exception as e:
40
- return jsonify({
41
- 'error': str(e)
42
- }), 400
43
-
44
- @app.route('/health', methods=['GET'])
45
- def health():
46
- return jsonify({
47
- 'status': 'healthy',
48
- 'tokenizer': 'deepseek-v3'
49
- })
50
-
51
- if __name__ == '__main__':
52
- app.run(host='127.0.0.1', port=7861)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
deepseek_v3_tokenizer/deepseek_tokenizer.py DELETED
@@ -1,12 +0,0 @@
1
- # pip3 install transformers
2
- # python3 deepseek_tokenizer.py
3
- import transformers
4
-
5
- chat_tokenizer_dir = "./"
6
-
7
- tokenizer = transformers.AutoTokenizer.from_pretrained(
8
- chat_tokenizer_dir, trust_remote_code=True
9
- )
10
-
11
- result = tokenizer.encode("Hello!")
12
- print(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
deepseek_v3_tokenizer/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
deepseek_v3_tokenizer/tokenizer_config.json DELETED
@@ -1,35 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "bos_token": {
5
- "__type": "AddedToken",
6
- "content": "<|begin▁of▁sentence|>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false
11
- },
12
- "clean_up_tokenization_spaces": false,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "<|end▁of▁sentence|>",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "legacy": true,
22
- "model_max_length": 16384,
23
- "pad_token": {
24
- "__type": "AddedToken",
25
- "content": "<|end▁of▁sentence|>",
26
- "lstrip": false,
27
- "normalized": true,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- "sp_model_kwargs": {},
32
- "unk_token": null,
33
- "tokenizer_class": "LlamaTokenizerFast",
34
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}"
35
- }