Upload folder using huggingface_hub
Browse files- config.json +0 -1
- model.py +2 -20
- special_tokens_map.json +1 -0
- tokenizer.py +50 -0
- tokenizer_config.json +13 -0
config.json
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
{
|
2 |
-
"_name_or_path": "noob_model",
|
3 |
"architectures": [
|
4 |
"Noob"
|
5 |
],
|
|
|
1 |
{
|
|
|
2 |
"architectures": [
|
3 |
"Noob"
|
4 |
],
|
model.py
CHANGED
@@ -3,7 +3,6 @@ import torch
|
|
3 |
import torch.nn as nn
|
4 |
from torch.nn import functional as F
|
5 |
from transformers import PreTrainedModel, PretrainedConfig
|
6 |
-
import os
|
7 |
|
8 |
# 超参数
|
9 |
batch_size = 64 # 一批包含的文本序列个数
|
@@ -13,19 +12,7 @@ n_head = 6
|
|
13 |
n_layer = 6
|
14 |
dropout = 0.2
|
15 |
|
16 |
-
|
17 |
-
current_dir = os.path.dirname(os.path.abspath(__file__))
|
18 |
-
with open(os.path.join(current_dir, 'input.txt'), 'r', encoding='utf-8') as f:
|
19 |
-
text = f.read()
|
20 |
-
|
21 |
-
chars = sorted(list(set(text)))
|
22 |
-
vocab_size = len(chars)
|
23 |
-
|
24 |
-
# decode、encode函数,在序号和字符间转换
|
25 |
-
stoi = { ch:i for i,ch in enumerate(chars) }
|
26 |
-
itos = { i:ch for i,ch in enumerate(chars) }
|
27 |
-
encode = lambda s: [stoi[c] for c in s]
|
28 |
-
decode = lambda l: ''.join([itos[i] for i in l])
|
29 |
|
30 |
class NoobConfig(PretrainedConfig):
|
31 |
model_type = "Noob"
|
@@ -136,9 +123,4 @@ class Noob(PreTrainedModel):
|
|
136 |
probs = F.softmax(logits, dim=-1)
|
137 |
idx_next = torch.multinomial(probs, num_samples=1)
|
138 |
idx = torch.cat((idx, idx_next), dim=1)
|
139 |
-
return idx
|
140 |
-
|
141 |
-
def save_pretrained(self, save_directory, **kwargs):
|
142 |
-
super().save_pretrained(save_directory, **kwargs)
|
143 |
-
with open(f"{save_directory}/vocab.json", "w") as f:
|
144 |
-
json.dump(stoi, f)
|
|
|
3 |
import torch.nn as nn
|
4 |
from torch.nn import functional as F
|
5 |
from transformers import PreTrainedModel, PretrainedConfig
|
|
|
6 |
|
7 |
# 超参数
|
8 |
batch_size = 64 # 一批包含的文本序列个数
|
|
|
12 |
n_layer = 6
|
13 |
dropout = 0.2
|
14 |
|
15 |
+
vocab_size = 65
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
class NoobConfig(PretrainedConfig):
|
18 |
model_type = "Noob"
|
|
|
123 |
probs = F.softmax(logits, dim=-1)
|
124 |
idx_next = torch.multinomial(probs, num_samples=1)
|
125 |
idx = torch.cat((idx, idx_next), dim=1)
|
126 |
+
return idx
|
|
|
|
|
|
|
|
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{}
|
tokenizer.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PreTrainedTokenizer
|
2 |
+
from typing import List, Optional
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
class NoobTokenizer(PreTrainedTokenizer):
|
8 |
+
model_input_names = ["input_ids", "attention_mask"]
|
9 |
+
|
10 |
+
def __init__(self, vocab_file=None, **kwargs):
|
11 |
+
if vocab_file is None:
|
12 |
+
# 默认词汇表,仅用于初始化
|
13 |
+
with open('input.txt', 'r', encoding='utf-8') as f:
|
14 |
+
text = f.read()
|
15 |
+
chars = sorted(list(set(text)))
|
16 |
+
self.stoi = {ch: i for i, ch in enumerate(chars)}
|
17 |
+
self.itos = {i: ch for i, ch in enumerate(chars)}
|
18 |
+
else:
|
19 |
+
# 从文件加载词汇表
|
20 |
+
with open(vocab_file, 'r', encoding='utf-8') as f:
|
21 |
+
self.stoi = json.load(f)
|
22 |
+
self.itos = {int(i): ch for ch, i in self.stoi.items()}
|
23 |
+
super().__init__(**kwargs)
|
24 |
+
|
25 |
+
@property
|
26 |
+
def vocab_size(self) -> int:
|
27 |
+
return len(self.stoi)
|
28 |
+
|
29 |
+
def get_vocab(self):
|
30 |
+
return dict(self.stoi)
|
31 |
+
|
32 |
+
def _tokenize(self, text: str) -> List[str]:
|
33 |
+
return list(text)
|
34 |
+
|
35 |
+
def _convert_token_to_id(self, token: str) -> int:
|
36 |
+
return self.stoi.get(token, 0)
|
37 |
+
|
38 |
+
def _convert_id_to_token(self, index: int) -> str:
|
39 |
+
return self.itos[index]
|
40 |
+
|
41 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
42 |
+
return "".join(tokens)
|
43 |
+
|
44 |
+
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
|
45 |
+
vocab_file = os.path.join(save_directory, 'vocab.json')
|
46 |
+
|
47 |
+
with open(vocab_file, 'w', encoding='utf-8') as f:
|
48 |
+
json.dump(self.stoi, f, ensure_ascii=False)
|
49 |
+
|
50 |
+
return (vocab_file,)
|
tokenizer_config.json
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {},
|
3 |
+
"auto_map": {
|
4 |
+
"AutoTokenizer": [
|
5 |
+
"tokenizer.NoobTokenizer",
|
6 |
+
null
|
7 |
+
]
|
8 |
+
},
|
9 |
+
"clean_up_tokenization_spaces": false,
|
10 |
+
"extra_special_tokens": {},
|
11 |
+
"model_max_length": 1000000000000000019884624838656,
|
12 |
+
"tokenizer_class": "NoobTokenizer"
|
13 |
+
}
|