simpx commited on
Commit
0b0cd06
·
verified ·
1 Parent(s): b62aa4e

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. config.json +0 -1
  2. model.py +2 -20
  3. special_tokens_map.json +1 -0
  4. tokenizer.py +50 -0
  5. tokenizer_config.json +13 -0
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_name_or_path": "noob_model",
3
  "architectures": [
4
  "Noob"
5
  ],
 
1
  {
 
2
  "architectures": [
3
  "Noob"
4
  ],
model.py CHANGED
@@ -3,7 +3,6 @@ import torch
3
  import torch.nn as nn
4
  from torch.nn import functional as F
5
  from transformers import PreTrainedModel, PretrainedConfig
6
- import os
7
 
8
  # 超参数
9
  batch_size = 64 # 一批包含的文本序列个数
@@ -13,19 +12,7 @@ n_head = 6
13
  n_layer = 6
14
  dropout = 0.2
15
 
16
- # 准备词汇表
17
- current_dir = os.path.dirname(os.path.abspath(__file__))
18
- with open(os.path.join(current_dir, 'input.txt'), 'r', encoding='utf-8') as f:
19
- text = f.read()
20
-
21
- chars = sorted(list(set(text)))
22
- vocab_size = len(chars)
23
-
24
- # decode、encode函数,在序号和字符间转换
25
- stoi = { ch:i for i,ch in enumerate(chars) }
26
- itos = { i:ch for i,ch in enumerate(chars) }
27
- encode = lambda s: [stoi[c] for c in s]
28
- decode = lambda l: ''.join([itos[i] for i in l])
29
 
30
  class NoobConfig(PretrainedConfig):
31
  model_type = "Noob"
@@ -136,9 +123,4 @@ class Noob(PreTrainedModel):
136
  probs = F.softmax(logits, dim=-1)
137
  idx_next = torch.multinomial(probs, num_samples=1)
138
  idx = torch.cat((idx, idx_next), dim=1)
139
- return idx
140
-
141
- def save_pretrained(self, save_directory, **kwargs):
142
- super().save_pretrained(save_directory, **kwargs)
143
- with open(f"{save_directory}/vocab.json", "w") as f:
144
- json.dump(stoi, f)
 
3
  import torch.nn as nn
4
  from torch.nn import functional as F
5
  from transformers import PreTrainedModel, PretrainedConfig
 
6
 
7
  # 超参数
8
  batch_size = 64 # 一批包含的文本序列个数
 
12
  n_layer = 6
13
  dropout = 0.2
14
 
15
+ vocab_size = 65
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  class NoobConfig(PretrainedConfig):
18
  model_type = "Noob"
 
123
  probs = F.softmax(logits, dim=-1)
124
  idx_next = torch.multinomial(probs, num_samples=1)
125
  idx = torch.cat((idx, idx_next), dim=1)
126
+ return idx
 
 
 
 
 
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
tokenizer.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PreTrainedTokenizer
2
+ from typing import List, Optional
3
+ import os
4
+ import json
5
+ import argparse
6
+
7
+ class NoobTokenizer(PreTrainedTokenizer):
8
+ model_input_names = ["input_ids", "attention_mask"]
9
+
10
+ def __init__(self, vocab_file=None, **kwargs):
11
+ if vocab_file is None:
12
+ # 默认词汇表,仅用于初始化
13
+ with open('input.txt', 'r', encoding='utf-8') as f:
14
+ text = f.read()
15
+ chars = sorted(list(set(text)))
16
+ self.stoi = {ch: i for i, ch in enumerate(chars)}
17
+ self.itos = {i: ch for i, ch in enumerate(chars)}
18
+ else:
19
+ # 从文件加载词汇表
20
+ with open(vocab_file, 'r', encoding='utf-8') as f:
21
+ self.stoi = json.load(f)
22
+ self.itos = {int(i): ch for ch, i in self.stoi.items()}
23
+ super().__init__(**kwargs)
24
+
25
+ @property
26
+ def vocab_size(self) -> int:
27
+ return len(self.stoi)
28
+
29
+ def get_vocab(self):
30
+ return dict(self.stoi)
31
+
32
+ def _tokenize(self, text: str) -> List[str]:
33
+ return list(text)
34
+
35
+ def _convert_token_to_id(self, token: str) -> int:
36
+ return self.stoi.get(token, 0)
37
+
38
+ def _convert_id_to_token(self, index: int) -> str:
39
+ return self.itos[index]
40
+
41
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
42
+ return "".join(tokens)
43
+
44
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
45
+ vocab_file = os.path.join(save_directory, 'vocab.json')
46
+
47
+ with open(vocab_file, 'w', encoding='utf-8') as f:
48
+ json.dump(self.stoi, f, ensure_ascii=False)
49
+
50
+ return (vocab_file,)
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {},
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ "tokenizer.NoobTokenizer",
6
+ null
7
+ ]
8
+ },
9
+ "clean_up_tokenization_spaces": false,
10
+ "extra_special_tokens": {},
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "tokenizer_class": "NoobTokenizer"
13
+ }