Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

config.json +0 -1
model.py +2 -20
special_tokens_map.json +1 -0
tokenizer.py +50 -0
tokenizer_config.json +13 -0

config.json CHANGED Viewed

@@ -1,5 +1,4 @@
 {
-  "_name_or_path": "noob_model",
   "architectures": [
     "Noob"
   ],

 {
   "architectures": [
     "Noob"
   ],

model.py CHANGED Viewed

@@ -3,7 +3,6 @@ import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from transformers import PreTrainedModel, PretrainedConfig
-import os
 # 超参数
 batch_size = 64  # 一批包含的文本序列个数
@@ -13,19 +12,7 @@ n_head = 6
 n_layer = 6
 dropout = 0.2
-# 准备词汇表
-current_dir = os.path.dirname(os.path.abspath(__file__))
-with open(os.path.join(current_dir, 'input.txt'), 'r', encoding='utf-8') as f:
-    text = f.read()
-chars = sorted(list(set(text)))
-vocab_size = len(chars)
-# decode、encode函数，在序号和字符间转换
-stoi = { ch:i for i,ch in enumerate(chars) }
-itos = { i:ch for i,ch in enumerate(chars) }
-encode = lambda s: [stoi[c] for c in s]
-decode = lambda l: ''.join([itos[i] for i in l])
 class NoobConfig(PretrainedConfig):
     model_type = "Noob"
@@ -136,9 +123,4 @@ class Noob(PreTrainedModel):
             probs = F.softmax(logits, dim=-1)
             idx_next = torch.multinomial(probs, num_samples=1)
             idx = torch.cat((idx, idx_next), dim=1)
-        return idx
-    def save_pretrained(self, save_directory, **kwargs):
-        super().save_pretrained(save_directory, **kwargs)
-        with open(f"{save_directory}/vocab.json", "w") as f:
-            json.dump(stoi, f)

 import torch.nn as nn
 from torch.nn import functional as F
 from transformers import PreTrainedModel, PretrainedConfig
 # 超参数
 batch_size = 64  # 一批包含的文本序列个数
 n_layer = 6
 dropout = 0.2
+vocab_size = 65
 class NoobConfig(PretrainedConfig):
     model_type = "Noob"
             probs = F.softmax(logits, dim=-1)
             idx_next = torch.multinomial(probs, num_samples=1)
             idx = torch.cat((idx, idx_next), dim=1)
+        return idx

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

tokenizer.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from transformers import PreTrainedTokenizer
+from typing import List, Optional
+import os
+import json
+import argparse
+class NoobTokenizer(PreTrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(self, vocab_file=None, **kwargs):
+        if vocab_file is None:
+            # 默认词汇表，仅用于初始化
+            with open('input.txt', 'r', encoding='utf-8') as f:
+                text = f.read()
+            chars = sorted(list(set(text)))
+            self.stoi = {ch: i for i, ch in enumerate(chars)}
+            self.itos = {i: ch for i, ch in enumerate(chars)}
+        else:
+            # 从文件加载词汇表
+            with open(vocab_file, 'r', encoding='utf-8') as f:
+                self.stoi = json.load(f)
+            self.itos = {int(i): ch for ch, i in self.stoi.items()}
+        super().__init__(**kwargs)
+    @property
+    def vocab_size(self) -> int:
+        return len(self.stoi)
+    def get_vocab(self):
+        return dict(self.stoi)
+    def _tokenize(self, text: str) -> List[str]:
+        return list(text)
+    def _convert_token_to_id(self, token: str) -> int:
+        return self.stoi.get(token, 0)
+    def _convert_id_to_token(self, index: int) -> str:
+        return self.itos[index]
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return "".join(tokens)
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple:
+        vocab_file = os.path.join(save_directory, 'vocab.json')
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            json.dump(self.stoi, f, ensure_ascii=False)
+        return (vocab_file,)

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "added_tokens_decoder": {},
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer.NoobTokenizer",
+      null
+    ]
+  },
+  "clean_up_tokenization_spaces": false,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "NoobTokenizer"
+}