Upload tokenizer (#28)
Browse files- Upload tokenizer (2ca60a10925b961f7b3f37ed3646fd8991bac3f6)
Co-authored-by: Arthur Zucker <[email protected]>
- merges.txt +1 -0
 - special_tokens_map.json +10 -4
 - tokenizer.json +1 -0
 - tokenizer_config.json +0 -0
 
    	
        merges.txt
    CHANGED
    
    | 
         @@ -1,4 +1,5 @@ 
     | 
|
| 1 | 
         
             
            #version: 0.2
         
     | 
| 
         | 
|
| 2 | 
         
             
            Ġ a
         
     | 
| 3 | 
         
             
            Ġt h
         
     | 
| 4 | 
         
             
            i n
         
     | 
| 
         | 
|
| 1 | 
         
             
            #version: 0.2
         
     | 
| 2 | 
         
            +
            Ġ t
         
     | 
| 3 | 
         
             
            Ġ a
         
     | 
| 4 | 
         
             
            Ġt h
         
     | 
| 5 | 
         
             
            i n
         
     | 
    	
        special_tokens_map.json
    CHANGED
    
    | 
         @@ -111,22 +111,28 @@ 
     | 
|
| 111 | 
         
             
              "bos_token": {
         
     | 
| 112 | 
         
             
                "content": "<|endoftext|>",
         
     | 
| 113 | 
         
             
                "lstrip": false,
         
     | 
| 114 | 
         
            -
                "normalized":  
     | 
| 115 | 
         
             
                "rstrip": false,
         
     | 
| 116 | 
         
             
                "single_word": false
         
     | 
| 117 | 
         
             
              },
         
     | 
| 118 | 
         
             
              "eos_token": {
         
     | 
| 119 | 
         
             
                "content": "<|endoftext|>",
         
     | 
| 120 | 
         
             
                "lstrip": false,
         
     | 
| 121 | 
         
            -
                "normalized":  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 122 | 
         
             
                "rstrip": false,
         
     | 
| 123 | 
         
             
                "single_word": false
         
     | 
| 124 | 
         
             
              },
         
     | 
| 125 | 
         
            -
              "pad_token": "<|endoftext|>",
         
     | 
| 126 | 
         
             
              "unk_token": {
         
     | 
| 127 | 
         
             
                "content": "<|endoftext|>",
         
     | 
| 128 | 
         
             
                "lstrip": false,
         
     | 
| 129 | 
         
            -
                "normalized":  
     | 
| 130 | 
         
             
                "rstrip": false,
         
     | 
| 131 | 
         
             
                "single_word": false
         
     | 
| 132 | 
         
             
              }
         
     | 
| 
         | 
|
| 111 | 
         
             
              "bos_token": {
         
     | 
| 112 | 
         
             
                "content": "<|endoftext|>",
         
     | 
| 113 | 
         
             
                "lstrip": false,
         
     | 
| 114 | 
         
            +
                "normalized": false,
         
     | 
| 115 | 
         
             
                "rstrip": false,
         
     | 
| 116 | 
         
             
                "single_word": false
         
     | 
| 117 | 
         
             
              },
         
     | 
| 118 | 
         
             
              "eos_token": {
         
     | 
| 119 | 
         
             
                "content": "<|endoftext|>",
         
     | 
| 120 | 
         
             
                "lstrip": false,
         
     | 
| 121 | 
         
            +
                "normalized": false,
         
     | 
| 122 | 
         
            +
                "rstrip": false,
         
     | 
| 123 | 
         
            +
                "single_word": false
         
     | 
| 124 | 
         
            +
              },
         
     | 
| 125 | 
         
            +
              "pad_token": {
         
     | 
| 126 | 
         
            +
                "content": "<|endoftext|>",
         
     | 
| 127 | 
         
            +
                "lstrip": false,
         
     | 
| 128 | 
         
            +
                "normalized": false,
         
     | 
| 129 | 
         
             
                "rstrip": false,
         
     | 
| 130 | 
         
             
                "single_word": false
         
     | 
| 131 | 
         
             
              },
         
     | 
| 
         | 
|
| 132 | 
         
             
              "unk_token": {
         
     | 
| 133 | 
         
             
                "content": "<|endoftext|>",
         
     | 
| 134 | 
         
             
                "lstrip": false,
         
     | 
| 135 | 
         
            +
                "normalized": false,
         
     | 
| 136 | 
         
             
                "rstrip": false,
         
     | 
| 137 | 
         
             
                "single_word": false
         
     | 
| 138 | 
         
             
              }
         
     | 
    	
        tokenizer.json
    CHANGED
    
    | 
         @@ -64848,6 +64848,7 @@ 
     | 
|
| 64848 | 
         
             
                  "<|endoftext|>": 50257
         
     | 
| 64849 | 
         
             
                },
         
     | 
| 64850 | 
         
             
                "merges": [
         
     | 
| 
         | 
|
| 64851 | 
         
             
                  "Ġ a",
         
     | 
| 64852 | 
         
             
                  "Ġt h",
         
     | 
| 64853 | 
         
             
                  "i n",
         
     | 
| 
         | 
|
| 64848 | 
         
             
                  "<|endoftext|>": 50257
         
     | 
| 64849 | 
         
             
                },
         
     | 
| 64850 | 
         
             
                "merges": [
         
     | 
| 64851 | 
         
            +
                  "Ġ t",
         
     | 
| 64852 | 
         
             
                  "Ġ a",
         
     | 
| 64853 | 
         
             
                  "Ġt h",
         
     | 
| 64854 | 
         
             
                  "i n",
         
     | 
    	
        tokenizer_config.json
    CHANGED
    
    | 
         The diff for this file is too large to render. 
		See raw diff 
     | 
| 
         |