Shriti09 commited on
Commit
b8fa3db
·
1 Parent(s): d56b253

Add application file

Browse files
Files changed (5) hide show
  1. Assign12_Model.py +314 -0
  2. README.md +1 -12
  3. app.py +109 -0
  4. requirements.txt +5 -0
  5. trained_model.pth +3 -0
Assign12_Model.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Solving for residual std scaling issue
2
+ import os
3
+ import math
4
+ import time
5
+ import inspect
6
+ from dataclasses import dataclass
7
+ import torch
8
+ import torch.nn as nn
9
+ from torch.nn import functional as F
10
+ from tqdm import tqdm # Import tqdm
11
+
12
+
13
+ class CausalSelfAttention(nn.Module):
14
+
15
+ def __init__(self, config):
16
+ super().__init__()
17
+ assert config.n_embd % config.n_head == 0
18
+ # key, query, value projections for all heads, but in a batch
19
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
20
+ # output projection
21
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
22
+ self.c_proj.NANGPT_SCALE_INIT = 1
23
+ # regularization
24
+ self.n_head = config.n_head
25
+ self.n_embd = config.n_embd
26
+ self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))
27
+
28
+ def forward(self, x):
29
+ B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
30
+ # calculate query, key, values for all heads in batch and move head forward to be the batch dim
31
+ # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
32
+ # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
33
+ qkv = self.c_attn(x)
34
+ q, k, v = qkv.split(self.n_embd, dim=2)
35
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
36
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
37
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
38
+
39
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
40
+ att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
41
+ att = F.softmax(att, dim=-1)
42
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
43
+
44
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
45
+ # output projection
46
+ y = self.c_proj(y)
47
+ return y
48
+
49
+
50
+ class MLP(nn.Module):
51
+
52
+ def __init__(self, config):
53
+ super().__init__()
54
+ self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
55
+ self.gelu = nn.GELU(approximate='tanh')
56
+ self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
57
+ self.c_proj.NANOGPT_SCALE_INIT = 1
58
+
59
+ def forward(self, x):
60
+ x = self.c_fc(x)
61
+ x = self.gelu(x)
62
+ x = self.c_proj(x)
63
+ return x
64
+
65
+ class Block(nn.Module):
66
+
67
+ def __init__(self, config):
68
+ super().__init__()
69
+ self.ln_1 = nn.LayerNorm(config.n_embd)
70
+ self.attn = CausalSelfAttention(config)
71
+ self.ln_2 = nn.LayerNorm(config.n_embd)
72
+ self.mlp = MLP(config)
73
+
74
+ def forward(self, x):
75
+ x = x + self.attn(self.ln_1(x))
76
+ x = x + self.mlp(self.ln_2(x))
77
+ return x
78
+
79
+
80
+ @dataclass
81
+ class GPTConfig:
82
+ block_size: int = 1024 # max sequence length
83
+ vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
84
+ n_layer: int = 12 # number of layers
85
+ n_head: int = 12 # number of heads
86
+ n_embd: int = 768 # embedding dimension
87
+
88
+
89
+ class GPT(nn.Module):
90
+
91
+ def __init__(self, config):
92
+ super().__init__()
93
+ self.config = config
94
+
95
+ self.transformer = nn.ModuleDict(dict(
96
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
97
+ wpe = nn.Embedding(config.block_size, config.n_embd),
98
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
99
+ ln_f = nn.LayerNorm(config.n_embd),
100
+ ))
101
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
102
+
103
+ # weight sharing
104
+ self.transformer.wte.weight = self.lm_head.weight
105
+
106
+ # weight initialization
107
+ self.apply(self._init_weights)
108
+
109
+ def _init_weights(self, module):
110
+ if isinstance(module, nn.Linear):
111
+ std = 0.02
112
+ if hasattr(module, 'NANGPT_SCALE_INIT'):
113
+ std *= (2 * self.config.n_layer) ** -0.5
114
+ torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
115
+ if module.bias is not None:
116
+ torch.nn.init.zeros_(module.bias)
117
+ elif isinstance(module, nn.Embedding):
118
+ torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)
119
+
120
+
121
+
122
+ def forward(self, idx, targets=None):
123
+ # idx is of shape (B, T)
124
+ B, T = idx.size()
125
+ assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
126
+ # forward the token and posisition embeddings
127
+ pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
128
+ pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
129
+ tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
130
+ x = tok_emb + pos_emb
131
+ # forward the blocks of the transformer
132
+ for block in self.transformer.h:
133
+ x = block(x)
134
+ # forward the final layernorm and the classifier
135
+ x = self.transformer.ln_f(x)
136
+ logits = self.lm_head(x) # (B, T, vocab_size)
137
+ loss = None
138
+ if targets is not None:
139
+ loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
140
+ return logits, loss
141
+
142
+ @classmethod
143
+ def from_pretrained(cls, model_type):
144
+ """Loads pretrained GPT-2 model weights from huggingface"""
145
+ assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
146
+ from transformers import GPT2LMHeadModel
147
+ #print("loading weights from pretrained gpt: %s" % model_type)
148
+
149
+ # n_layer, n_head and n_embd are determined from model_type
150
+ config_args = {
151
+ 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
152
+ 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
153
+ 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
154
+ 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
155
+ }[model_type]
156
+ config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
157
+ config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
158
+ # create a from-scratch initialized minGPT model
159
+ config = GPTConfig(**config_args)
160
+ model = GPT(config)
161
+ sd = model.state_dict()
162
+ sd_keys = sd.keys()
163
+ sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
164
+
165
+ # init a huggingface/transformers model
166
+ model_hf = GPT2LMHeadModel.from_pretrained(model_type)
167
+ sd_hf = model_hf.state_dict()
168
+
169
+ # copy while ensuring all of the parameters are aligned and match in names and shapes
170
+ sd_keys_hf = sd_hf.keys()
171
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
172
+ sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
173
+ transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
174
+ # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
175
+ # this means that we have to transpose these weights when we import them
176
+ assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
177
+ for k in sd_keys_hf:
178
+ if any(k.endswith(w) for w in transposed):
179
+ # special treatment for the Conv1D weights we need to transpose
180
+ assert sd_hf[k].shape[::-1] == sd[k].shape
181
+ with torch.no_grad():
182
+ sd[k].copy_(sd_hf[k].t())
183
+ else:
184
+ # vanilla copy over the other parameters
185
+ assert sd_hf[k].shape == sd[k].shape
186
+ with torch.no_grad():
187
+ sd[k].copy_(sd_hf[k])
188
+
189
+ return model
190
+
191
+ # model = GPT.from_pretrained('gpt2')
192
+
193
+ import tiktoken
194
+
195
+ class DataLoaderLite:
196
+ def __init__(self, B, T):
197
+ self.B = B
198
+ self.T = T
199
+
200
+ # at init load tokens from disk and store them in memory
201
+ with open('input.txt', 'r') as f:
202
+ text = f.read()
203
+ self.enc = tiktoken.get_encoding('gpt2')
204
+ tokens = self.enc.encode(text)
205
+ self.tokens = torch.tensor(tokens)
206
+ print(f'loaded {len(self.tokens)} tokens')
207
+ print(f'1 epoch = {len(self.tokens) // (B * T)} batches')
208
+
209
+ # state
210
+ self.current_position = 0
211
+
212
+ def next_batch(self):
213
+ B, T = self.B, self.T
214
+ buf = self.tokens[self.current_position: self.current_position + B * T + 1]
215
+ x = (buf[:-1]).view(B, T) # inputs
216
+ y = (buf[1:]).view(B, T) # targets
217
+ # advance the position in the tensor
218
+ self.current_position += B*T
219
+ # if loading the next batch would be out of bounds, reset
220
+ if self.current_position + (B * T + 1) > len(self.tokens):
221
+ self.current_position = 0
222
+ return x, y
223
+
224
+
225
+ if __name__ == "__main__":
226
+ # SEED
227
+ device = 'cpu'
228
+ if torch.cuda.is_available():
229
+ device = 'cuda'
230
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
231
+ device = "mps"
232
+ print(f"using device: {device}")
233
+
234
+ torch.manual_seed(1337)
235
+ if torch.cuda.is_available():
236
+ torch.cuda.manual_seed(1337)
237
+
238
+ def count_parameters(model):
239
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
240
+
241
+ model = GPT(GPTConfig())
242
+ model.to(device)
243
+ print(f"Number of parameters: {count_parameters(model):,}")
244
+
245
+ train_loader = DataLoaderLite(B = 16, T = 128) #Only changed B = 16, T = 128
246
+ target_loss = 0.08
247
+ best_loss = float( 'inf')
248
+ # NEW CODE
249
+ optimizer = torch.optim.AdamW(model.parameters(), lr = 3e-4)
250
+ num_batches = len(train_loader.tokens) // (train_loader.B * train_loader.T)
251
+ for epoch in range(100):
252
+ # Training Loop
253
+ model.train() # Make sure the model is in training mode
254
+ total_train_loss = 0.0
255
+ with tqdm(total=num_batches, desc=f'Epoch {epoch+1}/100', ncols=100, leave=True) as pbar:
256
+ for _ in range(num_batches): # Iterate over batches
257
+
258
+ x, y = train_loader.next_batch()
259
+ x, y = x.to(device), y.to(device)
260
+ optimizer.zero_grad()
261
+ logits, loss = model(x, y)
262
+ loss.backward()
263
+ optimizer.step()
264
+ # Update the progress bar
265
+ pbar.set_postfix(loss=loss.item()) # Display loss in the progress bar
266
+ pbar.update(1) # Increment progress by 1 step
267
+
268
+ # Check if the target loss is reached
269
+ if loss.item() < best_loss:
270
+ best_loss = loss.item()
271
+ if best_loss < target_loss:
272
+ torch.save(model.state_dict(), "trained_model.pth")
273
+ print(f"Target loss reached: {best_loss:.4f}. Model saved.")
274
+ break
275
+
276
+ print( loss )
277
+
278
+
279
+
280
+ import sys; sys.exit(0)
281
+
282
+
283
+
284
+ # STOP
285
+ num_return_sequences = 5
286
+ max_length = 30
287
+
288
+
289
+ torch.manual_seed(42)
290
+ torch.cuda.manual_seed(42)
291
+ while x.size(1) < max_length:
292
+ # forward the model to get the logits
293
+ with torch.no_grad():
294
+ logits = model(x)[0] # (B, T, vocab_size)
295
+ # take the logits at the last position
296
+ logits = logits[:, -1, :] # (B, vocab_size)
297
+ # get the probabilities
298
+ probs = F.softmax(logits, dim=-1)
299
+ # do top-k sampling of 50 (huggingface pipeline default)
300
+ # topk_probs here becomes (5, 50), topk_indices is (5, 50)
301
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
302
+ # select a token from the top-k probabilities
303
+ # note: multinomial does not demand the input to sum to 1
304
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
305
+ # gather the corresponding indices
306
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
307
+ # append to the sequence
308
+ x = torch.cat((x, xcol), dim=1)
309
+
310
+ # print the generated text
311
+ for i in range(num_return_sequences):
312
+ tokens = x[i, :max_length].tolist()
313
+ decoded = enc.decode(tokens)
314
+ print(">", decoded)
README.md CHANGED
@@ -1,12 +1 @@
1
- ---
2
- title: TransformerModel
3
- emoji: ⚡
4
- colorFrom: red
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.12.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # Assign12_ERAV3
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print("Step 1: Importing libraries...")
2
+ import torch
3
+ from transformers import GPT2Tokenizer
4
+ import gradio as gr
5
+ from Assign12_Model import GPT, GPTConfig
6
+ import torchvision
7
+ torchvision.disable_beta_transforms_warning()
8
+
9
+ print("Step 2: Loading the model...")
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ config = GPTConfig(block_size=1024, vocab_size=50257, n_layer=12, n_head=12, n_embd=768)
12
+ model = GPT(config)
13
+
14
+ print("Step 3: Loading model weights...")
15
+ model.load_state_dict(torch.load("trained_model.pth", map_location=device, weights_only=True))
16
+ model.eval().to(device)
17
+
18
+ print("Step 4: Loading tokenizer...")
19
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
20
+
21
+ # Add print statements in the function
22
+ print("Step 5: Defining the inference function...")
23
+ def generate_text(prompt, max_length=50, num_return_sequences=1):
24
+ print(f"Received input prompt: {prompt}")
25
+ inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
26
+ outputs = []
27
+ for _ in range(num_return_sequences):
28
+ with torch.no_grad():
29
+ logits, _ = model(inputs)
30
+ generated_token = torch.argmax(logits[:, -1, :], dim=-1)
31
+ inputs = torch.cat((inputs, generated_token.unsqueeze(0)), dim=1)
32
+ if inputs.size(1) >= max_length:
33
+ break
34
+ output = tokenizer.decode(inputs[0].tolist())
35
+ outputs.append(output)
36
+ return outputs
37
+
38
+ import os
39
+ import torch
40
+ import torch.nn.functional as F
41
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
42
+ import gradio as gr
43
+
44
+ # Initialize model and tokenizer
45
+ model_name = 'gpt2' # You can replace this with your specific model
46
+ model = GPT2LMHeadModel.from_pretrained(model_name)
47
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
48
+ model.eval()
49
+
50
+ # Ensure we're using CUDA if available
51
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
+ model.to(device)
53
+
54
+ # Define parameters
55
+ max_length = 100
56
+ num_return_sequences = 1 # Default number of sequences to generate
57
+
58
+ # Function to generate text
59
+ def generate_text(prompt, max_len=50, num_outputs=1):
60
+ global max_length, num_return_sequences
61
+
62
+ max_length = max_len
63
+ num_return_sequences = num_outputs
64
+
65
+ # Encode the input text
66
+ input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
67
+
68
+ generated_sequences = [] # List to store generated text
69
+
70
+ # Generate sequences
71
+ with torch.no_grad():
72
+ for i in range(num_return_sequences):
73
+ x = input_ids.clone()
74
+ while x.size(1) < max_length:
75
+ logits = model(x).logits # (B, T, vocab_size)
76
+ logits = logits[:, -1, :] # (B, vocab_size)
77
+ probs = F.softmax(logits, dim=-1)
78
+ topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
79
+ ix = torch.multinomial(topk_probs, 1) # (B, 1)
80
+ xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
81
+ x = torch.cat((x, xcol), dim=1)
82
+
83
+ # Decode the generated tokens and append it to the list
84
+ tokens = x[0, :max_length].tolist()
85
+ decoded = tokenizer.decode(tokens, skip_special_tokens=True)
86
+ generated_sequences.append(f"Generated Text {i+1}:")
87
+ generated_sequences.append(f"> {decoded}\n")
88
+
89
+ # Join the generated sequences into a structured output
90
+ structured_output = "\n".join(generated_sequences)
91
+
92
+ return structured_output
93
+
94
+ # Set up Gradio interface
95
+ print("Step 6: Setting up the Gradio interface...")
96
+ interface = gr.Interface(
97
+ fn=generate_text,
98
+ inputs=[
99
+ gr.Textbox(label="Input Prompt"),
100
+ gr.Slider(10, 200, step=10, label="Max Length", value=50),
101
+ gr.Number(label="Number of Outputs", value=10),
102
+ ],
103
+ outputs=gr.Textbox(label="Generated Text"),
104
+ title="Transformer Text Generator",
105
+ description="Enter a prompt and generate text using the trained transformer model.",
106
+ )
107
+
108
+ print("Step 7: Launching the Gradio interface...")
109
+ interface.launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio
4
+ tqdm
5
+ torchvision
trained_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2b357993c1ef80272691ea3e6a09e88bab5b28d03005ac0443b56fdcd1faedb
3
+ size 548146637