5ivatej commited on
Commit
f2caf55
·
verified ·
1 Parent(s): 08d60aa

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TinyLlama Early-Exit Heads (Adapter)
2
+
3
+ This repo contains early-exit classification heads for `TinyLlama/TinyLlama-1.1B-Chat-v1.0`.
4
+ Attach them to the base model to enable token-level early exiting.
5
+
6
+ ## Usage
7
+ ```python
8
+ from early_exit_wrapper import load_early_exit_from_hub, generate_with_early_exit
9
+ wrapped, tok = load_early_exit_from_hub("5ivatej/tinyllama-1.1b-early-exit", device=None)
10
+ out_ids = generate_with_early_exit("Explain early exit in one tweet.", wrapped, tok, max_new_tokens=64)
11
+ print(tok.decode(out_ids[0], skip_special_tokens=True))
early_exit_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
3
+ "hidden_size": 2048,
4
+ "vocab_size": 32000,
5
+ "num_layers_to_check": 22,
6
+ "confidence_threshold": 0.9,
7
+ "dtype": "float32",
8
+ "format": "safetensors"
9
+ }
early_exit_heads.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df4576efb2127d1e98e4465f8c87e64dca9a16118ddba79a4b6646fd4000d5ad
3
+ size 5769987992
early_exit_wrapper.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # early_exit_wrapper.py
2
+ import json, torch, torch.nn as nn, torch.nn.functional as F
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from safetensors.torch import load_file
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ class EarlyExitClassifier(nn.Module):
8
+ def __init__(self, hidden_size, vocab_size, dtype=torch.float16, device=None):
9
+ super().__init__()
10
+ self.linear = nn.Linear(hidden_size, vocab_size, dtype=dtype)
11
+ if device is not None:
12
+ self.to(device)
13
+
14
+ def forward(self, hidden_states):
15
+ return self.linear(hidden_states)
16
+
17
+ class EarlyExitModelWrapper(nn.Module):
18
+ def __init__(self, model, confidence_threshold=0.9, num_layers_to_check=None, device=None, dtype=torch.float16):
19
+ super().__init__()
20
+ self.model = model
21
+ self.config = model.config
22
+ self.confidence_threshold = confidence_threshold
23
+ nl = num_layers_to_check or len(model.model.layers)
24
+ self.classifiers = nn.ModuleList([
25
+ EarlyExitClassifier(self.config.hidden_size, self.config.vocab_size, dtype=dtype, device=device)
26
+ for _ in range(nl)
27
+ ])
28
+ self._device = device or next(model.parameters()).device
29
+ self._dtype = dtype
30
+
31
+ def forward(self, input_ids, attention_mask=None, **kwargs):
32
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True, **kwargs)
33
+ hidden_states = outputs.hidden_states
34
+ for layer_idx, classifier in enumerate(self.classifiers, start=1):
35
+ h_state = hidden_states[layer_idx]
36
+ logits = classifier(h_state[:, -1, :].to(self._dtype))
37
+ probs = F.softmax(logits, dim=-1)
38
+ max_prob = probs.max(dim=-1)[0]
39
+ if bool((max_prob >= self.confidence_threshold).item()):
40
+ return {"logits": logits, "exit_layer": layer_idx, "hidden_states": h_state}
41
+
42
+ final_hidden = self.model.model.norm(hidden_states[-1].to(self._dtype))
43
+ final_logits = self.model.lm_head(final_hidden[:, -1, :])
44
+ return {"logits": final_logits, "exit_layer": len(hidden_states)-1, "hidden_states": hidden_states[-1]}
45
+
46
+ @torch.no_grad()
47
+ def generate_with_early_exit(prompt, model, tokenizer, max_new_tokens=64, temperature=0.7, top_p=0.9, device=None):
48
+ device = device or next(model.parameters()).device
49
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
50
+ input_ids = inputs["input_ids"]
51
+ attention_mask = inputs.get("attention_mask", None)
52
+ generated_ids = input_ids.clone()
53
+
54
+ for _ in range(max_new_tokens):
55
+ outputs = model(input_ids=generated_ids, attention_mask=attention_mask)
56
+ logits = outputs["logits"] / temperature
57
+
58
+ if top_p < 1.0:
59
+ sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True)
60
+ cumprobs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
61
+ to_remove = cumprobs > top_p
62
+ to_remove[:, 1:] = to_remove[:, :-1].clone()
63
+ to_remove[:, 0] = 0
64
+ indices_to_remove = to_remove.scatter(1, sorted_indices, to_remove)
65
+ logits[indices_to_remove] = float("-inf")
66
+
67
+ probs = torch.softmax(logits, dim=-1)
68
+ next_token_id = torch.multinomial(probs, num_samples=1)
69
+ generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
70
+ if attention_mask is not None:
71
+ attention_mask = torch.cat([attention_mask, torch.ones_like(next_token_id)], dim=-1)
72
+ if next_token_id.item() == tokenizer.eos_token_id:
73
+ break
74
+ return generated_ids
75
+
76
+ def load_early_exit_from_hub(repo_id: str, device: str = None):
77
+ """
78
+ Loads:
79
+ - early_exit_config.json
80
+ - early_exit_heads.safetensors
81
+ and returns (wrapped_model, tokenizer).
82
+ """
83
+ cfg_path = hf_hub_download(repo_id=repo_id, filename="early_exit_config.json")
84
+ with open(cfg_path, "r") as f:
85
+ cfg = json.load(f)
86
+
87
+ base_id = cfg["base_model"]
88
+ dtype = torch.float16 if cfg.get("dtype", "float16") == "float16" else torch.float32
89
+ device = device or ("mps" if torch.backends.mps.is_available() else ("cuda" if torch.cuda.is_available() else "cpu"))
90
+
91
+ tokenizer = AutoTokenizer.from_pretrained(base_id)
92
+ base_model = AutoModelForCausalLM.from_pretrained(
93
+ base_id,
94
+ torch_dtype=dtype,
95
+ device_map={"": device} if device != "cpu" else None,
96
+ )
97
+
98
+ wrapped = EarlyExitModelWrapper(
99
+ base_model,
100
+ confidence_threshold=float(cfg["confidence_threshold"]),
101
+ num_layers_to_check=int(cfg["num_layers_to_check"]),
102
+ device=device,
103
+ dtype=dtype,
104
+ )
105
+
106
+ heads_path = hf_hub_download(repo_id=repo_id, filename="early_exit_heads.safetensors")
107
+ state = load_file(heads_path)
108
+ wrapped.classifiers.load_state_dict(state, strict=True)
109
+
110
+ return wrapped, tokenizer