Spaces:
Runtime error
Runtime error
| import torch | |
| def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: | |
| """ | |
| Repeats key-value hidden states along the key-value head dimension. | |
| Args: | |
| hidden_states (torch.Tensor): Input tensor with shape either | |
| (batch, num_key_value_heads, seqlen, head_dim) or | |
| (num_layers, batch, num_key_value_heads, seqlen, head_dim). | |
| n_rep (int): Number of repetitions for key-value heads. | |
| Returns: | |
| torch.Tensor: The repeated tensor with shape either | |
| (batch, num_attention_heads, seqlen, head_dim) or | |
| (num_layers, batch, num_attention_heads, seqlen, head_dim). | |
| """ | |
| if hidden_states.dim() == 4: # (batch, num_key_value_heads, seqlen, head_dim) | |
| batch, num_key_value_heads, slen, head_dim = hidden_states.shape | |
| if n_rep == 1: | |
| return hidden_states | |
| hidden_states = hidden_states.unsqueeze(2).expand(batch, num_key_value_heads, n_rep, slen, head_dim) | |
| return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) | |
| elif hidden_states.dim() == 5: # (num_layers, batch, num_key_value_heads, seqlen, head_dim) | |
| num_layers, batch, num_key_value_heads, slen, head_dim = hidden_states.shape | |
| if n_rep == 1: | |
| return hidden_states | |
| hidden_states = hidden_states.unsqueeze(3).expand(num_layers, batch, num_key_value_heads, n_rep, slen, head_dim) | |
| return hidden_states.reshape(num_layers, batch, num_key_value_heads * n_rep, slen, head_dim) | |
| else: | |
| raise ValueError("Input tensor must have 4 or 5 dimensions.") | |
| import math | |
| def calculate_tokens_suggest_compression_ratio(text, tokenizer, model): | |
| """ | |
| Tokenizes the text and returns: | |
| - token_count: the number of tokens in the input text. | |
| - suggestions: a list of 6 candidate compression ratios. | |
| - tokenized: a dictionary containing 'input_ids' and 'attention_mask'. | |
| The suggestions are chosen so that compressing the token count by these ratios | |
| would (in the worst case) bring the count within the maximum allowed tokens (128k). | |
| If the text already fits within the context (<= 128k tokens), | |
| the default suggestions [1, 2, 4, 8, 16, 32] are returned. | |
| If the text is too long, we generate six values in logarithmic space | |
| between max(required_ratio, 1) and 32 (or a higher upper bound if needed). | |
| """ | |
| tokenized = tokenizer(text, return_tensors="pt", truncation=False) | |
| token_ids = tokenized["input_ids"][0] | |
| token_count = token_ids.size(0) | |
| max_context = model.config.max_position_embeddings | |
| if token_count <= max_context: | |
| required_ratio = 1.0 | |
| else: | |
| required_ratio = token_count / max_context | |
| if required_ratio <= 1.0: | |
| suggestions = [1, 2, 4, 8, 16, 32] | |
| else: | |
| lower_bound = max(required_ratio, 1) | |
| if required_ratio < 32: | |
| upper_bound = 32 | |
| else: | |
| upper_bound = required_ratio * (32 / 1) | |
| suggestions = [ | |
| round(math.exp(math.log(lower_bound) + i * (math.log(upper_bound) - math.log(lower_bound)) / (6 - 1)), 2) | |
| for i in range(6) | |
| ] | |
| return token_count, suggestions, tokenized | |
| def update_retrieval_context(token_count, compression_ratio): | |
| retrieval_tokens = int(token_count / compression_ratio) | |
| return f"Retrieval context tokens (after compression): {retrieval_tokens}" | |