Spaces:

chansung
/

LLM-As-Chatbot

Runtime error

App Files Files Community

chansung commited on Mar 28, 2023

Commit

741ff0c

1 Parent(s): de0a8a7

Update gen.py

Browse files

Files changed (1) hide show

gen.py +30 -86

gen.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gc
 import copy
 from tenacity import RetryError
 from tenacity import retry, stop_after_attempt, wait_fixed
@@ -13,6 +14,7 @@ from transformers import (
     MinNewTokensLengthLogitsProcessor,
     TemperatureLogitsWarper,
     TopPLogitsWarper,
 )
 def get_output_batch(
@@ -56,6 +58,11 @@ class StreamModel:
         self.tokenizer = tokenizer
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(
         self,
         prompt,
@@ -71,82 +78,40 @@ class StreamModel:
         logprobs = max(logprobs, 0)
         # bigger than 1
-        chunk_size = 3
         chunk_count = 0
         # Generate completion tokens.
-        final_tokens = torch.empty(0).to(self.device)
-        try:
-            for tokens in self.generate(
-                input_ids[None, :].repeat(n, 1),
-                logprobs=logprobs,
-                min_new_tokens=min_tokens,
-                max_new_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-            ):
-                if chunk_count < chunk_size:
-                    chunk_count = chunk_count + 1
-                final_tokens = torch.cat((final_tokens, tokens))
-                if chunk_count == chunk_size-1:
-                    chunk_count = 0
-                    yield self.tokenizer.decode(final_tokens, skip_special_tokens=True)
-            if chunk_count > 0:
                 yield self.tokenizer.decode(final_tokens, skip_special_tokens=True)
-        except RetryError as e:
-            print(e)
-            del input_ids
-            gc.collect()
-        del final_tokens
         if self.device == "cuda":
             torch.cuda.empty_cache()
-    @retry(stop=stop_after_attempt(5), wait=wait_fixed(1))
     def _infer(self, model_fn, **kwargs):
-        """Call a model function in inference mode with auto retrying."""
-        # This is a temporary workaround for bitsandbytes #162:
-        # https://github.com/TimDettmers/bitsandbytes/issues/162
         with torch.inference_mode():
             return model_fn(**kwargs)
-    def _logits_processor(self, config, input_length):
-        """Set up logits processor based on the generation config."""
-        processor = LogitsProcessorList()
-        # Add processor for enforcing a min-length of new tokens.
-        if (
-            config.min_new_tokens is not None
-            and config.min_new_tokens > 0
-            and config.eos_token_id is not None
-        ):
-            processor.append(
-                MinNewTokensLengthLogitsProcessor(
-                    prompt_length_to_skip=input_length,
-                    min_new_tokens=config.min_new_tokens,
-                    eos_token_id=config.eos_token_id,
-                )
-            )
-        # Add processor for scaling output probability distribution.
-        if (
-            config.temperature is not None
-            and config.temperature > 0
-            and config.temperature != 1.0
-        ):
-            processor.append(TemperatureLogitsWarper(config.temperature))
-        # Add processor for nucleus sampling.
-        if config.top_p is not None and config.top_p > 0 and config.top_p < 1:
-            processor.append(TopPLogitsWarper(config.top_p))
-        return processor
     def tokenize(self, text):
         """Tokenize a string into a tensor of token IDs."""
         batch = self.tokenizer.encode(text, return_tensors="pt")
@@ -165,7 +130,7 @@ class StreamModel:
         kwargs = config.update(**kwargs)
         kwargs["output_attentions"] = False
         kwargs["output_hidden_states"] = False
-        kwargs["use_cache"] = True # config.use_cache
         # Collect special token IDs.
         pad_token_id = config.pad_token_id
@@ -183,28 +148,6 @@ class StreamModel:
                 input_ids = input_ids * eos_token_id[0]
             input_length = 1
-        # Prepare inputs for encoder-decoder models.
-        if self.model.config.is_encoder_decoder:
-            # Get outputs from the encoder.
-            encoder = self.model.get_encoder()
-            encoder_kwargs = kwargs.copy()
-            encoder_kwargs.pop("use_cache", None)
-            encoder_kwargs["input_ids"] = input_ids
-            encoder_kwargs["return_dict"] = True
-            encoder_outputs = self._infer(encoder, **encoder_kwargs)
-            kwargs["encoder_outputs"] = encoder_outputs
-            # Reinitialize inputs for the decoder.
-            decoder_start_token_id = config.decoder_start_token_id
-            if decoder_start_token_id is None:
-                decoder_start_token_id = bos_token_id
-            input_ids = input_ids.new_ones((batch_size, 1))
-            input_ids = input_ids * decoder_start_token_id
-            input_length = 1
-        # Set up logits processor.
-        processor = self._logits_processor(config, input_length)
         # Keep track of which sequences are already finished.
         unfinished = input_ids.new_ones(batch_size)
@@ -213,10 +156,11 @@ class StreamModel:
             inputs = self.model.prepare_inputs_for_generation(
                 input_ids, **kwargs
             )  # noqa: E501
             outputs = self._infer(
                 self.model,
                 **inputs,
-                return_dict=True,
                 output_attentions=False,
                 output_hidden_states=False,
             )
@@ -224,7 +168,7 @@ class StreamModel:
             # Pre-process the probability distribution of the next tokens.
             logits = outputs.logits[:, -1, :]
             with torch.inference_mode():
-                logits = processor(input_ids, logits)
             probs = torch.nn.functional.softmax(logits, dim=-1)
             # Select deterministic or stochastic decoding strategy.

 import gc
 import copy
+import time
 from tenacity import RetryError
 from tenacity import retry, stop_after_attempt, wait_fixed
     MinNewTokensLengthLogitsProcessor,
     TemperatureLogitsWarper,
     TopPLogitsWarper,
+    MinLengthLogitsProcessor
 )
 def get_output_batch(
         self.tokenizer = tokenizer
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.processor = LogitsProcessorList()
+        self.processor.append(TemperatureLogitsWarper(0.9))
+        self.processor.append(TopPLogitsWarper(0.75))
     def __call__(
         self,
         prompt,
         logprobs = max(logprobs, 0)
         # bigger than 1
+        chunk_size = 2
         chunk_count = 0
         # Generate completion tokens.
+        final_tokens = torch.empty(0)
+        for tokens in self.generate(
+            input_ids[None, :].repeat(n, 1),
+            logprobs=logprobs,
+            min_new_tokens=min_tokens,
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+        ):
+            if chunk_count < chunk_size:
+                chunk_count = chunk_count + 1
+            final_tokens = torch.cat((final_tokens, tokens.to("cpu")))
+            if chunk_count == chunk_size-1:
+                chunk_count = 0
                 yield self.tokenizer.decode(final_tokens, skip_special_tokens=True)
+        if chunk_count > 0:
+            yield self.tokenizer.decode(final_tokens, skip_special_tokens=True)
+        del final_tokens, input_ids
         if self.device == "cuda":
             torch.cuda.empty_cache()
     def _infer(self, model_fn, **kwargs):
         with torch.inference_mode():
             return model_fn(**kwargs)
     def tokenize(self, text):
         """Tokenize a string into a tensor of token IDs."""
         batch = self.tokenizer.encode(text, return_tensors="pt")
         kwargs = config.update(**kwargs)
         kwargs["output_attentions"] = False
         kwargs["output_hidden_states"] = False
+        kwargs["use_cache"] = True
         # Collect special token IDs.
         pad_token_id = config.pad_token_id
                 input_ids = input_ids * eos_token_id[0]
             input_length = 1
         # Keep track of which sequences are already finished.
         unfinished = input_ids.new_ones(batch_size)
             inputs = self.model.prepare_inputs_for_generation(
                 input_ids, **kwargs
             )  # noqa: E501
             outputs = self._infer(
                 self.model,
                 **inputs,
+                # return_dict=True,
                 output_attentions=False,
                 output_hidden_states=False,
             )
             # Pre-process the probability distribution of the next tokens.
             logits = outputs.logits[:, -1, :]
             with torch.inference_mode():
+                logits = self.processor(input_ids, logits)
             probs = torch.nn.functional.softmax(logits, dim=-1)
             # Select deterministic or stochastic decoding strategy.