Spaces:

zhijie3
/

D2F-LLaDA-Instruct-8B

Running on Zero

App Files Files Community

UnhurriedDawn commited on 10 days ago

Commit

12437ad

1 Parent(s): c1e0817

init

Browse files

Files changed (1) hide show

app.py +41 -26

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import time
 import os
 from typing import List, Dict, Optional, Tuple, Iterator, Set
 import gradio as gr
-import spaces  # ← 新增：导入 spaces 模块
 # Suppress some Hugging Face warnings
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -24,6 +24,7 @@ from model_cache.llada.configuration_llada import LLaDAConfig
 def set_seed(seed):
     torch.manual_seed(seed); random.seed(seed); np.random.seed(seed);
     if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed); torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False
 def create_full_block_attention_mask(prompt_length, max_length, block_size, device=None, dtype=None):
     if dtype is None: dtype = torch.bfloat16
     attention_mask = torch.full((1, 1, max_length, max_length), -torch.inf, device=device, dtype=dtype)
@@ -38,12 +39,14 @@ def create_full_block_attention_mask(prompt_length, max_length, block_size, devi
             attention_mask[:, :, block_start:block_end, prev_start:prev_end] = 0
         attention_mask[:, :, block_start:block_end, block_start:block_end] = 0
     return attention_mask
 def extract_attention_mask(full_mask, start_pos, input_length, cache_length):
     end_pos = start_pos + input_length; total_length = cache_length + input_length
     extracted_mask = torch.full((1, 1, input_length, total_length), -torch.inf, device=full_mask.device, dtype=full_mask.dtype)
     extracted_mask[:, :, :, :cache_length] = full_mask[:, :, start_pos:end_pos, :cache_length]
     extracted_mask[:, :, :, cache_length:] = full_mask[:, :, start_pos:end_pos, start_pos:end_pos]
     return extracted_mask
 def top_p_logits(logits, top_p=None):
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
     cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
@@ -54,11 +57,13 @@ def top_p_logits(logits, top_p=None):
     mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
     logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
     return logits
 def top_k_logits(logits, top_k=None):
     top_k = min(top_k, logits.size(-1))
     indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
     logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
     return logits
 def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
     if temperature > 0: logits = logits / temperature
     if top_p is not None and top_p < 1: logits = top_p_logits(logits, top_p)
@@ -178,39 +183,46 @@ class DreamLoRAInference:
     def __init__(self, **kwargs):
         print("Initializing DreamLoRAInference...")
         self.device = torch.device(kwargs.get("device", "cuda") if torch.cuda.is_available() else "cpu")
-        self.__dict__.update(kwargs)
-        if self.dtype == "bfloat16" and torch.cuda.is_bf16_supported(): self.target_dtype = torch.bfloat16
-        elif self.dtype == "float16": self.target_dtype = torch.float16
-        else: self.target_dtype = torch.float32
-        self._setup_model(self.pretrained_path, self.lora_path)
-        print("Model and tokenizer setup complete.")
     def _setup_model(self, pretrained_path, lora_path):
-        # --- MODIFICATION START ---
-        # The arguments `trust_remote_code=True` have been removed as they are not needed here
-        # and were causing warnings in the log.
         config = LLaDAConfig.from_pretrained(pretrained_path)
         self.model = LLaDAModelLM.from_pretrained(
             pretrained_path,
             config=config,
             torch_dtype=self.target_dtype,
-            # device_map="auto" is handled by accelerate for better memory management on Spaces
             device_map="auto"
         ).eval()
-        # THIS IS THE CRITICAL FIX: Tie the weights before loading the adapter.
-        # This resolves the error message from the log and allows `device_map="auto"` to work correctly.
-        # print("Tying model weights...")
-        # self.model.tie_weights()
-        # print("Weights tied.")
-        # Now, load the PEFT adapter on top of the correctly configured base model
         self.model = PeftModel.from_pretrained(self.model, lora_path)
-        # --- MODIFICATION END ---
         self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
-        if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token
     def _apply_chat_template(self, prompt):
         chat_history = [{"role": "user", "content": prompt}]
@@ -225,13 +237,11 @@ class DreamLoRAInference:
                     if (next_block_id := block_id + 1) in block_states:
                         block_states[next_block_id]['is_complete'] = True
-    # The rest of your class methods (_render_visualization_html, _render_status_html, stream_and_capture_for_gradio)
-    # remain completely unchanged.
     def _render_visualization_html(self, step: int, x_t: torch.Tensor, block_states: Dict, cache_length: int, updated_block_ids: Set[int]) -> str:
         timestamp = int(time.time() * 1000)
         html_parts = []
-        for block_id in sorted(k for k in block_states.keys() if k > 0): # Only render generated part (block_id > 0)
             state = block_states[block_id]
             container_classes = ["block-container"]
             if block_id in updated_block_ids: container_classes.append("block-updating")
@@ -370,7 +380,7 @@ class DreamLoRAInference:
         return complete_html
-    @spaces.GPU  # ← 新增：关键修复 - 添加 GPU 装饰器
     @torch.inference_mode()
     def stream_and_capture_for_gradio(
         self,
@@ -382,6 +392,9 @@ class DreamLoRAInference:
         skip_threshold: float
     ) -> Iterator[Tuple[str, List[Tuple[str, str]], str, str, str]]:
         start_time = time.time()
         captured_frames: List[Tuple[str, str]] = []
@@ -396,7 +409,7 @@ class DreamLoRAInference:
         # Capture initial state
         initial_viz_html = self._render_visualization_html(0, x_t, block_states, 0, set())
-        initial_status_html = self._render_status_html(0, x_t, block_states, 0)
         captured_frames.append((initial_viz_html, initial_status_html))
         yield "", captured_frames, "Initializing generation process...", "Initializing visualization...", "Initializing block status..."
@@ -507,6 +520,8 @@ if __name__ == "__main__":
         "sampling_strategy": "default",
     }
     set_seed(42)
     inference_engine = DreamLoRAInference(**config)
     def animate_visualization(html_frames_list: List[Tuple[str, str]], delay: float) -> Iterator[Tuple[str, str]]:

 import os
 from typing import List, Dict, Optional, Tuple, Iterator, Set
 import gradio as gr
+import spaces  # 导入 spaces 模块
 # Suppress some Hugging Face warnings
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 def set_seed(seed):
     torch.manual_seed(seed); random.seed(seed); np.random.seed(seed);
     if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed); torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False
 def create_full_block_attention_mask(prompt_length, max_length, block_size, device=None, dtype=None):
     if dtype is None: dtype = torch.bfloat16
     attention_mask = torch.full((1, 1, max_length, max_length), -torch.inf, device=device, dtype=dtype)
             attention_mask[:, :, block_start:block_end, prev_start:prev_end] = 0
         attention_mask[:, :, block_start:block_end, block_start:block_end] = 0
     return attention_mask
 def extract_attention_mask(full_mask, start_pos, input_length, cache_length):
     end_pos = start_pos + input_length; total_length = cache_length + input_length
     extracted_mask = torch.full((1, 1, input_length, total_length), -torch.inf, device=full_mask.device, dtype=full_mask.dtype)
     extracted_mask[:, :, :, :cache_length] = full_mask[:, :, start_pos:end_pos, :cache_length]
     extracted_mask[:, :, :, cache_length:] = full_mask[:, :, start_pos:end_pos, start_pos:end_pos]
     return extracted_mask
 def top_p_logits(logits, top_p=None):
     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
     cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
     mask = mask.scatter_(-1, sorted_indices, sorted_indices_to_remove)
     logits = logits.masked_fill(mask, torch.finfo(logits.dtype).min)
     return logits
 def top_k_logits(logits, top_k=None):
     top_k = min(top_k, logits.size(-1))
     indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
     logits = logits.masked_fill(indices_to_remove, torch.finfo(logits.dtype).min)
     return logits
 def sample_tokens(logits, temperature=0.0, top_p=None, top_k=None, margin_confidence=False, neg_entropy=False):
     if temperature > 0: logits = logits / temperature
     if top_p is not None and top_p < 1: logits = top_p_logits(logits, top_p)
     def __init__(self, **kwargs):
         print("Initializing DreamLoRAInference...")
+        # 只保存配置，不立即加载模型
+        self.config = kwargs
+        self.model = None
+        self.tokenizer = None
         self.device = torch.device(kwargs.get("device", "cuda") if torch.cuda.is_available() else "cpu")
+        if kwargs.get("dtype") == "bfloat16" and torch.cuda.is_bf16_supported():
+            self.target_dtype = torch.bfloat16
+        elif kwargs.get("dtype") == "float16":
+            self.target_dtype = torch.float16
+        else:
+            self.target_dtype = torch.float32
+        # 从配置中获取其他属性
+        for key, value in kwargs.items():
+            if not hasattr(self, key):
+                setattr(self, key, value)
+        print("DreamLoRAInference initialized (model will be loaded on first use).")
+    def _ensure_model_loaded(self):
+        """延迟加载模型，只在真正需要时加载"""
+        if self.model is None:
+            print("Loading model for the first time...")
+            self._setup_model(self.config["pretrained_path"], self.config["lora_path"])
+            print("Model and tokenizer setup complete.")
     def _setup_model(self, pretrained_path, lora_path):
         config = LLaDAConfig.from_pretrained(pretrained_path)
         self.model = LLaDAModelLM.from_pretrained(
             pretrained_path,
             config=config,
             torch_dtype=self.target_dtype,
             device_map="auto"
         ).eval()
         self.model = PeftModel.from_pretrained(self.model, lora_path)
         self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
     def _apply_chat_template(self, prompt):
         chat_history = [{"role": "user", "content": prompt}]
                     if (next_block_id := block_id + 1) in block_states:
                         block_states[next_block_id]['is_complete'] = True
     def _render_visualization_html(self, step: int, x_t: torch.Tensor, block_states: Dict, cache_length: int, updated_block_ids: Set[int]) -> str:
         timestamp = int(time.time() * 1000)
         html_parts = []
+        for block_id in sorted(k for k in block_states.keys() if k > 0):
             state = block_states[block_id]
             container_classes = ["block-container"]
             if block_id in updated_block_ids: container_classes.append("block-updating")
         return complete_html
+    @spaces.GPU  # 关键修复：GPU 装饰器
     @torch.inference_mode()
     def stream_and_capture_for_gradio(
         self,
         skip_threshold: float
     ) -> Iterator[Tuple[str, List[Tuple[str, str]], str, str, str]]:
+        # 确保模型已加载
+        self._ensure_model_loaded()
         start_time = time.time()
         captured_frames: List[Tuple[str, str]] = []
         # Capture initial state
         initial_viz_html = self._render_visualization_html(0, x_t, block_states, 0, set())
+        initial_status_html = self._render_status_html(0, block_states, 0)
         captured_frames.append((initial_viz_html, initial_status_html))
         yield "", captured_frames, "Initializing generation process...", "Initializing visualization...", "Initializing block status..."
         "sampling_strategy": "default",
     }
     set_seed(42)
+    # 创建推理引擎但不立即加载模型
     inference_engine = DreamLoRAInference(**config)
     def animate_visualization(html_frames_list: List[Tuple[str, str]], delay: float) -> Iterator[Tuple[str, str]]: