Spaces:

weiyi01191
/

DeepOperateAI-Video

Running

App Files Files Community

weiyi01191 commited on Jun 10

Commit

92180ac

verified ·

1 Parent(s): ff51588

Update minigpt4/models/mini_gpt4_llama_v2.py

Browse files

Files changed (1) hide show

minigpt4/models/mini_gpt4_llama_v2.py +18 -84

minigpt4/models/mini_gpt4_llama_v2.py CHANGED Viewed

@@ -111,94 +111,28 @@ class MiniGPT4_Video(Blip2Base, PreTrainedModel):
         print('Loading LLAMA')
-        # 🔧 在加载Llama前强制清理GPU缓存
-        import torch
-        import gc
-        if torch.cuda.is_available():
-            print("🔧 正在清理GPU缓存...")
-            torch.cuda.empty_cache()
-            torch.cuda.ipc_collect()
-            gc.collect()
-            available_mem = torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated(0)
-            print(f"🔧 清理后可用显存: {available_mem / 1024**3:.1f} GB")
         self.B_SYS, self.E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
         token=os.environ.get("HF_TKN")
-        # 🔧 根据模型类型选择合适的tokenizer
-        print(f"🔧 正在为模型 {self.llama_model} 加载tokenizer...")
-        # 检查是否是Qwen模型
-        if "qwen" in self.llama_model.lower() or "Qwen" in self.llama_model:
-            print("🔧 检测到Qwen模型，使用AutoTokenizer")
-            from transformers import AutoTokenizer
-            self.llama_tokenizer = AutoTokenizer.from_pretrained(
-                self.llama_model,
-                use_fast=False,
-                token=token,
-                trust_remote_code=True
-            )
-            # Qwen模型的特殊token设置
-            if self.llama_tokenizer.pad_token is None:
-                self.llama_tokenizer.pad_token = self.llama_tokenizer.eos_token
-        else:
-            print("🔧 使用LlamaTokenizer")
-            self.llama_tokenizer = LlamaTokenizer.from_pretrained(
-                self.llama_model,
-                use_fast=False,
-                token=token
-            )
-            self.llama_tokenizer.pad_token = "$$"
-        print(f"✅ Tokenizer加载成功: {type(self.llama_tokenizer)}")
         print("self.low_resource",self.low_resource)
-        # 🔧 再次清理内存，为模型加载腾出空间
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            gc.collect()
         if self.low_resource:
-            # 🔧 根据模型类型使用不同的加载策略
-            if "qwen" in self.llama_model.lower() or "Qwen" in self.llama_model:
-                print("🔧 使用Qwen专用加载策略")
-                # Qwen模型使用AutoModelForCausalLM
-                from transformers import AutoModelForCausalLM
-                self.llama_model = AutoModelForCausalLM.from_pretrained(
-                    self.llama_model,
-                    torch_dtype=torch.float16,
-                    load_in_8bit=True,
-                    device_map={'':f"cuda:{self.minigpt4_gpu_id}"},
-                    token=token,
-                    trust_remote_code=True
-                )
-            else:
-                print("🔧 使用Llama专用加载策略")
-                self.llama_model = llm_model.from_pretrained(
-                    self.llama_model,
-                    torch_dtype=torch.float16,
-                    load_in_8bit=True,
-                    device_map={'':f"cuda:{self.minigpt4_gpu_id}"},
-                    token=token
-                )
         else:
-            # 🔧 高资源模式下的加载策略
-            if "qwen" in self.llama_model.lower() or "Qwen" in self.llama_model:
-                print("🔧 使用Qwen高资源加载策略")
-                from transformers import AutoModelForCausalLM
-                self.llama_model = AutoModelForCausalLM.from_pretrained(
-                    self.llama_model,
-                    torch_dtype=torch.float16,
-                    token=token,
-                    trust_remote_code=True
-                )
-            else:
-                print("🔧 使用Llama高资源加载策略")
-                self.llama_model = llm_model.from_pretrained(
-                    self.llama_model,
-                    torch_dtype=torch.float16,
-                    token=token
-                )
         # self.llama_model.resize_token_embeddings(len(self.llama_tokenizer))
         self.llama_model = prepare_model_for_int8_training(self.llama_model)
@@ -874,4 +808,4 @@ def assign_imgs(batched_instruct_list, batched_img_embeds):
                 n_assigned.append(None)
         batched_assigned.append(assigned_img)
-    return batched_assigned

         print('Loading LLAMA')
         self.B_SYS, self.E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
         token=os.environ.get("HF_TKN")
+        self.llama_tokenizer = LlamaTokenizer.from_pretrained(self.llama_model,use_fast=False,token=token)  #
+        self.llama_tokenizer.pad_token = "$$"
         print("self.low_resource",self.low_resource)
         if self.low_resource:
+            self.llama_model = llm_model.from_pretrained(
+                self.llama_model,
+                torch_dtype=torch.float16,
+                # torch_dtype = torch.bfloat16,
+                load_in_8bit=True,
+                # device_map = "balanced"
+                # device_map="auto",
+                # device_map={'':torch.cuda.current_device()},token=token
+                device_map={'':f"cuda:{self.minigpt4_gpu_id}"},token=token
+            )
         else:
+            self.llama_model = llm_model.from_pretrained(
+                self.llama_model,
+                torch_dtype=torch.float16,token=token
+            )
         # self.llama_model.resize_token_embeddings(len(self.llama_tokenizer))
         self.llama_model = prepare_model_for_int8_training(self.llama_model)
                 n_assigned.append(None)
         batched_assigned.append(assigned_img)
+    return batched_assigned