Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

16c5c11

verified ·

1 Parent(s): fdebc65

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -9

app.py CHANGED Viewed

@@ -425,8 +425,27 @@ def train_model(
         )
         log.append(f"Model files downloaded to {local_model_path}")
         # Create a bnb configuration for loading the model in 4-bit
-        # Not strictly necessary for A100 but keeps memory usage lower
         progress(0.25, desc="Loading model...")
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
@@ -435,21 +454,26 @@ def train_model(
             bnb_4bit_use_double_quant=False
         )
-        # Load model and tokenizer
         model = AutoModelForCausalLM.from_pretrained(
             local_model_path,
             quantization_config=bnb_config,
             device_map="auto",
-            torch_dtype=torch.bfloat16,
         )
-        tokenizer = AutoTokenizer.from_pretrained(local_model_path)
-        # Handle tokenizer settings
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        log.append(f"Loaded model vocab size: {tokenizer.vocab_size}")
-        log.append(f"Input embedding shape: {model.get_input_embeddings().weight.shape}")
         # PEFT Configuration (Smaller LoRA for faster iteration)
         model = prepare_model_for_kbit_training(model)

         )
         log.append(f"Model files downloaded to {local_model_path}")
+        # Check and fix the model config if needed
+        config_path = os.path.join(local_model_path, "config.json")
+        if os.path.exists(config_path):
+            with open(config_path, 'r') as f:
+                config_data = json.load(f)
+            # Fix the rope_scaling configuration
+            if 'rope_scaling' in config_data:
+                if not isinstance(config_data['rope_scaling'], dict):
+                    config_data['rope_scaling'] = {"type": "linear", "factor": 2.0}
+                elif 'rope_type' in config_data['rope_scaling']:
+                    # Convert complex rope_scaling to the simple format expected
+                    rope_factor = config_data['rope_scaling'].get('factor', 2.0)
+                    config_data['rope_scaling'] = {"type": "linear", "factor": rope_factor}
+                # Write the updated config back
+                with open(config_path, 'w') as f:
+                    json.dump(config_data, f, indent=2)
+                log.append("Updated model configuration for rope_scaling")
         # Create a bnb configuration for loading the model in 4-bit
         progress(0.25, desc="Loading model...")
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_use_double_quant=False
         )
+        # Load the model with fixed configuration
         model = AutoModelForCausalLM.from_pretrained(
             local_model_path,
             quantization_config=bnb_config,
             device_map="auto",
+            use_cache=False,  # Needed for gradient checkpointing
+            torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
         )
+        # Load the tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            local_model_path,
+            padding_side="right",
+            use_fast=True,
+        )
+        tokenizer.pad_token = tokenizer.eos_token
+        # Find model's architecture type
+        model_type = model.config.model_type
+        log.append(f"Model architecture type: {model_type}")
         # PEFT Configuration (Smaller LoRA for faster iteration)
         model = prepare_model_for_kbit_training(model)