Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

14bbc11

verified ·

1 Parent(s): 192b89f

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -60

app.py CHANGED Viewed

@@ -298,7 +298,7 @@ def load_model():
     log.append(f"LoRA rank: 8, alpha: 16 (optimized for 1B model)")
     model_to_train.print_trainable_parameters()
-    return model, tokenizer  # Return both model and tokenizer
 def load_dataset():
     # --- Download the dataset repository files ---
@@ -670,77 +670,108 @@ def train_model(
             sample_data = torch.load(sample_file)
             log.append(f"Sample data type: {type(sample_data)}")
-            if isinstance(sample_data, dict):
-                log.append(f"Sample data is a dictionary with keys: {list(sample_data.keys())}")
-                # Print a few sample values to understand the structure
-                for key in list(sample_data.keys())[:3]:
-                    log.append(f"Key '{key}' has value of type {type(sample_data[key])}")
-                    if isinstance(sample_data[key], torch.Tensor):
-                        log.append(f"  Shape: {sample_data[key].shape}, Dtype: {sample_data[key].dtype}")
-            # Load all files with the appropriate structure
             input_ids_list = []
             labels_list = []
-            for pt_file in tqdm(pt_files, desc="Loading .pt files"):
-                data = torch.load(pt_file)
-                # Handling dictionary structure
-                if isinstance(data, dict):
-                    # Assume dictionary contains input_ids and labels keys
-                    if 'input_ids' in data and 'labels' in data:
-                        input_ids_list.append(data['input_ids'])
-                        labels_list.append(data['labels'])
-                    # Or maybe it has other keys that we need to convert
-                    elif 'prompt' in data and 'response' in data:
-                        input_ids_list.append(data['prompt'])
-                        labels_list.append(data['response'])
-                    # Or maybe it has source and target keys
-                    elif 'source' in data and 'target' in data:
-                        input_ids_list.append(data['source'])
-                        labels_list.append(data['target'])
-                    # If none of these patterns match, try to figure out the structure
                     else:
-                        log.append(f"Unknown dictionary structure in {pt_file} with keys: {list(data.keys())}")
-                        # Try the first two keys as input/output
-                        keys = list(data.keys())
-                        if len(keys) >= 2:
-                            input_ids_list.append(data[keys[0]])
-                            labels_list.append(data[keys[1]])
-                # Handling tuple/list structure - the original expected format
-                elif isinstance(data, (tuple, list)) and len(data) >= 2:
-                    input_ids_list.append(data[0])
-                    labels_list.append(data[1])
-                else:
-                    log.append(f"Unsupported data format in {pt_file}: {type(data)}")
-            log.append(f"Processed {len(input_ids_list)} input/label pairs")
-            # Process tensors to ensure they're the right format
-            processed_inputs = []
-            processed_labels = []
             for i, (inputs, labels) in enumerate(zip(input_ids_list, labels_list)):
-                # Convert to tensor if not already
-                if not isinstance(inputs, torch.Tensor):
-                    inputs = torch.tensor(inputs)
-                if not isinstance(labels, torch.Tensor):
-                    labels = torch.tensor(labels)
-                # Ensure they're integer tensors
-                inputs = inputs.long()
-                labels = labels.long()
-                # Append to lists, converting to standard Python lists for the Dataset
-                processed_inputs.append(inputs.tolist())
-                processed_labels.append(labels.tolist())
-                # Log some diagnostics for the first few pairs
-                if i < 3:
-                    log.append(f"Pair {i}: Input shape: {inputs.shape}, Label shape: {labels.shape}")
             # Create the dataset
-            log.append("Creating dataset from processed pairs...")
             dataset = Dataset.from_dict({
                 "input_ids": processed_inputs,
                 "labels": processed_labels
@@ -754,6 +785,7 @@ def train_model(
             log.append(f"Created dataset with {len(train_dataset)} training examples and {len(val_dataset)} validation examples")
         except Exception as e:
             error_msg = f"Error processing dataset: {str(e)}\n{traceback.format_exc()}"
             log.append(error_msg)
             return "\n".join(log)

     log.append(f"LoRA rank: 8, alpha: 16 (optimized for 1B model)")
     model_to_train.print_trainable_parameters()
+    return model, tokenizer
 def load_dataset():
     # --- Download the dataset repository files ---
             sample_data = torch.load(sample_file)
             log.append(f"Sample data type: {type(sample_data)}")
+            # Function to recursively explore the data structure
+            def explore_data(data, prefix=""):
+                if isinstance(data, (list, tuple)):
+                    log.append(f"{prefix}List/Tuple with {len(data)} items")
+                    if len(data) > 0:
+                        explore_data(data[0], prefix + "  [0]: ")
+                elif isinstance(data, dict):
+                    log.append(f"{prefix}Dictionary with keys: {list(data.keys())}")
+                    for key in list(data.keys())[:2]:  # Look at first 2 keys
+                        explore_data(data[key], prefix + f"  ['{key}']: ")
+                elif isinstance(data, torch.Tensor):
+                    log.append(f"{prefix}Tensor with shape {data.shape} and dtype {data.dtype}")
+                else:
+                    log.append(f"{prefix}Other type: {type(data)}")
+            # Explore the sample data
+            explore_data(sample_data, "Sample data: ")
+            # Function to extract tensor data from complex structures
+            def extract_tensor_data(data):
+                if isinstance(data, torch.Tensor):
+                    return data
+                elif isinstance(data, (list, tuple)) and len(data) > 0:
+                    if all(isinstance(item, (int, float)) for item in data):
+                        return torch.tensor(data)
+                    # For lists of tensors/complex structures, use the first item
+                    return extract_tensor_data(data[0])
+                elif isinstance(data, dict):
+                    # Try common keys for input data
+                    for key in ['input_ids', 'prompt', 'source', 'inputs', 'data']:
+                        if key in data:
+                            return extract_tensor_data(data[key])
+                    # If none found, use the first key
+                    if len(data) > 0:
+                        return extract_tensor_data(next(iter(data.values())))
+                return None
+            # Process all files
             input_ids_list = []
             labels_list = []
+            # Capture any errors for later analysis
+            file_errors = []
+            for i, pt_file in enumerate(tqdm(pt_files, desc="Loading .pt files")):
+                try:
+                    data = torch.load(pt_file)
+                    if isinstance(data, (list, tuple)) and len(data) >= 2:
+                        # Standard format: list/tuple with [input, label]
+                        input_tensor = extract_tensor_data(data[0])
+                        label_tensor = extract_tensor_data(data[1])
+                        if input_tensor is not None and label_tensor is not None:
+                            input_ids_list.append(input_tensor)
+                            labels_list.append(label_tensor)
+                        else:
+                            file_errors.append(f"Could not extract tensors from {pt_file}")
                     else:
+                        log.append(f"File {pt_file} has unexpected format. Skipping.")
+                        file_errors.append(f"Unexpected format in {pt_file}: {type(data)}")
+                except Exception as e:
+                    file_errors.append(f"Error processing file {pt_file}: {str(e)}")
+            # Log errors if any
+            if file_errors:
+                log.append(f"Encountered {len(file_errors)} errors during file processing:")
+                for i, error in enumerate(file_errors[:5]):  # Log first 5 errors
+                    log.append(f"  Error {i+1}: {error}")
+                if len(file_errors) > 5:
+                    log.append(f"  ...and {len(file_errors) - 5} more errors")
+            log.append(f"Successfully processed {len(input_ids_list)} input/label pairs")
+            # Verify all tensors are valid
+            valid_pairs = []
             for i, (inputs, labels) in enumerate(zip(input_ids_list, labels_list)):
+                # Perform safety checks on tensors
+                if not isinstance(inputs, torch.Tensor) or not isinstance(labels, torch.Tensor):
+                    log.append(f"Pair {i}: Invalid tensor types - skipping")
+                    continue
+                # Ensure tensors contain integers
+                try:
+                    inputs = inputs.long()
+                    labels = labels.long()
+                    # Convert to lists and add to valid pairs
+                    valid_pairs.append((inputs.tolist(), labels.tolist()))
+                    # Log some diagnostics for the first few pairs
+                    if i < 3:
+                        log.append(f"Pair {i}: Input shape: {inputs.shape}, Label shape: {labels.shape}")
+                except Exception as e:
+                    log.append(f"Error converting tensors for pair {i}: {str(e)}")
             # Create the dataset
+            log.append(f"Creating dataset from {len(valid_pairs)} valid pairs...")
+            processed_inputs = [pair[0] for pair in valid_pairs]
+            processed_labels = [pair[1] for pair in valid_pairs]
             dataset = Dataset.from_dict({
                 "input_ids": processed_inputs,
                 "labels": processed_labels
             log.append(f"Created dataset with {len(train_dataset)} training examples and {len(val_dataset)} validation examples")
         except Exception as e:
+            import traceback
             error_msg = f"Error processing dataset: {str(e)}\n{traceback.format_exc()}"
             log.append(error_msg)
             return "\n".join(log)