Spaces:

Twelve2five
/

qlora-llama3-finetuning

Sleeping

App Files Files Community

Twelve2five commited on Apr 9

Commit

192b89f

verified ·

1 Parent(s): 19ba848

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -80

app.py CHANGED Viewed

@@ -24,6 +24,7 @@ import subprocess
 import sys
 import json
 import shutil
 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
@@ -653,95 +654,108 @@ def train_model(
         downloaded_files = glob.glob(f"{local_dataset_path}/**/*.pt", recursive=True)
         log.append(f"Found {len(downloaded_files)} .pt files in the dataset directory")
-        if len(downloaded_files) == 0:
-            log.append("No .pt files found. Checking for other file types...")
-            all_files = glob.glob(f"{local_dataset_path}/**/*.*", recursive=True)
-            log.append(f"All files found: {', '.join(all_files[:10])}")
-            if len(all_files) > 10:
-                log.append(f"...and {len(all_files) - 10} more files")
-        # Look for the pairs directory
         pairs_dir = os.path.join(local_dataset_path, "final_rvq_pairs")
-        if not os.path.exists(pairs_dir):
-            log.append(f"final_rvq_pairs directory not found. Looking for other possible directories...")
-            possible_dirs = [d for d in glob.glob(f"{local_dataset_path}/**/") if os.path.isdir(d)]
-            log.append(f"Available directories: {', '.join(possible_dirs)}")
-            # Try to find any directory containing .pt files
-            for dir_path in possible_dirs:
-                if glob.glob(f"{dir_path}/*.pt"):
-                    pairs_dir = dir_path
-                    log.append(f"Using {pairs_dir} as the pairs directory.")
-                    break
-        # If we found the pairs directory, we're good to go
-        if pairs_dir and os.path.exists(pairs_dir):
-            log.append(f"Using pairs directory: {pairs_dir}")
-            pt_files = glob.glob(f"{pairs_dir}/*.pt")
-            log.append(f"Found {len(pt_files)} .pt files in pairs directory")
-            # Load the dataset from the files
-            progress(0.5, desc="Loading pairs from dataset files...")
-            log.append("Loading dataset pairs...")
-            try:
-                # Load pairs from .pt files
-                pairs = []
-                for pt_file in tqdm(pt_files, desc="Loading .pt files"):
-                    pair_data = torch.load(pt_file)
-                    pairs.append(pair_data)
-                log.append(f"Loaded {len(pairs)} conversation pairs")
-                # Create a dataset from the pairs
-                dataset = Dataset.from_dict({
-                    "input_ids": [pair[0].tolist() for pair in pairs],
-                    "labels": [pair[1].tolist() for pair in pairs]
-                })
-                # Split into training and validation sets
-                train_test_split = dataset.train_test_split(test_size=0.05)
-                train_dataset = train_test_split["train"]
-                log.append(f"Created dataset with {len(train_dataset)} training examples")
-            except Exception as e:
-                log.append(f"Error loading pair data: {e}")
-                # Try an alternative approach - look for JSON or other formats
-                log.append("Attempting alternative dataset loading approaches...")
-                # Search for JSON files
-                json_files = glob.glob(f"{local_dataset_path}/**/*.json", recursive=True)
-                if json_files:
-                    log.append(f"Found {len(json_files)} JSON files. Trying to load from these...")
-                    # Load from JSON
-                    combined_data = []
-                    for json_file in json_files[:5]:  # Start with a few files
-                        try:
-                            with open(json_file, 'r') as f:
-                                file_data = json.load(f)
-                                log.append(f"Successfully loaded {json_file}")
-                                # Print sample of the data structure
-                                log.append(f"Sample data structure: {str(file_data)[:500]}...")
-                                combined_data.append(file_data)
-                        except Exception as je:
-                            log.append(f"Error loading {json_file}: {je}")
-                    # If we loaded any data, try to create a dataset from it
-                    if combined_data:
-                        log.append("Attempting to create dataset from JSON data...")
-                        # This will need adapting based on the actual JSON structure
                 else:
-                    log.append("No JSON files found. Looking for other formats...")
-                    # Add code for other formats if needed
-                log.append("Failed to load dataset after multiple attempts.")
-                return "\n".join(log)
-        else:
-            log.append("Could not locate pairs directory or any directory with .pt files.")
             return "\n".join(log)
     except Exception as e:

 import sys
 import json
 import shutil
+import traceback
 # --- Configuration ---
 YOUR_HF_USERNAME = "Twelve2five"
         downloaded_files = glob.glob(f"{local_dataset_path}/**/*.pt", recursive=True)
         log.append(f"Found {len(downloaded_files)} .pt files in the dataset directory")
+        # Look for the pairs directory (we know this exists from the log)
         pairs_dir = os.path.join(local_dataset_path, "final_rvq_pairs")
+        log.append(f"Using pairs directory: {pairs_dir}")
+        pt_files = glob.glob(f"{pairs_dir}/*.pt")
+        log.append(f"Found {len(pt_files)} .pt files in pairs directory")
+        # Load the dataset from the files
+        progress(0.5, desc="Loading pairs from dataset files...")
+        log.append("Loading dataset pairs...")
+        try:
+            # Load a single file first to understand its structure
+            sample_file = pt_files[0]
+            sample_data = torch.load(sample_file)
+            log.append(f"Sample data type: {type(sample_data)}")
+            if isinstance(sample_data, dict):
+                log.append(f"Sample data is a dictionary with keys: {list(sample_data.keys())}")
+                # Print a few sample values to understand the structure
+                for key in list(sample_data.keys())[:3]:
+                    log.append(f"Key '{key}' has value of type {type(sample_data[key])}")
+                    if isinstance(sample_data[key], torch.Tensor):
+                        log.append(f"  Shape: {sample_data[key].shape}, Dtype: {sample_data[key].dtype}")
+            # Load all files with the appropriate structure
+            input_ids_list = []
+            labels_list = []
+            for pt_file in tqdm(pt_files, desc="Loading .pt files"):
+                data = torch.load(pt_file)
+                # Handling dictionary structure
+                if isinstance(data, dict):
+                    # Assume dictionary contains input_ids and labels keys
+                    if 'input_ids' in data and 'labels' in data:
+                        input_ids_list.append(data['input_ids'])
+                        labels_list.append(data['labels'])
+                    # Or maybe it has other keys that we need to convert
+                    elif 'prompt' in data and 'response' in data:
+                        input_ids_list.append(data['prompt'])
+                        labels_list.append(data['response'])
+                    # Or maybe it has source and target keys
+                    elif 'source' in data and 'target' in data:
+                        input_ids_list.append(data['source'])
+                        labels_list.append(data['target'])
+                    # If none of these patterns match, try to figure out the structure
+                    else:
+                        log.append(f"Unknown dictionary structure in {pt_file} with keys: {list(data.keys())}")
+                        # Try the first two keys as input/output
+                        keys = list(data.keys())
+                        if len(keys) >= 2:
+                            input_ids_list.append(data[keys[0]])
+                            labels_list.append(data[keys[1]])
+                # Handling tuple/list structure - the original expected format
+                elif isinstance(data, (tuple, list)) and len(data) >= 2:
+                    input_ids_list.append(data[0])
+                    labels_list.append(data[1])
                 else:
+                    log.append(f"Unsupported data format in {pt_file}: {type(data)}")
+            log.append(f"Processed {len(input_ids_list)} input/label pairs")
+            # Process tensors to ensure they're the right format
+            processed_inputs = []
+            processed_labels = []
+            for i, (inputs, labels) in enumerate(zip(input_ids_list, labels_list)):
+                # Convert to tensor if not already
+                if not isinstance(inputs, torch.Tensor):
+                    inputs = torch.tensor(inputs)
+                if not isinstance(labels, torch.Tensor):
+                    labels = torch.tensor(labels)
+                # Ensure they're integer tensors
+                inputs = inputs.long()
+                labels = labels.long()
+                # Append to lists, converting to standard Python lists for the Dataset
+                processed_inputs.append(inputs.tolist())
+                processed_labels.append(labels.tolist())
+                # Log some diagnostics for the first few pairs
+                if i < 3:
+                    log.append(f"Pair {i}: Input shape: {inputs.shape}, Label shape: {labels.shape}")
+            # Create the dataset
+            log.append("Creating dataset from processed pairs...")
+            dataset = Dataset.from_dict({
+                "input_ids": processed_inputs,
+                "labels": processed_labels
+            })
+            # Split into training and validation
+            train_test_split = dataset.train_test_split(test_size=0.05)
+            train_dataset = train_test_split["train"]
+            val_dataset = train_test_split["test"]
+            log.append(f"Created dataset with {len(train_dataset)} training examples and {len(val_dataset)} validation examples")
+        except Exception as e:
+            error_msg = f"Error processing dataset: {str(e)}\n{traceback.format_exc()}"
+            log.append(error_msg)
             return "\n".join(log)
     except Exception as e: