Spaces:

George-API
/

phi4training

Sleeping

App Files Files Community

George-API commited on Mar 10

Commit

678c431

verified ·

1 Parent(s): 73ea801

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

run_distributed.bat +56 -0
run_distributed.sh +28 -0
run_transformers_training.py +101 -63

run_distributed.bat ADDED Viewed

	@@ -0,0 +1,56 @@

+@echo off
+REM ======================================================================
+REM Distributed training launch script for Phi-4 training with torchrun
+REM This script launches multi-GPU training on Windows systems
+REM ======================================================================
+REM Set the number of GPUs to use (defaults to all available)
+set NUM_GPUS=%1
+if "%NUM_GPUS%"=="" set NUM_GPUS=4
+echo.
+echo ===== Phi-4 Distributed Training =====
+echo.
+echo Preparing to launch training with %NUM_GPUS% GPUs...
+REM Check if Python is available
+where python >nul 2>&1
+if %ERRORLEVEL% NEQ 0 (
+    echo ERROR: Python not found in PATH. Please make sure Python is installed and in your PATH.
+    exit /b 1
+)
+REM Check if PyTorch is installed by attempting to import it
+python -c "import torch" >nul 2>&1
+if %ERRORLEVEL% NEQ 0 (
+    echo ERROR: PyTorch not properly installed. Please install with:
+    echo pip install torch>=2.0.0
+    exit /b 1
+)
+REM Check if torch.distributed is available
+python -c "import torch.distributed" >nul 2>&1
+if %ERRORLEVEL% NEQ 0 (
+    echo ERROR: torch.distributed module not available. Please check your PyTorch installation.
+    exit /b 1
+)
+echo Environment checks passed. Starting distributed training...
+echo.
+REM Launch the distributed training
+python -m torch.distributed.run --nproc_per_node=%NUM_GPUS% --master_port=29500 run_transformers_training.py --config transformers_config.json
+REM Check exit status
+if %ERRORLEVEL% EQU 0 (
+    echo.
+    echo ===== SUCCESS =====
+    echo Distributed training completed successfully!
+) else (
+    echo.
+    echo ===== ERROR =====
+    echo Distributed training failed with exit code %ERRORLEVEL%
+)
+echo.
+echo Training logs are available in the ./results directory.

run_distributed.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+# Distributed training launch script for Phi-4 training
+# This script uses torchrun to launch multi-GPU training
+# Set the number of GPUs to use (defaults to all available)
+NUM_GPUS=${1:-4}
+# Check if torchrun is available
+if ! command -v torchrun &> /dev/null; then
+    echo "torchrun command not found. Make sure PyTorch is installed properly."
+    echo "Try: pip install torch>=2.0.0"
+    exit 1
+fi
+echo "Launching distributed training with $NUM_GPUS GPUs..."
+# Launch the distributed training
+torchrun --nproc_per_node=$NUM_GPUS \
+    --master_port=29500 \
+    run_transformers_training.py \
+    --config transformers_config.json
+# Check exit status
+if [ $? -eq 0 ]; then
+    echo "Distributed training completed successfully!"
+else
+    echo "Distributed training failed with exit code $?"
+fi

run_transformers_training.py CHANGED Viewed

@@ -432,7 +432,7 @@ def load_dataset_with_mapping(dataset_config):
     except Exception as e:
         logger.error(f"Error loading dataset: {str(e)}")
-        raise
 def format_phi_chat(messages, dataset_config):
     """Format messages according to phi-4's chat template and dataset config."""
@@ -502,31 +502,50 @@ class SimpleDataCollator:
         for example in features:
             try:
-                # Get ID
-                paper_id = example.get("article_id", example.get("id", ""))
-                # Get conversations
-                raw_conversations = example.get("conversations", [])
                 if not raw_conversations:
-                    logger.warning(f"Empty conversations for example {paper_id}")
                     self.stats["skipped"] += 1
                     continue
                 # Extract only the 'content' field from each conversation item
-                # This simplifies the structure and avoids potential NoneType errors
                 try:
                     # Convert conversations to the simple format with only content
                     simplified_conversations = []
                     for item in raw_conversations:
-                        if isinstance(item, dict) and "content" in item:
-                            # Keep only the content field
-                            content = item["content"]
-                            simplified_conversations.append({"role": "user", "content": content})
                         elif isinstance(item, str):
                             # If it's just a string, treat it as content
                             simplified_conversations.append({"role": "user", "content": item})
                         else:
-                            logger.warning(f"Skipping invalid conversation item: {item}")
                     # Skip if no valid conversations after filtering
                     if not simplified_conversations:
@@ -536,62 +555,66 @@ class SimpleDataCollator:
                     # Log the simplified content for debugging
                     if len(simplified_conversations) > 0:
-                        first_content = simplified_conversations[0]["content"]
-                        logger.debug(f"First content: {first_content[:50]}...")
                     # Let tokenizer handle the simplified conversations
-                    inputs = self.tokenizer.apply_chat_template(
-                        simplified_conversations,
-                        return_tensors=None,
-                        add_generation_prompt=False
-                    )
-                except Exception as chat_error:
-                    # Fallback if apply_chat_template fails
-                    logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)}")
-                    # Create a basic representation of just the content
-                    conversation_text = ""
-                    for msg in raw_conversations:
-                        if isinstance(msg, dict) and 'content' in msg:
-                            conversation_text += msg['content'] + "\n\n"
-                        elif isinstance(msg, str):
-                            conversation_text += msg + "\n\n"
-                    # Basic tokenization
-                    inputs = self.tokenizer(
-                        conversation_text,
-                        add_special_tokens=True,
-                        return_tensors=None
-                    )
-                # Apply length cap if needed (shouldn't be necessary for pre-audited data)
-                if self.max_seq_length > 0 and len(inputs) > self.max_seq_length:
-                    logger.warning(f"Example {paper_id} exceeds max_seq_length ({len(inputs)} > {self.max_seq_length})")
-                    inputs = inputs[:self.max_seq_length]
-                # Create attention mask (1 for all tokens)
-                attention_mask = [1] * len(inputs)
-                if len(inputs) > 0:
-                    # For causal language modeling, labels are the same as inputs
-                    labels = inputs.copy()
-                    batch["input_ids"].append(inputs)
-                    batch["attention_mask"].append(attention_mask)
-                    batch["labels"].append(labels)
-                    self.stats["processed"] += 1
-                    self.stats["total_tokens"] += len(inputs)
-                    # Debug logging for first few examples
-                    log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
-                    if self.stats["processed"] <= log_samples:
-                        logger.info(f"Example {self.stats['processed']}:")
-                        logger.info(f"Paper ID: {paper_id}")
-                        logger.info(f"Token count: {len(inputs)}")
-                        logger.info(f"Conversation entries: {len(raw_conversations)}")
-                else:
                     self.stats["skipped"] += 1
             except Exception as e:
                 logger.warning(f"Error processing example: {str(e)[:100]}...")
                 logger.warning(f"Problematic example ID: {example.get('id', 'unknown')}")
@@ -758,7 +781,22 @@ def check_dependencies():
         logger.info("flash-attn found. Flash attention will be used for faster training.")
     else:
         logger.warning("flash-attn not found. Training will work but may be slower.")
-        logger.warning("To use flash attention, install with: pip install flash-attn --no-build-isolation")
     # Additional optional packages that improve performance
     if find_spec("bitsandbytes"):

     except Exception as e:
         logger.error(f"Error loading dataset: {str(e)}")
+        return 1
 def format_phi_chat(messages, dataset_config):
     """Format messages according to phi-4's chat template and dataset config."""
         for example in features:
             try:
+                # Get ID for logging
+                paper_id = example.get("article_id", example.get("id", "unknown"))
+                # Safely get conversations with explicit None check
+                raw_conversations = example.get("conversations")
+                if raw_conversations is None:
+                    logger.warning(f"Conversations is None for example {paper_id}")
+                    self.stats["skipped"] += 1
+                    continue
+                # Ensure conversations is a list
+                if not isinstance(raw_conversations, list):
+                    logger.warning(f"Conversations is not a list for example {paper_id} (type: {type(raw_conversations)})")
+                    self.stats["skipped"] += 1
+                    continue
+                # Check for empty conversations list
                 if not raw_conversations:
+                    logger.warning(f"Empty conversations list for example {paper_id}")
                     self.stats["skipped"] += 1
                     continue
                 # Extract only the 'content' field from each conversation item
                 try:
                     # Convert conversations to the simple format with only content
                     simplified_conversations = []
                     for item in raw_conversations:
+                        # Skip None items
+                        if item is None:
+                            logger.warning(f"Skipping None conversation item in example {paper_id}")
+                            continue
+                        if isinstance(item, dict):
+                            # Get content with explicit None check
+                            content = item.get("content")
+                            if content is not None:
+                                simplified_conversations.append({"role": "user", "content": content})
+                            else:
+                                logger.warning(f"Skipping conversation item with None content in example {paper_id}")
                         elif isinstance(item, str):
                             # If it's just a string, treat it as content
                             simplified_conversations.append({"role": "user", "content": item})
                         else:
+                            logger.warning(f"Skipping invalid conversation item type: {type(item)} in example {paper_id}")
                     # Skip if no valid conversations after filtering
                     if not simplified_conversations:
                     # Log the simplified content for debugging
                     if len(simplified_conversations) > 0:
+                        first_content = simplified_conversations[0].get("content", "")
+                        if first_content:
+                            logger.debug(f"First content: {first_content[:50]}...")
                     # Let tokenizer handle the simplified conversations
+                    try:
+                        inputs = self.tokenizer.apply_chat_template(
+                            simplified_conversations,
+                            return_tensors=None,
+                            add_generation_prompt=False
+                        )
+                    except Exception as chat_error:
+                        # Fallback if apply_chat_template fails
+                        logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)}")
+                        # Create a basic representation of just the content
+                        conversation_text = ""
+                        for msg in simplified_conversations:
+                            if isinstance(msg, dict) and msg.get("content"):
+                                conversation_text += msg["content"] + "\n\n"
+                        if not conversation_text:
+                            logger.warning(f"No valid content to tokenize in example {paper_id}")
+                            self.stats["skipped"] += 1
+                            continue
+                        # Basic tokenization
+                        inputs = self.tokenizer(
+                            conversation_text,
+                            add_special_tokens=True,
+                            return_tensors=None
+                        )
+                    # Apply length cap if needed
+                    if self.max_seq_length > 0 and len(inputs) > self.max_seq_length:
+                        logger.warning(f"Example {paper_id} exceeds max_seq_length ({len(inputs)} > {self.max_seq_length})")
+                        inputs = inputs[:self.max_seq_length]
+                    # Create attention mask (1 for all tokens)
+                    attention_mask = [1] * len(inputs)
+                    if len(inputs) > 0:
+                        # For causal language modeling, labels are the same as inputs
+                        labels = inputs.copy()
+                        batch["input_ids"].append(inputs)
+                        batch["attention_mask"].append(attention_mask)
+                        batch["labels"].append(labels)
+                        self.stats["processed"] += 1
+                        self.stats["total_tokens"] += len(inputs)
+                    else:
+                        logger.warning(f"Empty inputs after tokenization for example {paper_id}")
+                        self.stats["skipped"] += 1
+                except Exception as e:
+                    logger.warning(f"Error processing conversations in example {paper_id}: {str(e)}")
                     self.stats["skipped"] += 1
+                    continue
             except Exception as e:
                 logger.warning(f"Error processing example: {str(e)[:100]}...")
                 logger.warning(f"Problematic example ID: {example.get('id', 'unknown')}")
         logger.info("flash-attn found. Flash attention will be used for faster training.")
     else:
         logger.warning("flash-attn not found. Training will work but may be slower.")
+        logger.warning("Attempting to install flash-attn automatically...")
+        try:
+            import subprocess
+            subprocess.check_call([sys.executable, "-m", "pip", "install", "flash-attn", "--no-build-isolation"])
+            logger.info("Successfully installed flash-attn!")
+            # Try to import it now that it's installed
+            try:
+                import flash_attn
+                logger.info("flash-attn imported successfully after installation.")
+            except ImportError:
+                logger.warning("flash-attn installed but import failed - may require restart.")
+        except Exception as e:
+            logger.warning(f"Failed to install flash-attn: {str(e)}")
+            logger.warning("To manually install flash attention, run: pip install flash-attn --no-build-isolation")
     # Additional optional packages that improve performance
     if find_spec("bitsandbytes"):