File size: 5,549 Bytes
b4b3dd2
 
 
 
ab1497e
b4b3dd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab1497e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4b3dd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab1497e
 
 
 
 
 
 
b4b3dd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab1497e
 
b4b3dd2
 
 
 
 
 
ab1497e
 
 
 
 
b4b3dd2
 
 
 
 
 
ab1497e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
import os
import json
import torch
import subprocess
from dotenv import load_dotenv
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("app.log")
    ]
)
logger = logging.getLogger(__name__)

# Load environment variables
load_dotenv()

# Load config file
def load_config(config_path="transformers_config.json"):
    try:
        with open(config_path, 'r') as f:
            config = json.load(f)
        return config
    except Exception as e:
        logger.error(f"Error loading config: {str(e)}")
        return {}

# Load configuration
config = load_config()
model_config = config.get("model_config", {})

# Model details from config
MODEL_NAME = model_config.get("model_name_or_path", "unsloth/DeepSeek-R1-Distill-Qwen-14B-bnb-4bit")
SPACE_NAME = os.getenv("HF_SPACE_NAME", "phi4training")
TRAINING_ACTIVE = os.path.exists("TRAINING_ACTIVE")

# Function to start the training process
def start_training():
    try:
        # Create TRAINING_ACTIVE file
        with open("TRAINING_ACTIVE", "w") as f:
            f.write("Training in progress")
            
        # Run the training script in the background
        subprocess.Popen(["python", "run_cloud_training.py"], 
                        stdout=subprocess.PIPE, 
                        stderr=subprocess.PIPE)
        
        return "✅ Training started! Check status below for updates."
    except Exception as e:
        logger.error(f"Error starting training: {str(e)}")
        return f"❌ Error starting training: {str(e)}"

# Create Gradio interface - training status only, no model outputs
with gr.Blocks(css="footer {visibility: hidden}") as demo:
    gr.Markdown(f"# {SPACE_NAME}: Training Status Dashboard")
    
    with gr.Row():
        with gr.Column():
            status = gr.Markdown(
                f"""
                ## Research Training Phase Active
                
                **Model**: {MODEL_NAME}
                **Dataset**: phi4-cognitive-dataset
                
                This is a multidisciplinary research training phase. The model is not available for interactive use.
                
                ### Training Configuration:
                - **Epochs**: {config.get("training_config", {}).get("num_train_epochs", 3)}
                - **Batch Size**: {config.get("training_config", {}).get("per_device_train_batch_size", 2)}
                - **Gradient Accumulation Steps**: {config.get("training_config", {}).get("gradient_accumulation_steps", 4)}
                - **Learning Rate**: {config.get("training_config", {}).get("learning_rate", 2e-5)}
                - **Max Sequence Length**: {config.get("training_config", {}).get("max_seq_length", 2048)}
                
                ### Training Status:
                {"🟢 Training in progress" if TRAINING_ACTIVE else "⚪ Training not currently active"}
                
                ⚠️ **NOTE**: This space does not provide model outputs during the research training phase.
                """
            )
    
    with gr.Row():
        # Add buttons for starting training and refreshing status
        start_btn = gr.Button("Start Training", variant="primary")
        refresh_btn = gr.Button("Refresh Status")
        
    # Output area for training start messages
    training_output = gr.Markdown("")
    
    def refresh_status():
        # Re-check if training is active
        training_active = os.path.exists("TRAINING_ACTIVE")
        return f"""
        ## Research Training Phase Active
        
        **Model**: {MODEL_NAME}
        **Dataset**: phi4-cognitive-dataset
        
        This is a multidisciplinary research training phase. The model is not available for interactive use.
        
        ### Training Configuration:
        - **Epochs**: {config.get("training_config", {}).get("num_train_epochs", 3)}
        - **Batch Size**: {config.get("training_config", {}).get("per_device_train_batch_size", 2)}
        - **Gradient Accumulation Steps**: {config.get("training_config", {}).get("gradient_accumulation_steps", 4)}
        - **Learning Rate**: {config.get("training_config", {}).get("learning_rate", 2e-5)}
        - **Max Sequence Length**: {config.get("training_config", {}).get("max_seq_length", 2048)}
        
        ### Training Status:
        {"🟢 Training in progress" if training_active else "⚪ Training not currently active"}
        
        ⚠️ **NOTE**: This space does not provide model outputs during the research training phase.
        """
    
    # Connect button clicks to functions
    start_btn.click(start_training, outputs=training_output)
    refresh_btn.click(refresh_status, outputs=status)
    
    gr.Markdown("""
    ### Research Training Information
    This model is being fine-tuned on research-focused datasets and is not available for interactive querying.
    Training logs are available to authorized researchers only.
    
    ### Instructions
    1. Click "Start Training" to begin the fine-tuning process
    2. Use "Refresh Status" to check training progress
    3. Training logs are saved to the output directory
    """)

# Launch the interface
if __name__ == "__main__":
    # Start Gradio with minimal features
    logger.info("Starting training status dashboard")
    demo.launch(share=False)  # Removed enable_queue parameter which is no longer supported in Gradio 5.x