Spaces:

cong182
/

firstAI

Sleeping

App Files Files Community

ndc8 commited on 10 days ago

Commit

3960f0f

1 Parent(s): 78b611a

rabbit-ed

Browse files

Files changed (5) hide show

README.md +1 -0
gemma_gguf_backend.py +82 -9
space.yaml +1 -1
training/train_gemma_unsloth.py +41 -14
training_runs/devlocal/meta.json +2 -2

README.md CHANGED Viewed

@@ -431,4 +431,5 @@ To run with a real model locally:
    ```
 ## License
 Apache 2.0

    ```
 ## License
 Apache 2.0

gemma_gguf_backend.py CHANGED Viewed

@@ -14,8 +14,9 @@ import sys
 import subprocess
 import threading
 from pathlib import Path
-from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, field_validator
@@ -28,6 +29,8 @@ except ImportError:
     llama_cpp_available = False
 import uvicorn
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -72,6 +75,7 @@ class HealthResponse(BaseModel):
     version: str
     backend: str
 # Global variables for model management
 current_model = os.environ.get("AI_MODEL", "unsloth/gemma-3n-E4B-it-GGUF")
 llm = None
@@ -277,19 +281,57 @@ async def create_chat_completion(
 # Training Job Management (Unsloth)
 # -----------------------------
-# Jobs are tracked in-memory; logs and artifacts are written to disk
 TRAIN_JOBS: Dict[str, Dict[str, Any]] = {}
 TRAIN_DIR = Path(os.environ.get("TRAIN_DIR", "./training_runs")).resolve()
 TRAIN_DIR.mkdir(parents=True, exist_ok=True)
 def _start_training_subprocess(job_id: str, args: Dict[str, Any]) -> subprocess.Popen[Any]:
     """Spawn a subprocess to run the Unsloth fine-tuning script."""
     logs_dir = TRAIN_DIR / job_id
     logs_dir.mkdir(parents=True, exist_ok=True)
     log_file = open(logs_dir / "train.log", "w", encoding="utf-8")
     # Build absolute script path to avoid module/package resolution issues
     script_path = (Path(__file__).parent / "training" / "train_gemma_unsloth.py").resolve()
     python_exec = sys.executable
     cmd = [
@@ -338,6 +380,15 @@ def _watch_process(job_id: str, proc: subprocess.Popen[Any]):
     TRAIN_JOBS[job_id]["status"] = status
     TRAIN_JOBS[job_id]["return_code"] = return_code
     TRAIN_JOBS[job_id]["ended_at"] = int(time.time())
     logger.info(f"🏁 Training job {job_id} finished with status={status}, code={return_code}")
 class StartTrainingRequest(BaseModel):
@@ -376,6 +427,13 @@ class TrainStatusResponse(BaseModel):
 @app.post("/train/start", response_model=StartTrainingResponse)
 def start_training(req: StartTrainingRequest):
     """Start a background Unsloth fine-tuning job. Returns a job_id to poll."""
     job_id = uuid.uuid4().hex[:12]
     now = int(time.time())
     output_dir = str((TRAIN_DIR / job_id).resolve())
@@ -386,18 +444,21 @@ def start_training(req: StartTrainingRequest):
         "args": req.model_dump(),
         "output_dir": output_dir,
     }
     try:
         proc = _start_training_subprocess(job_id, req.model_dump())
         TRAIN_JOBS[job_id]["status"] = "running"
         TRAIN_JOBS[job_id]["pid"] = proc.pid
         watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
         watcher.start()
         return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
     except Exception as e:
-        logger.exception("Failed to start training job")
-        TRAIN_JOBS[job_id]["status"] = "failed_to_start"
-        raise HTTPException(status_code=500, detail=f"Failed to start training: {e}")
 @app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
 def train_status(job_id: str):
@@ -415,7 +476,10 @@ def train_status(job_id: str):
     )
 @app.get("/train/logs/{job_id}")
-def train_logs(job_id: str, tail: int = 200):
     job = TRAIN_JOBS.get(job_id)
     if not job:
         raise HTTPException(status_code=404, detail="Job not found")
@@ -438,11 +502,20 @@ def train_stop(job_id: str):
     if not pid:
         raise HTTPException(status_code=400, detail="Job does not have an active PID")
     try:
-        os.kill(pid, 15)  # SIGTERM
-        job["status"] = "stopping"
-        return {"job_id": job_id, "status": "stopping"}
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Failed to stop job: {e}")
 # Main entry point
 if __name__ == "__main__":

 import subprocess
 import threading
 from pathlib import Path
+import signal  # Use signal.SIGTERM for process termination
+from fastapi import FastAPI, HTTPException, Query
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field, field_validator
     llama_cpp_available = False
 import uvicorn
+import sqlite3
+import json  # For persisting job metadata
 # Configure logging
 logging.basicConfig(level=logging.INFO)
     version: str
     backend: str
+from pathlib import Path
 # Global variables for model management
 current_model = os.environ.get("AI_MODEL", "unsloth/gemma-3n-E4B-it-GGUF")
 llm = None
 # Training Job Management (Unsloth)
 # -----------------------------
+# Persistent job store: in-memory dict backed by SQLite
 TRAIN_JOBS: Dict[str, Dict[str, Any]] = {}
+# Initialize SQLite DB for job persistence
+DB_PATH = Path(os.environ.get("JOB_DB_PATH", "./jobs.db"))
+conn = sqlite3.connect(str(DB_PATH), check_same_thread=False)
+cursor = conn.cursor()
+cursor.execute(
+    """
+    CREATE TABLE IF NOT EXISTS jobs (
+        job_id TEXT PRIMARY KEY,
+        data   TEXT NOT NULL
+    )
+    """
+)
+conn.commit()
+def load_jobs() -> None:
+    cursor.execute("SELECT job_id, data FROM jobs")
+    for job_id, data in cursor.fetchall():
+        TRAIN_JOBS[job_id] = json.loads(data)
+def save_job(job_id: str) -> None:
+    cursor.execute(
+        "INSERT OR REPLACE INTO jobs (job_id, data) VALUES (?, ?)",
+        (job_id, json.dumps(TRAIN_JOBS[job_id]))
+    )
+    conn.commit()
+# Load existing jobs on startup
+load_jobs()
 TRAIN_DIR = Path(os.environ.get("TRAIN_DIR", "./training_runs")).resolve()
 TRAIN_DIR.mkdir(parents=True, exist_ok=True)
+# Maximum concurrent training jobs
+MAX_CONCURRENT_JOBS = int(os.environ.get("MAX_CONCURRENT_JOBS", "5"))
 def _start_training_subprocess(job_id: str, args: Dict[str, Any]) -> subprocess.Popen[Any]:
     """Spawn a subprocess to run the Unsloth fine-tuning script."""
     logs_dir = TRAIN_DIR / job_id
     logs_dir.mkdir(parents=True, exist_ok=True)
     log_file = open(logs_dir / "train.log", "w", encoding="utf-8")
+    # Store log file handle to close later
+    TRAIN_JOBS.setdefault(job_id, {})["log_file"] = log_file
+    save_job(job_id)
     # Build absolute script path to avoid module/package resolution issues
     script_path = (Path(__file__).parent / "training" / "train_gemma_unsloth.py").resolve()
+    # Verify training script exists
+    if not script_path.exists():
+        logger.error(f"Training script not found at {script_path}")
+        raise HTTPException(status_code=500, detail=f"Training script not found at {script_path}")
     python_exec = sys.executable
     cmd = [
     TRAIN_JOBS[job_id]["status"] = status
     TRAIN_JOBS[job_id]["return_code"] = return_code
     TRAIN_JOBS[job_id]["ended_at"] = int(time.time())
+    # Persist updated job status
+    save_job(job_id)
+    # Close the log file handle to prevent resource leaks
+    log_file = TRAIN_JOBS[job_id].get("log_file")
+    if log_file:
+        try:
+            log_file.close()
+        except Exception as close_err:
+            logger.warning(f"Failed to close log file for job {job_id}: {close_err}")
     logger.info(f"🏁 Training job {job_id} finished with status={status}, code={return_code}")
 class StartTrainingRequest(BaseModel):
 @app.post("/train/start", response_model=StartTrainingResponse)
 def start_training(req: StartTrainingRequest):
     """Start a background Unsloth fine-tuning job. Returns a job_id to poll."""
+    # Enforce maximum concurrent training jobs
+    running_jobs = sum(1 for job in TRAIN_JOBS.values() if job.get("status") == "running")
+    if running_jobs >= MAX_CONCURRENT_JOBS:
+        raise HTTPException(
+            status_code=429,
+            detail=f"Maximum concurrent training jobs reached ({MAX_CONCURRENT_JOBS}). Try again later."
+        )
     job_id = uuid.uuid4().hex[:12]
     now = int(time.time())
     output_dir = str((TRAIN_DIR / job_id).resolve())
         "args": req.model_dump(),
         "output_dir": output_dir,
     }
+    save_job(job_id)
     try:
         proc = _start_training_subprocess(job_id, req.model_dump())
         TRAIN_JOBS[job_id]["status"] = "running"
         TRAIN_JOBS[job_id]["pid"] = proc.pid
+    save_job(job_id)
         watcher = threading.Thread(target=_watch_process, args=(job_id, proc), daemon=True)
         watcher.start()
         return StartTrainingResponse(job_id=job_id, status="running", output_dir=output_dir)
     except Exception as e:
+    logger.exception("Failed to start training job")
+    TRAIN_JOBS[job_id]["status"] = "failed_to_start"
+    save_job(job_id)
+    raise HTTPException(status_code=500, detail=f"Failed to start training: {e}")
 @app.get("/train/status/{job_id}", response_model=TrainStatusResponse)
 def train_status(job_id: str):
     )
 @app.get("/train/logs/{job_id}")
+def train_logs(
+    job_id: str,
+    tail: int = Query(200, ge=0, le=1000, description="Number of lines to tail, between 0 and 1000"),
+):
     job = TRAIN_JOBS.get(job_id)
     if not job:
         raise HTTPException(status_code=404, detail="Job not found")
     if not pid:
         raise HTTPException(status_code=400, detail="Job does not have an active PID")
     try:
+        os.kill(pid, signal.SIGTERM)
+    except ProcessLookupError:
+        logger.warning(
+            f"Process {pid} for job {job_id} not found; may have exited already"
+        )
+        job["status"] = "stopping_failed"
+        save_job(job_id)
+        return {"job_id": job_id, "status": job["status"]}
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Failed to stop job: {e}")
+    else:
+        job["status"] = "stopping"
+        save_job(job_id)
+        return {"job_id": job_id, "status": "stopping"}
 # Main entry point
 if __name__ == "__main__":

space.yaml CHANGED Viewed

@@ -2,4 +2,4 @@ sdk: fastapi
 python_version: 3.10
 app_file: gemma_gguf_backend.py
 env:
-  - DEMO_MODE=1

 python_version: 3.10
 app_file: gemma_gguf_backend.py
 env:
+  - DEMO_MODE=0 # Ensure model loads properly in production

training/train_gemma_unsloth.py CHANGED Viewed

@@ -12,6 +12,9 @@ import json
 import time
 from pathlib import Path
 from typing import Any, Dict
 # Lazy imports to keep API light
@@ -40,7 +43,12 @@ def _import_training_libs() -> Dict[str, Any]:
             "FastLanguageModel": FastLanguageModel,
             "AutoTokenizer": AutoTokenizer,
         }
-    except Exception:
         # Fallback: pure HF + PEFT (CPU / MPS friendly)
         from transformers import AutoTokenizer, AutoModelForCausalLM
         from peft import get_peft_model, LoraConfig
@@ -161,10 +169,18 @@ def main():
         tokenizer = AutoTokenizer.from_pretrained(args.model_id, use_fast=True, trust_remote_code=True)
         # Prefer MPS on Apple Silicon if available
         use_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
-        torch_dtype = torch.float16 if (args.use_fp16 or args.use_bf16) and not use_mps else torch.float32
         model = AutoModelForCausalLM.from_pretrained(
             args.model_id,
-            torch_dtype=torch_dtype,
             trust_remote_code=True,
         )
         if use_mps:
@@ -190,17 +206,25 @@ def main():
     response_field = args.response_field
     if text_field:
-        # Simple SFT: single text field
-        def format_row(ex):
             return ex[text_field]
     elif prompt_field and response_field:
-        # Chat data: prompt + response
-        def format_row(ex):
-            return f"<start_of_turn>user\n{ex[prompt_field]}<end_of_turn>\n<start_of_turn>model\n{ex[response_field]}<end_of_turn>\n"
     else:
         raise ValueError("Provide either --text-field or both --prompt-field and --response-field")
-    def map_fn(ex):
         return {"text": format_row(ex)}
     ds = ds.map(map_fn, remove_columns=[c for c in ds.column_names if c != "text"])
@@ -237,13 +261,16 @@ def main():
     adapter_path.mkdir(parents=True, exist_ok=True)
     # Save adapter-only weights if PEFT; Unsloth path is also PEFT-compatible
     try:
         model.save_pretrained(str(adapter_path))
-    except Exception:
-        # Fallback: save full model (large); unlikely on LoRA
         try:
-            model.base_model.save_pretrained(str(adapter_path))  # type: ignore[attr-defined]
-        except Exception:
-            pass
     tokenizer.save_pretrained(str(adapter_path))
     # Write done file

 import time
 from pathlib import Path
 from typing import Any, Dict
+import logging
+logger = logging.getLogger(__name__)
 # Lazy imports to keep API light
             "FastLanguageModel": FastLanguageModel,
             "AutoTokenizer": AutoTokenizer,
         }
+    except ImportError as e:
+        logger.warning(
+            "Primary Unsloth import failed, falling back to HF+PEFT: %s",
+            e,
+            exc_info=True,
+        )
         # Fallback: pure HF + PEFT (CPU / MPS friendly)
         from transformers import AutoTokenizer, AutoModelForCausalLM
         from peft import get_peft_model, LoraConfig
         tokenizer = AutoTokenizer.from_pretrained(args.model_id, use_fast=True, trust_remote_code=True)
         # Prefer MPS on Apple Silicon if available
         use_mps = hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
+        if not use_mps:
+            if args.use_fp16:
+                dtype = torch.float16
+            elif args.use_bf16:
+                dtype = torch.bfloat16
+            else:
+                dtype = torch.float32
+        else:
+            dtype = torch.float32
         model = AutoModelForCausalLM.from_pretrained(
             args.model_id,
+            torch_dtype=dtype,
             trust_remote_code=True,
         )
         if use_mps:
     response_field = args.response_field
     if text_field:
+        # Simple SFT: single text field with validation
+        def format_row(ex: Dict[str, Any]) -> str:
+            if text_field not in ex:
+                raise KeyError(f"Missing required text field '{text_field}' in example: {ex}")
             return ex[text_field]
     elif prompt_field and response_field:
+        # Chat data: prompt + response with validation
+        def format_row(ex: Dict[str, Any]) -> str:
+            missing = [f for f in (prompt_field, response_field) if f not in ex]
+            if missing:
+                raise KeyError(f"Missing required field(s) {missing} in example: {ex}")
+            return (
+                f"<start_of_turn>user\n{ex[prompt_field]}<end_of_turn>\n"
+                f"<start_of_turn>model\n{ex[response_field]}<end_of_turn>\n"
+            )
     else:
         raise ValueError("Provide either --text-field or both --prompt-field and --response-field")
+    def map_fn(ex: Dict[str, Any]) -> Dict[str, str]:
         return {"text": format_row(ex)}
     ds = ds.map(map_fn, remove_columns=[c for c in ds.column_names if c != "text"])
     adapter_path.mkdir(parents=True, exist_ok=True)
     # Save adapter-only weights if PEFT; Unsloth path is also PEFT-compatible
     try:
+        # Primary model saving logic
         model.save_pretrained(str(adapter_path))
+    except Exception as e:
+        logger.error("Error during primary model saving: %s", e, exc_info=True)  # type: ignore
         try:
+             # Fallback model saving logic
+             model.base_model.save_pretrained(str(adapter_path))  # type: ignore[attr-defined]
+        except Exception as fallback_e:
+             logger.error("Fallback model saving failed: %s", fallback_e, exc_info=True)  # type: ignore
+             pass  # Optionally re-raise or handle accordingly
     tokenizer.save_pretrained(str(adapter_path))
     # Write done file

training_runs/devlocal/meta.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "job_id": "devlocal",
   "model_id": "unsloth/gemma-3n-E4B-it",
-  "dataset": "/Users/congnguyen/DevRepo/firstAI/sample_data/train.jsonl",
   "created_at": 1754620844
-}

 {
   "job_id": "devlocal",
   "model_id": "unsloth/gemma-3n-E4B-it",
+  "dataset": "sample_data/train.jsonl",
   "created_at": 1754620844
+}