from fastapi import FastAPI, Request
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import multiprocessing
import time
import tiktoken # For estimating token count
import logging # Import the logging module

# === Configure Logging ===
# Get the root logger
logger = logging.getLogger(__name__)
# Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
logger.setLevel(logging.INFO)
# Create a console handler and set its format
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# Add the handler to the logger if it's not already added
if not logger.handlers:
    logger.addHandler(handler)

app = FastAPI()

# === Model Config ===
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
    logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
    try:
        model_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME,
            cache_dir=MODEL_DIR,
            local_dir=MODEL_DIR,
            local_dir_use_symlinks=False
        )
        logger.info(f"✅ Model downloaded to: {model_path}")
    except Exception as e:
        logger.error(f"❌ Error downloading model: {e}")
        # Exit or handle error appropriately if model download fails
        exit(1)
else:
    logger.info(f"✅ Model already available at: {MODEL_PATH}")
    model_path = MODEL_PATH

# === Optimal thread usage ===
logical_cores = psutil.cpu_count(logical=True)
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread

logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
logger.info(f"Using n_threads: {recommended_threads}")

# === Load the model ===
try:
    llm = Llama(
        model_path=model_path,
        n_ctx=2048,  # Context window size for the model (still needed, but not fully utilized for history)
        n_threads=recommended_threads,
        use_mlock=True,  # Lock model in RAM for faster access
        n_gpu_layers=0,  # CPU only
        chat_format="chatml",  # TinyLlama Chat uses ChatML format
        verbose=False # Keep llama.cpp's internal verbose logging off
    )
    logger.info("� Llama model loaded successfully!")
except Exception as e:
    logger.error(f"❌ Error loading Llama model: {e}")
    exit(1)

# Initialize tiktoken encoder for token counting
try:
    encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
    logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
    encoding = None

def count_tokens_in_text(text):
    """Estimates tokens in a given text using tiktoken or simple char count."""
    if encoding:
        return len(encoding.encode(text))
    else:
        # Fallback for when tiktoken isn't available or for simple estimation
        return len(text) // 4 # Rough estimate: 1 token ~ 4 characters

@app.get("/")
def root():
    logger.info("Root endpoint accessed.")
    return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}

@app.get("/get_sys")
def get_sys_specs():
    """Returns system specifications including CPU, RAM, and OS details."""
    logger.info("System specs endpoint accessed.")
    memory = psutil.virtual_memory()
    return {
        "CPU": {
            "physical_cores": physical_cores,
            "logical_cores": logical_cores,
            "max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
            "cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
        },
        "RAM": {
            "total_GB": round(memory.total / (1024 ** 3), 2),
            "available_GB": round(memory.available / (1024 ** 3), 2),
            "usage_percent": memory.percent
        },
        "System": {
            "platform": platform.platform(),
            "architecture": platform.machine(),
            "python_version": platform.python_version()
        },
        "Model_Config": {
            "model_name": FILENAME,
            "n_ctx": llm.n_ctx(),
            "n_threads": llm.n_threads(),
            "use_mlock": llm.use_mlock()
        }
    }

@app.get("/process_list")
def process_list():
    """Returns a list of processes consuming significant CPU."""
    logger.info("Process list endpoint accessed.")
    time.sleep(1)  # Let CPU settle for accurate measurement
    processes = []
    for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
        try:
            cpu = proc.cpu_percent()
            mem = proc.memory_percent()
            # Filter processes using more than 5% CPU or 2% memory
            if cpu > 5 or mem > 2:
                processes.append({
                    "pid": proc.pid,
                    "name": proc.name(),
                    "cpu_percent": round(cpu, 2),
                    "memory_percent": round(mem, 2)
                })
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    # Sort by CPU usage descending
    processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
    return {"heavy_processes": processes}

@app.post("/generate")
async def generate(request: Request):
    """
    Generates a response from the LLM without retaining chat context.
    Expects a JSON body with 'prompt'.
    """
    logger.info("➡️ /generate endpoint received a request.")
    data = await request.json()
    user_input = data.get("prompt", "").strip() # Renamed to user_input for clarity

    if not user_input:
        logger.warning("Prompt cannot be empty in /generate request.")
        return {"error": "Prompt cannot be empty"}, 400

    # Define the system prompt - sent with every request
    system_prompt_content = (
        "You are a highly efficient and objective data analysis API. You are the 'assistant'. "
        "Your sole function is to process the user's data and instructions, then output ONLY the requested analysis in the specified format. "
        "**Crucially, do NOT include any conversational text, greetings, introductions, conclusions, or any remarks about being an AI.** "
        "Respond directly with the content. Adhere strictly to all formatting requirements. "
        "If a request cannot be fulfilled, respond ONLY with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.'"
    )
    
    # === FIX: Wrap user input in a clear instruction to prevent role confusion ===
    # This frames the user's text as 'data' for the model to analyze.
    user_content_template = f"""Please analyze the following data based on the instructions within it.
Provide only the direct output as requested. Do not add any extra conversational text.

--- DATA ---
{user_input}
"""

    # Construct messages for the current request only
    messages_for_llm = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": user_content_template} # Use the new template
    ]

    # Calculate tokens in the user's prompt
    prompt_tokens = count_tokens_in_text(user_input)

    logger.info(f"🧾 Original user input: {user_input}")
    logger.info(f"Tokens in prompt: {prompt_tokens}")

    try:
        response = llm.create_chat_completion(
            messages=messages_for_llm,
            max_tokens=800,
            # === FIX: Lower temperature for more factual, less creative output ===
            temperature=0.2, 
            # === FIX: Use the CORRECT stop token for the chatml format ===
            stop=["<|im_end|>"] 
        )
        ai_response_content = response["choices"][0]["message"]["content"].strip()

        response_token_count = count_tokens_in_text(ai_response_content)
        
        logger.info("✅ Response generated successfully.")
        return {
            "response": ai_response_content,
            "prompt_tokens": prompt_tokens,
            "response_token_count": response_token_count
        }
    except Exception as e:
        logger.error(f"❌ Error during generation: {e}", exc_info=True)
        return {"error": f"Failed to generate response: {e}. Please try again."}, 500