Spaces:

Priyanshukr-1
/

openhermes_mistral_API

Sleeping

File size: 8,527 Bytes

43cc365
 
 
 
2d70789
 
 
048628f
01e79df
8a98314
 
 
 
 
 
 
 
 
 
 
 
 
 
048628f
43cc365
 
 
cb82ced
 
43cc365
 
 
a49d7b2
43cc365
8a98314
191a9f9
 
 
 
 
 
 
 
8a98314
191a9f9
8a98314
191a9f9
 
43cc365
8a98314
43cc365
 
a49d7b2
 
 
191a9f9
 
8a98314
 
a49d7b2
 
191a9f9
 
 
96c2f2d
191a9f9
01e79df
 
191a9f9
8a98314
191a9f9
8a98314
191a9f9
8a98314
191a9f9
 
280099a
01e79df
 
 
8a98314
01e79df
 
280099a
 
01e79df
280099a
01e79df
 
280099a
01e79df
43cc365
 
8a98314
280099a
43cc365
2d70789
 
191a9f9
8a98314
2d70789
 
a49d7b2
 
 
191a9f9
 
a49d7b2
 
 
 
 
 
 
 
 
191a9f9
 
 
 
 
 
 
a49d7b2
2d70789
 
ab6809d
8afce56
191a9f9
8a98314
191a9f9
a49d7b2
191a9f9
8ac0dd3
a49d7b2
191a9f9
 
 
a49d7b2
8ac0dd3
 
191a9f9
 
8ac0dd3
191a9f9
ab6809d
191a9f9
 
a49d7b2
ab6809d
43cc365
 
191a9f9
280099a
 
191a9f9
9ee58dc
43cc365
9ee58dc
314bed8
9ee58dc
8a98314
191a9f9
1fb027f
280099a
b03785b
9ee58dc
 
 
 
 
b03785b
be3fe73
9ee58dc
 
 
 
 
 
 
 
b03785b
280099a
 
 
9ee58dc
280099a
191a9f9
280099a
9ee58dc
01e79df
9ee58dc
8a98314
191a9f9
 
 
280099a
9ee58dc
 
 
 
 
191a9f9
 
347340b
 
 
8a98314
191a9f9
 
9ee58dc
347340b
191a9f9
 
9ee58dc
8a98314

from fastapi import FastAPI, Request
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import multiprocessing
import time
import tiktoken # For estimating token count
import logging # Import the logging module

# === Configure Logging ===
# Get the root logger
logger = logging.getLogger(__name__)
# Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
logger.setLevel(logging.INFO)
# Create a console handler and set its format
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# Add the handler to the logger if it's not already added
if not logger.handlers:
    logger.addHandler(handler)

app = FastAPI()

# === Model Config ===
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
    logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
    try:
        model_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=FILENAME,
            cache_dir=MODEL_DIR,
            local_dir=MODEL_DIR,
            local_dir_use_symlinks=False
        )
        logger.info(f"✅ Model downloaded to: {model_path}")
    except Exception as e:
        logger.error(f"❌ Error downloading model: {e}")
        # Exit or handle error appropriately if model download fails
        exit(1)
else:
    logger.info(f"✅ Model already available at: {MODEL_PATH}")
    model_path = MODEL_PATH

# === Optimal thread usage ===
logical_cores = psutil.cpu_count(logical=True)
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread

logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
logger.info(f"Using n_threads: {recommended_threads}")

# === Load the model ===
try:
    llm = Llama(
        model_path=model_path,
        n_ctx=2048,  # Context window size for the model (still needed, but not fully utilized for history)
        n_threads=recommended_threads,
        use_mlock=True,  # Lock model in RAM for faster access
        n_gpu_layers=0,  # CPU only
        chat_format="chatml",  # TinyLlama Chat uses ChatML format
        verbose=False # Keep llama.cpp's internal verbose logging off
    )
    logger.info("� Llama model loaded successfully!")
except Exception as e:
    logger.error(f"❌ Error loading Llama model: {e}")
    exit(1)

# Initialize tiktoken encoder for token counting
try:
    encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
    logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
    encoding = None

def count_tokens_in_text(text):
    """Estimates tokens in a given text using tiktoken or simple char count."""
    if encoding:
        return len(encoding.encode(text))
    else:
        # Fallback for when tiktoken isn't available or for simple estimation
        return len(text) // 4 # Rough estimate: 1 token ~ 4 characters

@app.get("/")
def root():
    logger.info("Root endpoint accessed.")
    return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}

@app.get("/get_sys")
def get_sys_specs():
    """Returns system specifications including CPU, RAM, and OS details."""
    logger.info("System specs endpoint accessed.")
    memory = psutil.virtual_memory()
    return {
        "CPU": {
            "physical_cores": physical_cores,
            "logical_cores": logical_cores,
            "max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
            "cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
        },
        "RAM": {
            "total_GB": round(memory.total / (1024 ** 3), 2),
            "available_GB": round(memory.available / (1024 ** 3), 2),
            "usage_percent": memory.percent
        },
        "System": {
            "platform": platform.platform(),
            "architecture": platform.machine(),
            "python_version": platform.python_version()
        },
        "Model_Config": {
            "model_name": FILENAME,
            "n_ctx": llm.n_ctx(),
            "n_threads": llm.n_threads(),
            "use_mlock": llm.use_mlock()
        }
    }

@app.get("/process_list")
def process_list():
    """Returns a list of processes consuming significant CPU."""
    logger.info("Process list endpoint accessed.")
    time.sleep(1)  # Let CPU settle for accurate measurement
    processes = []
    for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
        try:
            cpu = proc.cpu_percent()
            mem = proc.memory_percent()
            # Filter processes using more than 5% CPU or 2% memory
            if cpu > 5 or mem > 2:
                processes.append({
                    "pid": proc.pid,
                    "name": proc.name(),
                    "cpu_percent": round(cpu, 2),
                    "memory_percent": round(mem, 2)
                })
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    # Sort by CPU usage descending
    processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
    return {"heavy_processes": processes}

@app.post("/generate")
async def generate(request: Request):
    """
    Generates a response from the LLM without retaining chat context.
    Expects a JSON body with 'prompt'.
    """
    logger.info("➡️ /generate endpoint received a request.")
    data = await request.json()
    user_input = data.get("prompt", "").strip() # Renamed to user_input for clarity

    if not user_input:
        logger.warning("Prompt cannot be empty in /generate request.")
        return {"error": "Prompt cannot be empty"}, 400

    # Define the system prompt - sent with every request
    system_prompt_content = (
        "You are a highly efficient and objective data analysis API. You are the 'assistant'. "
        "Your sole function is to process the user's data and instructions, then output ONLY the requested analysis in the specified format. "
        "**Crucially, do NOT include any conversational text, greetings, introductions, conclusions, or any remarks about being an AI.** "
        "Respond directly with the content. Adhere strictly to all formatting requirements. "
        "If a request cannot be fulfilled, respond ONLY with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.'"
    )
    
    # === FIX: Wrap user input in a clear instruction to prevent role confusion ===
    # This frames the user's text as 'data' for the model to analyze.
    user_content_template = f"""Please analyze the following data based on the instructions within it.
Provide only the direct output as requested. Do not add any extra conversational text.

--- DATA ---
{user_input}
"""

    # Construct messages for the current request only
    messages_for_llm = [
        {"role": "system", "content": system_prompt_content},
        {"role": "user", "content": user_content_template} # Use the new template
    ]

    # Calculate tokens in the user's prompt
    prompt_tokens = count_tokens_in_text(user_input)

    logger.info(f"🧾 Original user input: {user_input}")
    logger.info(f"Tokens in prompt: {prompt_tokens}")

    try:
        response = llm.create_chat_completion(
            messages=messages_for_llm,
            max_tokens=800,
            # === FIX: Lower temperature for more factual, less creative output ===
            temperature=0.2, 
            # === FIX: Use the CORRECT stop token for the chatml format ===
            stop=["<|im_end|>"] 
        )
        ai_response_content = response["choices"][0]["message"]["content"].strip()

        response_token_count = count_tokens_in_text(ai_response_content)
        
        logger.info("✅ Response generated successfully.")
        return {
            "response": ai_response_content,
            "prompt_tokens": prompt_tokens,
            "response_token_count": response_token_count
        }
    except Exception as e:
        logger.error(f"❌ Error during generation: {e}", exc_info=True)
        return {"error": f"Failed to generate response: {e}. Please try again."}, 500