Spaces:

Priyanshukr-1
/

openhermes_mistral_API

Sleeping

App Files Files Community

openhermes_mistral_API / app.py

Priyanshukr-1

Update app.py

630d5e2 verified about 1 month ago

raw

history blame

7.93 kB

	from fastapi import FastAPI, Request
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os
	import platform
	import psutil
	import multiprocessing
	import time
	import tiktoken # For estimating token count
	import logging # Import the logging module

	# === Configure Logging ===
	# Get the root logger
	logger = logging.getLogger(__name__)
	# Set the logging level (e.g., INFO, DEBUG, WARNING, ERROR, CRITICAL)
	logger.setLevel(logging.INFO)
	# Create a console handler and set its format
	handler = logging.StreamHandler()
	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	handler.setFormatter(formatter)
	# Add the handler to the logger if it's not already added
	if not logger.handlers:
	logger.addHandler(handler)

	app = FastAPI()

	# === Model Config ===
	REPO_ID = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
	FILENAME = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Q4_K_M is a good balance of size and quality
	MODEL_DIR = "models"
	MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

	# === Download if model not available ===
	if not os.path.exists(MODEL_PATH):
	logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
	try:
	model_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME,
	cache_dir=MODEL_DIR,
	local_dir=MODEL_DIR,
	local_dir_use_symlinks=False
	)
	logger.info(f"✅ Model downloaded to: {model_path}")
	except Exception as e:
	logger.error(f"❌ Error downloading model: {e}")
	# Exit or handle error appropriately if model download fails
	exit(1)
	else:
	logger.info(f"✅ Model already available at: {MODEL_PATH}")
	model_path = MODEL_PATH

	# === Optimal thread usage ===
	logical_cores = psutil.cpu_count(logical=True)
	physical_cores = psutil.cpu_count(logical=False)
	recommended_threads = max(1, physical_cores) # Ensure at least 1 thread

	logger.info(f"Detected physical cores: {physical_cores}, logical cores: {logical_cores}")
	logger.info(f"Using n_threads: {recommended_threads}")

	# === Load the model ===
	try:
	llm = Llama(
	model_path=model_path,
	n_ctx=1024, # Context window size for the model (still needed, but not fully utilized for history)
	n_threads=recommended_threads,
	use_mlock=True, # Lock model in RAM for faster access
	n_gpu_layers=0, # CPU only
	chat_format="chatml", # TinyLlama Chat uses ChatML format
	verbose=False # Keep llama.cpp's internal verbose logging off
	)
	logger.info("� Llama model loaded successfully!")
	except Exception as e:
	logger.error(f"❌ Error loading Llama model: {e}")
	exit(1)

	# Initialize tiktoken encoder for token counting
	try:
	encoding = tiktoken.get_encoding("cl100k_base")
	except Exception:
	logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
	encoding = None

	def count_tokens_in_text(text):
	"""Estimates tokens in a given text using tiktoken or simple char count."""
	if encoding:
	return len(encoding.encode(text))
	else:
	# Fallback for when tiktoken isn't available or for simple estimation
	return len(text) // 4 # Rough estimate: 1 token ~ 4 characters

	@app.get("/")
	def root():
	logger.info("Root endpoint accessed.")
	return {"message": "✅ Data Analysis AI API is live and optimized for speed (no context retention)!"}

	@app.get("/get_sys")
	def get_sys_specs():
	"""Returns system specifications including CPU, RAM, and OS details."""
	logger.info("System specs endpoint accessed.")
	memory = psutil.virtual_memory()
	return {
	"CPU": {
	"physical_cores": physical_cores,
	"logical_cores": logical_cores,
	"max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
	"cpu_usage_percent": psutil.cpu_percent(interval=1) # CPU usage over 1 second
	},
	"RAM": {
	"total_GB": round(memory.total / (1024 ** 3), 2),
	"available_GB": round(memory.available / (1024 ** 3), 2),
	"usage_percent": memory.percent
	},
	"System": {
	"platform": platform.platform(),
	"architecture": platform.machine(),
	"python_version": platform.python_version()
	},
	"Model_Config": {
	"model_name": FILENAME,
	"n_ctx": llm.n_ctx(),
	"n_threads": llm.n_threads(),
	"use_mlock": llm.use_mlock()
	}
	}

	@app.get("/process_list")
	def process_list():
	"""Returns a list of processes consuming significant CPU."""
	logger.info("Process list endpoint accessed.")
	time.sleep(1) # Let CPU settle for accurate measurement
	processes = []
	for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
	try:
	cpu = proc.cpu_percent()
	mem = proc.memory_percent()
	# Filter processes using more than 5% CPU or 2% memory
	if cpu > 5 or mem > 2:
	processes.append({
	"pid": proc.pid,
	"name": proc.name(),
	"cpu_percent": round(cpu, 2),
	"memory_percent": round(mem, 2)
	})
	except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
	pass
	# Sort by CPU usage descending
	processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
	return {"heavy_processes": processes}

	@app.post("/generate")
	async def generate(request: Request):
	"""
	Generates a response from the LLM without retaining chat context.
	Expects a JSON body with 'prompt'.
	"""
	logger.info("➡️ /generate endpoint received a request.") # Log at the very beginning
	data = await request.json()
	prompt = data.get("prompt", "").strip()

	if not prompt:
	logger.warning("Prompt cannot be empty in /generate request.")
	return {"error": "Prompt cannot be empty"}, 400

	# Define the system prompt - sent with every request
	system_prompt_content = (
	"You are a helpful AI assistant for data analysis. ",
	"Provide concise and actionable suggestions based on the data provided or questions asked. ",
	"Focus on data insights and actionable steps for report generation.",
	"Be concise and professional in your responses.",
	"Avoid unnecessary verbosity and focus on key insights.",
	"Ensure your responses are clear and directly address the questions asked.",
	"Alway follow the instructions provided in the prompt and respond within instructed word limits.",
	)

	# Construct messages for the current request only
	messages_for_llm = [
	{"role": "system", "content": system_prompt_content},
	{"role": "user", "content": prompt}
	]

	# Calculate tokens in the user's prompt
	prompt_tokens = count_tokens_in_text(prompt)

	logger.info(f"🧾 Prompt received: {prompt}")
	logger.info(f"Tokens in prompt: {prompt_tokens}")

	try:
	response = llm.create_chat_completion(
	messages=messages_for_llm,
	max_tokens=300, # Keep response length short for maximum speed
	temperature=0.7, # Adjust temperature for creativity vs. coherence (0.0-1.0)
	stop=["</s>"] # Stop sequence for TinyLlama Chat
	)
	ai_response_content = response["choices"][0]["message"]["content"].strip()
	logger.info("✅ Response generated successfully.")
	return {
	"response": ai_response_content,
	"prompt_tokens": prompt_tokens # Return tokens in the prompt
	}
	except Exception as e:
	logger.error(f"❌ Error during generation: {e}", exc_info=True) # Log exception details
	return {"error": f"Failed to generate response: {e}. Please try again."}, 500