Priyanshukr-1's picture
Update app.py
b389616 verified
from fastapi import FastAPI, Request, HTTPException
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import platform
import psutil
import time
import tiktoken # For estimating token count
import logging # Import the logging module
from pydantic import BaseModel, Field
# === Configure Logging ===
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
if not logger.handlers:
logger.addHandler(handler)
app = FastAPI(
title="Data Analysis & News AI API",
description="API for efficient news summarization and keyword extraction using local LLMs.",
version="1.0.0"
)
# === Model Config ===
# Recommended Model for 16GB RAM CPU: Mistral-7B-Instruct-v0.2 Q4_K_M
# It offers a great balance of quality, speed, and memory footprint for your hardware.
# You can uncomment other models to test them out.
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
# Alternative: OpenHermes 2.5 Mistral 7B (also excellent instruction following)
# REPO_ID = "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF"
# FILENAME = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"
# Alternative: Phi-3-mini (if you need extreme efficiency and speed, with good quality for its size)
# REPO_ID = "microsoft/Phi-3-mini-4k-instruct-GGUF"
# FILENAME = "phi-3-mini-4k-instruct-q4.gguf" # Often the standard Q4 for Phi-3
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)
# === Download if model not available ===
if not os.path.exists(MODEL_PATH):
logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
try:
model_path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
cache_dir=MODEL_DIR,
local_dir=MODEL_DIR,
local_dir_use_symlinks=False
)
logger.info(f"βœ… Model downloaded to: {model_path}")
except Exception as e:
logger.error(f"❌ Error downloading model: {e}")
# Re-raise the exception or exit, as the app cannot function without the model
raise RuntimeError(f"Failed to download model: {e}")
else:
logger.info(f"βœ… Model already available at: {MODEL_PATH}")
model_path = MODEL_PATH
# === Optimal thread usage ===
# For llama.cpp on CPU, using physical cores is generally more efficient than logical cores (hyperthreading).
physical_cores = psutil.cpu_count(logical=False)
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
logger.info(f"Detected physical cores: {physical_cores}, logical cores: {psutil.cpu_count(logical=True)}")
logger.info(f"Using n_threads: {recommended_threads}")
# === Load the model ===
try:
llm = Llama(
model_path=model_path,
n_ctx=4096, # Increased context window for better summarization of news articles
# 4096 is a good balance for 7B models on 16GB RAM.
# Test with 8192 if you often process very long articles.
n_threads=recommended_threads,
n_batch=512, # Max batch size for prompt processing. Larger can be faster for long prompts.
use_mlock=True, # Lock model in RAM for faster access, reducing disk I/O.
n_gpu_layers=0, # CPU only, as specified.
chat_format="chatml", # This works for many instruct models, including Mistral.
verbose=False # Keep llama.cpp's internal verbose logging off
)
logger.info("πŸš€ Llama model loaded successfully!")
except Exception as e:
logger.error(f"❌ Error loading Llama model: {e}")
raise RuntimeError(f"Failed to load Llama model: {e}")
# Initialize tiktoken encoder for token counting
try:
encoding = tiktoken.get_encoding("cl100k_base")
except Exception:
logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
encoding = None
def count_tokens_in_text(text):
"""Estimates tokens in a given text using tiktoken or simple char count."""
if encoding:
return len(encoding.encode(text))
else:
return len(text) // 4 # Rough estimate: 1 token ~ 4 characters
# === Pydantic Models for API Request Bodies ===
class NewsArticle(BaseModel):
article: str = Field(..., min_length=50, description="The full news article text to summarize.")
num_sentences: int = Field(3, ge=1, le=10, description="Number of sentences for the summary (1-10).")
max_tokens: int = Field(200, ge=50, le=500, description="Maximum tokens for the generated summary.")
class TextForKeywords(BaseModel):
text: str = Field(..., min_length=20, description="The text from which to extract keywords.")
num_keywords: int = Field(5, ge=1, le=15, description="Number of keywords to extract (1-15).")
max_tokens: int = Field(100, ge=30, le=200, description="Maximum tokens for the keyword output.")
# === API Endpoints ===
@app.get("/")
def root():
logger.info("Root endpoint accessed.")
return {"message": "βœ… Data Analysis AI API is live and optimized for speed and accuracy!"}
@app.get("/get_sys")
def get_sys_specs():
"""Returns system specifications including CPU, RAM, and OS details."""
logger.info("System specs endpoint accessed.")
memory = psutil.virtual_memory()
return {
"CPU": {
"physical_cores": physical_cores,
"logical_cores": psutil.cpu_count(logical=True),
"max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
"cpu_usage_percent": psutil.cpu_percent(interval=1)
},
"RAM": {
"total_GB": round(memory.total / (1024 ** 3), 2),
"available_GB": round(memory.available / (1024 ** 3), 2),
"usage_percent": memory.percent
},
"System": {
"platform": platform.platform(),
"architecture": platform.machine(),
"python_version": platform.python_version()
},
"Model_Config": {
"model_name": FILENAME,
"n_ctx": llm.n_ctx(),
"n_threads": llm.n_threads(),
"n_batch": llm.n_batch(),
"use_mlock": llm.use_mlock(),
"chat_format": llm.chat_format,
"n_gpu_layers": llm.n_gpu_layers()
}
}
@app.get("/process_list")
def process_list():
"""Returns a list of processes consuming significant CPU."""
logger.info("Process list endpoint accessed.")
time.sleep(1)
processes = []
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
try:
cpu = proc.cpu_percent()
mem = proc.memory_percent()
if cpu > 5 or mem > 2:
processes.append({
"pid": proc.pid,
"name": proc.name(),
"cpu_percent": round(cpu, 2),
"memory_percent": round(mem, 2)
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
pass
processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
return {"heavy_processes": processes}
@app.post("/summarize_news")
async def summarize_news(request_body: NewsArticle):
"""
Summarizes a given news article.
"""
logger.info("➑️ /summarize_news endpoint received a request.")
# Define the system prompt - consistent for all LLM interactions
system_prompt_content = (
"You are a highly efficient, objective, and precise Data and News analysis API. "
"Your sole function is to process the provided text (data or news) and instructions, "
"then output ONLY the requested analysis in the exact specified format. "
"**Crucially, do NOT include any conversational text, greetings, introductions "
"(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
"or any remarks about being an AI. Respond directly with the content.** "
"Adhere strictly to all formatting requirements given in the user's prompt "
"(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
"Focus exclusively on data insights, statistics, trends, influencing factors, "
"and actionable recommendations if requested. Be concise, professional, and factual. "
"If a request cannot be fulfilled due to data limitations or model capabilities, "
"respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
"No other text should be included."
)
prompt = (
f"Summarize the following news article in {request_body.num_sentences} "
"concise sentences, focusing on the main event, key actors, and outcome. "
"Do not include any introductory phrases or conversational elements. "
f"Article: {request_body.article}"
)
messages_for_llm = [
{"role": "system", "content": system_prompt_content},
{"role": "user", "content": prompt}
]
prompt_tokens = count_tokens_in_text(prompt)
logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...")
logger.info(f"Tokens in prompt: {prompt_tokens}")
try:
response = llm.create_chat_completion(
messages=messages_for_llm,
max_tokens=request_body.max_tokens,
temperature=0.7,
stop=["</s>", "<|im_end|>", "\n\n---"],
top_p=0.9,
top_k=40,
repeat_penalty=1.1
)
ai_response_content = response["choices"][0]["message"]["content"].strip()
response_token_count = count_tokens_in_text(ai_response_content)
logger.info("βœ… Response generated successfully.")
return {
"response": ai_response_content,
"prompt_tokens": prompt_tokens,
"response_token_count": response_token_count
}
except Exception as e:
logger.error(f"❌ Error during generation: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")
@app.post("/extract_keywords")
async def extract_keywords(request_body: TextForKeywords):
"""
Extracts keywords from a given text.
"""
logger.info("➑️ /extract_keywords endpoint received a request.")
# Define the system prompt - consistent for all LLM interactions
system_prompt_content = (
"You are a highly efficient, objective, and precise Data and News analysis API. "
"Your sole function is to process the provided text (data or news) and instructions, "
"then output ONLY the requested analysis in the exact specified format. "
"**Crucially, do NOT include any conversational text, greetings, introductions "
"(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
"or any remarks about being an AI. Respond directly with the content.** "
"Adhere strictly to all formatting requirements given in the user's prompt "
"(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
"Focus exclusively on data insights, statistics, trends, influencing factors, "
"and actionable recommendations if requested. Be concise, professional, and factual. "
"If a request cannot be fulfilled due to data limitations or model capabilities, "
"respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
"No other text should be included."
)
# Modified prompt for clearer keyword extraction instruction
prompt = (
f"Extract exactly {request_body.num_keywords} most important keywords from the following text. "
"Your output should be ONLY the comma-separated list of keywords, nothing else. "
"For example, if the keywords are 'apple', 'banana', 'cherry', your output should be: 'apple, banana, cherry'. "
f"Text: {request_body.text}"
)
messages_for_llm = [
{"role": "system", "content": system_prompt_content},
{"role": "user", "content": prompt}
]
prompt_tokens = count_tokens_in_text(prompt)
logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...")
logger.info(f"Tokens in prompt: {prompt_tokens}")
try:
response = llm.create_chat_completion(
messages=messages_for_llm,
max_tokens=request_body.max_tokens,
temperature=0.7,
stop=["</s>", "<|im_end|>", "\n\n---"],
top_p=0.9,
top_k=40,
repeat_penalty=1.1
)
ai_response_content = response["choices"][0]["message"]["content"].strip()
response_token_count = count_tokens_in_text(ai_response_content)
logger.info("βœ… Response generated successfully.")
return {
"response": ai_response_content,
"prompt_tokens": prompt_tokens,
"response_token_count": response_token_count
}
except Exception as e:
logger.error(f"❌ Error during generation: {e}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")