Spaces:
Running
Running
from fastapi import FastAPI, Request, HTTPException | |
from llama_cpp import Llama | |
from huggingface_hub import hf_hub_download | |
import os | |
import platform | |
import psutil | |
import time | |
import tiktoken # For estimating token count | |
import logging # Import the logging module | |
from pydantic import BaseModel, Field | |
# === Configure Logging === | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
handler = logging.StreamHandler() | |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
handler.setFormatter(formatter) | |
if not logger.handlers: | |
logger.addHandler(handler) | |
app = FastAPI( | |
title="Data Analysis & News AI API", | |
description="API for efficient news summarization and keyword extraction using local LLMs.", | |
version="1.0.0" | |
) | |
# === Model Config === | |
# Recommended Model for 16GB RAM CPU: Mistral-7B-Instruct-v0.2 Q4_K_M | |
# It offers a great balance of quality, speed, and memory footprint for your hardware. | |
# You can uncomment other models to test them out. | |
REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" | |
FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf" | |
# Alternative: OpenHermes 2.5 Mistral 7B (also excellent instruction following) | |
# REPO_ID = "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF" | |
# FILENAME = "openhermes-2.5-mistral-7b.Q4_K_M.gguf" | |
# Alternative: Phi-3-mini (if you need extreme efficiency and speed, with good quality for its size) | |
# REPO_ID = "microsoft/Phi-3-mini-4k-instruct-GGUF" | |
# FILENAME = "phi-3-mini-4k-instruct-q4.gguf" # Often the standard Q4 for Phi-3 | |
MODEL_DIR = "models" | |
MODEL_PATH = os.path.join(MODEL_DIR, FILENAME) | |
# === Download if model not available === | |
if not os.path.exists(MODEL_PATH): | |
logger.info(f"β¬οΈ Downloading {FILENAME} from Hugging Face...") | |
try: | |
model_path = hf_hub_download( | |
repo_id=REPO_ID, | |
filename=FILENAME, | |
cache_dir=MODEL_DIR, | |
local_dir=MODEL_DIR, | |
local_dir_use_symlinks=False | |
) | |
logger.info(f"β Model downloaded to: {model_path}") | |
except Exception as e: | |
logger.error(f"β Error downloading model: {e}") | |
# Re-raise the exception or exit, as the app cannot function without the model | |
raise RuntimeError(f"Failed to download model: {e}") | |
else: | |
logger.info(f"β Model already available at: {MODEL_PATH}") | |
model_path = MODEL_PATH | |
# === Optimal thread usage === | |
# For llama.cpp on CPU, using physical cores is generally more efficient than logical cores (hyperthreading). | |
physical_cores = psutil.cpu_count(logical=False) | |
recommended_threads = max(1, physical_cores) # Ensure at least 1 thread | |
logger.info(f"Detected physical cores: {physical_cores}, logical cores: {psutil.cpu_count(logical=True)}") | |
logger.info(f"Using n_threads: {recommended_threads}") | |
# === Load the model === | |
try: | |
llm = Llama( | |
model_path=model_path, | |
n_ctx=4096, # Increased context window for better summarization of news articles | |
# 4096 is a good balance for 7B models on 16GB RAM. | |
# Test with 8192 if you often process very long articles. | |
n_threads=recommended_threads, | |
n_batch=512, # Max batch size for prompt processing. Larger can be faster for long prompts. | |
use_mlock=True, # Lock model in RAM for faster access, reducing disk I/O. | |
n_gpu_layers=0, # CPU only, as specified. | |
chat_format="chatml", # This works for many instruct models, including Mistral. | |
verbose=False # Keep llama.cpp's internal verbose logging off | |
) | |
logger.info("π Llama model loaded successfully!") | |
except Exception as e: | |
logger.error(f"β Error loading Llama model: {e}") | |
raise RuntimeError(f"Failed to load Llama model: {e}") | |
# Initialize tiktoken encoder for token counting | |
try: | |
encoding = tiktoken.get_encoding("cl100k_base") | |
except Exception: | |
logger.warning("β οΈ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.") | |
encoding = None | |
def count_tokens_in_text(text): | |
"""Estimates tokens in a given text using tiktoken or simple char count.""" | |
if encoding: | |
return len(encoding.encode(text)) | |
else: | |
return len(text) // 4 # Rough estimate: 1 token ~ 4 characters | |
# === Pydantic Models for API Request Bodies === | |
class NewsArticle(BaseModel): | |
article: str = Field(..., min_length=50, description="The full news article text to summarize.") | |
num_sentences: int = Field(3, ge=1, le=10, description="Number of sentences for the summary (1-10).") | |
max_tokens: int = Field(200, ge=50, le=500, description="Maximum tokens for the generated summary.") | |
class TextForKeywords(BaseModel): | |
text: str = Field(..., min_length=20, description="The text from which to extract keywords.") | |
num_keywords: int = Field(5, ge=1, le=15, description="Number of keywords to extract (1-15).") | |
max_tokens: int = Field(100, ge=30, le=200, description="Maximum tokens for the keyword output.") | |
# === API Endpoints === | |
def root(): | |
logger.info("Root endpoint accessed.") | |
return {"message": "β Data Analysis AI API is live and optimized for speed and accuracy!"} | |
def get_sys_specs(): | |
"""Returns system specifications including CPU, RAM, and OS details.""" | |
logger.info("System specs endpoint accessed.") | |
memory = psutil.virtual_memory() | |
return { | |
"CPU": { | |
"physical_cores": physical_cores, | |
"logical_cores": psutil.cpu_count(logical=True), | |
"max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A", | |
"cpu_usage_percent": psutil.cpu_percent(interval=1) | |
}, | |
"RAM": { | |
"total_GB": round(memory.total / (1024 ** 3), 2), | |
"available_GB": round(memory.available / (1024 ** 3), 2), | |
"usage_percent": memory.percent | |
}, | |
"System": { | |
"platform": platform.platform(), | |
"architecture": platform.machine(), | |
"python_version": platform.python_version() | |
}, | |
"Model_Config": { | |
"model_name": FILENAME, | |
"n_ctx": llm.n_ctx(), | |
"n_threads": llm.n_threads(), | |
"n_batch": llm.n_batch(), | |
"use_mlock": llm.use_mlock(), | |
"chat_format": llm.chat_format, | |
"n_gpu_layers": llm.n_gpu_layers() | |
} | |
} | |
def process_list(): | |
"""Returns a list of processes consuming significant CPU.""" | |
logger.info("Process list endpoint accessed.") | |
time.sleep(1) | |
processes = [] | |
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']): | |
try: | |
cpu = proc.cpu_percent() | |
mem = proc.memory_percent() | |
if cpu > 5 or mem > 2: | |
processes.append({ | |
"pid": proc.pid, | |
"name": proc.name(), | |
"cpu_percent": round(cpu, 2), | |
"memory_percent": round(mem, 2) | |
}) | |
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): | |
pass | |
processes.sort(key=lambda x: x['cpu_percent'], reverse=True) | |
return {"heavy_processes": processes} | |
async def summarize_news(request_body: NewsArticle): | |
""" | |
Summarizes a given news article. | |
""" | |
logger.info("β‘οΈ /summarize_news endpoint received a request.") | |
# Define the system prompt - consistent for all LLM interactions | |
system_prompt_content = ( | |
"You are a highly efficient, objective, and precise Data and News analysis API. " | |
"Your sole function is to process the provided text (data or news) and instructions, " | |
"then output ONLY the requested analysis in the exact specified format. " | |
"**Crucially, do NOT include any conversational text, greetings, introductions " | |
"(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, " | |
"or any remarks about being an AI. Respond directly with the content.** " | |
"Adhere strictly to all formatting requirements given in the user's prompt " | |
"(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). " | |
"Focus exclusively on data insights, statistics, trends, influencing factors, " | |
"and actionable recommendations if requested. Be concise, professional, and factual. " | |
"If a request cannot be fulfilled due to data limitations or model capabilities, " | |
"respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' " | |
"No other text should be included." | |
) | |
prompt = ( | |
f"Summarize the following news article in {request_body.num_sentences} " | |
"concise sentences, focusing on the main event, key actors, and outcome. " | |
"Do not include any introductory phrases or conversational elements. " | |
f"Article: {request_body.article}" | |
) | |
messages_for_llm = [ | |
{"role": "system", "content": system_prompt_content}, | |
{"role": "user", "content": prompt} | |
] | |
prompt_tokens = count_tokens_in_text(prompt) | |
logger.info(f"π§Ύ Prompt received (first 100 chars): {prompt[:100]}...") | |
logger.info(f"Tokens in prompt: {prompt_tokens}") | |
try: | |
response = llm.create_chat_completion( | |
messages=messages_for_llm, | |
max_tokens=request_body.max_tokens, | |
temperature=0.7, | |
stop=["</s>", "<|im_end|>", "\n\n---"], | |
top_p=0.9, | |
top_k=40, | |
repeat_penalty=1.1 | |
) | |
ai_response_content = response["choices"][0]["message"]["content"].strip() | |
response_token_count = count_tokens_in_text(ai_response_content) | |
logger.info("β Response generated successfully.") | |
return { | |
"response": ai_response_content, | |
"prompt_tokens": prompt_tokens, | |
"response_token_count": response_token_count | |
} | |
except Exception as e: | |
logger.error(f"β Error during generation: {e}", exc_info=True) | |
raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.") | |
async def extract_keywords(request_body: TextForKeywords): | |
""" | |
Extracts keywords from a given text. | |
""" | |
logger.info("β‘οΈ /extract_keywords endpoint received a request.") | |
# Define the system prompt - consistent for all LLM interactions | |
system_prompt_content = ( | |
"You are a highly efficient, objective, and precise Data and News analysis API. " | |
"Your sole function is to process the provided text (data or news) and instructions, " | |
"then output ONLY the requested analysis in the exact specified format. " | |
"**Crucially, do NOT include any conversational text, greetings, introductions " | |
"(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, " | |
"or any remarks about being an AI. Respond directly with the content.** " | |
"Adhere strictly to all formatting requirements given in the user's prompt " | |
"(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). " | |
"Focus exclusively on data insights, statistics, trends, influencing factors, " | |
"and actionable recommendations if requested. Be concise, professional, and factual. " | |
"If a request cannot be fulfilled due to data limitations or model capabilities, " | |
"respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' " | |
"No other text should be included." | |
) | |
# Modified prompt for clearer keyword extraction instruction | |
prompt = ( | |
f"Extract exactly {request_body.num_keywords} most important keywords from the following text. " | |
"Your output should be ONLY the comma-separated list of keywords, nothing else. " | |
"For example, if the keywords are 'apple', 'banana', 'cherry', your output should be: 'apple, banana, cherry'. " | |
f"Text: {request_body.text}" | |
) | |
messages_for_llm = [ | |
{"role": "system", "content": system_prompt_content}, | |
{"role": "user", "content": prompt} | |
] | |
prompt_tokens = count_tokens_in_text(prompt) | |
logger.info(f"π§Ύ Prompt received (first 100 chars): {prompt[:100]}...") | |
logger.info(f"Tokens in prompt: {prompt_tokens}") | |
try: | |
response = llm.create_chat_completion( | |
messages=messages_for_llm, | |
max_tokens=request_body.max_tokens, | |
temperature=0.7, | |
stop=["</s>", "<|im_end|>", "\n\n---"], | |
top_p=0.9, | |
top_k=40, | |
repeat_penalty=1.1 | |
) | |
ai_response_content = response["choices"][0]["message"]["content"].strip() | |
response_token_count = count_tokens_in_text(ai_response_content) | |
logger.info("β Response generated successfully.") | |
return { | |
"response": ai_response_content, | |
"prompt_tokens": prompt_tokens, | |
"response_token_count": response_token_count | |
} | |
except Exception as e: | |
logger.error(f"β Error during generation: {e}", exc_info=True) | |
raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.") | |