Spaces:

Priyanshukr-1
/

News-Summary-API

Running

App Files Files Community

News-Summary-API / app.py

Priyanshukr-1

Update app.py

b389616 verified 27 days ago

raw

history blame contribute delete

13.4 kB

	from fastapi import FastAPI, Request, HTTPException
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os
	import platform
	import psutil
	import time
	import tiktoken # For estimating token count
	import logging # Import the logging module
	from pydantic import BaseModel, Field

	# === Configure Logging ===
	logger = logging.getLogger(__name__)
	logger.setLevel(logging.INFO)
	handler = logging.StreamHandler()
	formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
	handler.setFormatter(formatter)
	if not logger.handlers:
	logger.addHandler(handler)

	app = FastAPI(
	title="Data Analysis & News AI API",
	description="API for efficient news summarization and keyword extraction using local LLMs.",
	version="1.0.0"
	)

	# === Model Config ===
	# Recommended Model for 16GB RAM CPU: Mistral-7B-Instruct-v0.2 Q4_K_M
	# It offers a great balance of quality, speed, and memory footprint for your hardware.
	# You can uncomment other models to test them out.
	REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
	FILENAME = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"

	# Alternative: OpenHermes 2.5 Mistral 7B (also excellent instruction following)
	# REPO_ID = "TheBloke/OpenHermes-2.5-Mistral-7B-GGUF"
	# FILENAME = "openhermes-2.5-mistral-7b.Q4_K_M.gguf"

	# Alternative: Phi-3-mini (if you need extreme efficiency and speed, with good quality for its size)
	# REPO_ID = "microsoft/Phi-3-mini-4k-instruct-GGUF"
	# FILENAME = "phi-3-mini-4k-instruct-q4.gguf" # Often the standard Q4 for Phi-3

	MODEL_DIR = "models"
	MODEL_PATH = os.path.join(MODEL_DIR, FILENAME)

	# === Download if model not available ===
	if not os.path.exists(MODEL_PATH):
	logger.info(f"⬇️ Downloading {FILENAME} from Hugging Face...")
	try:
	model_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME,
	cache_dir=MODEL_DIR,
	local_dir=MODEL_DIR,
	local_dir_use_symlinks=False
	)
	logger.info(f"✅ Model downloaded to: {model_path}")
	except Exception as e:
	logger.error(f"❌ Error downloading model: {e}")
	# Re-raise the exception or exit, as the app cannot function without the model
	raise RuntimeError(f"Failed to download model: {e}")
	else:
	logger.info(f"✅ Model already available at: {MODEL_PATH}")
	model_path = MODEL_PATH

	# === Optimal thread usage ===
	# For llama.cpp on CPU, using physical cores is generally more efficient than logical cores (hyperthreading).
	physical_cores = psutil.cpu_count(logical=False)
	recommended_threads = max(1, physical_cores) # Ensure at least 1 thread
	logger.info(f"Detected physical cores: {physical_cores}, logical cores: {psutil.cpu_count(logical=True)}")
	logger.info(f"Using n_threads: {recommended_threads}")

	# === Load the model ===
	try:
	llm = Llama(
	model_path=model_path,
	n_ctx=4096, # Increased context window for better summarization of news articles
	# 4096 is a good balance for 7B models on 16GB RAM.
	# Test with 8192 if you often process very long articles.
	n_threads=recommended_threads,
	n_batch=512, # Max batch size for prompt processing. Larger can be faster for long prompts.
	use_mlock=True, # Lock model in RAM for faster access, reducing disk I/O.
	n_gpu_layers=0, # CPU only, as specified.
	chat_format="chatml", # This works for many instruct models, including Mistral.
	verbose=False # Keep llama.cpp's internal verbose logging off
	)
	logger.info("🚀 Llama model loaded successfully!")
	except Exception as e:
	logger.error(f"❌ Error loading Llama model: {e}")
	raise RuntimeError(f"Failed to load Llama model: {e}")

	# Initialize tiktoken encoder for token counting
	try:
	encoding = tiktoken.get_encoding("cl100k_base")
	except Exception:
	logger.warning("⚠️ Could not load tiktoken 'cl100k_base' encoding. Token count for prompt might be less accurate.")
	encoding = None

	def count_tokens_in_text(text):
	"""Estimates tokens in a given text using tiktoken or simple char count."""
	if encoding:
	return len(encoding.encode(text))
	else:
	return len(text) // 4 # Rough estimate: 1 token ~ 4 characters

	# === Pydantic Models for API Request Bodies ===
	class NewsArticle(BaseModel):
	article: str = Field(..., min_length=50, description="The full news article text to summarize.")
	num_sentences: int = Field(3, ge=1, le=10, description="Number of sentences for the summary (1-10).")
	max_tokens: int = Field(200, ge=50, le=500, description="Maximum tokens for the generated summary.")

	class TextForKeywords(BaseModel):
	text: str = Field(..., min_length=20, description="The text from which to extract keywords.")
	num_keywords: int = Field(5, ge=1, le=15, description="Number of keywords to extract (1-15).")
	max_tokens: int = Field(100, ge=30, le=200, description="Maximum tokens for the keyword output.")

	# === API Endpoints ===

	@app.get("/")
	def root():
	logger.info("Root endpoint accessed.")
	return {"message": "✅ Data Analysis AI API is live and optimized for speed and accuracy!"}

	@app.get("/get_sys")
	def get_sys_specs():
	"""Returns system specifications including CPU, RAM, and OS details."""
	logger.info("System specs endpoint accessed.")
	memory = psutil.virtual_memory()
	return {
	"CPU": {
	"physical_cores": physical_cores,
	"logical_cores": psutil.cpu_count(logical=True),
	"max_freq_mhz": psutil.cpu_freq().max if psutil.cpu_freq() else "N/A",
	"cpu_usage_percent": psutil.cpu_percent(interval=1)
	},
	"RAM": {
	"total_GB": round(memory.total / (1024 ** 3), 2),
	"available_GB": round(memory.available / (1024 ** 3), 2),
	"usage_percent": memory.percent
	},
	"System": {
	"platform": platform.platform(),
	"architecture": platform.machine(),
	"python_version": platform.python_version()
	},
	"Model_Config": {
	"model_name": FILENAME,
	"n_ctx": llm.n_ctx(),
	"n_threads": llm.n_threads(),
	"n_batch": llm.n_batch(),
	"use_mlock": llm.use_mlock(),
	"chat_format": llm.chat_format,
	"n_gpu_layers": llm.n_gpu_layers()
	}
	}

	@app.get("/process_list")
	def process_list():
	"""Returns a list of processes consuming significant CPU."""
	logger.info("Process list endpoint accessed.")
	time.sleep(1)
	processes = []
	for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent']):
	try:
	cpu = proc.cpu_percent()
	mem = proc.memory_percent()
	if cpu > 5 or mem > 2:
	processes.append({
	"pid": proc.pid,
	"name": proc.name(),
	"cpu_percent": round(cpu, 2),
	"memory_percent": round(mem, 2)
	})
	except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
	pass
	processes.sort(key=lambda x: x['cpu_percent'], reverse=True)
	return {"heavy_processes": processes}

	@app.post("/summarize_news")
	async def summarize_news(request_body: NewsArticle):
	"""
	Summarizes a given news article.
	"""
	logger.info("➡️ /summarize_news endpoint received a request.")

	# Define the system prompt - consistent for all LLM interactions
	system_prompt_content = (
	"You are a highly efficient, objective, and precise Data and News analysis API. "
	"Your sole function is to process the provided text (data or news) and instructions, "
	"then output ONLY the requested analysis in the exact specified format. "
	"**Crucially, do NOT include any conversational text, greetings, introductions "
	"(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
	"or any remarks about being an AI. Respond directly with the content.** "
	"Adhere strictly to all formatting requirements given in the user's prompt "
	"(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
	"Focus exclusively on data insights, statistics, trends, influencing factors, "
	"and actionable recommendations if requested. Be concise, professional, and factual. "
	"If a request cannot be fulfilled due to data limitations or model capabilities, "
	"respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
	"No other text should be included."
	)

	prompt = (
	f"Summarize the following news article in {request_body.num_sentences} "
	"concise sentences, focusing on the main event, key actors, and outcome. "
	"Do not include any introductory phrases or conversational elements. "
	f"Article: {request_body.article}"
	)

	messages_for_llm = [
	{"role": "system", "content": system_prompt_content},
	{"role": "user", "content": prompt}
	]

	prompt_tokens = count_tokens_in_text(prompt)
	logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...")
	logger.info(f"Tokens in prompt: {prompt_tokens}")

	try:
	response = llm.create_chat_completion(
	messages=messages_for_llm,
	max_tokens=request_body.max_tokens,
	temperature=0.7,
	stop=["</s>", "<\|im_end\|>", "\n\n---"],
	top_p=0.9,
	top_k=40,
	repeat_penalty=1.1
	)
	ai_response_content = response["choices"][0]["message"]["content"].strip()
	response_token_count = count_tokens_in_text(ai_response_content)

	logger.info("✅ Response generated successfully.")
	return {
	"response": ai_response_content,
	"prompt_tokens": prompt_tokens,
	"response_token_count": response_token_count
	}
	except Exception as e:
	logger.error(f"❌ Error during generation: {e}", exc_info=True)
	raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")


	@app.post("/extract_keywords")
	async def extract_keywords(request_body: TextForKeywords):
	"""
	Extracts keywords from a given text.
	"""
	logger.info("➡️ /extract_keywords endpoint received a request.")

	# Define the system prompt - consistent for all LLM interactions
	system_prompt_content = (
	"You are a highly efficient, objective, and precise Data and News analysis API. "
	"Your sole function is to process the provided text (data or news) and instructions, "
	"then output ONLY the requested analysis in the exact specified format. "
	"**Crucially, do NOT include any conversational text, greetings, introductions "
	"(e.g., 'Here is the report', 'Below is the analysis'), conclusions, disclaimers, "
	"or any remarks about being an AI. Respond directly with the content.** "
	"Adhere strictly to all formatting requirements given in the user's prompt "
	"(e.g., 'summary:{}', numbered lists, bullet points, JSON structures). "
	"Focus exclusively on data insights, statistics, trends, influencing factors, "
	"and actionable recommendations if requested. Be concise, professional, and factual. "
	"If a request cannot be fulfilled due to data limitations or model capabilities, "
	"respond with: 'STATUS: FAILED_ANALYSIS; REASON: Unable to process this specific analytical request due to limitations.' "
	"No other text should be included."
	)

	# Modified prompt for clearer keyword extraction instruction
	prompt = (
	f"Extract exactly {request_body.num_keywords} most important keywords from the following text. "
	"Your output should be ONLY the comma-separated list of keywords, nothing else. "
	"For example, if the keywords are 'apple', 'banana', 'cherry', your output should be: 'apple, banana, cherry'. "
	f"Text: {request_body.text}"
	)

	messages_for_llm = [
	{"role": "system", "content": system_prompt_content},
	{"role": "user", "content": prompt}
	]

	prompt_tokens = count_tokens_in_text(prompt)
	logger.info(f"🧾 Prompt received (first 100 chars): {prompt[:100]}...")
	logger.info(f"Tokens in prompt: {prompt_tokens}")

	try:
	response = llm.create_chat_completion(
	messages=messages_for_llm,
	max_tokens=request_body.max_tokens,
	temperature=0.7,
	stop=["</s>", "<\|im_end\|>", "\n\n---"],
	top_p=0.9,
	top_k=40,
	repeat_penalty=1.1
	)
	ai_response_content = response["choices"][0]["message"]["content"].strip()
	response_token_count = count_tokens_in_text(ai_response_content)

	logger.info("✅ Response generated successfully.")
	return {
	"response": ai_response_content,
	"prompt_tokens": prompt_tokens,
	"response_token_count": response_token_count
	}
	except Exception as e:
	logger.error(f"❌ Error during generation: {e}", exc_info=True)
	raise HTTPException(status_code=500, detail=f"Failed to generate response: {e}. Please try again.")