Spaces:

nuseAI
/

fastAPIv2

Running

File size: 5,199 Bytes

import os
import json
import redis
from typing import List, Dict
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.schema import Document
from llama_index.core.query_engine import RetrieverQueryEngine
from components.LLMs.Mistral import call_mistral

# 🔐 Environment variables
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
INDEX_DIR = os.environ.get("INDEX_DIR", "storage/index")

# ✅ Redis client
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)

# 📰 Topic list
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]

# 🧠 Summarization prompt
BASE_PROMPT = (
    "You are Nuse’s official news summarizer — fast, sharp, and never generic.\n"
    "Your task is to read the following **collection of news excerpts** and extract the most important stories.\n"
    "\n"
    "For each distinct news item you find, write a punchy summary — exactly one line, no more than 20 words. Aim for 15–20 words per summary.\n"
    "\n"
    "Formatting rules:\n"
    "- Each summary must begin with a dash (-)\n"
    "- Do **not** number the summaries\n"
    "- Do **not** include emojis or hashtags\n"
    "- Do **not** add the source name or publication\n"
    "\n"
    "If a person is mentioned, include their designation in brackets. Examples:\n"
    "- Jeff Bezos (Amazon founder)\n"
    "- Narendra Modi (Prime Minister of India)\n"
    "- NATO Chief Jens Stoltenberg\n"
    "\n"
    "✅ Good examples:\n"
    "- India stuns Australia in last-ball World Cup thriller, secures spot in finals\n"
    "- U.S. imposes tariffs on Chinese tech giants, shaking global investor confidence\n"
    "- Ceasefire breakthrough as Netanyahu (Israeli PM) relents under diplomatic pressure\n"
    "\n"
    "❌ Avoid:\n"
    "- Source mentions like (The New York Times), (Reuters)\n"
    "- Introductory fluff or meta comments\n"
    "- Repeating prompt instructions or context\n"
    "\n"
    "You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
)

# 🧠 Categorize summary line into topic
def categorize_summary(summary: str) -> str:
    s = summary.lower()
    if "india" in s or "modi" in s:
        return "india"
    elif any(x in s for x in ["us", "uk", "gaza", "china", "russia", "bangladesh", "israel", "trump", "biden", "world"]):
        return "world"
    elif any(x in s for x in ["ai", "tech", "space", "innovation", "startup", "software", "device"]):
        return "tech"
    elif any(x in s for x in ["market", "stock", "inflation", "finance", "fed", "reserve", "earnings", "revenue", "economy"]):
        return "finance"
    elif any(x in s for x in ["cricket", "football", "nba", "nfl", "sports", "match", "league", "tournament"]):
        return "sports"
    else:
        return "world"

# 📥 Load all documents from the vector store
def load_all_documents() -> List[Document]:
    storage_context = StorageContext.from_defaults(persist_dir=INDEX_DIR)
    index = load_index_from_storage(storage_context)
    retriever = index.as_retriever(similarity_top_k=50)
    query_engine = RetrieverQueryEngine(retriever=retriever)

    combined_docs = []
    for topic in TOPICS:
        response = query_engine.query(topic)
        for node in response.source_nodes:
            doc_text = str(node.get_content()).strip()
            if doc_text:
                combined_docs.append(doc_text)
    return combined_docs

# 🧪 Summarize entire day's content in one call
def summarize_and_categorize(docs: List[str]) -> Dict[str, List[Dict]]:
    merged_text = "\n\n---\n\n".join(docs)
    print("\n🧠 Sending merged prompt to summarizer...\n")
    summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)

    categorized_feed = {key: [] for key in TOPIC_KEYS}
    if summary_block:
        for line in summary_block.splitlines():
            line = line.strip()
            if line.startswith("-"):
                clean = line.lstrip("-–").strip()
                if clean:
                    topic_key = categorize_summary(clean)
                    categorized_feed[topic_key].append({
                        "summary": clean,
                        "image_url": "https://source.unsplash.com/800x600/?news",
                        "article_link": f"https://google.com/search?q={topic_key}+news"
                    })
    return categorized_feed

# 🚀 Main callable
def generate_and_cache_daily_feed():
    docs = load_all_documents()
    if not docs:
        print("⚠️ No documents found in vector store.")
        return []

    feed_map = summarize_and_categorize(docs)
    final_feed = [{"topic": topic, "feed": feed_map[topic]} for topic in TOPIC_KEYS]

    redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
    print(f"✅ Cached daily feed under key '{REDIS_KEY}'")
    return final_feed

# 📦 Get cached data
def get_cached_daily_feed():
    cached = redis_client.get(REDIS_KEY)
    return json.loads(cached) if cached else []