|
import os |
|
import json |
|
import redis |
|
from typing import List, Dict |
|
from llama_index.core import VectorStoreIndex, StorageContext |
|
from llama_index.core.query_engine import RetrieverQueryEngine |
|
from llama_index.llms.base import LLM, LLMMetadata |
|
from llama_index.core.llms import CompletionResponse |
|
from llama_index.core.settings import Settings |
|
from components.LLMs.Mistral import call_mistral |
|
from components.indexers.news_indexer import get_upstash_vector_store |
|
|
|
|
|
class DummyLLM(LLM): |
|
def complete(self, prompt: str, **kwargs) -> CompletionResponse: |
|
return CompletionResponse(text="") |
|
|
|
@property |
|
def metadata(self) -> LLMMetadata: |
|
return LLMMetadata( |
|
context_window=8192, |
|
num_output=1024, |
|
is_chat_model=False |
|
) |
|
|
|
Settings.llm = DummyLLM() |
|
|
|
|
|
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379") |
|
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN") |
|
INDEX_DIR = os.environ.get("INDEX_DIR", "storage/index") |
|
|
|
|
|
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True) |
|
|
|
|
|
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"] |
|
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS] |
|
|
|
|
|
BASE_PROMPT = ( |
|
"You are Nuseβs official news summarizer β fast, sharp, and never generic.\n" |
|
"Your task is to read the following **collection of news excerpts** and extract the most important stories.\n" |
|
"\n" |
|
"For each distinct news item you find, write a punchy summary β exactly one line, no more than 20 words. Aim for 15β20 words per summary.\n" |
|
"\n" |
|
"Formatting rules:\n" |
|
"- Each summary must begin with a dash (-)\n" |
|
"- Do **not** number the summaries\n" |
|
"- Do **not** include emojis or hashtags\n" |
|
"- Do **not** add the source name or publication\n" |
|
"\n" |
|
"If a person is mentioned, include their designation in brackets. Examples:\n" |
|
"- Jeff Bezos (Amazon founder)\n" |
|
"- Narendra Modi (Prime Minister of India)\n" |
|
"- NATO Chief Jens Stoltenberg\n" |
|
"\n" |
|
"β
Good examples:\n" |
|
"- India stuns Australia in last-ball World Cup thriller, secures spot in finals\n" |
|
"- U.S. imposes tariffs on Chinese tech giants, shaking global investor confidence\n" |
|
"- Ceasefire breakthrough as Netanyahu (Israeli PM) relents under diplomatic pressure\n" |
|
"\n" |
|
"β Avoid:\n" |
|
"- Source mentions like (The New York Times), (Reuters)\n" |
|
"- Introductory fluff or meta comments\n" |
|
"- Repeating prompt instructions or context\n" |
|
"\n" |
|
"You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else." |
|
) |
|
|
|
|
|
def load_documents_by_topic() -> Dict[str, List[str]]: |
|
vector_store = get_upstash_vector_store() |
|
storage_context = StorageContext.from_defaults(vector_store=vector_store) |
|
index = VectorStoreIndex([], storage_context=storage_context) |
|
retriever = index.as_retriever(similarity_top_k=10) |
|
query_engine = RetrieverQueryEngine(retriever=retriever) |
|
|
|
topic_docs = {} |
|
for topic, key in zip(TOPICS, TOPIC_KEYS): |
|
response = query_engine.query(topic) |
|
doc_texts = [str(node.get_content()).strip() for node in response.source_nodes if node.get_content()] |
|
topic_docs[key] = doc_texts |
|
return topic_docs |
|
|
|
|
|
def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]: |
|
if not docs: |
|
return [] |
|
|
|
merged_text = "\n\n---\n\n".join(docs) |
|
print(f"π§ Summarizing topic: {topic_key}") |
|
summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text) |
|
|
|
summaries = [] |
|
if summary_block: |
|
for line in summary_block.splitlines(): |
|
line = line.strip() |
|
if line.startswith("-"): |
|
clean = line.lstrip("-β").strip() |
|
if clean: |
|
summaries.append({ |
|
"summary": clean, |
|
"image_url": "https://source.unsplash.com/800x600/?news", |
|
"article_link": f"https://google.com/search?q={topic_key}+news" |
|
}) |
|
return summaries |
|
|
|
|
|
def generate_and_cache_daily_feed(): |
|
topic_docs = load_documents_by_topic() |
|
feed_map = {} |
|
|
|
for topic_key in TOPIC_KEYS: |
|
summaries = summarize_topic(topic_key, topic_docs.get(topic_key, [])) |
|
feed_map[topic_key] = summaries |
|
|
|
final_feed = [{"topic": topic, "feed": feed_map[topic]} for topic in TOPIC_KEYS] |
|
redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False)) |
|
print(f"β
Cached daily feed under key '{REDIS_KEY}'") |
|
return final_feed |
|
|
|
|
|
def get_cached_daily_feed(): |
|
cached = redis_client.get(REDIS_KEY) |
|
return json.loads(cached) if cached else [] |
|
|