File size: 5,009 Bytes
69210b9 6858714 3f4bef7 4df303e 430007a 3f4bef7 430007a 3f4bef7 430007a 69210b9 0e7d7a3 69210b9 4df303e 69210b9 0e7d7a3 69210b9 6858714 69210b9 6858714 c8b3b66 9266b3d c8b3b66 9266b3d c8b3b66 9266b3d c8b3b66 71257bd 3f4bef7 374bd8c af2f607 4df303e 3f4bef7 4df303e 3f4bef7 4df303e 3f4bef7 6858714 3f4bef7 7200af5 6858714 3f4bef7 6858714 7200af5 6858714 7200af5 3f4bef7 7200af5 4df303e 3f4bef7 4df303e 3f4bef7 69210b9 3f4bef7 69210b9 4df303e 69210b9 c8b3b66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import os
import json
import redis
from typing import List, Dict
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.llms.base import LLM, LLMMetadata
from llama_index.core.llms import CompletionResponse
from llama_index.core.settings import Settings
from components.LLMs.Mistral import call_mistral
from components.indexers.news_indexer import get_upstash_vector_store
# β
Dummy LLM config to allow higher context window in LlamaIndex
class DummyLLM(LLM):
def complete(self, prompt: str, **kwargs) -> CompletionResponse:
return CompletionResponse(text="")
@property
def metadata(self) -> LLMMetadata:
return LLMMetadata(
context_window=8192,
num_output=1024,
is_chat_model=False
)
Settings.llm = DummyLLM()
# π Environment variables
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
INDEX_DIR = os.environ.get("INDEX_DIR", "storage/index")
# β
Redis client
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)
# π° Topic list
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]
# π§ Summarization prompt
BASE_PROMPT = (
"You are Nuseβs official news summarizer β fast, sharp, and never generic.\n"
"Your task is to read the following **collection of news excerpts** and extract the most important stories.\n"
"\n"
"For each distinct news item you find, write a punchy summary β exactly one line, no more than 20 words. Aim for 15β20 words per summary.\n"
"\n"
"Formatting rules:\n"
"- Each summary must begin with a dash (-)\n"
"- Do **not** number the summaries\n"
"- Do **not** include emojis or hashtags\n"
"- Do **not** add the source name or publication\n"
"\n"
"If a person is mentioned, include their designation in brackets. Examples:\n"
"- Jeff Bezos (Amazon founder)\n"
"- Narendra Modi (Prime Minister of India)\n"
"- NATO Chief Jens Stoltenberg\n"
"\n"
"β
Good examples:\n"
"- India stuns Australia in last-ball World Cup thriller, secures spot in finals\n"
"- U.S. imposes tariffs on Chinese tech giants, shaking global investor confidence\n"
"- Ceasefire breakthrough as Netanyahu (Israeli PM) relents under diplomatic pressure\n"
"\n"
"β Avoid:\n"
"- Source mentions like (The New York Times), (Reuters)\n"
"- Introductory fluff or meta comments\n"
"- Repeating prompt instructions or context\n"
"\n"
"You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
)
# π₯ Load topic-wise documents from Upstash vector store
def load_documents_by_topic() -> Dict[str, List[str]]:
vector_store = get_upstash_vector_store()
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex([], storage_context=storage_context)
retriever = index.as_retriever(similarity_top_k=10)
query_engine = RetrieverQueryEngine(retriever=retriever)
topic_docs = {}
for topic, key in zip(TOPICS, TOPIC_KEYS):
response = query_engine.query(topic)
doc_texts = [str(node.get_content()).strip() for node in response.source_nodes if node.get_content()]
topic_docs[key] = doc_texts
return topic_docs
# π§ͺ Summarize one topic at a time
def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
if not docs:
return []
merged_text = "\n\n---\n\n".join(docs)
print(f"π§ Summarizing topic: {topic_key}")
summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)
summaries = []
if summary_block:
for line in summary_block.splitlines():
line = line.strip()
if line.startswith("-"):
clean = line.lstrip("-β").strip()
if clean:
summaries.append({
"summary": clean,
"image_url": "https://source.unsplash.com/800x600/?news",
"article_link": f"https://google.com/search?q={topic_key}+news"
})
return summaries
# π Main callable
def generate_and_cache_daily_feed():
topic_docs = load_documents_by_topic()
feed_map = {}
for topic_key in TOPIC_KEYS:
summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
feed_map[topic_key] = summaries
final_feed = [{"topic": topic, "feed": feed_map[topic]} for topic in TOPIC_KEYS]
redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
print(f"β
Cached daily feed under key '{REDIS_KEY}'")
return final_feed
# π¦ Get cached data
def get_cached_daily_feed():
cached = redis_client.get(REDIS_KEY)
return json.loads(cached) if cached else []
|