File size: 5,199 Bytes
69210b9
 
 
6858714
4df303e
67fbb52
4df303e
c8b3b66
69210b9
0e7d7a3
69210b9
 
4df303e
69210b9
0e7d7a3
69210b9
 
6858714
69210b9
6858714
 
 
c8b3b66
9266b3d
 
c8b3b66
9266b3d
c8b3b66
9266b3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8b3b66
71257bd
6858714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4df303e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6858714
4df303e
 
 
6858714
 
 
 
7200af5
 
 
6858714
 
 
 
 
 
7200af5
6858714
7200af5
6858714
7200af5
4df303e
 
 
 
 
 
 
 
 
69210b9
 
 
 
 
4df303e
69210b9
 
c8b3b66
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import json
import redis
from typing import List, Dict
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.schema import Document
from llama_index.core.query_engine import RetrieverQueryEngine
from components.LLMs.Mistral import call_mistral

# πŸ” Environment variables
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
INDEX_DIR = os.environ.get("INDEX_DIR", "storage/index")

# βœ… Redis client
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)

# πŸ“° Topic list
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]

# 🧠 Summarization prompt
BASE_PROMPT = (
    "You are Nuse’s official news summarizer β€” fast, sharp, and never generic.\n"
    "Your task is to read the following **collection of news excerpts** and extract the most important stories.\n"
    "\n"
    "For each distinct news item you find, write a punchy summary β€” exactly one line, no more than 20 words. Aim for 15–20 words per summary.\n"
    "\n"
    "Formatting rules:\n"
    "- Each summary must begin with a dash (-)\n"
    "- Do **not** number the summaries\n"
    "- Do **not** include emojis or hashtags\n"
    "- Do **not** add the source name or publication\n"
    "\n"
    "If a person is mentioned, include their designation in brackets. Examples:\n"
    "- Jeff Bezos (Amazon founder)\n"
    "- Narendra Modi (Prime Minister of India)\n"
    "- NATO Chief Jens Stoltenberg\n"
    "\n"
    "βœ… Good examples:\n"
    "- India stuns Australia in last-ball World Cup thriller, secures spot in finals\n"
    "- U.S. imposes tariffs on Chinese tech giants, shaking global investor confidence\n"
    "- Ceasefire breakthrough as Netanyahu (Israeli PM) relents under diplomatic pressure\n"
    "\n"
    "❌ Avoid:\n"
    "- Source mentions like (The New York Times), (Reuters)\n"
    "- Introductory fluff or meta comments\n"
    "- Repeating prompt instructions or context\n"
    "\n"
    "You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
)

# 🧠 Categorize summary line into topic
def categorize_summary(summary: str) -> str:
    s = summary.lower()
    if "india" in s or "modi" in s:
        return "india"
    elif any(x in s for x in ["us", "uk", "gaza", "china", "russia", "bangladesh", "israel", "trump", "biden", "world"]):
        return "world"
    elif any(x in s for x in ["ai", "tech", "space", "innovation", "startup", "software", "device"]):
        return "tech"
    elif any(x in s for x in ["market", "stock", "inflation", "finance", "fed", "reserve", "earnings", "revenue", "economy"]):
        return "finance"
    elif any(x in s for x in ["cricket", "football", "nba", "nfl", "sports", "match", "league", "tournament"]):
        return "sports"
    else:
        return "world"

# πŸ“₯ Load all documents from the vector store
def load_all_documents() -> List[Document]:
    storage_context = StorageContext.from_defaults(persist_dir=INDEX_DIR)
    index = load_index_from_storage(storage_context)
    retriever = index.as_retriever(similarity_top_k=50)
    query_engine = RetrieverQueryEngine(retriever=retriever)

    combined_docs = []
    for topic in TOPICS:
        response = query_engine.query(topic)
        for node in response.source_nodes:
            doc_text = str(node.get_content()).strip()
            if doc_text:
                combined_docs.append(doc_text)
    return combined_docs

# πŸ§ͺ Summarize entire day's content in one call
def summarize_and_categorize(docs: List[str]) -> Dict[str, List[Dict]]:
    merged_text = "\n\n---\n\n".join(docs)
    print("\n🧠 Sending merged prompt to summarizer...\n")
    summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)

    categorized_feed = {key: [] for key in TOPIC_KEYS}
    if summary_block:
        for line in summary_block.splitlines():
            line = line.strip()
            if line.startswith("-"):
                clean = line.lstrip("-–").strip()
                if clean:
                    topic_key = categorize_summary(clean)
                    categorized_feed[topic_key].append({
                        "summary": clean,
                        "image_url": "https://source.unsplash.com/800x600/?news",
                        "article_link": f"https://google.com/search?q={topic_key}+news"
                    })
    return categorized_feed

# πŸš€ Main callable
def generate_and_cache_daily_feed():
    docs = load_all_documents()
    if not docs:
        print("⚠️ No documents found in vector store.")
        return []

    feed_map = summarize_and_categorize(docs)
    final_feed = [{"topic": topic, "feed": feed_map[topic]} for topic in TOPIC_KEYS]

    redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
    print(f"βœ… Cached daily feed under key '{REDIS_KEY}'")
    return final_feed

# πŸ“¦ Get cached data
def get_cached_daily_feed():
    cached = redis_client.get(REDIS_KEY)
    return json.loads(cached) if cached else []