revert to by topic
Browse files
components/generators/daily_feed.py
CHANGED
@@ -2,15 +2,15 @@ import os
|
|
2 |
import json
|
3 |
import redis
|
4 |
from typing import List, Dict
|
5 |
-
from llama_index.core import VectorStoreIndex, StorageContext
|
6 |
-
from llama_index.core.schema import Document
|
7 |
from llama_index.core.query_engine import RetrieverQueryEngine
|
8 |
-
from components.LLMs.Mistral import call_mistral
|
9 |
-
from components.indexers.news_indexer import load_news_index,get_upstash_vector_store
|
10 |
-
from llama_index.core.settings import Settings
|
11 |
from llama_index.llms.base import LLM, LLMMetadata
|
12 |
from llama_index.core.llms import CompletionResponse
|
|
|
|
|
|
|
13 |
|
|
|
14 |
class DummyLLM(LLM):
|
15 |
def complete(self, prompt: str, **kwargs) -> CompletionResponse:
|
16 |
return CompletionResponse(text="")
|
@@ -68,73 +68,54 @@ BASE_PROMPT = (
|
|
68 |
"You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
|
69 |
)
|
70 |
|
71 |
-
#
|
72 |
-
def
|
73 |
-
s = summary.lower()
|
74 |
-
if "india" in s or "modi" in s:
|
75 |
-
return "india"
|
76 |
-
elif any(x in s for x in ["us", "uk", "gaza", "china", "russia", "bangladesh", "israel", "trump", "biden", "world"]):
|
77 |
-
return "world"
|
78 |
-
elif any(x in s for x in ["ai", "tech", "space", "innovation", "startup", "software", "device"]):
|
79 |
-
return "tech"
|
80 |
-
elif any(x in s for x in ["market", "stock", "inflation", "finance", "fed", "reserve", "earnings", "revenue", "economy"]):
|
81 |
-
return "finance"
|
82 |
-
elif any(x in s for x in ["cricket", "football", "nba", "nfl", "sports", "match", "league", "tournament"]):
|
83 |
-
return "sports"
|
84 |
-
else:
|
85 |
-
return "world"
|
86 |
-
|
87 |
-
# π₯ Load all documents from the vector store
|
88 |
-
def load_all_documents() -> List[str]:
|
89 |
vector_store = get_upstash_vector_store()
|
90 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
91 |
-
|
92 |
-
# Rebuild index using storage context directly (not from_disk)
|
93 |
index = VectorStoreIndex([], storage_context=storage_context)
|
94 |
-
|
95 |
retriever = index.as_retriever(similarity_top_k=10)
|
96 |
query_engine = RetrieverQueryEngine(retriever=retriever)
|
97 |
|
98 |
-
|
99 |
-
for topic in TOPICS:
|
100 |
response = query_engine.query(topic)
|
101 |
-
for node in response.source_nodes
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
109 |
merged_text = "\n\n---\n\n".join(docs)
|
110 |
-
print("
|
111 |
summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)
|
112 |
|
113 |
-
|
114 |
if summary_block:
|
115 |
for line in summary_block.splitlines():
|
116 |
line = line.strip()
|
117 |
if line.startswith("-"):
|
118 |
clean = line.lstrip("-β").strip()
|
119 |
if clean:
|
120 |
-
|
121 |
-
categorized_feed[topic_key].append({
|
122 |
"summary": clean,
|
123 |
"image_url": "https://source.unsplash.com/800x600/?news",
|
124 |
"article_link": f"https://google.com/search?q={topic_key}+news"
|
125 |
})
|
126 |
-
return
|
127 |
|
128 |
# π Main callable
|
129 |
def generate_and_cache_daily_feed():
|
130 |
-
|
131 |
-
|
132 |
-
print("β οΈ No documents found in vector store.")
|
133 |
-
return []
|
134 |
|
135 |
-
|
136 |
-
|
|
|
137 |
|
|
|
138 |
redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
|
139 |
print(f"β
Cached daily feed under key '{REDIS_KEY}'")
|
140 |
return final_feed
|
|
|
2 |
import json
|
3 |
import redis
|
4 |
from typing import List, Dict
|
5 |
+
from llama_index.core import VectorStoreIndex, StorageContext
|
|
|
6 |
from llama_index.core.query_engine import RetrieverQueryEngine
|
|
|
|
|
|
|
7 |
from llama_index.llms.base import LLM, LLMMetadata
|
8 |
from llama_index.core.llms import CompletionResponse
|
9 |
+
from llama_index.core.settings import Settings
|
10 |
+
from components.LLMs.Mistral import call_mistral
|
11 |
+
from components.indexers.news_indexer import get_upstash_vector_store
|
12 |
|
13 |
+
# β
Dummy LLM config to allow higher context window in LlamaIndex
|
14 |
class DummyLLM(LLM):
|
15 |
def complete(self, prompt: str, **kwargs) -> CompletionResponse:
|
16 |
return CompletionResponse(text="")
|
|
|
68 |
"You are generating sharp, editorial-style headlines. Only output the summaries. Nothing else."
|
69 |
)
|
70 |
|
71 |
+
# π₯ Load topic-wise documents from Upstash vector store
|
72 |
+
def load_documents_by_topic() -> Dict[str, List[str]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
vector_store = get_upstash_vector_store()
|
74 |
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
|
|
|
|
75 |
index = VectorStoreIndex([], storage_context=storage_context)
|
|
|
76 |
retriever = index.as_retriever(similarity_top_k=10)
|
77 |
query_engine = RetrieverQueryEngine(retriever=retriever)
|
78 |
|
79 |
+
topic_docs = {}
|
80 |
+
for topic, key in zip(TOPICS, TOPIC_KEYS):
|
81 |
response = query_engine.query(topic)
|
82 |
+
doc_texts = [str(node.get_content()).strip() for node in response.source_nodes if node.get_content()]
|
83 |
+
topic_docs[key] = doc_texts
|
84 |
+
return topic_docs
|
85 |
+
|
86 |
+
# π§ͺ Summarize one topic at a time
|
87 |
+
def summarize_topic(topic_key: str, docs: List[str]) -> List[Dict]:
|
88 |
+
if not docs:
|
89 |
+
return []
|
90 |
+
|
91 |
merged_text = "\n\n---\n\n".join(docs)
|
92 |
+
print(f"π§ Summarizing topic: {topic_key}")
|
93 |
summary_block = call_mistral(base_prompt=BASE_PROMPT, tail_prompt=merged_text)
|
94 |
|
95 |
+
summaries = []
|
96 |
if summary_block:
|
97 |
for line in summary_block.splitlines():
|
98 |
line = line.strip()
|
99 |
if line.startswith("-"):
|
100 |
clean = line.lstrip("-β").strip()
|
101 |
if clean:
|
102 |
+
summaries.append({
|
|
|
103 |
"summary": clean,
|
104 |
"image_url": "https://source.unsplash.com/800x600/?news",
|
105 |
"article_link": f"https://google.com/search?q={topic_key}+news"
|
106 |
})
|
107 |
+
return summaries
|
108 |
|
109 |
# π Main callable
|
110 |
def generate_and_cache_daily_feed():
|
111 |
+
topic_docs = load_documents_by_topic()
|
112 |
+
feed_map = {}
|
|
|
|
|
113 |
|
114 |
+
for topic_key in TOPIC_KEYS:
|
115 |
+
summaries = summarize_topic(topic_key, topic_docs.get(topic_key, []))
|
116 |
+
feed_map[topic_key] = summaries
|
117 |
|
118 |
+
final_feed = [{"topic": topic, "feed": feed_map[topic]} for topic in TOPIC_KEYS]
|
119 |
redis_client.set(REDIS_KEY, json.dumps(final_feed, ensure_ascii=False))
|
120 |
print(f"β
Cached daily feed under key '{REDIS_KEY}'")
|
121 |
return final_feed
|