|
import os |
|
import json |
|
import redis |
|
import numpy as np |
|
from typing import List, Dict |
|
from openai import OpenAI |
|
from components.indexers.news_indexer import get_upstash_vector_store |
|
from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator |
|
|
|
|
|
REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379") |
|
REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN") |
|
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") |
|
|
|
|
|
redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True) |
|
|
|
|
|
TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"] |
|
TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS] |
|
|
|
|
|
BASE_PROMPT = ( |
|
"You are Nuseβs editorial summarizer. Read the excerpts below and extract the most important stories. " |
|
"Return up to 3 punchy headlines, each under 20 words, and include why the story matters as the second half of the line." |
|
) |
|
|
|
|
|
def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]: |
|
topic_docs = {key: [] for key in TOPIC_KEYS} |
|
try: |
|
vector_store = get_upstash_vector_store() |
|
for full_topic, topic_key in zip(TOPICS, TOPIC_KEYS): |
|
filters = MetadataFilters( |
|
filters=[MetadataFilter(key="topic", value=topic_key, operator=FilterOperator.EQ)] |
|
) |
|
dummy_vector = np.random.rand(384).tolist() |
|
query = VectorStoreQuery(query_embedding=dummy_vector, similarity_top_k=50, filters=filters) |
|
result = vector_store.query(query) |
|
for node in result.nodes: |
|
content = node.get_content().strip() |
|
ref_id = node.node_id or node.id_ or "" |
|
if content and ref_id: |
|
topic_docs[topic_key].append({"text": content, "ref": ref_id}) |
|
except Exception as e: |
|
print("β [load_docs_by_topic_with_refs Error]", e) |
|
return topic_docs |
|
|
|
|
|
def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]: |
|
if not docs: |
|
print(f"β οΈ No docs for topic: {topic_key}") |
|
return [] |
|
|
|
try: |
|
content = "\n\n---\n\n".join([d["text"] for d in docs])[:12000] |
|
client = OpenAI(api_key=OPENAI_API_KEY) |
|
response = client.chat.completions.create( |
|
model="gpt-4", |
|
messages=[ |
|
{"role": "system", "content": BASE_PROMPT}, |
|
{"role": "user", "content": content}, |
|
], |
|
max_tokens=512, |
|
temperature=0.7, |
|
) |
|
headlines = response.choices[0].message.content.strip().splitlines() |
|
result = [] |
|
for i, line in enumerate(headlines): |
|
clean_line = line.strip("-ββ’ ") |
|
if clean_line: |
|
ref_id = docs[i]["ref"] if i < len(docs) else "" |
|
result.append({ |
|
"summary": f"{start_index + i}. {clean_line}", |
|
"ref": ref_id, |
|
"image_url": "https://source.unsplash.com/800x600/?news", |
|
"article_link": f"https://google.com/search?q={topic_key}+news" |
|
}) |
|
return result |
|
except Exception as e: |
|
print(f"β [Summarize topic '{topic_key}' Error]", e) |
|
return [] |
|
|
|
|
|
def generate_and_cache_daily_feed(): |
|
print("π Starting daily feed generation with OpenAI...") |
|
docs_by_topic = load_docs_by_topic_with_refs() |
|
all_feed = [] |
|
counter = 1 |
|
for topic, topic_key in zip(TOPICS, TOPIC_KEYS): |
|
try: |
|
summaries = summarize_topic(topic_key, docs_by_topic[topic_key], start_index=counter) |
|
counter += len(summaries) |
|
all_feed.append({"topic": topic, "feed": summaries}) |
|
except Exception as e: |
|
print(f"β [Feed generation error for {topic_key}]", e) |
|
all_feed.append({"topic": topic, "feed": []}) |
|
|
|
try: |
|
redis_client.set("daily_news_feed_cache", json.dumps(all_feed, ensure_ascii=False)) |
|
redis_client.expire("daily_news_feed_cache", 86400) |
|
print("β
Cached final feed.") |
|
except Exception as e: |
|
print("β [Redis caching error]", e) |
|
|
|
return all_feed |
|
|
|
|
|
def get_cached_daily_feed(): |
|
try: |
|
data = redis_client.get("daily_news_feed_cache") |
|
return json.loads(data) if data else [] |
|
except Exception as e: |
|
print("β [Cache fetch error]", e) |
|
return [] |
|
|
|
if __name__ == "__main__": |
|
feed = generate_and_cache_daily_feed() |
|
print(json.dumps(feed, indent=2, ensure_ascii=False)) |
|
|