Spaces:

nuseAI
/

fastAPIv2

Sleeping

App Files Files Community

fastAPIv2 / components /generators /daily_feed.py

ragV98

scraper refinements

8cb2491 about 2 months ago

raw

history blame

4.77 kB

	import os
	import json
	import redis
	import numpy as np
	from typing import List, Dict
	from openai import OpenAI
	from components.indexers.news_indexer import get_upstash_vector_store
	from llama_index.core.vector_stores.types import VectorStoreQuery, MetadataFilter, MetadataFilters, FilterOperator

	# 🔐 Environment variables
	REDIS_URL = os.environ.get("UPSTASH_REDIS_URL", "redis://localhost:6379")
	REDIS_KEY = os.environ.get("UPSTASH_REDIS_TOKEN")
	OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

	# ✅ Redis client
	redis_client = redis.Redis.from_url(REDIS_URL, decode_responses=True)

	# 📰 Topics
	TOPICS = ["India news", "World news", "Tech news", "Finance news", "Sports news"]
	TOPIC_KEYS = [t.lower().replace(" news", "") for t in TOPICS]

	# 🧠 Prompt for summarization
	BASE_PROMPT = (
	"You are Nuse’s editorial summarizer. Read the excerpts below and extract the most important stories. "
	"Return up to 3 punchy headlines, each under 20 words, and include why the story matters as the second half of the line."
	)

	# 📥 Load documents by topic and collect references
	def load_docs_by_topic_with_refs() -> Dict[str, List[Dict]]:
	topic_docs = {key: [] for key in TOPIC_KEYS}
	try:
	vector_store = get_upstash_vector_store()
	for full_topic, topic_key in zip(TOPICS, TOPIC_KEYS):
	filters = MetadataFilters(
	filters=[MetadataFilter(key="topic", value=topic_key, operator=FilterOperator.EQ)]
	)
	dummy_vector = np.random.rand(384).tolist()
	query = VectorStoreQuery(query_embedding=dummy_vector, similarity_top_k=50, filters=filters)
	result = vector_store.query(query)
	for node in result.nodes:
	content = node.get_content().strip()
	ref_id = node.node_id or node.id_ or ""
	if content and ref_id:
	topic_docs[topic_key].append({"text": content, "ref": ref_id})
	except Exception as e:
	print("❌ [load_docs_by_topic_with_refs Error]", e)
	return topic_docs

	# 🧪 Summarize topic with reference IDs
	def summarize_topic(topic_key: str, docs: List[Dict], start_index: int) -> List[Dict]:
	if not docs:
	print(f"⚠️ No docs for topic: {topic_key}")
	return []

	try:
	content = "\n\n---\n\n".join([d["text"] for d in docs])[:12000]
	client = OpenAI(api_key=OPENAI_API_KEY)
	response = client.chat.completions.create(
	model="gpt-4",
	messages=[
	{"role": "system", "content": BASE_PROMPT},
	{"role": "user", "content": content},
	],
	max_tokens=512,
	temperature=0.7,
	)
	headlines = response.choices[0].message.content.strip().splitlines()
	result = []
	for i, line in enumerate(headlines):
	clean_line = line.strip("-–• ")
	if clean_line:
	ref_id = docs[i]["ref"] if i < len(docs) else ""
	result.append({
	"summary": f"{start_index + i}. {clean_line}",
	"ref": ref_id,
	"image_url": "https://source.unsplash.com/800x600/?news",
	"article_link": f"https://google.com/search?q={topic_key}+news"
	})
	return result
	except Exception as e:
	print(f"❌ [Summarize topic '{topic_key}' Error]", e)
	return []

	# 🚀 Generate and cache full feed
	def generate_and_cache_daily_feed():
	print("🆕 Starting daily feed generation with OpenAI...")
	docs_by_topic = load_docs_by_topic_with_refs()
	all_feed = []
	counter = 1
	for topic, topic_key in zip(TOPICS, TOPIC_KEYS):
	try:
	summaries = summarize_topic(topic_key, docs_by_topic[topic_key], start_index=counter)
	counter += len(summaries)
	all_feed.append({"topic": topic, "feed": summaries})
	except Exception as e:
	print(f"❌ [Feed generation error for {topic_key}]", e)
	all_feed.append({"topic": topic, "feed": []})

	try:
	redis_client.set("daily_news_feed_cache", json.dumps(all_feed, ensure_ascii=False))
	redis_client.expire("daily_news_feed_cache", 86400)
	print("✅ Cached final feed.")
	except Exception as e:
	print("❌ [Redis caching error]", e)

	return all_feed

	# 🗃️ Fetch from cache
	def get_cached_daily_feed():
	try:
	data = redis_client.get("daily_news_feed_cache")
	return json.loads(data) if data else []
	except Exception as e:
	print("❌ [Cache fetch error]", e)
	return []

	if __name__ == "__main__":
	feed = generate_and_cache_daily_feed()
	print(json.dumps(feed, indent=2, ensure_ascii=False))