Spaces:

nuseAI
/

fastAPIv2

Running

File size: 3,591 Bytes

import sys
import os
import json
from typing import List, Dict

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from components.indexers.news_indexer import get_or_build_index_from_docs
from components.fetchers.google_search import fetch_google_news
from components.fetchers.scraper import scrape_url
from components.generators.daily_feed import generate_and_cache_daily_feed
from llama_index.core.settings import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import Document

# ✅ Set up local embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")

# 🔐 Environment variables
API_KEY = os.environ.get("GOOGLE_API_KEY")
CSE_ID = os.environ.get("GOOGLE_CX_ID")  # ✅ fixed typo

# ✅ News topics to fetch
QUERIES = [
    "India news", "World news", "Tech news", "Finance news", "Sports news"
]

# ✅ Paths
INDEX_DIR = "storage/index"
DATA_DIR = "data/news"
RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")

def write_articles_jsonl(articles: List[Dict], file_path: str):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "w", encoding="utf-8") as f:
        for article in articles:
            f.write(json.dumps(article, ensure_ascii=False) + "\n")

def build_documents(data: List[Dict]) -> List[Document]:
    return [
        Document(
            text=entry["content"],
            metadata={
                "title": entry["title"],
                "url": entry["url"],
                "topic": entry["topic"],
                "source": entry["source"]
            }
        )
        for entry in data
    ]

if __name__ == "__main__":
    if not API_KEY or not CSE_ID:
        raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")

    print("\U0001F30D Fetching news URLs from Google...")

    all_articles = []

    for query in QUERIES:
        print(f"\U0001F50D Searching for: {query}")
        try:
            results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
            print(f"   → Found {len(results)} links for '{query}'.")

            for item in results:
                url = item.get("link", "").strip()
                title = item.get("title", "").strip()
                source = item.get("displayLink", "").strip()
                if not url or not title:
                    continue

                print(f"\U0001F310 Scraping: {url}")
                article_text = scrape_url(url)

                if article_text:
                    all_articles.append({
                        "topic": query,
                        "title": title,
                        "url": url,
                        "source": source,
                        "content": article_text
                    })
                else:
                    print(f"⚠️ Skipped: {url}")

        except Exception as e:
            print(f"❌ Error fetching '{query}': {e}")

    if not all_articles:
        print("⚠️ No content scraped. Exiting.")
    else:
        print(f"📝 Writing {len(all_articles)} articles to {RAW_JSON}...")
        write_articles_jsonl(all_articles, RAW_JSON)

        print("🧠 Building index...")
        documents = build_documents(all_articles)
        get_or_build_index_from_docs(documents)

        print("⚡ Generating daily feed...")
        generate_and_cache_daily_feed(documents)  # 👈 CALLS HEADLINE BUILDER

        print(f"✅ Indexed, headlines generated, and stored at: {INDEX_DIR}")