import sys import os import json sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) from components.indexers.news_indexer import get_or_build_index from components.fetchers.google_search import fetch_google_news from components.fetchers.scraper import scrape_url from llama_index.core.settings import Settings from llama_index.embeddings.huggingface import HuggingFaceEmbedding # ✅ Set up local embedding model Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2") # 🔐 Environment variables API_KEY = os.environ.get("GOOGLE_API_KEY") CSE_ID = os.environ.get("GOOGLE_CX_ID") # ✅ fixed typo # ✅ News topics to fetch QUERIES = [ "India news", "World news", "Tech news", "Finance news", "Sports news" ] # ✅ Paths INDEX_DIR = "storage/index" DATA_DIR = "data/news" RAW_FILE = os.path.join(DATA_DIR, "news.txt") def write_articles_to_file(articles, file_path): os.makedirs(os.path.dirname(file_path), exist_ok=True) with open(file_path, "w", encoding="utf-8") as f: for article in articles: f.write(article.strip() + "\n\n") if __name__ == "__main__": if not API_KEY or not CSE_ID: raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.") print("🌍 Fetching news URLs from Google...") all_articles = [] for query in QUERIES: print(f"🔍 Searching for: {query}") try: results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10) print(f" → Found {len(results)} links for '{query}'.") for item in results: url = item.get("link", "").strip() if not url: continue print(f"🌐 Scraping: {url}") article_text = scrape_url(url) if article_text: tagged_text = f"[{query.upper()}]\n{article_text}" print("Adding text to vector", tagged_text) all_articles.append(tagged_text) else: print(f"⚠️ Skipped: {url}") except Exception as e: print(f"❌ Error fetching '{query}': {e}") if not all_articles: print("⚠️ No content scraped. Exiting.") else: print(f"📝 Writing {len(all_articles)} articles to {RAW_FILE}...") write_articles_to_file(all_articles, RAW_FILE) print("🧠 Building index...") get_or_build_index(DATA_DIR) print(f"✅ Indexed and stored at: {INDEX_DIR}")