File size: 2,590 Bytes
6d24925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import sys
import os
import json
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from components.indexers.news_indexer import get_or_build_index
from components.fetchers.google_search import fetch_google_news
from components.fetchers.scraper import scrape_url
from llama_index.core.settings import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# βœ… Set up local embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")

# πŸ” Environment variables
API_KEY = os.environ.get("GOOGLE_API_KEY")
CSE_ID = os.environ.get("GOOGLE_CX_ID")  # βœ… fixed typo

# βœ… News topics to fetch
QUERIES = [
    "India news", "World news", "Tech news", "Finance news", "Sports news"
]

# βœ… Paths
INDEX_DIR = "storage/index"
DATA_DIR = "data/news"
RAW_FILE = os.path.join(DATA_DIR, "news.txt")

def write_articles_to_file(articles, file_path):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "w", encoding="utf-8") as f:
        for article in articles:
            f.write(article.strip() + "\n\n")

if __name__ == "__main__":
    if not API_KEY or not CSE_ID:
        raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")

    print("🌍 Fetching news URLs from Google...")

    all_articles = []

    for query in QUERIES:
        print(f"πŸ” Searching for: {query}")
        try:
            results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
            print(f"   β†’ Found {len(results)} links for '{query}'.")

            for item in results:
                url = item.get("link", "").strip()
                if not url:
                    continue

                print(f"🌐 Scraping: {url}")
                article_text = scrape_url(url)

                if article_text:
                    tagged_text = f"[{query.upper()}]\n{article_text}"
                    print("Adding text to vector", tagged_text)
                    all_articles.append(tagged_text)
                else:
                    print(f"⚠️ Skipped: {url}")

        except Exception as e:
            print(f"❌ Error fetching '{query}': {e}")

    if not all_articles:
        print("⚠️ No content scraped. Exiting.")
    else:
        print(f"πŸ“ Writing {len(all_articles)} articles to {RAW_FILE}...")
        write_articles_to_file(all_articles, RAW_FILE)

        print("🧠 Building index...")
        get_or_build_index(DATA_DIR)

        print(f"βœ… Indexed and stored at: {INDEX_DIR}")