|
import sys |
|
import os |
|
import json |
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) |
|
|
|
from components.indexers.news_indexer import get_or_build_index |
|
from components.fetchers.google_search import fetch_google_news |
|
from components.fetchers.scraper import scrape_url |
|
from llama_index.core.settings import Settings |
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
|
|
|
|
|
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2") |
|
|
|
|
|
API_KEY = os.environ.get("GOOGLE_API_KEY") |
|
CSE_ID = os.environ.get("GOOGLE_CX_ID") |
|
|
|
|
|
QUERIES = [ |
|
"India news", "World news", "Tech news", "Finance news", "Sports news" |
|
] |
|
|
|
|
|
INDEX_DIR = "storage/index" |
|
DATA_DIR = "data/news" |
|
RAW_FILE = os.path.join(DATA_DIR, "news.txt") |
|
|
|
def write_articles_to_file(articles, file_path): |
|
os.makedirs(os.path.dirname(file_path), exist_ok=True) |
|
with open(file_path, "w", encoding="utf-8") as f: |
|
for article in articles: |
|
f.write(article.strip() + "\n\n") |
|
|
|
if __name__ == "__main__": |
|
if not API_KEY or not CSE_ID: |
|
raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.") |
|
|
|
print("π Fetching news URLs from Google...") |
|
|
|
all_articles = [] |
|
|
|
for query in QUERIES: |
|
print(f"π Searching for: {query}") |
|
try: |
|
results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10) |
|
print(f" β Found {len(results)} links for '{query}'.") |
|
|
|
for item in results: |
|
url = item.get("link", "").strip() |
|
if not url: |
|
continue |
|
|
|
print(f"π Scraping: {url}") |
|
article_text = scrape_url(url) |
|
|
|
if article_text: |
|
tagged_text = f"[{query.upper()}]\n{article_text}" |
|
print("Adding text to vector", tagged_text) |
|
all_articles.append(tagged_text) |
|
else: |
|
print(f"β οΈ Skipped: {url}") |
|
|
|
except Exception as e: |
|
print(f"β Error fetching '{query}': {e}") |
|
|
|
if not all_articles: |
|
print("β οΈ No content scraped. Exiting.") |
|
else: |
|
print(f"π Writing {len(all_articles)} articles to {RAW_FILE}...") |
|
write_articles_to_file(all_articles, RAW_FILE) |
|
|
|
print("π§ Building index...") |
|
get_or_build_index(DATA_DIR) |
|
|
|
print(f"β
Indexed and stored at: {INDEX_DIR}") |
|
|