File size: 4,928 Bytes
6d24925
 
 
8e17b80
 
6d24925
 
989b675
6d24925
 
69210b9
6d24925
 
8e17b80
6d24925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e17b80
6d24925
8e17b80
6d24925
 
 
8e17b80
 
d4f91e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e17b80
 
 
 
 
 
 
 
 
 
 
 
6d24925
d4f91e1
 
6d24925
 
 
d4f91e1
6d24925
 
 
 
d4f91e1
6d24925
 
 
 
 
 
8e17b80
 
 
6d24925
 
d4f91e1
6d24925
 
 
8e17b80
 
 
 
 
 
 
6d24925
 
 
 
 
 
 
 
d4f91e1
6d24925
d4f91e1
 
6d24925
d4f91e1
 
 
 
 
67fbb52
d4f91e1
 
 
 
 
 
69210b9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import sys
import os
import json
from typing import List, Dict

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from components.indexers.news_indexer import get_or_build_index_from_docs
from components.fetchers.google_search import fetch_google_news
from components.fetchers.scraper import scrape_url
from components.generators.daily_feed import generate_and_cache_daily_feed
from llama_index.core.settings import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import Document

# βœ… Set up local embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")

# πŸ” Environment variables
API_KEY = os.environ.get("GOOGLE_API_KEY")
CSE_ID = os.environ.get("GOOGLE_CX_ID")  # βœ… fixed typo

# βœ… News topics to fetch
QUERIES = [
    "India news", "World news", "Tech news", "Finance news", "Sports news"
]

# βœ… Paths
INDEX_DIR = "storage/index"
DATA_DIR = "data/news"
RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")

def write_articles_jsonl(articles: List[Dict], file_path: str):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "w", encoding="utf-8") as f:
        for article in articles:
            f.write(json.dumps(article, ensure_ascii=False) + "\n")

import sys
import os
import json
import asyncio
from typing import List, Dict

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from components.indexers.news_indexer import get_or_build_index_from_docs
from components.fetchers.google_search import fetch_google_news
from components.fetchers.scraper import scrape_url
from components.generators.daily_feed import generate_and_cache_daily_feed
from llama_index.core.settings import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import Document

# βœ… Set up local embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")

# πŸ” Environment variables
API_KEY = os.environ.get("GOOGLE_API_KEY")
CSE_ID = os.environ.get("GOOGLE_CX_ID")

# βœ… News topics to fetch
QUERIES = [
    "India news", "World news", "Tech news", "Finance news", "Sports news"
]

# βœ… Paths
INDEX_DIR = "storage/index"
DATA_DIR = "data/news"
RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")


def write_articles_jsonl(articles: List[Dict], file_path: str):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    with open(file_path, "w", encoding="utf-8") as f:
        for article in articles:
            f.write(json.dumps(article, ensure_ascii=False) + "\n")


async def build_documents(data: List[Dict]) -> List[Document]:
    return [
        Document(
            text=entry["content"],
            metadata={
                "title": entry["title"],
                "url": entry["url"],
                "topic": entry["topic"],
                "source": entry["source"]
            }
        )
        for entry in data
    ]


async def main():
    if not API_KEY or not CSE_ID:
        raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")

    print("🌍 Fetching news URLs from Google...")

    all_articles = []

    for query in QUERIES:
        print(f"πŸ” Searching for: {query}")
        try:
            results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
            print(f"   β†’ Found {len(results)} links for '{query}'.")

            for item in results:
                url = item.get("link", "").strip()
                title = item.get("title", "").strip()
                source = item.get("displayLink", "").strip()
                if not url or not title:
                    continue

                print(f"🌐 Scraping: {url}")
                article_text = scrape_url(url)

                if article_text:
                    all_articles.append({
                        "topic": query,
                        "title": title,
                        "url": url,
                        "source": source,
                        "content": article_text
                    })
                else:
                    print(f"⚠️ Skipped: {url}")

        except Exception as e:
            print(f"❌ Error fetching '{query}': {e}")

    if not all_articles:
        print("⚠️ No content scraped. Exiting.")
        return

    print(f"πŸ“ Writing {len(all_articles)} articles to {RAW_JSON}...")
    write_articles_jsonl(all_articles, RAW_JSON)

    print("🧠 Building index...")
    documents = await build_documents(all_articles)
    get_or_build_index_from_docs(documents)

    print("⚑ Generating daily feed...")
    await generate_and_cache_daily_feed(documents)

    print(f"βœ… Indexed, headlines generated, and stored at: {INDEX_DIR}")


if __name__ == "__main__":
    asyncio.run(main())