File size: 4,928 Bytes
6d24925 8e17b80 6d24925 989b675 6d24925 69210b9 6d24925 8e17b80 6d24925 8e17b80 6d24925 8e17b80 6d24925 8e17b80 d4f91e1 8e17b80 6d24925 d4f91e1 6d24925 d4f91e1 6d24925 d4f91e1 6d24925 8e17b80 6d24925 d4f91e1 6d24925 8e17b80 6d24925 d4f91e1 6d24925 d4f91e1 6d24925 d4f91e1 67fbb52 d4f91e1 69210b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import sys
import os
import json
from typing import List, Dict
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from components.indexers.news_indexer import get_or_build_index_from_docs
from components.fetchers.google_search import fetch_google_news
from components.fetchers.scraper import scrape_url
from components.generators.daily_feed import generate_and_cache_daily_feed
from llama_index.core.settings import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import Document
# β
Set up local embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
# π Environment variables
API_KEY = os.environ.get("GOOGLE_API_KEY")
CSE_ID = os.environ.get("GOOGLE_CX_ID") # β
fixed typo
# β
News topics to fetch
QUERIES = [
"India news", "World news", "Tech news", "Finance news", "Sports news"
]
# β
Paths
INDEX_DIR = "storage/index"
DATA_DIR = "data/news"
RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")
def write_articles_jsonl(articles: List[Dict], file_path: str):
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "w", encoding="utf-8") as f:
for article in articles:
f.write(json.dumps(article, ensure_ascii=False) + "\n")
import sys
import os
import json
import asyncio
from typing import List, Dict
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from components.indexers.news_indexer import get_or_build_index_from_docs
from components.fetchers.google_search import fetch_google_news
from components.fetchers.scraper import scrape_url
from components.generators.daily_feed import generate_and_cache_daily_feed
from llama_index.core.settings import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import Document
# β
Set up local embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/paraphrase-MiniLM-L3-v2")
# π Environment variables
API_KEY = os.environ.get("GOOGLE_API_KEY")
CSE_ID = os.environ.get("GOOGLE_CX_ID")
# β
News topics to fetch
QUERIES = [
"India news", "World news", "Tech news", "Finance news", "Sports news"
]
# β
Paths
INDEX_DIR = "storage/index"
DATA_DIR = "data/news"
RAW_JSON = os.path.join(DATA_DIR, "news.jsonl")
def write_articles_jsonl(articles: List[Dict], file_path: str):
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "w", encoding="utf-8") as f:
for article in articles:
f.write(json.dumps(article, ensure_ascii=False) + "\n")
async def build_documents(data: List[Dict]) -> List[Document]:
return [
Document(
text=entry["content"],
metadata={
"title": entry["title"],
"url": entry["url"],
"topic": entry["topic"],
"source": entry["source"]
}
)
for entry in data
]
async def main():
if not API_KEY or not CSE_ID:
raise EnvironmentError("Missing GOOGLE_API_KEY or GOOGLE_CX_ID in environment.")
print("π Fetching news URLs from Google...")
all_articles = []
for query in QUERIES:
print(f"π Searching for: {query}")
try:
results = fetch_google_news(query, API_KEY, CSE_ID, num_results=10)
print(f" β Found {len(results)} links for '{query}'.")
for item in results:
url = item.get("link", "").strip()
title = item.get("title", "").strip()
source = item.get("displayLink", "").strip()
if not url or not title:
continue
print(f"π Scraping: {url}")
article_text = scrape_url(url)
if article_text:
all_articles.append({
"topic": query,
"title": title,
"url": url,
"source": source,
"content": article_text
})
else:
print(f"β οΈ Skipped: {url}")
except Exception as e:
print(f"β Error fetching '{query}': {e}")
if not all_articles:
print("β οΈ No content scraped. Exiting.")
return
print(f"π Writing {len(all_articles)} articles to {RAW_JSON}...")
write_articles_jsonl(all_articles, RAW_JSON)
print("π§ Building index...")
documents = await build_documents(all_articles)
get_or_build_index_from_docs(documents)
print("β‘ Generating daily feed...")
await generate_and_cache_daily_feed(documents)
print(f"β
Indexed, headlines generated, and stored at: {INDEX_DIR}")
if __name__ == "__main__":
asyncio.run(main())
|