Spaces:
Runtime error
Runtime error
import requests | |
from parsel import Selector | |
def extract_structured_data(url): | |
"""Extract clean, structured content from webpage""" | |
try: | |
response = requests.get(url, timeout=10, | |
headers={'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)'}) | |
response.raise_for_status() | |
sel = Selector(text=response.text) | |
return { | |
"url": url, | |
"title": sel.css("title::text").get() or url, | |
"content": " ".join(sel.css("p::text, article::text, .main-content::text").getall())[:5000], | |
"key_points": sel.css("li::text, .summary::text, .highlight::text").getall()[:5] | |
} | |
except Exception as e: | |
print(f"Error scraping {url}: {e}") | |
return None | |
def search_google(query, num_results=5): | |
mock_results = [ | |
{"url": "https://en.wikipedia.org/wiki/Quantum_computing", "title": "Quantum Computing - Wikipedia"}, | |
{"url": "https://www.ibm.com/quantum-computing", "title": "IBM Quantum Computing"}, | |
{"url": "https://quantum.microsoft.com", "title": "Microsoft Quantum"} | |
] | |
# Extract structured data from each URL | |
structured_results = [] | |
for result in mock_results[:num_results]: | |
if data := extract_structured_data(result["url"]): | |
structured_results.append(data) | |
return structured_results |