Spaces:
Runtime error
Runtime error
File size: 1,404 Bytes
513fc9a e1f9ee6 513fc9a 547b150 513fc9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import requests
from parsel import Selector
def extract_structured_data(url):
"""Extract clean, structured content from webpage"""
try:
response = requests.get(url, timeout=10,
headers={'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)'})
response.raise_for_status()
sel = Selector(text=response.text)
return {
"url": url,
"title": sel.css("title::text").get() or url,
"content": " ".join(sel.css("p::text, article::text, .main-content::text").getall())[:5000],
"key_points": sel.css("li::text, .summary::text, .highlight::text").getall()[:5]
}
except Exception as e:
print(f"Error scraping {url}: {e}")
return None
def search_google(query, num_results=5):
mock_results = [
{"url": "https://en.wikipedia.org/wiki/Quantum_computing", "title": "Quantum Computing - Wikipedia"},
{"url": "https://www.ibm.com/quantum-computing", "title": "IBM Quantum Computing"},
{"url": "https://quantum.microsoft.com", "title": "Microsoft Quantum"}
]
# Extract structured data from each URL
structured_results = []
for result in mock_results[:num_results]:
if data := extract_structured_data(result["url"]):
structured_results.append(data)
return structured_results |