Spaces:
Runtime error
Runtime error
Update search.py
Browse files
search.py
CHANGED
@@ -1,36 +1,76 @@
|
|
1 |
import requests
|
2 |
from parsel import Selector
|
|
|
3 |
|
4 |
def extract_structured_data(url):
|
5 |
"""Extract clean, structured content from webpage"""
|
6 |
try:
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
9 |
response.raise_for_status()
|
10 |
|
|
|
11 |
sel = Selector(text=response.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
return {
|
14 |
"url": url,
|
15 |
-
"title":
|
16 |
-
"content":
|
17 |
-
"
|
18 |
}
|
19 |
except Exception as e:
|
20 |
-
print(f"Error scraping {url}: {e}")
|
21 |
return None
|
22 |
|
23 |
-
def search_google(query, num_results=5):
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import requests
|
2 |
from parsel import Selector
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
|
5 |
def extract_structured_data(url):
|
6 |
"""Extract clean, structured content from webpage"""
|
7 |
try:
|
8 |
+
headers = {
|
9 |
+
'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
|
10 |
+
'Accept-Language': 'en-US,en;q=0.9'
|
11 |
+
}
|
12 |
+
response = requests.get(url, timeout=8, headers=headers)
|
13 |
response.raise_for_status()
|
14 |
|
15 |
+
# Using Parsel + BeautifulSoup hybrid for efficiency
|
16 |
sel = Selector(text=response.text)
|
17 |
+
soup = BeautifulSoup(response.text, 'lxml')
|
18 |
+
|
19 |
+
# Extract key elements
|
20 |
+
title = sel.css("title::text").get() or url
|
21 |
+
main_content = ""
|
22 |
+
|
23 |
+
# Try to find article body
|
24 |
+
for selector in ['article', '.article-body', '.post-content', '#content']:
|
25 |
+
if elem := soup.select_one(selector):
|
26 |
+
main_content = elem.get_text(separator=' ', strip=True)[:2000]
|
27 |
+
break
|
28 |
+
|
29 |
+
# Fallback to paragraph aggregation
|
30 |
+
if not main_content:
|
31 |
+
paragraphs = soup.find_all('p')
|
32 |
+
main_content = " ".join(p.get_text().strip() for p in paragraphs)[:2000]
|
33 |
+
|
34 |
+
# Extract metadata
|
35 |
+
description = sel.css('meta[name="description"]::attr(content)').get() or ""
|
36 |
|
37 |
return {
|
38 |
"url": url,
|
39 |
+
"title": title,
|
40 |
+
"content": main_content,
|
41 |
+
"description": description
|
42 |
}
|
43 |
except Exception as e:
|
44 |
+
print(f"Error scraping {url}: {str(e)[:100]}")
|
45 |
return None
|
46 |
|
47 |
+
def search_google(query, num_results=5):
|
48 |
+
"""Search with enhanced result parsing"""
|
49 |
+
try:
|
50 |
+
# Use custom search API or fallback
|
51 |
+
from googlesearch import search
|
52 |
+
urls = list(search(query, num_results=num_results, advanced=True))
|
53 |
+
|
54 |
+
results = []
|
55 |
+
for i, result in enumerate(urls[:num_results]):
|
56 |
+
# Use metadata if available
|
57 |
+
base_data = {
|
58 |
+
"url": result.url,
|
59 |
+
"title": result.title,
|
60 |
+
"description": result.description
|
61 |
+
}
|
62 |
+
|
63 |
+
# Only do full extraction for top 2 results
|
64 |
+
if i < 2:
|
65 |
+
if structured := extract_structured_data(result.url):
|
66 |
+
results.append(structured)
|
67 |
+
else:
|
68 |
+
results.append(base_data)
|
69 |
+
else:
|
70 |
+
results.append(base_data)
|
71 |
+
|
72 |
+
return results
|
73 |
+
except:
|
74 |
+
# Fallback to simple search
|
75 |
+
from googlesearch import search
|
76 |
+
return [{"url": url} for url in search(query, num_results=num_results)]
|