Spaces:
Runtime error
Runtime error
Update search.py
Browse files
search.py
CHANGED
@@ -1,76 +1,54 @@
|
|
|
|
1 |
import requests
|
2 |
-
from parsel import Selector
|
3 |
from bs4 import BeautifulSoup
|
|
|
|
|
4 |
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
7 |
try:
|
8 |
headers = {
|
9 |
'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
|
10 |
'Accept-Language': 'en-US,en;q=0.9'
|
11 |
}
|
12 |
-
response = requests.get(url, timeout=
|
13 |
response.raise_for_status()
|
14 |
|
15 |
-
# Using Parsel + BeautifulSoup hybrid for efficiency
|
16 |
-
sel = Selector(text=response.text)
|
17 |
soup = BeautifulSoup(response.text, 'lxml')
|
18 |
|
19 |
-
|
20 |
-
title =
|
21 |
-
main_content = ""
|
22 |
-
|
23 |
-
# Try to find article body
|
24 |
-
for selector in ['article', '.article-body', '.post-content', '#content']:
|
25 |
-
if elem := soup.select_one(selector):
|
26 |
-
main_content = elem.get_text(separator=' ', strip=True)[:2000]
|
27 |
-
break
|
28 |
|
29 |
-
#
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
# Extract metadata
|
35 |
-
description = sel.css('meta[name="description"]::attr(content)').get() or ""
|
36 |
|
37 |
return {
|
38 |
"url": url,
|
39 |
"title": title,
|
40 |
-
"content": main_content,
|
41 |
"description": description
|
42 |
}
|
43 |
except Exception as e:
|
44 |
-
|
45 |
-
return None
|
46 |
|
47 |
def search_google(query, num_results=5):
|
48 |
"""Search with enhanced result parsing"""
|
49 |
try:
|
50 |
-
#
|
51 |
-
|
52 |
-
urls = list(search(query, num_results=num_results, advanced=True))
|
53 |
|
|
|
54 |
results = []
|
55 |
-
for
|
56 |
-
|
57 |
-
|
58 |
-
"url": result.url,
|
59 |
-
"title": result.title,
|
60 |
-
"description": result.description
|
61 |
-
}
|
62 |
-
|
63 |
-
# Only do full extraction for top 2 results
|
64 |
-
if i < 2:
|
65 |
-
if structured := extract_structured_data(result.url):
|
66 |
-
results.append(structured)
|
67 |
-
else:
|
68 |
-
results.append(base_data)
|
69 |
-
else:
|
70 |
-
results.append(base_data)
|
71 |
|
72 |
return results
|
73 |
-
except:
|
74 |
-
|
75 |
-
|
76 |
-
return [{"url": url} for url in search(query, num_results=num_results)]
|
|
|
1 |
+
from googlesearch import search
|
2 |
import requests
|
|
|
3 |
from bs4 import BeautifulSoup
|
4 |
+
import re
|
5 |
+
from functools import lru_cache
|
6 |
|
7 |
+
# Clean HTML tags
|
8 |
+
TAG_CLEANER = re.compile(r"<[^>]+>")
|
9 |
+
|
10 |
+
@lru_cache(maxsize=500)
|
11 |
+
def extract_metadata(url):
|
12 |
+
"""Extract title and description from URL"""
|
13 |
try:
|
14 |
headers = {
|
15 |
'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
|
16 |
'Accept-Language': 'en-US,en;q=0.9'
|
17 |
}
|
18 |
+
response = requests.get(url, timeout=5, headers=headers)
|
19 |
response.raise_for_status()
|
20 |
|
|
|
|
|
21 |
soup = BeautifulSoup(response.text, 'lxml')
|
22 |
|
23 |
+
title = soup.title.string.strip() if soup.title else url
|
24 |
+
title = title[:200]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
# Try to get description
|
27 |
+
description = ""
|
28 |
+
if meta_desc := soup.find("meta", attrs={"name": "description"}):
|
29 |
+
description = meta_desc.get("content", "")[:300]
|
|
|
|
|
|
|
30 |
|
31 |
return {
|
32 |
"url": url,
|
33 |
"title": title,
|
|
|
34 |
"description": description
|
35 |
}
|
36 |
except Exception as e:
|
37 |
+
return {"url": url, "title": f"Error: {str(e)[:30]}", "description": ""}
|
|
|
38 |
|
39 |
def search_google(query, num_results=5):
|
40 |
"""Search with enhanced result parsing"""
|
41 |
try:
|
42 |
+
# Get search results
|
43 |
+
urls = list(search(query, num_results=num_results, advanced=False))
|
|
|
44 |
|
45 |
+
# Extract metadata for each URL
|
46 |
results = []
|
47 |
+
for url in urls[:num_results]:
|
48 |
+
if metadata := extract_metadata(url):
|
49 |
+
results.append(metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
return results
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Search error: {e}")
|
54 |
+
return []
|
|