gaur3009 commited on
Commit
399f464
Β·
verified Β·
1 Parent(s): 2e65f14

Update search.py

Browse files
Files changed (1) hide show
  1. search.py +60 -20
search.py CHANGED
@@ -1,36 +1,76 @@
1
  import requests
2
  from parsel import Selector
 
3
 
4
  def extract_structured_data(url):
5
  """Extract clean, structured content from webpage"""
6
  try:
7
- response = requests.get(url, timeout=10,
8
- headers={'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)'})
 
 
 
9
  response.raise_for_status()
10
 
 
11
  sel = Selector(text=response.text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  return {
14
  "url": url,
15
- "title": sel.css("title::text").get() or url,
16
- "content": " ".join(sel.css("p::text, article::text, .main-content::text").getall())[:5000],
17
- "key_points": sel.css("li::text, .summary::text, .highlight::text").getall()[:5]
18
  }
19
  except Exception as e:
20
- print(f"Error scraping {url}: {e}")
21
  return None
22
 
23
- def search_google(query, num_results=5):
24
- mock_results = [
25
- {"url": "https://en.wikipedia.org/wiki/Quantum_computing", "title": "Quantum Computing - Wikipedia"},
26
- {"url": "https://www.ibm.com/quantum-computing", "title": "IBM Quantum Computing"},
27
- {"url": "https://quantum.microsoft.com", "title": "Microsoft Quantum"}
28
- ]
29
-
30
- # Extract structured data from each URL
31
- structured_results = []
32
- for result in mock_results[:num_results]:
33
- if data := extract_structured_data(result["url"]):
34
- structured_results.append(data)
35
-
36
- return structured_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  from parsel import Selector
3
+ from bs4 import BeautifulSoup
4
 
5
  def extract_structured_data(url):
6
  """Extract clean, structured content from webpage"""
7
  try:
8
+ headers = {
9
+ 'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
10
+ 'Accept-Language': 'en-US,en;q=0.9'
11
+ }
12
+ response = requests.get(url, timeout=8, headers=headers)
13
  response.raise_for_status()
14
 
15
+ # Using Parsel + BeautifulSoup hybrid for efficiency
16
  sel = Selector(text=response.text)
17
+ soup = BeautifulSoup(response.text, 'lxml')
18
+
19
+ # Extract key elements
20
+ title = sel.css("title::text").get() or url
21
+ main_content = ""
22
+
23
+ # Try to find article body
24
+ for selector in ['article', '.article-body', '.post-content', '#content']:
25
+ if elem := soup.select_one(selector):
26
+ main_content = elem.get_text(separator=' ', strip=True)[:2000]
27
+ break
28
+
29
+ # Fallback to paragraph aggregation
30
+ if not main_content:
31
+ paragraphs = soup.find_all('p')
32
+ main_content = " ".join(p.get_text().strip() for p in paragraphs)[:2000]
33
+
34
+ # Extract metadata
35
+ description = sel.css('meta[name="description"]::attr(content)').get() or ""
36
 
37
  return {
38
  "url": url,
39
+ "title": title,
40
+ "content": main_content,
41
+ "description": description
42
  }
43
  except Exception as e:
44
+ print(f"Error scraping {url}: {str(e)[:100]}")
45
  return None
46
 
47
+ def search_google(query, num_results=5):
48
+ """Search with enhanced result parsing"""
49
+ try:
50
+ # Use custom search API or fallback
51
+ from googlesearch import search
52
+ urls = list(search(query, num_results=num_results, advanced=True))
53
+
54
+ results = []
55
+ for i, result in enumerate(urls[:num_results]):
56
+ # Use metadata if available
57
+ base_data = {
58
+ "url": result.url,
59
+ "title": result.title,
60
+ "description": result.description
61
+ }
62
+
63
+ # Only do full extraction for top 2 results
64
+ if i < 2:
65
+ if structured := extract_structured_data(result.url):
66
+ results.append(structured)
67
+ else:
68
+ results.append(base_data)
69
+ else:
70
+ results.append(base_data)
71
+
72
+ return results
73
+ except:
74
+ # Fallback to simple search
75
+ from googlesearch import search
76
+ return [{"url": url} for url in search(query, num_results=num_results)]