gaur3009 commited on
Commit
1ac9fd6
Β·
verified Β·
1 Parent(s): 768d1ad

Update search.py

Browse files
Files changed (1) hide show
  1. search.py +26 -48
search.py CHANGED
@@ -1,76 +1,54 @@
 
1
  import requests
2
- from parsel import Selector
3
  from bs4 import BeautifulSoup
 
 
4
 
5
- def extract_structured_data(url):
6
- """Extract clean, structured content from webpage"""
 
 
 
 
7
  try:
8
  headers = {
9
  'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
10
  'Accept-Language': 'en-US,en;q=0.9'
11
  }
12
- response = requests.get(url, timeout=8, headers=headers)
13
  response.raise_for_status()
14
 
15
- # Using Parsel + BeautifulSoup hybrid for efficiency
16
- sel = Selector(text=response.text)
17
  soup = BeautifulSoup(response.text, 'lxml')
18
 
19
- # Extract key elements
20
- title = sel.css("title::text").get() or url
21
- main_content = ""
22
-
23
- # Try to find article body
24
- for selector in ['article', '.article-body', '.post-content', '#content']:
25
- if elem := soup.select_one(selector):
26
- main_content = elem.get_text(separator=' ', strip=True)[:2000]
27
- break
28
 
29
- # Fallback to paragraph aggregation
30
- if not main_content:
31
- paragraphs = soup.find_all('p')
32
- main_content = " ".join(p.get_text().strip() for p in paragraphs)[:2000]
33
-
34
- # Extract metadata
35
- description = sel.css('meta[name="description"]::attr(content)').get() or ""
36
 
37
  return {
38
  "url": url,
39
  "title": title,
40
- "content": main_content,
41
  "description": description
42
  }
43
  except Exception as e:
44
- print(f"Error scraping {url}: {str(e)[:100]}")
45
- return None
46
 
47
  def search_google(query, num_results=5):
48
  """Search with enhanced result parsing"""
49
  try:
50
- # Use custom search API or fallback
51
- from googlesearch import search
52
- urls = list(search(query, num_results=num_results, advanced=True))
53
 
 
54
  results = []
55
- for i, result in enumerate(urls[:num_results]):
56
- # Use metadata if available
57
- base_data = {
58
- "url": result.url,
59
- "title": result.title,
60
- "description": result.description
61
- }
62
-
63
- # Only do full extraction for top 2 results
64
- if i < 2:
65
- if structured := extract_structured_data(result.url):
66
- results.append(structured)
67
- else:
68
- results.append(base_data)
69
- else:
70
- results.append(base_data)
71
 
72
  return results
73
- except:
74
- # Fallback to simple search
75
- from googlesearch import search
76
- return [{"url": url} for url in search(query, num_results=num_results)]
 
1
+ from googlesearch import search
2
  import requests
 
3
  from bs4 import BeautifulSoup
4
+ import re
5
+ from functools import lru_cache
6
 
7
+ # Clean HTML tags
8
+ TAG_CLEANER = re.compile(r"<[^>]+>")
9
+
10
+ @lru_cache(maxsize=500)
11
+ def extract_metadata(url):
12
+ """Extract title and description from URL"""
13
  try:
14
  headers = {
15
  'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)',
16
  'Accept-Language': 'en-US,en;q=0.9'
17
  }
18
+ response = requests.get(url, timeout=5, headers=headers)
19
  response.raise_for_status()
20
 
 
 
21
  soup = BeautifulSoup(response.text, 'lxml')
22
 
23
+ title = soup.title.string.strip() if soup.title else url
24
+ title = title[:200]
 
 
 
 
 
 
 
25
 
26
+ # Try to get description
27
+ description = ""
28
+ if meta_desc := soup.find("meta", attrs={"name": "description"}):
29
+ description = meta_desc.get("content", "")[:300]
 
 
 
30
 
31
  return {
32
  "url": url,
33
  "title": title,
 
34
  "description": description
35
  }
36
  except Exception as e:
37
+ return {"url": url, "title": f"Error: {str(e)[:30]}", "description": ""}
 
38
 
39
  def search_google(query, num_results=5):
40
  """Search with enhanced result parsing"""
41
  try:
42
+ # Get search results
43
+ urls = list(search(query, num_results=num_results, advanced=False))
 
44
 
45
+ # Extract metadata for each URL
46
  results = []
47
+ for url in urls[:num_results]:
48
+ if metadata := extract_metadata(url):
49
+ results.append(metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  return results
52
+ except Exception as e:
53
+ print(f"Search error: {e}")
54
+ return []