gaur3009 commited on
Commit
513fc9a
Β·
verified Β·
1 Parent(s): 9015035

Update search.py

Browse files
Files changed (1) hide show
  1. search.py +34 -11
search.py CHANGED
@@ -1,13 +1,36 @@
1
- from googlesearch import search
 
2
 
3
- PREFERRED_DOMAINS = [
4
- 'amazon.in', 'flipkart.com', 'wikipedia.org',
5
- 'makemytrip.com', 'yatra.com', 'skyscanner.co.in'
6
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- def search_google(query, num_results=5):
9
- raw_urls = list(search(query, num_results=num_results))
10
- # Prioritize known trusted domains
11
- preferred = [u for u in raw_urls if any(dom in u for dom in PREFERRED_DOMAINS)]
12
- others = [u for u in raw_urls if u not in preferred]
13
- return preferred + others
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from parsel import Selector
3
 
4
+ def extract_structured_data(url):
5
+ """Extract clean, structured content from webpage"""
6
+ try:
7
+ response = requests.get(url, timeout=10,
8
+ headers={'User-Agent': 'Mozilla/5.0 (compatible; ResearchBot/1.0)'})
9
+ response.raise_for_status()
10
+
11
+ sel = Selector(text=response.text)
12
+
13
+ return {
14
+ "url": url,
15
+ "title": sel.css("title::text").get() or url,
16
+ "content": " ".join(sel.css("p::text, article::text, .main-content::text").getall())[:5000],
17
+ "key_points": sel.css("li::text, .summary::text, .highlight::text").getall()[:5]
18
+ }
19
+ except Exception as e:
20
+ print(f"Error scraping {url}: {e}")
21
+ return None
22
 
23
+ def search_google(query, num_results=5):
24
+ mock_results = [
25
+ {"url": "https://en.wikipedia.org/wiki/Quantum_computing", "title": "Quantum Computing - Wikipedia"},
26
+ {"url": "https://www.ibm.com/quantum-computing", "title": "IBM Quantum Computing"},
27
+ {"url": "https://quantum.microsoft.com", "title": "Microsoft Quantum"}
28
+ ]
29
+
30
+ # Extract structured data from each URL
31
+ structured_results = []
32
+ for result in mock_results[:num_results]:
33
+ if data := extract_structured_data(result["url"]):
34
+ structured_results.append(data)
35
+
36
+ return structured_results