That1BrainCell commited on
Commit
5fe4f2b
·
verified ·
1 Parent(s): 9efaf5f

Update search.py

Browse files
Files changed (1) hide show
  1. search.py +227 -227
search.py CHANGED
@@ -1,227 +1,227 @@
1
- # Library Imports
2
- import requests
3
- from bs4 import BeautifulSoup
4
- from googlesearch import search
5
- from duckduckgo_search import DDGS
6
- import concurrent.futures
7
- import re
8
-
9
-
10
-
11
- # Search Functions -------------------------------------------------------------->
12
-
13
- # Function to search DuckDuckGo
14
- def search_duckduckgo(query):
15
- print("Fetching Duckduckgo Links -----")
16
- try:
17
- results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
18
- return [res['href'] for res in results]
19
- except:
20
- return []
21
-
22
- # Function to search Google
23
- def search_google(query):
24
- print("Fetching Google Links -----")
25
-
26
- links = []
27
- try:
28
- api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
29
- search_engine_id = 'c4ca951b9fc6949cb'
30
-
31
- url = f"https://www.googleapis.com/customsearch/v1"
32
- params = {
33
- "key": api_key,
34
- "cx": search_engine_id,
35
- "q": query + " manual filetype:pdf"
36
- }
37
-
38
- response = requests.get(url, params=params)
39
- results = response.json()
40
-
41
- for item in results.get('items', []):
42
- links.append(item['link'])
43
- except:
44
- pass
45
-
46
- try:
47
- extension = "ext:pdf"
48
- for result in search(query + " manual " + extension, num_results=5):
49
- if result.endswith('.pdf'):
50
- links.append(result)
51
- except:
52
- pass
53
-
54
- return links
55
-
56
- # Function to search Internet Archive
57
- def search_archive(query):
58
- print("Fetching Archieve Links -----")
59
-
60
- try:
61
- url = "https://archive.org/advancedsearch.php"
62
- params = {
63
- 'q': f'{query} manual',
64
- 'fl[]': ['identifier', 'title', 'format'],
65
- 'rows': 50,
66
- 'page': 1,
67
- 'output': 'json'
68
- }
69
-
70
- # Make the request
71
- response = requests.get(url, params=params)
72
- data = response.json()
73
-
74
- # Function to extract hyperlinks from a webpage
75
- def extract_hyperlinks(url):
76
- # Send a GET request to the URL
77
- response = requests.get(url)
78
-
79
- # Check if the request was successful
80
- if response.status_code == 200:
81
- # Parse the HTML content of the page
82
- soup = BeautifulSoup(response.text, 'html.parser')
83
-
84
- # Find all <a> tags (hyperlinks)
85
- for link in soup.find_all('a', href=True):
86
- href = link['href']
87
- if href.endswith('.pdf'):
88
- pdf_files.append(url+'/'+href)
89
- if href.endswith('.iso'):
90
- # If the link ends with .iso, follow the link and extract .pdf hyperlinks
91
- extract_pdf_from_iso(url+'/'+href+'/')
92
-
93
- # Function to extract .pdf hyperlinks from an .iso file
94
- def extract_pdf_from_iso(iso_url):
95
- # Send a GET request to the ISO URL
96
- iso_response = requests.get(iso_url)
97
-
98
- # Check if the request was successful
99
- if iso_response.status_code == 200:
100
- # Parse the HTML content of the ISO page
101
- iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
102
-
103
- # Find all <a> tags (hyperlinks) in the ISO page
104
- for link in iso_soup.find_all('a', href=True):
105
- href = link['href']
106
- if href.endswith('.pdf'):
107
- pdf_files.append('https:'+href)
108
-
109
- pdf_files = []
110
-
111
- def process_doc(doc):
112
- identifier = doc.get('identifier', 'N/A')
113
- # title = doc.get('title', 'N/A')
114
- # format = doc.get('format', 'N/A')
115
- pdf_link = f"https://archive.org/download/{identifier}"
116
- extract_hyperlinks(pdf_link)
117
-
118
- with concurrent.futures.ThreadPoolExecutor() as executor:
119
- futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
120
-
121
- # Optionally, wait for all futures to complete and handle any exceptions
122
- for future in concurrent.futures.as_completed(futures):
123
- try:
124
- future.result() # This will raise an exception if the function call raised
125
- except Exception as exc:
126
- print(f'Generated an exception: {exc}')
127
-
128
-
129
- return pdf_files
130
-
131
- except:
132
- return []
133
-
134
- def search_github(query):
135
- print("Fetching Github Links -----")
136
-
137
- try:
138
- # GitHub Search API endpoint
139
- url = f"https://api.github.com/search/code?q={query}+extension:md"
140
-
141
- headers = {
142
- 'Authorization': 'Token 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'
143
- }
144
-
145
- # Make the request
146
- response = requests.get(url,headers=headers)
147
- data = response.json()
148
- links = [item['html_url'] for item in data['items']]
149
-
150
- return links
151
-
152
- except:
153
- return []
154
-
155
- def search_wikipedia(product):
156
- print("Fetching Duckduckgo Links -----")
157
-
158
- api_url = "https://en.wikipedia.org/w/api.php"
159
- params = {
160
- "action": "opensearch",
161
- "search": product,
162
- "limit": 5,
163
- "namespace": 0,
164
- "format": "json"
165
- }
166
-
167
- try:
168
- response = requests.get(api_url, params=params)
169
- response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
170
- data = response.json()
171
-
172
- if data and len(data) > 3 and len(data[3]) > 0:
173
- return data[3] # The URL is in the fourth element of the response array
174
- else:
175
- return []
176
-
177
- except requests.RequestException as e:
178
- print(f"An error occurred: {e}")
179
- return []
180
-
181
- # def search_all(product,num):
182
-
183
- # similar_products = extract_similar_products(product)[num]
184
-
185
- # # results = {
186
- # # product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
187
- # # }
188
-
189
- # results = {}
190
-
191
- # def search_product(p):
192
- # return {
193
- # 'product': p,
194
- # 'duckduckgo': duckduckgo_search(p),
195
- # 'google': google_search(p),
196
- # 'github': github_search(p),
197
- # 'archive': archive_search(p),
198
- # 'wikipedia': wikipedia_search(p)
199
- # }
200
-
201
- # with concurrent.futures.ThreadPoolExecutor() as executor:
202
- # future_to_product = {executor.submit(search_product, p): p for p in similar_products}
203
-
204
- # for future in concurrent.futures.as_completed(future_to_product):
205
- # result = future.result()
206
- # product = result['product']
207
- # results[product] = [
208
- # {'duckduckgo': result['duckduckgo']},
209
- # {'google': result['google']},
210
- # {'github': result['github']},
211
- # {'archive': result['archive']},
212
- # {'wikipedia': result['wikipedia']}
213
- # ]
214
-
215
- # return results
216
-
217
- # Similarity Check -------------------------------------->
218
-
219
- def extract_similar_products(query):
220
- print(f"\nFetching similar items of -----> {query}")
221
- results = DDGS().chat(f'{query} Similar Products')
222
-
223
- pattern = r'^\d+\.\s(.+)$'
224
- matches = re.findall(pattern, results, re.MULTILINE)
225
- matches = [item.split(': ')[0] for item in matches]
226
- return matches
227
-
 
1
+ # Library Imports
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from googlesearch import search
5
+ from duckduckgo_search import DDGS
6
+ import concurrent.futures
7
+ import re
8
+
9
+
10
+
11
+ # Search Functions -------------------------------------------------------------->
12
+
13
+ # Function to search DuckDuckGo
14
+ def search_duckduckgo(query):
15
+ print("Fetching Duckduckgo Links -----")
16
+ try:
17
+ results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
18
+ return [res['href'] for res in results]
19
+ except:
20
+ return []
21
+
22
+ # Function to search Google
23
+ def search_google(query):
24
+ print("Fetching Google Links -----")
25
+
26
+ links = []
27
+ try:
28
+ api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
29
+ search_engine_id = 'c4ca951b9fc6949cb'
30
+
31
+ url = f"https://www.googleapis.com/customsearch/v1"
32
+ params = {
33
+ "key": api_key,
34
+ "cx": search_engine_id,
35
+ "q": query + " manual filetype:pdf"
36
+ }
37
+
38
+ response = requests.get(url, params=params)
39
+ results = response.json()
40
+
41
+ for item in results.get('items', []):
42
+ links.append(item['link'])
43
+ except:
44
+ pass
45
+
46
+ try:
47
+ extension = "ext:pdf"
48
+ for result in search(query + " manual " + extension, num_results=5):
49
+ if result.endswith('.pdf'):
50
+ links.append(result)
51
+ except:
52
+ pass
53
+
54
+ return links
55
+
56
+ # Function to search Internet Archive
57
+ def search_archive(query):
58
+ print("Fetching Archive Links -----")
59
+
60
+ try:
61
+ url = "https://archive.org/advancedsearch.php"
62
+ params = {
63
+ 'q': f'{query} manual',
64
+ 'fl[]': ['identifier', 'title', 'format'],
65
+ 'rows': 50,
66
+ 'page': 1,
67
+ 'output': 'json'
68
+ }
69
+
70
+ # Make the request
71
+ response = requests.get(url, params=params)
72
+ data = response.json()
73
+
74
+ # Function to extract hyperlinks from a webpage
75
+ def extract_hyperlinks(url):
76
+ # Send a GET request to the URL
77
+ response = requests.get(url)
78
+
79
+ # Check if the request was successful
80
+ if response.status_code == 200:
81
+ # Parse the HTML content of the page
82
+ soup = BeautifulSoup(response.text, 'html.parser')
83
+
84
+ # Find all <a> tags (hyperlinks)
85
+ for link in soup.find_all('a', href=True):
86
+ href = link['href']
87
+ if href.endswith('.pdf'):
88
+ pdf_files.append(url+'/'+href)
89
+ if href.endswith('.iso'):
90
+ # If the link ends with .iso, follow the link and extract .pdf hyperlinks
91
+ extract_pdf_from_iso(url+'/'+href+'/')
92
+
93
+ # Function to extract .pdf hyperlinks from an .iso file
94
+ def extract_pdf_from_iso(iso_url):
95
+ # Send a GET request to the ISO URL
96
+ iso_response = requests.get(iso_url)
97
+
98
+ # Check if the request was successful
99
+ if iso_response.status_code == 200:
100
+ # Parse the HTML content of the ISO page
101
+ iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
102
+
103
+ # Find all <a> tags (hyperlinks) in the ISO page
104
+ for link in iso_soup.find_all('a', href=True):
105
+ href = link['href']
106
+ if href.endswith('.pdf'):
107
+ pdf_files.append('https:'+href)
108
+
109
+ pdf_files = []
110
+
111
+ def process_doc(doc):
112
+ identifier = doc.get('identifier', 'N/A')
113
+ # title = doc.get('title', 'N/A')
114
+ # format = doc.get('format', 'N/A')
115
+ pdf_link = f"https://archive.org/download/{identifier}"
116
+ extract_hyperlinks(pdf_link)
117
+
118
+ with concurrent.futures.ThreadPoolExecutor() as executor:
119
+ futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
120
+
121
+ # Optionally, wait for all futures to complete and handle any exceptions
122
+ for future in concurrent.futures.as_completed(futures):
123
+ try:
124
+ future.result() # This will raise an exception if the function call raised
125
+ except Exception as exc:
126
+ print(f'Generated an exception: {exc}')
127
+
128
+
129
+ return pdf_files
130
+
131
+ except:
132
+ return []
133
+
134
+ def search_github(query):
135
+ print("Fetching Github Links -----")
136
+
137
+ try:
138
+ # GitHub Search API endpoint
139
+ url = f"https://api.github.com/search/code?q={query}+extension:md"
140
+
141
+ headers = {
142
+ 'Authorization': 'Token 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'
143
+ }
144
+
145
+ # Make the request
146
+ response = requests.get(url,headers=headers)
147
+ data = response.json()
148
+ links = [item['html_url'] for item in data['items']]
149
+
150
+ return links
151
+
152
+ except:
153
+ return []
154
+
155
+ def search_wikipedia(product):
156
+ print("Fetching Wikipedia Links -----")
157
+
158
+ api_url = "https://en.wikipedia.org/w/api.php"
159
+ params = {
160
+ "action": "opensearch",
161
+ "search": product,
162
+ "limit": 5,
163
+ "namespace": 0,
164
+ "format": "json"
165
+ }
166
+
167
+ try:
168
+ response = requests.get(api_url, params=params)
169
+ response.raise_for_status() # Raise an HTTPError for bad responses (4xx and 5xx)
170
+ data = response.json()
171
+
172
+ if data and len(data) > 3 and len(data[3]) > 0:
173
+ return data[3] # The URL is in the fourth element of the response array
174
+ else:
175
+ return []
176
+
177
+ except requests.RequestException as e:
178
+ print(f"An error occurred: {e}")
179
+ return []
180
+
181
+ # def search_all(product,num):
182
+
183
+ # similar_products = extract_similar_products(product)[num]
184
+
185
+ # # results = {
186
+ # # product : [{'duckduckgo': duckduckgo_search(product)},{'google': google_search(product)},{'github': github_search(product)},{'archive': archive_search(product)}]
187
+ # # }
188
+
189
+ # results = {}
190
+
191
+ # def search_product(p):
192
+ # return {
193
+ # 'product': p,
194
+ # 'duckduckgo': duckduckgo_search(p),
195
+ # 'google': google_search(p),
196
+ # 'github': github_search(p),
197
+ # 'archive': archive_search(p),
198
+ # 'wikipedia': wikipedia_search(p)
199
+ # }
200
+
201
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
202
+ # future_to_product = {executor.submit(search_product, p): p for p in similar_products}
203
+
204
+ # for future in concurrent.futures.as_completed(future_to_product):
205
+ # result = future.result()
206
+ # product = result['product']
207
+ # results[product] = [
208
+ # {'duckduckgo': result['duckduckgo']},
209
+ # {'google': result['google']},
210
+ # {'github': result['github']},
211
+ # {'archive': result['archive']},
212
+ # {'wikipedia': result['wikipedia']}
213
+ # ]
214
+
215
+ # return results
216
+
217
+ # Similarity Check -------------------------------------->
218
+
219
+ def extract_similar_products(query):
220
+ print(f"\nFetching similar items of -----> {query}")
221
+ results = DDGS().chat(f'{query} Similar Products')
222
+
223
+ pattern = r'^\d+\.\s(.+)$'
224
+ matches = re.findall(pattern, results, re.MULTILINE)
225
+ matches = [item.split(': ')[0] for item in matches]
226
+ return matches
227
+