ginipick commited on
Commit
1326112
ยท
verified ยท
1 Parent(s): b57eaf6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -50
app.py CHANGED
@@ -207,6 +207,7 @@ def generate_mock_search_results(query):
207
 
208
  # Google ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (SerpAPI ๋Œ€์‹  ์ง์ ‘ ๊ฒ€์ƒ‰)
209
  # Google ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (BeautifulSoup์„ ์‚ฌ์šฉํ•˜์—ฌ ๊ฒฐ๊ณผ ํŒŒ์‹ฑ)
 
210
  def do_google_search(query, num_results=5):
211
  try:
212
  # ๋‹ค์–‘ํ•œ User-Agent ์‚ฌ์šฉ (Google ์ฐจ๋‹จ ๋ฐฉ์ง€)
@@ -214,19 +215,14 @@ def do_google_search(query, num_results=5):
214
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
215
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
216
  'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
217
- 'Accept-Encoding': 'gzip, deflate, br',
218
  'Referer': 'https://www.google.com/',
219
- 'DNT': '1',
220
- 'Connection': 'keep-alive',
221
- 'Upgrade-Insecure-Requests': '1',
222
- 'Cache-Control': 'max-age=0',
223
  }
224
 
225
- # ๊ฒ€์ƒ‰ URL (์ผ๋ถ€ ํŒŒ๋ผ๋ฏธํ„ฐ ์ถ”๊ฐ€)
226
  search_url = f"https://www.google.com/search?q={query}&num={num_results}&hl=ko&gl=kr"
227
  logging.info(f"๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ URL: {search_url}")
228
 
229
- # ์š”์ฒญ ๋ณด๋‚ด๊ธฐ (์งง์€ ํƒ€์ž„์•„์›ƒ ์„ค์ •)
230
  response = requests.get(search_url, headers=headers, timeout=10)
231
 
232
  # ์‘๋‹ต์ด ์„ฑ๊ณต์ ์ธ์ง€ ํ™•์ธ
@@ -234,62 +230,133 @@ def do_google_search(query, num_results=5):
234
  logging.error(f"Google ๊ฒ€์ƒ‰ ์‘๋‹ต ์ƒํƒœ ์ฝ”๋“œ: {response.status_code}")
235
  return generate_mock_search_results(query)
236
 
 
 
 
 
 
237
  # BeautifulSoup์œผ๋กœ HTML ํŒŒ์‹ฑ
238
  soup = BeautifulSoup(response.text, 'html.parser')
239
 
240
  # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ถ”์ถœ
241
  organic_results = []
242
 
243
- # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ปจํ…Œ์ด๋„ˆ ์ฐพ๊ธฐ (Google์˜ HTML ๊ตฌ์กฐ์— ๋”ฐ๋ผ ๋ณ€๊ฒฝ๋  ์ˆ˜ ์žˆ์Œ)
244
- result_containers = soup.select('div.g')
 
 
 
 
245
 
246
- if not result_containers:
247
- logging.warning("Google ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ปจํ…Œ์ด๋„ˆ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๋Œ€์ฒด ์„ ํƒ์ž๋ฅผ ์‹œ๋„ํ•ฉ๋‹ˆ๋‹ค.")
248
- # ๋Œ€์ฒด ์„ ํƒ์ž ์‹œ๋„
249
- result_containers = soup.select('div[data-hveid]')
 
 
 
 
 
 
250
 
251
- counter = 0
252
- for container in result_containers:
253
- if counter >= num_results:
254
- break
 
 
 
 
 
 
 
 
 
255
 
256
- # ์ œ๋ชฉ ์ถ”์ถœ
257
- title_element = container.select_one('h3')
258
- if not title_element:
259
- continue
260
 
261
- title = title_element.get_text()
262
-
263
- # ๋งํฌ ์ถ”์ถœ
264
- link_element = container.select_one('a')
265
- if not link_element:
266
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
- link = link_element.get('href', '')
269
- if link.startswith('/url?'):
270
- # Google์˜ ๋ฆฌ๋‹ค์ด๋ ‰ํŠธ URL์—์„œ ์‹ค์ œ URL ์ถ”์ถœ
271
- link = link.split('q=')[1].split('&')[0] if 'q=' in link else link
272
- elif not link.startswith('http'):
273
- continue
274
-
275
- # ์Šค๋‹ˆํŽซ ์ถ”์ถœ
276
- snippet_element = container.select_one('div.VwiC3b') or container.select_one('span.aCOpRe')
277
- snippet = snippet_element.get_text() if snippet_element else "์„ค๋ช… ์—†์Œ"
278
-
279
- # ํ‘œ์‹œ ๋งํฌ ์ถ”์ถœ
280
- displayed_link_element = container.select_one('cite')
281
- displayed_link = displayed_link_element.get_text() if displayed_link_element else link
282
-
283
- organic_results.append({
284
- "title": title,
285
- "link": link,
286
- "snippet": snippet,
287
- "displayed_link": displayed_link
288
- })
289
- counter += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
  if not organic_results:
292
- logging.warning("๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ํŒŒ์‹ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ์„ ํƒ์ž๊ฐ€ ๋ณ€๊ฒฝ๋˜์—ˆ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
293
  return generate_mock_search_results(query)
294
 
295
  # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋งˆํฌ๋‹ค์šด ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
 
207
 
208
  # Google ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (SerpAPI ๋Œ€์‹  ์ง์ ‘ ๊ฒ€์ƒ‰)
209
  # Google ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (BeautifulSoup์„ ์‚ฌ์šฉํ•˜์—ฌ ๊ฒฐ๊ณผ ํŒŒ์‹ฑ)
210
+ # Google ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ ๊ฐœ์„ 
211
  def do_google_search(query, num_results=5):
212
  try:
213
  # ๋‹ค์–‘ํ•œ User-Agent ์‚ฌ์šฉ (Google ์ฐจ๋‹จ ๋ฐฉ์ง€)
 
215
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
216
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
217
  'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
 
218
  'Referer': 'https://www.google.com/',
 
 
 
 
219
  }
220
 
221
+ # ๊ฒ€์ƒ‰ URL
222
  search_url = f"https://www.google.com/search?q={query}&num={num_results}&hl=ko&gl=kr"
223
  logging.info(f"๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ URL: {search_url}")
224
 
225
+ # ์š”์ฒญ ๋ณด๋‚ด๊ธฐ
226
  response = requests.get(search_url, headers=headers, timeout=10)
227
 
228
  # ์‘๋‹ต์ด ์„ฑ๊ณต์ ์ธ์ง€ ํ™•์ธ
 
230
  logging.error(f"Google ๊ฒ€์ƒ‰ ์‘๋‹ต ์ƒํƒœ ์ฝ”๋“œ: {response.status_code}")
231
  return generate_mock_search_results(query)
232
 
233
+ # HTML ํŒŒ์‹ฑ์„ ์œ„ํ•œ ๋””๋ฒ„๊น…
234
+ with open("google_response.html", "w", encoding="utf-8") as f:
235
+ f.write(response.text)
236
+ logging.info("Google ์‘๋‹ต HTML์„ 'google_response.html'์— ์ €์žฅํ–ˆ์Šต๋‹ˆ๋‹ค.")
237
+
238
  # BeautifulSoup์œผ๋กœ HTML ํŒŒ์‹ฑ
239
  soup = BeautifulSoup(response.text, 'html.parser')
240
 
241
  # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ถ”์ถœ
242
  organic_results = []
243
 
244
+ # 2023-2025 ๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์„ ํƒ์ž ์‹œ๋„ (๋‹ค์–‘ํ•œ ์„ ํƒ์ž ์‹œ๋„)
245
+ selectors = [
246
+ 'div.g', 'div.Gx5Zad', 'div.tF2Cxc', 'div.yuRUbf',
247
+ '.v5yQqb', '.MjjYud', 'div[jscontroller]', '.fP1Qef',
248
+ 'div[data-sokoban-container]', '.hlcw0c'
249
+ ]
250
 
251
+ # ๋ชจ๋“  ์„ ํƒ์ž ์‹œ๋„
252
+ containers = []
253
+ for selector in selectors:
254
+ elements = soup.select(selector)
255
+ if elements:
256
+ logging.info(f"์„ ํƒ์ž '{selector}'๋กœ {len(elements)}๊ฐœ ์š”์†Œ๋ฅผ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค.")
257
+ containers.extend(elements)
258
+ # ์ถฉ๋ถ„ํ•œ ์š”์†Œ๋ฅผ ์ฐพ์•˜์œผ๋ฉด ์ค‘๋‹จ
259
+ if len(containers) >= num_results * 2: # ์ค‘๋ณต ๊ฐ€๋Šฅ์„ฑ ๊ณ ๋ คํ•ด 2๋ฐฐ๋กœ ์ฐพ์Œ
260
+ break
261
 
262
+ # ๋Œ€์•ˆ: ๋ชจ๋“  <a> ํƒœ๊ทธ์—์„œ href๊ฐ€ http๋กœ ์‹œ์ž‘ํ•˜๋Š” ๊ฒƒ๋งŒ ์„ ํƒ
263
+ if not containers:
264
+ logging.warning("๊ตฌ์กฐํ™”๋œ ์ปจํ…Œ์ด๋„ˆ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์–ด ์ง์ ‘ ๋งํฌ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.")
265
+ link_elements = soup.select('a[href^="http"]')
266
+
267
+ for link in link_elements:
268
+ if len(organic_results) >= num_results:
269
+ break
270
+
271
+ href = link.get('href', '')
272
+ # Google ๋ฆฌ๋‹ค์ด๋ ‰ํŠธ URL ํ•„ํ„ฐ๋ง
273
+ if '/url?' in href or 'google.com' in href:
274
+ continue
275
 
276
+ # ๋ถ€๋ชจ ์š”์†Œ์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹œ๋„
277
+ parent = link.parent
278
+ title = link.get_text(strip=True) or "์ œ๋ชฉ ์—†์Œ"
 
279
 
280
+ # ์ถฉ๋ถ„ํ•œ ๊ธธ์ด์˜ ํ…์ŠคํŠธ๊ฐ€ ์žˆ๋Š” ๊ฒฝ์šฐ๋งŒ ๊ฒฐ๊ณผ๋กœ ์ถ”๊ฐ€
281
+ if len(title) > 5: # ์˜๋ฏธ ์žˆ๋Š” ์ œ๋ชฉ์€ ๋ณดํ†ต 5์ž ์ด์ƒ
282
+ # ์ฃผ๋ณ€ ํ…์ŠคํŠธ ์ถ”์ถœ ์‹œ๋„
283
+ surrounding_text = ""
284
+ for sibling in parent.next_siblings:
285
+ if sibling.name and sibling.get_text(strip=True):
286
+ surrounding_text = sibling.get_text(strip=True)
287
+ break
288
+
289
+ # ๊ฒฐ๊ณผ ์ถ”๊ฐ€
290
+ organic_results.append({
291
+ "title": title,
292
+ "link": href,
293
+ "snippet": surrounding_text or "์ถ”๊ฐ€ ์„ค๋ช… ์—†์Œ",
294
+ "displayed_link": href.split('/')[2] if '/' in href else href
295
+ })
296
+
297
+ # ์„ ํƒ์ž ๊ธฐ๋ฐ˜ ํŒŒ์‹ฑ ์‹œ๋„
298
+ else:
299
+ for container in containers:
300
+ if len(organic_results) >= num_results:
301
+ break
302
 
303
+ # 1. ๋งํฌ ์ฐพ๊ธฐ
304
+ link_element = container.select_one('a[href^="http"]')
305
+ if not link_element:
306
+ continue
307
+
308
+ link = link_element.get('href', '')
309
+ # Google์˜ ๋ฆฌ๋‹ค์ด๋ ‰ํŠธ URL์ด๋ฉด ๊ฑด๋„ˆ๋›ฐ๊ธฐ
310
+ if '/url?' in link or 'google.com' in link:
311
+ continue
312
+
313
+ # 2. ์ œ๋ชฉ ์ฐพ๊ธฐ (๋‹ค์–‘ํ•œ ์„ ํƒ์ž)
314
+ title_selectors = ['h3', '.LC20lb', '.DKV0Md', '.l', '.vvjwJb']
315
+ title = None
316
+ for selector in title_selectors:
317
+ title_element = container.select_one(selector)
318
+ if title_element and title_element.get_text(strip=True):
319
+ title = title_element.get_text(strip=True)
320
+ break
321
+
322
+ if not title:
323
+ title = link_element.get_text(strip=True) or "์ œ๋ชฉ ์—†์Œ"
324
+
325
+ # 3. ์Šค๋‹ˆํŽซ ์ฐพ๊ธฐ (๋‹ค์–‘ํ•œ ์„ ํƒ์ž)
326
+ snippet_selectors = ['.VwiC3b', '.lyLwlc', '.yXK7lf', '.lEBKkf', '.s', '.st']
327
+ snippet = "์„ค๋ช… ์—†์Œ"
328
+ for selector in snippet_selectors:
329
+ snippet_element = container.select_one(selector)
330
+ if snippet_element and snippet_element.get_text(strip=True):
331
+ snippet = snippet_element.get_text(strip=True)
332
+ break
333
+
334
+ # 4. ํ‘œ์‹œ ๋งํฌ ์ฐพ๊ธฐ
335
+ displayed_link_selectors = ['cite', '.UPmit', '.qLRx3b', '.iUh30']
336
+ displayed_link = link.split('/')[2] if '/' in link else link
337
+ for selector in displayed_link_selectors:
338
+ element = container.select_one(selector)
339
+ if element and element.get_text(strip=True):
340
+ displayed_link = element.get_text(strip=True)
341
+ break
342
+
343
+ # ์ค‘๋ณต ์ œ๊ฑฐ
344
+ is_duplicate = False
345
+ for result in organic_results:
346
+ if result["link"] == link or result["title"] == title:
347
+ is_duplicate = True
348
+ break
349
+
350
+ if not is_duplicate:
351
+ organic_results.append({
352
+ "title": title,
353
+ "link": link,
354
+ "snippet": snippet,
355
+ "displayed_link": displayed_link
356
+ })
357
 
358
  if not organic_results:
359
+ logging.warning("๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ํŒŒ์‹ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๊ฐ€์ƒ ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.")
360
  return generate_mock_search_results(query)
361
 
362
  # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋งˆํฌ๋‹ค์šด ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜