ginipick commited on
Commit
3070ad9
ยท
verified ยท
1 Parent(s): 1b0e9b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -8
app.py CHANGED
@@ -10,6 +10,7 @@ import tempfile
10
  import base64
11
  from datetime import datetime
12
  import re
 
13
 
14
  # ๋กœ๊น… ์„ค์ •
15
  logging.basicConfig(
@@ -205,15 +206,24 @@ def generate_mock_search_results(query):
205
  return notice + "\n".join(summary_lines)
206
 
207
  # Google ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (SerpAPI ๋Œ€์‹  ์ง์ ‘ ๊ฒ€์ƒ‰)
 
208
  def do_google_search(query, num_results=5):
209
  try:
210
- # ๊ธฐ๋ณธ ํ—ค๋” ์„ค์ • (๋ธŒ๋ผ์šฐ์ €์ฒ˜๋Ÿผ ๋ณด์ด๊ฒŒ)
211
  headers = {
212
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
 
 
 
 
 
213
  }
214
 
215
- # ๊ฒ€์ƒ‰ URL ๋งŒ๋“ค๊ธฐ
216
- search_url = f"https://www.google.com/search?q={query}&num={num_results}"
217
  logging.info(f"๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ URL: {search_url}")
218
 
219
  # ์š”์ฒญ ๋ณด๋‚ด๊ธฐ (์งง์€ ํƒ€์ž„์•„์›ƒ ์„ค์ •)
@@ -224,10 +234,92 @@ def do_google_search(query, num_results=5):
224
  logging.error(f"Google ๊ฒ€์ƒ‰ ์‘๋‹ต ์ƒํƒœ ์ฝ”๋“œ: {response.status_code}")
225
  return generate_mock_search_results(query)
226
 
227
- # ์—ฌ๊ธฐ์„œ๋Š” HTML ๋ถ„์„์ด ๋ณต์žกํ•˜๋ฏ€๋กœ, ์‹ค์ œ๋กœ๋Š” BeautifulSoup ๊ฐ™์€ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋ฅผ ์‚ฌ์šฉํ•˜๋Š” ๊ฒƒ์ด ์ข‹์Œ
228
- # ๊ฐ„๋‹จํ•œ ๋ฐ๋ชจ ๋ชฉ์ ์œผ๋กœ ๋Œ€์‹  Mock ๋ฐ์ดํ„ฐ ๋ฐ˜ํ™˜
229
- logging.info("Google ๊ฒ€์ƒ‰ ์„ฑ๊ณตํ–ˆ์œผ๋‚˜ ํŒŒ์‹ฑ ์ œํ•œ์œผ๋กœ Mock ๋ฐ์ดํ„ฐ ๋ฐ˜ํ™˜")
230
- return generate_mock_search_results(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
  except Exception as e:
233
  logging.error(f"Google ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
@@ -613,5 +705,6 @@ if __name__ == "__main__":
613
  f.write("requests>=2.32.3\n")
614
  f.write("markdown>=3.5.1\n")
615
  f.write("pillow>=10.1.0\n")
 
616
 
617
  main()
 
10
  import base64
11
  from datetime import datetime
12
  import re
13
+ from bs4 import BeautifulSoup # BeautifulSoup ์ถ”๊ฐ€
14
 
15
  # ๋กœ๊น… ์„ค์ •
16
  logging.basicConfig(
 
206
  return notice + "\n".join(summary_lines)
207
 
208
  # Google ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (SerpAPI ๋Œ€์‹  ์ง์ ‘ ๊ฒ€์ƒ‰)
209
+ # Google ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (BeautifulSoup์„ ์‚ฌ์šฉํ•˜์—ฌ ๊ฒฐ๊ณผ ํŒŒ์‹ฑ)
210
  def do_google_search(query, num_results=5):
211
  try:
212
+ # ๋‹ค์–‘ํ•œ User-Agent ์‚ฌ์šฉ (Google ์ฐจ๋‹จ ๋ฐฉ์ง€)
213
  headers = {
214
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
215
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
216
+ 'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
217
+ 'Accept-Encoding': 'gzip, deflate, br',
218
+ 'Referer': 'https://www.google.com/',
219
+ 'DNT': '1',
220
+ 'Connection': 'keep-alive',
221
+ 'Upgrade-Insecure-Requests': '1',
222
+ 'Cache-Control': 'max-age=0',
223
  }
224
 
225
+ # ๊ฒ€์ƒ‰ URL (์ผ๋ถ€ ํŒŒ๋ผ๋ฏธํ„ฐ ์ถ”๊ฐ€)
226
+ search_url = f"https://www.google.com/search?q={query}&num={num_results}&hl=ko&gl=kr"
227
  logging.info(f"๊ตฌ๊ธ€ ๊ฒ€์ƒ‰ URL: {search_url}")
228
 
229
  # ์š”์ฒญ ๋ณด๋‚ด๊ธฐ (์งง์€ ํƒ€์ž„์•„์›ƒ ์„ค์ •)
 
234
  logging.error(f"Google ๊ฒ€์ƒ‰ ์‘๋‹ต ์ƒํƒœ ์ฝ”๋“œ: {response.status_code}")
235
  return generate_mock_search_results(query)
236
 
237
+ # BeautifulSoup์œผ๋กœ HTML ํŒŒ์‹ฑ
238
+ soup = BeautifulSoup(response.text, 'html.parser')
239
+
240
+ # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ถ”์ถœ
241
+ organic_results = []
242
+
243
+ # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ปจํ…Œ์ด๋„ˆ ์ฐพ๊ธฐ (Google์˜ HTML ๊ตฌ์กฐ์— ๋”ฐ๋ผ ๋ณ€๊ฒฝ๋  ์ˆ˜ ์žˆ์Œ)
244
+ result_containers = soup.select('div.g')
245
+
246
+ if not result_containers:
247
+ logging.warning("Google ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ปจํ…Œ์ด๋„ˆ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๋Œ€์ฒด ์„ ํƒ์ž๋ฅผ ์‹œ๋„ํ•ฉ๋‹ˆ๋‹ค.")
248
+ # ๋Œ€์ฒด ์„ ํƒ์ž ์‹œ๋„
249
+ result_containers = soup.select('div[data-hveid]')
250
+
251
+ counter = 0
252
+ for container in result_containers:
253
+ if counter >= num_results:
254
+ break
255
+
256
+ # ์ œ๋ชฉ ์ถ”์ถœ
257
+ title_element = container.select_one('h3')
258
+ if not title_element:
259
+ continue
260
+
261
+ title = title_element.get_text()
262
+
263
+ # ๋งํฌ ์ถ”์ถœ
264
+ link_element = container.select_one('a')
265
+ if not link_element:
266
+ continue
267
+
268
+ link = link_element.get('href', '')
269
+ if link.startswith('/url?'):
270
+ # Google์˜ ๋ฆฌ๋‹ค์ด๋ ‰ํŠธ URL์—์„œ ์‹ค์ œ URL ์ถ”์ถœ
271
+ link = link.split('q=')[1].split('&')[0] if 'q=' in link else link
272
+ elif not link.startswith('http'):
273
+ continue
274
+
275
+ # ์Šค๋‹ˆํŽซ ์ถ”์ถœ
276
+ snippet_element = container.select_one('div.VwiC3b') or container.select_one('span.aCOpRe')
277
+ snippet = snippet_element.get_text() if snippet_element else "์„ค๋ช… ์—†์Œ"
278
+
279
+ # ํ‘œ์‹œ ๋งํฌ ์ถ”์ถœ
280
+ displayed_link_element = container.select_one('cite')
281
+ displayed_link = displayed_link_element.get_text() if displayed_link_element else link
282
+
283
+ organic_results.append({
284
+ "title": title,
285
+ "link": link,
286
+ "snippet": snippet,
287
+ "displayed_link": displayed_link
288
+ })
289
+ counter += 1
290
+
291
+ if not organic_results:
292
+ logging.warning("๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ํŒŒ์‹ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ์„ ํƒ์ž๊ฐ€ ๋ณ€๊ฒฝ๋˜์—ˆ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
293
+ return generate_mock_search_results(query)
294
+
295
+ # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋งˆํฌ๋‹ค์šด ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜
296
+ summary_lines = []
297
+ for idx, item in enumerate(organic_results, start=1):
298
+ title = item.get("title", "No title")
299
+ link = item.get("link", "#")
300
+ snippet = item.get("snippet", "No description")
301
+ displayed_link = item.get("displayed_link", link)
302
+
303
+ summary_lines.append(
304
+ f"### Result {idx}: {title}\n\n"
305
+ f"{snippet}\n\n"
306
+ f"**์ถœ์ฒ˜**: [{displayed_link}]({link})\n\n"
307
+ f"---\n"
308
+ )
309
+
310
+ # ๋ชจ๋ธ์—๊ฒŒ ๋ช…ํ™•ํ•œ ์ง€์นจ ์ถ”๊ฐ€
311
+ instructions = """
312
+ # ์›น ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ
313
+ ์•„๋ž˜๋Š” ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค. ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•  ๋•Œ ์ด ์ •๋ณด๋ฅผ ํ™œ์šฉํ•˜์„ธ์š”:
314
+ 1. ๊ฐ ๊ฒฐ๊ณผ์˜ ์ œ๋ชฉ, ๋‚ด์šฉ, ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ์ฐธ๊ณ ํ•˜์„ธ์š”
315
+ 2. ๋‹ต๋ณ€์— ๊ด€๋ จ ์ •๋ณด์˜ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ธ์šฉํ•˜์„ธ์š” (์˜ˆ: "X ์ถœ์ฒ˜์— ๋”ฐ๋ฅด๋ฉด...")
316
+ 3. ์‘๋‹ต์— ์‹ค์ œ ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ํฌํ•จํ•˜์„ธ์š”
317
+ 4. ์—ฌ๋Ÿฌ ์ถœ์ฒ˜์˜ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”
318
+ """
319
+
320
+ search_results = instructions + "\n".join(summary_lines)
321
+ logging.info(f"Google ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ {len(organic_results)}๊ฐœ ํŒŒ์‹ฑ ์™„๋ฃŒ")
322
+ return search_results
323
 
324
  except Exception as e:
325
  logging.error(f"Google ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
 
705
  f.write("requests>=2.32.3\n")
706
  f.write("markdown>=3.5.1\n")
707
  f.write("pillow>=10.1.0\n")
708
+ f.write("beautifulsoup4>=4.12.0\n") # BeautifulSoup ์ถ”๊ฐ€
709
 
710
  main()