Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,7 @@ import tempfile
|
|
10 |
import base64
|
11 |
from datetime import datetime
|
12 |
import re
|
|
|
13 |
|
14 |
# ๋ก๊น
์ค์
|
15 |
logging.basicConfig(
|
@@ -205,15 +206,24 @@ def generate_mock_search_results(query):
|
|
205 |
return notice + "\n".join(summary_lines)
|
206 |
|
207 |
# Google ๊ฒ์ ํจ์ (SerpAPI ๋์ ์ง์ ๊ฒ์)
|
|
|
208 |
def do_google_search(query, num_results=5):
|
209 |
try:
|
210 |
-
#
|
211 |
headers = {
|
212 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
}
|
214 |
|
215 |
-
# ๊ฒ์ URL
|
216 |
-
search_url = f"https://www.google.com/search?q={query}&num={num_results}"
|
217 |
logging.info(f"๊ตฌ๊ธ ๊ฒ์ URL: {search_url}")
|
218 |
|
219 |
# ์์ฒญ ๋ณด๋ด๊ธฐ (์งง์ ํ์์์ ์ค์ )
|
@@ -224,10 +234,92 @@ def do_google_search(query, num_results=5):
|
|
224 |
logging.error(f"Google ๊ฒ์ ์๋ต ์ํ ์ฝ๋: {response.status_code}")
|
225 |
return generate_mock_search_results(query)
|
226 |
|
227 |
-
#
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
except Exception as e:
|
233 |
logging.error(f"Google ๊ฒ์ ์คํจ: {e}")
|
@@ -613,5 +705,6 @@ if __name__ == "__main__":
|
|
613 |
f.write("requests>=2.32.3\n")
|
614 |
f.write("markdown>=3.5.1\n")
|
615 |
f.write("pillow>=10.1.0\n")
|
|
|
616 |
|
617 |
main()
|
|
|
10 |
import base64
|
11 |
from datetime import datetime
|
12 |
import re
|
13 |
+
from bs4 import BeautifulSoup # BeautifulSoup ์ถ๊ฐ
|
14 |
|
15 |
# ๋ก๊น
์ค์
|
16 |
logging.basicConfig(
|
|
|
206 |
return notice + "\n".join(summary_lines)
|
207 |
|
208 |
# Google ๊ฒ์ ํจ์ (SerpAPI ๋์ ์ง์ ๊ฒ์)
|
209 |
+
# Google ๊ฒ์ ํจ์ (BeautifulSoup์ ์ฌ์ฉํ์ฌ ๊ฒฐ๊ณผ ํ์ฑ)
|
210 |
def do_google_search(query, num_results=5):
|
211 |
try:
|
212 |
+
# ๋ค์ํ User-Agent ์ฌ์ฉ (Google ์ฐจ๋จ ๋ฐฉ์ง)
|
213 |
headers = {
|
214 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
215 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
216 |
+
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
217 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
218 |
+
'Referer': 'https://www.google.com/',
|
219 |
+
'DNT': '1',
|
220 |
+
'Connection': 'keep-alive',
|
221 |
+
'Upgrade-Insecure-Requests': '1',
|
222 |
+
'Cache-Control': 'max-age=0',
|
223 |
}
|
224 |
|
225 |
+
# ๊ฒ์ URL (์ผ๋ถ ํ๋ผ๋ฏธํฐ ์ถ๊ฐ)
|
226 |
+
search_url = f"https://www.google.com/search?q={query}&num={num_results}&hl=ko&gl=kr"
|
227 |
logging.info(f"๊ตฌ๊ธ ๊ฒ์ URL: {search_url}")
|
228 |
|
229 |
# ์์ฒญ ๋ณด๋ด๊ธฐ (์งง์ ํ์์์ ์ค์ )
|
|
|
234 |
logging.error(f"Google ๊ฒ์ ์๋ต ์ํ ์ฝ๋: {response.status_code}")
|
235 |
return generate_mock_search_results(query)
|
236 |
|
237 |
+
# BeautifulSoup์ผ๋ก HTML ํ์ฑ
|
238 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
239 |
+
|
240 |
+
# ๊ฒ์ ๊ฒฐ๊ณผ ์ถ์ถ
|
241 |
+
organic_results = []
|
242 |
+
|
243 |
+
# ๊ฒ์ ๊ฒฐ๊ณผ ์ปจํ
์ด๋ ์ฐพ๊ธฐ (Google์ HTML ๊ตฌ์กฐ์ ๋ฐ๋ผ ๋ณ๊ฒฝ๋ ์ ์์)
|
244 |
+
result_containers = soup.select('div.g')
|
245 |
+
|
246 |
+
if not result_containers:
|
247 |
+
logging.warning("Google ๊ฒ์ ๊ฒฐ๊ณผ ์ปจํ
์ด๋๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค. ๋์ฒด ์ ํ์๋ฅผ ์๋ํฉ๋๋ค.")
|
248 |
+
# ๋์ฒด ์ ํ์ ์๋
|
249 |
+
result_containers = soup.select('div[data-hveid]')
|
250 |
+
|
251 |
+
counter = 0
|
252 |
+
for container in result_containers:
|
253 |
+
if counter >= num_results:
|
254 |
+
break
|
255 |
+
|
256 |
+
# ์ ๋ชฉ ์ถ์ถ
|
257 |
+
title_element = container.select_one('h3')
|
258 |
+
if not title_element:
|
259 |
+
continue
|
260 |
+
|
261 |
+
title = title_element.get_text()
|
262 |
+
|
263 |
+
# ๋งํฌ ์ถ์ถ
|
264 |
+
link_element = container.select_one('a')
|
265 |
+
if not link_element:
|
266 |
+
continue
|
267 |
+
|
268 |
+
link = link_element.get('href', '')
|
269 |
+
if link.startswith('/url?'):
|
270 |
+
# Google์ ๋ฆฌ๋ค์ด๋ ํธ URL์์ ์ค์ URL ์ถ์ถ
|
271 |
+
link = link.split('q=')[1].split('&')[0] if 'q=' in link else link
|
272 |
+
elif not link.startswith('http'):
|
273 |
+
continue
|
274 |
+
|
275 |
+
# ์ค๋ํซ ์ถ์ถ
|
276 |
+
snippet_element = container.select_one('div.VwiC3b') or container.select_one('span.aCOpRe')
|
277 |
+
snippet = snippet_element.get_text() if snippet_element else "์ค๋ช
์์"
|
278 |
+
|
279 |
+
# ํ์ ๋งํฌ ์ถ์ถ
|
280 |
+
displayed_link_element = container.select_one('cite')
|
281 |
+
displayed_link = displayed_link_element.get_text() if displayed_link_element else link
|
282 |
+
|
283 |
+
organic_results.append({
|
284 |
+
"title": title,
|
285 |
+
"link": link,
|
286 |
+
"snippet": snippet,
|
287 |
+
"displayed_link": displayed_link
|
288 |
+
})
|
289 |
+
counter += 1
|
290 |
+
|
291 |
+
if not organic_results:
|
292 |
+
logging.warning("๊ฒ์ ๊ฒฐ๊ณผ๋ฅผ ํ์ฑํ ์ ์์ต๋๋ค. ์ ํ์๊ฐ ๋ณ๊ฒฝ๋์์ ์ ์์ต๋๋ค.")
|
293 |
+
return generate_mock_search_results(query)
|
294 |
+
|
295 |
+
# ๊ฒ์ ๊ฒฐ๊ณผ ๋งํฌ๋ค์ด ํ์์ผ๋ก ๋ณํ
|
296 |
+
summary_lines = []
|
297 |
+
for idx, item in enumerate(organic_results, start=1):
|
298 |
+
title = item.get("title", "No title")
|
299 |
+
link = item.get("link", "#")
|
300 |
+
snippet = item.get("snippet", "No description")
|
301 |
+
displayed_link = item.get("displayed_link", link)
|
302 |
+
|
303 |
+
summary_lines.append(
|
304 |
+
f"### Result {idx}: {title}\n\n"
|
305 |
+
f"{snippet}\n\n"
|
306 |
+
f"**์ถ์ฒ**: [{displayed_link}]({link})\n\n"
|
307 |
+
f"---\n"
|
308 |
+
)
|
309 |
+
|
310 |
+
# ๋ชจ๋ธ์๊ฒ ๋ช
ํํ ์ง์นจ ์ถ๊ฐ
|
311 |
+
instructions = """
|
312 |
+
# ์น ๊ฒ์ ๊ฒฐ๊ณผ
|
313 |
+
์๋๋ ๊ฒ์ ๊ฒฐ๊ณผ์
๋๋ค. ์ง๋ฌธ์ ๋ต๋ณํ ๋ ์ด ์ ๋ณด๋ฅผ ํ์ฉํ์ธ์:
|
314 |
+
1. ๊ฐ ๊ฒฐ๊ณผ์ ์ ๋ชฉ, ๋ด์ฉ, ์ถ์ฒ ๋งํฌ๋ฅผ ์ฐธ๊ณ ํ์ธ์
|
315 |
+
2. ๋ต๋ณ์ ๊ด๋ จ ์ ๋ณด์ ์ถ์ฒ๋ฅผ ๋ช
์์ ์ผ๋ก ์ธ์ฉํ์ธ์ (์: "X ์ถ์ฒ์ ๋ฐ๋ฅด๋ฉด...")
|
316 |
+
3. ์๋ต์ ์ค์ ์ถ์ฒ ๋งํฌ๋ฅผ ํฌํจํ์ธ์
|
317 |
+
4. ์ฌ๋ฌ ์ถ์ฒ์ ์ ๋ณด๋ฅผ ์ข
ํฉํ์ฌ ๋ต๋ณํ์ธ์
|
318 |
+
"""
|
319 |
+
|
320 |
+
search_results = instructions + "\n".join(summary_lines)
|
321 |
+
logging.info(f"Google ๊ฒ์ ๊ฒฐ๊ณผ {len(organic_results)}๊ฐ ํ์ฑ ์๋ฃ")
|
322 |
+
return search_results
|
323 |
|
324 |
except Exception as e:
|
325 |
logging.error(f"Google ๊ฒ์ ์คํจ: {e}")
|
|
|
705 |
f.write("requests>=2.32.3\n")
|
706 |
f.write("markdown>=3.5.1\n")
|
707 |
f.write("pillow>=10.1.0\n")
|
708 |
+
f.write("beautifulsoup4>=4.12.0\n") # BeautifulSoup ์ถ๊ฐ
|
709 |
|
710 |
main()
|