Kims12 commited on
Commit
e37ebe5
ยท
verified ยท
1 Parent(s): d479e4e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -3
app.py CHANGED
@@ -19,6 +19,12 @@ USER_AGENTS = [
19
  def scrape_blog(url):
20
  logging.debug("์Šคํฌ๋ž˜ํ•‘ ์‹œ์ž‘: %s", url)
21
 
 
 
 
 
 
 
22
  # ์š”์ฒญ ์ „์— 1~3์ดˆ ๋žœ๋ค ๋”œ๋ ˆ์ด ์ ์šฉ (์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐ ๋ด‡ ๊ฐ์ง€ ํšŒํ”ผ)
23
  delay = random.uniform(1, 3)
24
  logging.debug("์š”์ฒญ ์ „ ๋žœ๋ค ๋”œ๋ ˆ์ด: %.2f์ดˆ", delay)
@@ -28,7 +34,7 @@ def scrape_blog(url):
28
  user_agent = random.choice(USER_AGENTS)
29
  headers = {
30
  "User-Agent": user_agent,
31
- "Referer": "https://m.blog.naver.com", # ๋„ค์ด๋ฒ„ ๋ชจ๋ฐ”์ผ ํŽ˜์ด์ง€์—์„œ ์˜จ ๊ฒƒ์ฒ˜๋Ÿผ ์„ค์ •
32
  "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7"
33
  }
34
  logging.debug("์„ค์ •๋œ HTTP ํ—ค๋”: %s", headers)
@@ -47,7 +53,7 @@ def scrape_blog(url):
47
  soup = BeautifulSoup(response.text, "html.parser")
48
  logging.debug("HTML ํŒŒ์‹ฑ ์™„๋ฃŒ")
49
 
50
- # ์ œ๋ชฉ ์ถ”์ถœ (HTML ๊ตฌ์กฐ ์˜ˆ์‹œ ์ฐธ๊ณ )
51
  title_div = soup.find("div", class_="se-module se-module-text se-title-text")
52
  if title_div:
53
  title_tag = title_div.find(["p", "span"])
@@ -59,7 +65,7 @@ def scrape_blog(url):
59
  title_text = ""
60
  logging.debug("์ถ”์ถœ๋œ ์ œ๋ชฉ: %s", title_text)
61
 
62
- # ๋ณธ๋ฌธ ์ถ”์ถœ (HTML ๊ตฌ์กฐ ๋‚ด 'se-main-container' ํด๋ž˜์Šค ์‚ฌ์šฉ)
63
  body_div = soup.find("div", class_="se-main-container")
64
  if body_div:
65
  # ๋ณธ๋ฌธ ๋‚ด ๋ชจ๋“  ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋ฉฐ ๊ฐ ์š”์†Œ ์‚ฌ์ด์— ์ค„๋ฐ”๊ฟˆ ์ ์šฉ
 
19
  def scrape_blog(url):
20
  logging.debug("์Šคํฌ๋ž˜ํ•‘ ์‹œ์ž‘: %s", url)
21
 
22
+ # URL์ด ๋ชจ๋ฐ”์ผ ๋ฒ„์ „์ด ์•„๋‹ ๊ฒฝ์šฐ ๋ชจ๋ฐ”์ผ ๋ฒ„์ „์œผ๋กœ ๋ณ€ํ™˜
23
+ if "m.blog.naver.com" not in url:
24
+ new_url = url.replace("blog.naver.com", "m.blog.naver.com")
25
+ logging.debug("URL์„ ๋ชจ๋ฐ”์ผ ๋ฒ„์ „์œผ๋กœ ๋ณ€ํ™˜: %s", new_url)
26
+ url = new_url
27
+
28
  # ์š”์ฒญ ์ „์— 1~3์ดˆ ๋žœ๋ค ๋”œ๋ ˆ์ด ์ ์šฉ (์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐ ๋ด‡ ๊ฐ์ง€ ํšŒํ”ผ)
29
  delay = random.uniform(1, 3)
30
  logging.debug("์š”์ฒญ ์ „ ๋žœ๋ค ๋”œ๋ ˆ์ด: %.2f์ดˆ", delay)
 
34
  user_agent = random.choice(USER_AGENTS)
35
  headers = {
36
  "User-Agent": user_agent,
37
+ "Referer": "https://m.blog.naver.com", # ๋ชจ๋ฐ”์ผ ํŽ˜์ด์ง€์—์„œ ์˜จ ๊ฒƒ์ฒ˜๋Ÿผ ์„ค์ •
38
  "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7"
39
  }
40
  logging.debug("์„ค์ •๋œ HTTP ํ—ค๋”: %s", headers)
 
53
  soup = BeautifulSoup(response.text, "html.parser")
54
  logging.debug("HTML ํŒŒ์‹ฑ ์™„๋ฃŒ")
55
 
56
+ # ์ œ๋ชฉ ์ถ”์ถœ (๋ชจ๋ฐ”์ผ ๋ฒ„์ „ HTML ๊ตฌ์กฐ ์‚ฌ์šฉ)
57
  title_div = soup.find("div", class_="se-module se-module-text se-title-text")
58
  if title_div:
59
  title_tag = title_div.find(["p", "span"])
 
65
  title_text = ""
66
  logging.debug("์ถ”์ถœ๋œ ์ œ๋ชฉ: %s", title_text)
67
 
68
+ # ๋ณธ๋ฌธ ์ถ”์ถœ (๋ชจ๋ฐ”์ผ ๋ฒ„์ „ HTML ๊ตฌ์กฐ ์‚ฌ์šฉ)
69
  body_div = soup.find("div", class_="se-main-container")
70
  if body_div:
71
  # ๋ณธ๋ฌธ ๋‚ด ๋ชจ๋“  ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋ฉฐ ๊ฐ ์š”์†Œ ์‚ฌ์ด์— ์ค„๋ฐ”๊ฟˆ ์ ์šฉ