Kims12 commited on
Commit
f8570dc
Β·
verified Β·
1 Parent(s): f9c1fca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -128
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import gradio as gr
2
  import requests
3
- import aiohttp
4
- import asyncio
5
  from bs4 import BeautifulSoup
6
  import urllib.parse # iframe 경둜 보정을 μœ„ν•œ λͺ¨λ“ˆ
7
  import re
@@ -19,8 +17,8 @@ import base64
19
  def debug_log(message: str):
20
  print(f"[DEBUG] {message}")
21
 
22
- # --- 넀이버 λΈ”λ‘œκ·Έ μŠ€ν¬λž˜ν•‘ (비동기 버전) ---
23
- async def scrape_naver_blog(url: str) -> str:
24
  debug_log("scrape_naver_blog ν•¨μˆ˜ μ‹œμž‘")
25
  debug_log(f"μš”μ²­λ°›μ€ URL: {url}")
26
  headers = {
@@ -31,51 +29,75 @@ async def scrape_naver_blog(url: str) -> str:
31
  )
32
  }
33
  try:
34
- async with aiohttp.ClientSession() as session:
35
- async with session.get(url, headers=headers) as response:
36
- debug_log("HTTP GET μš”μ²­(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
37
- if response.status != 200:
38
- debug_log(f"μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {response.status}")
39
- return f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {response.status}"
40
- html = await response.text()
41
- soup = BeautifulSoup(html, "html.parser")
42
- debug_log("HTML νŒŒμ‹±(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
43
- iframe = soup.select_one("iframe#mainFrame")
44
- if not iframe:
45
- debug_log("iframe#mainFrame νƒœκ·Έλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
46
- return "λ³Έλ¬Έ iframe을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
47
- iframe_src = iframe.get("src")
48
- if not iframe_src:
49
- debug_log("iframe srcκ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
50
- return "λ³Έλ¬Έ iframe의 srcλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
51
- parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
52
- debug_log(f"iframe νŽ˜μ΄μ§€ μš”μ²­ URL: {parsed_iframe_url}")
53
- async with aiohttp.ClientSession() as session:
54
- async with session.get(parsed_iframe_url, headers=headers) as iframe_response:
55
- debug_log("HTTP GET μš”μ²­(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
56
- if iframe_response.status != 200:
57
- debug_log(f"iframe μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {iframe_response.status}")
58
- return f"iframeμ—μ„œ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {iframe_response.status}"
59
- iframe_html = await iframe_response.text()
60
- iframe_soup = BeautifulSoup(iframe_html, "html.parser")
61
- debug_log("HTML νŒŒμ‹±(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
62
- title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
63
- title = title_div.get_text(strip=True) if title_div else "제λͺ©μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
64
- debug_log(f"μΆ”μΆœλœ 제λͺ©: {title}")
65
- content_div = iframe_soup.select_one('.se-main-container')
66
- if content_div:
67
- content = content_div.get_text("\n", strip=True)
68
- else:
69
- content = "본문을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
70
- debug_log("λ³Έλ¬Έ μΆ”μΆœ μ™„λ£Œ")
71
- result = f"[제λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
72
- debug_log("제λͺ©κ³Ό λ³Έλ¬Έ ν•©μΉ¨ μ™„λ£Œ")
73
- return result
74
  except Exception as e:
75
  debug_log(f"μ—λŸ¬ λ°œμƒ: {str(e)}")
76
  return f"μŠ€ν¬λž˜ν•‘ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
77
 
78
- # --- 넀이버 검색 및 κ΄‘κ³  API κ΄€λ ¨ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def generate_signature(timestamp, method, uri, secret_key):
80
  message = f"{timestamp}.{method}.{uri}"
81
  digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
@@ -92,8 +114,7 @@ def get_header(method, uri, api_key, secret_key, customer_id):
92
  "X-Signature": signature
93
  }
94
 
95
- # --- μ—°κ΄€ ν‚€μ›Œλ“œ 쑰회 (비동기) ---
96
- async def fetch_related_keywords(keyword):
97
  debug_log(f"fetch_related_keywords 호좜, ν‚€μ›Œλ“œ: {keyword}")
98
  API_KEY = os.environ["NAVER_API_KEY"]
99
  SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
@@ -106,9 +127,8 @@ async def fetch_related_keywords(keyword):
106
  "hintKeywords": [keyword],
107
  "showDetail": "1"
108
  }
109
- async with aiohttp.ClientSession() as session:
110
- async with session.get(BASE_URL + uri, headers=headers, params=params) as response:
111
- data = await response.json()
112
  if "keywordList" not in data:
113
  return pd.DataFrame()
114
  df = pd.DataFrame(data["keywordList"])
@@ -127,8 +147,7 @@ async def fetch_related_keywords(keyword):
127
  debug_log("fetch_related_keywords μ™„λ£Œ")
128
  return result_df
129
 
130
- # --- λΈ”λ‘œκ·Έ λ¬Έμ„œμˆ˜ 쑰회 (비동기) ---
131
- async def fetch_blog_count(keyword):
132
  debug_log(f"fetch_blog_count 호좜, ν‚€μ›Œλ“œ: {keyword}")
133
  client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
134
  client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
@@ -138,30 +157,28 @@ async def fetch_blog_count(keyword):
138
  "X-Naver-Client-Secret": client_secret
139
  }
140
  params = {"query": keyword, "display": 1}
141
- async with aiohttp.ClientSession() as session:
142
- async with session.get(url, headers=headers, params=params) as response:
143
- if response.status == 200:
144
- data = await response.json()
145
- debug_log(f"fetch_blog_count κ²°κ³Ό: {data.get('total', 0)}")
146
- return data.get("total", 0)
147
- else:
148
- debug_log(f"fetch_blog_count 였λ₯˜, μƒνƒœμ½”λ“œ: {response.status}")
149
- return 0
150
 
151
  def create_excel_file(df):
152
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
153
  excel_path = tmp.name
154
- df.to_excel(excel_path, index=False, engine='openpyxl')
155
  debug_log(f"Excel 파일 생성됨: {excel_path}")
156
  return excel_path
157
 
158
- # --- ν‚€μ›Œλ“œ 검색 (비동기) ---
159
- async def process_keyword(keywords: str, include_related: bool):
160
  debug_log(f"process_keyword 호좜, ν‚€μ›Œλ“œλ“€: {keywords}, 연관검색어 포함: {include_related}")
161
  input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
162
  result_dfs = []
163
  for idx, kw in enumerate(input_keywords):
164
- df_kw = await fetch_related_keywords(kw)
165
  if df_kw.empty:
166
  continue
167
  row_kw = df_kw[df_kw["μ •λ³΄ν‚€μ›Œλ“œ"] == kw]
@@ -178,44 +195,13 @@ async def process_keyword(keywords: str, include_related: bool):
178
  result_df.drop_duplicates(subset=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
179
  else:
180
  result_df = pd.DataFrame(columns=["μ •λ³΄ν‚€μ›Œλ“œ", "PCμ›”κ²€μƒ‰λŸ‰", "λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", "ν† νƒˆμ›”κ²€μƒ‰λŸ‰"])
181
- # λΈ”λ‘œκ·Έ λ¬Έμ„œμˆ˜ 쑰회λ₯Ό λ³‘λ ¬λ‘œ 처리
182
- tasks = [fetch_blog_count(kw) for kw in result_df["μ •λ³΄ν‚€μ›Œλ“œ"]]
183
- counts = await asyncio.gather(*tasks)
184
- result_df["λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜"] = counts
185
  result_df.sort_values(by="ν† νƒˆμ›”κ²€μƒ‰λŸ‰", ascending=False, inplace=True)
186
  debug_log("process_keyword μ™„λ£Œ")
187
  return result_df, create_excel_file(result_df)
188
 
189
- # --- ν˜•νƒœμ†Œ 뢄석 (μ°Έκ³  μ½”λ“œ-1, 동기) ---
190
- def analyze_text(text: str):
191
- logging.basicConfig(level=logging.DEBUG)
192
- logger = logging.getLogger(__name__)
193
- logger.debug("원본 ν…μŠ€νŠΈ: %s", text)
194
- filtered_text = re.sub(r'[^κ°€-힣]', '', text)
195
- logger.debug("ν•„ν„°λ§λœ ν…μŠ€νŠΈ: %s", filtered_text)
196
- if not filtered_text:
197
- logger.debug("μœ νš¨ν•œ ν•œκ΅­μ–΄ ν…μŠ€νŠΈκ°€ μ—†μŒ.")
198
- return pd.DataFrame(columns=["단어", "λΉˆλ„μˆ˜"]), ""
199
- mecab_instance = mecab.MeCab()
200
- tokens = mecab_instance.pos(filtered_text)
201
- logger.debug("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Ό: %s", tokens)
202
- freq = {}
203
- for word, pos in tokens:
204
- if word and word.strip() and pos.startswith("NN"):
205
- freq[word] = freq.get(word, 0) + 1
206
- logger.debug("단어: %s, ν’ˆμ‚¬: %s, λΉˆλ„: %d", word, pos, freq[word])
207
- sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
208
- logger.debug("μ •λ ¬λœ 단어 λΉˆλ„: %s", sorted_freq)
209
- df = pd.DataFrame(sorted_freq, columns=["단어", "λΉˆλ„μˆ˜"])
210
- logger.debug("ν˜•νƒœμ†Œ 뢄석 DataFrame 생성됨, shape: %s", df.shape)
211
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
212
- df.to_excel(temp_file.name, index=False, engine='openpyxl')
213
- temp_file.close()
214
- logger.debug("Excel 파일 생성됨: %s", temp_file.name)
215
- return df, temp_file.name
216
-
217
- # --- ν˜•νƒœμ†Œ 뢄석과 κ²€μƒ‰λŸ‰/λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 병합 (비동기) ---
218
- async def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
219
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ‹œμž‘")
220
  df_freq, _ = analyze_text(text)
221
  if df_freq.empty:
@@ -227,7 +213,7 @@ async def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
227
  debug_log(f"λΉˆλ„μˆ˜ 1 제거 적용됨. {before_shape} -> {df_freq.shape}")
228
  keywords = "\n".join(df_freq["단어"].tolist())
229
  debug_log(f"λΆ„μ„λœ ν‚€μ›Œλ“œ: {keywords}")
230
- df_keyword_info, _ = await process_keyword(keywords, include_related=False)
231
  debug_log("κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 쑰회 μ™„λ£Œ")
232
  merged_df = pd.merge(df_freq, df_keyword_info, left_on="단어", right_on="μ •λ³΄ν‚€μ›Œλ“œ", how="left")
233
  merged_df.drop(columns=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
@@ -235,8 +221,8 @@ async def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
235
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ™„λ£Œ")
236
  return merged_df, merged_excel_path
237
 
238
- # --- 직접 ν‚€μ›Œλ“œ 뢄석 (단독 뢄석, 비동기) ---
239
- async def direct_keyword_analysis(text: str, keyword_input: str):
240
  debug_log("direct_keyword_analysis ν•¨μˆ˜ μ‹œμž‘")
241
  keywords = re.split(r'[\n,]+', keyword_input)
242
  keywords = [kw.strip() for kw in keywords if kw.strip()]
@@ -246,28 +232,15 @@ async def direct_keyword_analysis(text: str, keyword_input: str):
246
  count = text.count(kw)
247
  results.append((kw, count))
248
  debug_log(f"ν‚€μ›Œλ“œ '{kw}'의 λΉˆλ„μˆ˜: {count}")
249
- # 직접 μž…λ ₯ ν‚€μ›Œλ“œκ°€ 본문에 μ—†μœΌλ©΄ μΆ”κ°€ 쑰회
250
- if kw not in text:
251
- df_direct, _ = await process_keyword(kw, include_related=False)
252
- if (not df_direct.empty) and (kw in df_direct["μ •λ³΄ν‚€μ›Œλ“œ"].values):
253
- row = df_direct[df_direct["μ •λ³΄ν‚€μ›Œλ“œ"] == kw].iloc[0]
254
- pc = row.get("PCμ›”κ²€μƒ‰λŸ‰", None)
255
- mobile = row.get("λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", None)
256
- total = row.get("ν† νƒˆμ›”κ²€μƒ‰λŸ‰", None)
257
- blog_count = row.get("λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜", None)
258
- else:
259
- pc = mobile = total = blog_count = None
260
- # 결과에 μƒˆ ν–‰ μΆ”κ°€
261
- results.append((kw, count))
262
  df = pd.DataFrame(results, columns=["ν‚€μ›Œλ“œ", "λΉˆλ„μˆ˜"])
263
  excel_path = create_excel_file(df)
264
  debug_log("direct_keyword_analysis ν•¨μˆ˜ μ™„λ£Œ")
265
  return df, excel_path
266
 
267
- # --- 톡합 뢄석 (ν˜•νƒœμ†Œ 뢄석 + 직접 ν‚€μ›Œλ“œ 뢄석, 비동기) ---
268
- async def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
269
  debug_log("combined_analysis ν•¨μˆ˜ μ‹œμž‘")
270
- merged_df, _ = await morphological_analysis_and_enrich(blog_text, remove_freq1)
271
  if "μ§μ ‘μž…λ ₯" not in merged_df.columns:
272
  merged_df["μ§μ ‘μž…λ ₯"] = ""
273
  direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
@@ -278,7 +251,7 @@ async def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_i
278
  merged_df.loc[merged_df["단어"] == dk, "μ§μ ‘μž…λ ₯"] = "μ§μ ‘μž…λ ₯"
279
  else:
280
  freq = blog_text.count(dk)
281
- df_direct, _ = await process_keyword(dk, include_related=False)
282
  if (not df_direct.empty) and (dk in df_direct["μ •λ³΄ν‚€μ›Œλ“œ"].values):
283
  row = df_direct[df_direct["μ •λ³΄ν‚€μ›Œλ“œ"] == dk].iloc[0]
284
  pc = row.get("PCμ›”κ²€μƒ‰λŸ‰", None)
@@ -302,18 +275,20 @@ async def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_i
302
  debug_log("combined_analysis ν•¨μˆ˜ μ™„λ£Œ")
303
  return merged_df, combined_excel
304
 
305
- # --- 뢄석 ν•Έλ“€λŸ¬ (비동기) ---
306
- async def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
307
  debug_log("analysis_handler ν•¨μˆ˜ μ‹œμž‘")
308
  if direct_keyword_only:
309
- return await direct_keyword_analysis(blog_text, direct_keyword_input)
 
310
  else:
311
- return await combined_analysis(blog_text, remove_freq1, direct_keyword_input)
 
312
 
313
- # --- μŠ€ν¬λž˜ν•‘ μ‹€ν–‰ ν•Έλ“€λŸ¬ (비동기) ---
314
- async def fetch_blog_content(url: str):
315
  debug_log("fetch_blog_content ν•¨μˆ˜ μ‹œμž‘")
316
- content = await scrape_naver_blog(url)
317
  debug_log("fetch_blog_content ν•¨μˆ˜ μ™„λ£Œ")
318
  return content
319
 
@@ -399,6 +374,7 @@ custom_css = """
399
  # --- Gradio μΈν„°νŽ˜μ΄μŠ€ ꡬ성 ---
400
  with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€", css=custom_css) as demo:
401
  gr.HTML("<div class='custom-header'>넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€ πŸš€</div>")
 
402
  with gr.Group(elem_classes="custom-group"):
403
  with gr.Row():
404
  blog_url_input = gr.Textbox(label="넀이버 λΈ”λ‘œκ·Έ 링크", placeholder="예: https://blog.naver.com/ssboost/222983068507", lines=1)
@@ -420,6 +396,7 @@ with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€", css=custo
420
  result_df = gr.Dataframe(label="톡합 뢄석 κ²°κ³Ό (단어, λΉˆλ„μˆ˜, κ²€μƒ‰λŸ‰, λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜, μ§μ ‘μž…λ ₯)", interactive=True)
421
  with gr.Group(elem_classes="custom-group"):
422
  excel_file = gr.File(label="Excel λ‹€μš΄λ‘œλ“œ")
 
423
  with gr.Group(elem_classes="custom-group"):
424
  usage_html = gr.HTML("""
425
  <div class="usage-instructions">
@@ -441,7 +418,7 @@ with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€", css=custo
441
  <p><strong>Tip:</strong> 뢄석 κ²°κ³ΌλŠ” μ‹€μ‹œκ°„μœΌλ‘œ μ—…λ°μ΄νŠΈλ˜λ©°, ν•„μš”μ‹œ μˆ˜μ • ν›„ λ‹€μ‹œ 뢄석할 수 μžˆμŠ΅λ‹ˆλ‹€. 즐거운 뢄석 λ˜μ„Έμš”! 😊</p>
442
  </div>
443
  """)
444
- # 이벀트 μ—°κ²° (비동기 ν•¨μˆ˜ μ‚¬μš©)
445
  scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
446
  analyze_button.click(fn=analysis_handler,
447
  inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
@@ -450,4 +427,4 @@ with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€", css=custo
450
  if __name__ == "__main__":
451
  debug_log("Gradio μ•± μ‹€ν–‰ μ‹œμž‘")
452
  demo.launch()
453
- debug_log("Gradio μ•± μ‹€ν–‰ μ’…λ£Œ")
 
1
  import gradio as gr
2
  import requests
 
 
3
  from bs4 import BeautifulSoup
4
  import urllib.parse # iframe 경둜 보정을 μœ„ν•œ λͺ¨λ“ˆ
5
  import re
 
17
  def debug_log(message: str):
18
  print(f"[DEBUG] {message}")
19
 
20
+ # --- 넀이버 λΈ”λ‘œκ·Έ μŠ€ν¬λž˜ν•‘ ---
21
+ def scrape_naver_blog(url: str) -> str:
22
  debug_log("scrape_naver_blog ν•¨μˆ˜ μ‹œμž‘")
23
  debug_log(f"μš”μ²­λ°›μ€ URL: {url}")
24
  headers = {
 
29
  )
30
  }
31
  try:
32
+ response = requests.get(url, headers=headers)
33
+ debug_log("HTTP GET μš”μ²­(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
34
+ if response.status_code != 200:
35
+ debug_log(f"μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {response.status_code}")
36
+ return f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {response.status_code}"
37
+ soup = BeautifulSoup(response.text, "html.parser")
38
+ debug_log("HTML νŒŒμ‹±(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
39
+ iframe = soup.select_one("iframe#mainFrame")
40
+ if not iframe:
41
+ debug_log("iframe#mainFrame νƒœκ·Έλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
42
+ return "λ³Έλ¬Έ iframe을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
43
+ iframe_src = iframe.get("src")
44
+ if not iframe_src:
45
+ debug_log("iframe srcκ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
46
+ return "λ³Έλ¬Έ iframe의 srcλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
47
+ parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
48
+ debug_log(f"iframe νŽ˜μ΄μ§€ μš”μ²­ URL: {parsed_iframe_url}")
49
+ iframe_response = requests.get(parsed_iframe_url, headers=headers)
50
+ debug_log("HTTP GET μš”μ²­(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
51
+ if iframe_response.status_code != 200:
52
+ debug_log(f"iframe μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {iframe_response.status_code}")
53
+ return f"iframeμ—μ„œ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {iframe_response.status_code}"
54
+ iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
55
+ debug_log("HTML νŒŒμ‹±(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
56
+ title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
57
+ title = title_div.get_text(strip=True) if title_div else "οΏ½οΏ½λͺ©μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
58
+ debug_log(f"μΆ”μΆœλœ 제λͺ©: {title}")
59
+ content_div = iframe_soup.select_one('.se-main-container')
60
+ if content_div:
61
+ content = content_div.get_text("\n", strip=True)
62
+ else:
63
+ content = "본문을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
64
+ debug_log("λ³Έλ¬Έ μΆ”μΆœ μ™„λ£Œ")
65
+ result = f"[제λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
66
+ debug_log("제λͺ©κ³Ό λ³Έλ¬Έ ν•©μΉ¨ μ™„λ£Œ")
67
+ return result
 
 
 
 
68
  except Exception as e:
69
  debug_log(f"μ—λŸ¬ λ°œμƒ: {str(e)}")
70
  return f"μŠ€ν¬λž˜ν•‘ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
71
 
72
+ # --- ν˜•νƒœμ†Œ 뢄석 (μ°Έμ‘°μ½”λ“œ-1) ---
73
+ def analyze_text(text: str):
74
+ logging.basicConfig(level=logging.DEBUG)
75
+ logger = logging.getLogger(__name__)
76
+ logger.debug("원본 ν…μŠ€νŠΈ: %s", text)
77
+ filtered_text = re.sub(r'[^κ°€-힣]', '', text)
78
+ logger.debug("ν•„ν„°λ§λœ ν…μŠ€νŠΈ: %s", filtered_text)
79
+ if not filtered_text:
80
+ logger.debug("μœ νš¨ν•œ ν•œκ΅­μ–΄ ν…μŠ€νŠΈκ°€ μ—†μŒ.")
81
+ return pd.DataFrame(columns=["단어", "λΉˆλ„μˆ˜"]), ""
82
+ mecab_instance = mecab.MeCab()
83
+ tokens = mecab_instance.pos(filtered_text)
84
+ logger.debug("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Ό: %s", tokens)
85
+ freq = {}
86
+ for word, pos in tokens:
87
+ if word and word.strip() and pos.startswith("NN"):
88
+ freq[word] = freq.get(word, 0) + 1
89
+ logger.debug("단어: %s, ν’ˆμ‚¬: %s, λΉˆλ„: %d", word, pos, freq[word])
90
+ sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
91
+ logger.debug("μ •λ ¬λœ 단어 λΉˆλ„: %s", sorted_freq)
92
+ df = pd.DataFrame(sorted_freq, columns=["단어", "λΉˆλ„μˆ˜"])
93
+ logger.debug("ν˜•νƒœμ†Œ 뢄석 DataFrame 생성됨, shape: %s", df.shape)
94
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
95
+ df.to_excel(temp_file.name, index=False, engine='openpyxl')
96
+ temp_file.close()
97
+ logger.debug("Excel 파일 생성됨: %s", temp_file.name)
98
+ return df, temp_file.name
99
+
100
+ # --- 넀이버 검색 및 κ΄‘κ³  API κ΄€λ ¨ (μ°Έμ‘°μ½”λ“œ-2) ---
101
  def generate_signature(timestamp, method, uri, secret_key):
102
  message = f"{timestamp}.{method}.{uri}"
103
  digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
 
114
  "X-Signature": signature
115
  }
116
 
117
+ def fetch_related_keywords(keyword):
 
118
  debug_log(f"fetch_related_keywords 호좜, ν‚€μ›Œλ“œ: {keyword}")
119
  API_KEY = os.environ["NAVER_API_KEY"]
120
  SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
 
127
  "hintKeywords": [keyword],
128
  "showDetail": "1"
129
  }
130
+ response = requests.get(BASE_URL + uri, params=params, headers=headers)
131
+ data = response.json()
 
132
  if "keywordList" not in data:
133
  return pd.DataFrame()
134
  df = pd.DataFrame(data["keywordList"])
 
147
  debug_log("fetch_related_keywords μ™„λ£Œ")
148
  return result_df
149
 
150
+ def fetch_blog_count(keyword):
 
151
  debug_log(f"fetch_blog_count 호좜, ν‚€μ›Œλ“œ: {keyword}")
152
  client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
153
  client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
 
157
  "X-Naver-Client-Secret": client_secret
158
  }
159
  params = {"query": keyword, "display": 1}
160
+ response = requests.get(url, headers=headers, params=params)
161
+ if response.status_code == 200:
162
+ data = response.json()
163
+ debug_log(f"fetch_blog_count κ²°κ³Ό: {data.get('total', 0)}")
164
+ return data.get("total", 0)
165
+ else:
166
+ debug_log(f"fetch_blog_count 였λ₯˜, μƒνƒœμ½”λ“œ: {response.status_code}")
167
+ return 0
 
168
 
169
  def create_excel_file(df):
170
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
171
  excel_path = tmp.name
172
+ df.to_excel(excel_path, index=False)
173
  debug_log(f"Excel 파일 생성됨: {excel_path}")
174
  return excel_path
175
 
176
+ def process_keyword(keywords: str, include_related: bool):
 
177
  debug_log(f"process_keyword 호좜, ν‚€μ›Œλ“œλ“€: {keywords}, 연관검색어 포함: {include_related}")
178
  input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
179
  result_dfs = []
180
  for idx, kw in enumerate(input_keywords):
181
+ df_kw = fetch_related_keywords(kw)
182
  if df_kw.empty:
183
  continue
184
  row_kw = df_kw[df_kw["μ •λ³΄ν‚€μ›Œλ“œ"] == kw]
 
195
  result_df.drop_duplicates(subset=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
196
  else:
197
  result_df = pd.DataFrame(columns=["μ •λ³΄ν‚€μ›Œλ“œ", "PCμ›”κ²€μƒ‰λŸ‰", "λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", "ν† νƒˆμ›”κ²€μƒ‰λŸ‰"])
198
+ result_df["λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜"] = result_df["μ •λ³΄ν‚€μ›Œλ“œ"].apply(fetch_blog_count)
 
 
 
199
  result_df.sort_values(by="ν† νƒˆμ›”κ²€μƒ‰λŸ‰", ascending=False, inplace=True)
200
  debug_log("process_keyword μ™„λ£Œ")
201
  return result_df, create_excel_file(result_df)
202
 
203
+ # --- ν˜•νƒœμ†Œ 뢄석과 κ²€μƒ‰λŸ‰/λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 병합 ---
204
+ def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ‹œμž‘")
206
  df_freq, _ = analyze_text(text)
207
  if df_freq.empty:
 
213
  debug_log(f"λΉˆλ„μˆ˜ 1 제거 적용됨. {before_shape} -> {df_freq.shape}")
214
  keywords = "\n".join(df_freq["단어"].tolist())
215
  debug_log(f"λΆ„μ„λœ ν‚€μ›Œλ“œ: {keywords}")
216
+ df_keyword_info, _ = process_keyword(keywords, include_related=False)
217
  debug_log("κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 쑰회 μ™„λ£Œ")
218
  merged_df = pd.merge(df_freq, df_keyword_info, left_on="단어", right_on="μ •λ³΄ν‚€μ›Œλ“œ", how="left")
219
  merged_df.drop(columns=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
 
221
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ™„λ£Œ")
222
  return merged_df, merged_excel_path
223
 
224
+ # --- 직접 ν‚€μ›Œλ“œ 뢄석 (단독 뢄석) ---
225
+ def direct_keyword_analysis(text: str, keyword_input: str):
226
  debug_log("direct_keyword_analysis ν•¨μˆ˜ μ‹œμž‘")
227
  keywords = re.split(r'[\n,]+', keyword_input)
228
  keywords = [kw.strip() for kw in keywords if kw.strip()]
 
232
  count = text.count(kw)
233
  results.append((kw, count))
234
  debug_log(f"ν‚€μ›Œλ“œ '{kw}'의 λΉˆλ„μˆ˜: {count}")
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  df = pd.DataFrame(results, columns=["ν‚€μ›Œλ“œ", "λΉˆλ„μˆ˜"])
236
  excel_path = create_excel_file(df)
237
  debug_log("direct_keyword_analysis ν•¨μˆ˜ μ™„λ£Œ")
238
  return df, excel_path
239
 
240
+ # --- 톡합 뢄석 (ν˜•νƒœμ†Œ 뢄석 + 직접 ν‚€μ›Œλ“œ 뢄석) ---
241
+ def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
242
  debug_log("combined_analysis ν•¨μˆ˜ μ‹œμž‘")
243
+ merged_df, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
244
  if "μ§μ ‘μž…λ ₯" not in merged_df.columns:
245
  merged_df["μ§μ ‘μž…λ ₯"] = ""
246
  direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
 
251
  merged_df.loc[merged_df["단어"] == dk, "μ§μ ‘μž…λ ₯"] = "μ§μ ‘μž…λ ₯"
252
  else:
253
  freq = blog_text.count(dk)
254
+ df_direct, _ = process_keyword(dk, include_related=False)
255
  if (not df_direct.empty) and (dk in df_direct["μ •λ³΄ν‚€μ›Œλ“œ"].values):
256
  row = df_direct[df_direct["μ •λ³΄ν‚€μ›Œλ“œ"] == dk].iloc[0]
257
  pc = row.get("PCμ›”κ²€μƒ‰λŸ‰", None)
 
275
  debug_log("combined_analysis ν•¨μˆ˜ μ™„λ£Œ")
276
  return merged_df, combined_excel
277
 
278
+ # --- 뢄석 ν•Έλ“€λŸ¬ ---
279
+ def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
280
  debug_log("analysis_handler ν•¨μˆ˜ μ‹œμž‘")
281
  if direct_keyword_only:
282
+ # "직접 ν‚€μ›Œλ“œ μž…λ ₯만 뢄석" 선택 μ‹œ 단독 뢄석 μˆ˜ν–‰
283
+ return direct_keyword_analysis(blog_text, direct_keyword_input)
284
  else:
285
+ # κΈ°λ³Έ 톡합 뢄석 μˆ˜ν–‰
286
+ return combined_analysis(blog_text, remove_freq1, direct_keyword_input)
287
 
288
+ # --- μŠ€ν¬λž˜ν•‘ μ‹€ν–‰ ---
289
+ def fetch_blog_content(url: str):
290
  debug_log("fetch_blog_content ν•¨μˆ˜ μ‹œμž‘")
291
+ content = scrape_naver_blog(url)
292
  debug_log("fetch_blog_content ν•¨μˆ˜ μ™„λ£Œ")
293
  return content
294
 
 
374
  # --- Gradio μΈν„°νŽ˜μ΄μŠ€ ꡬ성 ---
375
  with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€", css=custom_css) as demo:
376
  gr.HTML("<div class='custom-header'>넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€ πŸš€</div>")
377
+ # λΈ”λ‘œκ·Έ 링크와 μŠ€ν¬λž˜ν•‘ μ‹€ν–‰ λ²„νŠΌμ„ ν•œ κ·Έλ£Ή 내에 배치 (λ²„νŠΌμ€ κ°€μš΄λ° μ •λ ¬)
378
  with gr.Group(elem_classes="custom-group"):
379
  with gr.Row():
380
  blog_url_input = gr.Textbox(label="넀이버 λΈ”λ‘œκ·Έ 링크", placeholder="예: https://blog.naver.com/ssboost/222983068507", lines=1)
 
396
  result_df = gr.Dataframe(label="톡합 뢄석 κ²°κ³Ό (단어, λΉˆλ„μˆ˜, κ²€μƒ‰λŸ‰, λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜, μ§μ ‘μž…λ ₯)", interactive=True)
397
  with gr.Group(elem_classes="custom-group"):
398
  excel_file = gr.File(label="Excel λ‹€μš΄λ‘œλ“œ")
399
+ # μ‚¬μš©μ„€λͺ… HTML 블둝 (μ•„λž˜μ— 배치)
400
  with gr.Group(elem_classes="custom-group"):
401
  usage_html = gr.HTML("""
402
  <div class="usage-instructions">
 
418
  <p><strong>Tip:</strong> 뢄석 κ²°κ³ΌλŠ” μ‹€μ‹œκ°„μœΌλ‘œ μ—…λ°μ΄νŠΈλ˜λ©°, ν•„μš”μ‹œ μˆ˜μ • ν›„ λ‹€μ‹œ 뢄석할 수 μžˆμŠ΅λ‹ˆλ‹€. 즐거운 뢄석 λ˜μ„Έμš”! 😊</p>
419
  </div>
420
  """)
421
+ # 이벀트 μ—°κ²°
422
  scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
423
  analyze_button.click(fn=analysis_handler,
424
  inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
 
427
  if __name__ == "__main__":
428
  debug_log("Gradio μ•± μ‹€ν–‰ μ‹œμž‘")
429
  demo.launch()
430
+ debug_log("Gradio μ•± μ‹€ν–‰ μ’…λ£Œ")