Kims12 commited on
Commit
f9c1fca
Β·
verified Β·
1 Parent(s): b06ebdb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -104
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import gradio as gr
2
  import requests
 
 
3
  from bs4 import BeautifulSoup
4
  import urllib.parse # iframe 경둜 보정을 μœ„ν•œ λͺ¨λ“ˆ
5
  import re
@@ -17,8 +19,8 @@ import base64
17
  def debug_log(message: str):
18
  print(f"[DEBUG] {message}")
19
 
20
- # --- 넀이버 λΈ”λ‘œκ·Έ μŠ€ν¬λž˜ν•‘ ---
21
- def scrape_naver_blog(url: str) -> str:
22
  debug_log("scrape_naver_blog ν•¨μˆ˜ μ‹œμž‘")
23
  debug_log(f"μš”μ²­λ°›μ€ URL: {url}")
24
  headers = {
@@ -29,75 +31,51 @@ def scrape_naver_blog(url: str) -> str:
29
  )
30
  }
31
  try:
32
- response = requests.get(url, headers=headers)
33
- debug_log("HTTP GET μš”μ²­(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
34
- if response.status_code != 200:
35
- debug_log(f"μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {response.status_code}")
36
- return f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {response.status_code}"
37
- soup = BeautifulSoup(response.text, "html.parser")
38
- debug_log("HTML νŒŒμ‹±(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
39
- iframe = soup.select_one("iframe#mainFrame")
40
- if not iframe:
41
- debug_log("iframe#mainFrame νƒœκ·Έλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
42
- return "λ³Έλ¬Έ iframe을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
43
- iframe_src = iframe.get("src")
44
- if not iframe_src:
45
- debug_log("iframe srcκ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
46
- return "λ³Έλ¬Έ iframe의 srcλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
47
- parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
48
- debug_log(f"iframe νŽ˜μ΄μ§€ μš”μ²­ URL: {parsed_iframe_url}")
49
- iframe_response = requests.get(parsed_iframe_url, headers=headers)
50
- debug_log("HTTP GET μš”μ²­(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
51
- if iframe_response.status_code != 200:
52
- debug_log(f"iframe μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {iframe_response.status_code}")
53
- return f"iframeμ—μ„œ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {iframe_response.status_code}"
54
- iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
55
- debug_log("HTML νŒŒμ‹±(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
56
- title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
57
- title = title_div.get_text(strip=True) if title_div else "제λͺ©μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
58
- debug_log(f"μΆ”μΆœλœ 제λͺ©: {title}")
59
- content_div = iframe_soup.select_one('.se-main-container')
60
- if content_div:
61
- content = content_div.get_text("\n", strip=True)
62
- else:
63
- content = "본문을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
64
- debug_log("λ³Έλ¬Έ μΆ”μΆœ μ™„λ£Œ")
65
- result = f"[제λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
66
- debug_log("제λͺ©κ³Ό λ³Έλ¬Έ ν•©μΉ¨ μ™„λ£Œ")
67
- return result
 
 
 
 
68
  except Exception as e:
69
  debug_log(f"μ—λŸ¬ λ°œμƒ: {str(e)}")
70
  return f"μŠ€ν¬λž˜ν•‘ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
71
 
72
- # --- ν˜•νƒœμ†Œ 뢄석 (μ°Έμ‘°μ½”λ“œ-1) ---
73
- def analyze_text(text: str):
74
- logging.basicConfig(level=logging.DEBUG)
75
- logger = logging.getLogger(__name__)
76
- logger.debug("원본 ν…μŠ€νŠΈ: %s", text)
77
- filtered_text = re.sub(r'[^κ°€-힣]', '', text)
78
- logger.debug("ν•„ν„°λ§λœ ν…μŠ€νŠΈ: %s", filtered_text)
79
- if not filtered_text:
80
- logger.debug("μœ νš¨ν•œ ν•œκ΅­μ–΄ ν…μŠ€νŠΈκ°€ μ—†μŒ.")
81
- return pd.DataFrame(columns=["단어", "λΉˆλ„μˆ˜"]), ""
82
- mecab_instance = mecab.MeCab()
83
- tokens = mecab_instance.pos(filtered_text)
84
- logger.debug("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Ό: %s", tokens)
85
- freq = {}
86
- for word, pos in tokens:
87
- if word and word.strip() and pos.startswith("NN"):
88
- freq[word] = freq.get(word, 0) + 1
89
- logger.debug("단어: %s, ν’ˆμ‚¬: %s, λΉˆλ„: %d", word, pos, freq[word])
90
- sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
91
- logger.debug("μ •λ ¬λœ 단어 λΉˆλ„: %s", sorted_freq)
92
- df = pd.DataFrame(sorted_freq, columns=["단어", "λΉˆλ„μˆ˜"])
93
- logger.debug("ν˜•νƒœμ†Œ 뢄석 DataFrame 생성됨, shape: %s", df.shape)
94
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
95
- df.to_excel(temp_file.name, index=False, engine='openpyxl')
96
- temp_file.close()
97
- logger.debug("Excel 파일 생성됨: %s", temp_file.name)
98
- return df, temp_file.name
99
-
100
- # --- 넀이버 검색 및 κ΄‘κ³  API κ΄€λ ¨ (μ°Έμ‘°μ½”λ“œ-2) ---
101
  def generate_signature(timestamp, method, uri, secret_key):
102
  message = f"{timestamp}.{method}.{uri}"
103
  digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
@@ -114,7 +92,8 @@ def get_header(method, uri, api_key, secret_key, customer_id):
114
  "X-Signature": signature
115
  }
116
 
117
- def fetch_related_keywords(keyword):
 
118
  debug_log(f"fetch_related_keywords 호좜, ν‚€μ›Œλ“œ: {keyword}")
119
  API_KEY = os.environ["NAVER_API_KEY"]
120
  SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
@@ -127,8 +106,9 @@ def fetch_related_keywords(keyword):
127
  "hintKeywords": [keyword],
128
  "showDetail": "1"
129
  }
130
- response = requests.get(BASE_URL + uri, params=params, headers=headers)
131
- data = response.json()
 
132
  if "keywordList" not in data:
133
  return pd.DataFrame()
134
  df = pd.DataFrame(data["keywordList"])
@@ -147,7 +127,8 @@ def fetch_related_keywords(keyword):
147
  debug_log("fetch_related_keywords μ™„λ£Œ")
148
  return result_df
149
 
150
- def fetch_blog_count(keyword):
 
151
  debug_log(f"fetch_blog_count 호좜, ν‚€μ›Œλ“œ: {keyword}")
152
  client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
153
  client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
@@ -157,28 +138,30 @@ def fetch_blog_count(keyword):
157
  "X-Naver-Client-Secret": client_secret
158
  }
159
  params = {"query": keyword, "display": 1}
160
- response = requests.get(url, headers=headers, params=params)
161
- if response.status_code == 200:
162
- data = response.json()
163
- debug_log(f"fetch_blog_count κ²°κ³Ό: {data.get('total', 0)}")
164
- return data.get("total", 0)
165
- else:
166
- debug_log(f"fetch_blog_count 였λ₯˜, μƒνƒœμ½”λ“œ: {response.status_code}")
167
- return 0
 
168
 
169
  def create_excel_file(df):
170
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
171
  excel_path = tmp.name
172
- df.to_excel(excel_path, index=False)
173
  debug_log(f"Excel 파일 생성됨: {excel_path}")
174
  return excel_path
175
 
176
- def process_keyword(keywords: str, include_related: bool):
 
177
  debug_log(f"process_keyword 호좜, ν‚€μ›Œλ“œλ“€: {keywords}, 연관검색어 포함: {include_related}")
178
  input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
179
  result_dfs = []
180
  for idx, kw in enumerate(input_keywords):
181
- df_kw = fetch_related_keywords(kw)
182
  if df_kw.empty:
183
  continue
184
  row_kw = df_kw[df_kw["μ •λ³΄ν‚€μ›Œλ“œ"] == kw]
@@ -195,13 +178,44 @@ def process_keyword(keywords: str, include_related: bool):
195
  result_df.drop_duplicates(subset=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
196
  else:
197
  result_df = pd.DataFrame(columns=["μ •λ³΄ν‚€μ›Œλ“œ", "PCμ›”κ²€μƒ‰λŸ‰", "λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", "ν† νƒˆμ›”κ²€μƒ‰λŸ‰"])
198
- result_df["λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜"] = result_df["μ •λ³΄ν‚€μ›Œλ“œ"].apply(fetch_blog_count)
 
 
 
199
  result_df.sort_values(by="ν† νƒˆμ›”κ²€μƒ‰λŸ‰", ascending=False, inplace=True)
200
  debug_log("process_keyword μ™„λ£Œ")
201
  return result_df, create_excel_file(result_df)
202
 
203
- # --- ν˜•νƒœμ†Œ 뢄석과 κ²€μƒ‰λŸ‰/λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 병합 ---
204
- def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ‹œμž‘")
206
  df_freq, _ = analyze_text(text)
207
  if df_freq.empty:
@@ -213,7 +227,7 @@ def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
213
  debug_log(f"λΉˆλ„μˆ˜ 1 제거 적용됨. {before_shape} -> {df_freq.shape}")
214
  keywords = "\n".join(df_freq["단어"].tolist())
215
  debug_log(f"λΆ„μ„λœ ν‚€μ›Œλ“œ: {keywords}")
216
- df_keyword_info, _ = process_keyword(keywords, include_related=False)
217
  debug_log("κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 쑰회 μ™„λ£Œ")
218
  merged_df = pd.merge(df_freq, df_keyword_info, left_on="단어", right_on="μ •λ³΄ν‚€μ›Œλ“œ", how="left")
219
  merged_df.drop(columns=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
@@ -221,8 +235,8 @@ def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
221
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ™„λ£Œ")
222
  return merged_df, merged_excel_path
223
 
224
- # --- 직접 ν‚€μ›Œλ“œ 뢄석 (단독 뢄석) ---
225
- def direct_keyword_analysis(text: str, keyword_input: str):
226
  debug_log("direct_keyword_analysis ν•¨μˆ˜ μ‹œμž‘")
227
  keywords = re.split(r'[\n,]+', keyword_input)
228
  keywords = [kw.strip() for kw in keywords if kw.strip()]
@@ -232,15 +246,28 @@ def direct_keyword_analysis(text: str, keyword_input: str):
232
  count = text.count(kw)
233
  results.append((kw, count))
234
  debug_log(f"ν‚€μ›Œλ“œ '{kw}'의 λΉˆλ„μˆ˜: {count}")
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  df = pd.DataFrame(results, columns=["ν‚€μ›Œλ“œ", "λΉˆλ„μˆ˜"])
236
  excel_path = create_excel_file(df)
237
  debug_log("direct_keyword_analysis ν•¨μˆ˜ μ™„λ£Œ")
238
  return df, excel_path
239
 
240
- # --- 톡합 뢄석 (ν˜•νƒœμ†Œ 뢄석 + 직접 ν‚€μ›Œλ“œ 뢄석) ---
241
- def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
242
  debug_log("combined_analysis ν•¨μˆ˜ μ‹œμž‘")
243
- merged_df, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
244
  if "μ§μ ‘μž…λ ₯" not in merged_df.columns:
245
  merged_df["μ§μ ‘μž…λ ₯"] = ""
246
  direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
@@ -251,7 +278,7 @@ def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input:
251
  merged_df.loc[merged_df["단어"] == dk, "μ§μ ‘μž…λ ₯"] = "μ§μ ‘μž…λ ₯"
252
  else:
253
  freq = blog_text.count(dk)
254
- df_direct, _ = process_keyword(dk, include_related=False)
255
  if (not df_direct.empty) and (dk in df_direct["μ •λ³΄ν‚€μ›Œλ“œ"].values):
256
  row = df_direct[df_direct["μ •λ³΄ν‚€μ›Œλ“œ"] == dk].iloc[0]
257
  pc = row.get("PCμ›”κ²€μƒ‰λŸ‰", None)
@@ -275,20 +302,18 @@ def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input:
275
  debug_log("combined_analysis ν•¨μˆ˜ μ™„λ£Œ")
276
  return merged_df, combined_excel
277
 
278
- # --- 뢄석 ν•Έλ“€λŸ¬ ---
279
- def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
280
  debug_log("analysis_handler ν•¨μˆ˜ μ‹œμž‘")
281
  if direct_keyword_only:
282
- # "직접 ν‚€μ›Œλ“œ μž…λ ₯만 뢄석" 선택 μ‹œ 단독 뢄석 μˆ˜ν–‰
283
- return direct_keyword_analysis(blog_text, direct_keyword_input)
284
  else:
285
- # κΈ°λ³Έ 톡합 뢄석 μˆ˜ν–‰
286
- return combined_analysis(blog_text, remove_freq1, direct_keyword_input)
287
 
288
- # --- μŠ€ν¬λž˜ν•‘ μ‹€ν–‰ ---
289
- def fetch_blog_content(url: str):
290
  debug_log("fetch_blog_content ν•¨μˆ˜ μ‹œμž‘")
291
- content = scrape_naver_blog(url)
292
  debug_log("fetch_blog_content ν•¨μˆ˜ μ™„λ£Œ")
293
  return content
294
 
@@ -374,7 +399,6 @@ custom_css = """
374
  # --- Gradio μΈν„°νŽ˜μ΄μŠ€ ꡬ성 ---
375
  with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€", css=custom_css) as demo:
376
  gr.HTML("<div class='custom-header'>넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€ πŸš€</div>")
377
- # λΈ”λ‘œκ·Έ 링크와 μŠ€ν¬λž˜ν•‘ μ‹€ν–‰ λ²„νŠΌμ„ ν•œ κ·Έλ£Ή 내에 배치 (λ²„νŠΌμ€ κ°€μš΄λ° μ •λ ¬)
378
  with gr.Group(elem_classes="custom-group"):
379
  with gr.Row():
380
  blog_url_input = gr.Textbox(label="넀이버 λΈ”λ‘œκ·Έ 링크", placeholder="예: https://blog.naver.com/ssboost/222983068507", lines=1)
@@ -396,7 +420,6 @@ with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€", css=custo
396
  result_df = gr.Dataframe(label="톡합 뢄석 κ²°κ³Ό (단어, λΉˆλ„μˆ˜, κ²€μƒ‰λŸ‰, λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜, μ§μ ‘μž…λ ₯)", interactive=True)
397
  with gr.Group(elem_classes="custom-group"):
398
  excel_file = gr.File(label="Excel λ‹€μš΄λ‘œλ“œ")
399
- # μ‚¬μš©μ„€λͺ… HTML 블둝 (μ•„λž˜μ— 배치)
400
  with gr.Group(elem_classes="custom-group"):
401
  usage_html = gr.HTML("""
402
  <div class="usage-instructions">
@@ -418,7 +441,7 @@ with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€", css=custo
418
  <p><strong>Tip:</strong> 뢄석 κ²°κ³ΌλŠ” μ‹€μ‹œκ°„μœΌλ‘œ μ—…λ°μ΄νŠΈλ˜λ©°, ν•„μš”μ‹œ μˆ˜μ • ν›„ λ‹€μ‹œ 뢄석할 수 μžˆμŠ΅λ‹ˆλ‹€. 즐거운 뢄석 λ˜μ„Έμš”! 😊</p>
419
  </div>
420
  """)
421
- # 이벀트 μ—°κ²°
422
  scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
423
  analyze_button.click(fn=analysis_handler,
424
  inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
 
1
  import gradio as gr
2
  import requests
3
+ import aiohttp
4
+ import asyncio
5
  from bs4 import BeautifulSoup
6
  import urllib.parse # iframe 경둜 보정을 μœ„ν•œ λͺ¨λ“ˆ
7
  import re
 
19
  def debug_log(message: str):
20
  print(f"[DEBUG] {message}")
21
 
22
+ # --- 넀이버 λΈ”λ‘œκ·Έ μŠ€ν¬λž˜ν•‘ (비동기 버전) ---
23
+ async def scrape_naver_blog(url: str) -> str:
24
  debug_log("scrape_naver_blog ν•¨μˆ˜ μ‹œμž‘")
25
  debug_log(f"μš”μ²­λ°›μ€ URL: {url}")
26
  headers = {
 
31
  )
32
  }
33
  try:
34
+ async with aiohttp.ClientSession() as session:
35
+ async with session.get(url, headers=headers) as response:
36
+ debug_log("HTTP GET μš”μ²­(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
37
+ if response.status != 200:
38
+ debug_log(f"μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {response.status}")
39
+ return f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {response.status}"
40
+ html = await response.text()
41
+ soup = BeautifulSoup(html, "html.parser")
42
+ debug_log("HTML νŒŒμ‹±(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
43
+ iframe = soup.select_one("iframe#mainFrame")
44
+ if not iframe:
45
+ debug_log("iframe#mainFrame νƒœκ·Έλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
46
+ return "λ³Έλ¬Έ iframe을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
47
+ iframe_src = iframe.get("src")
48
+ if not iframe_src:
49
+ debug_log("iframe srcκ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
50
+ return "λ³Έλ¬Έ iframe의 srcλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
51
+ parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
52
+ debug_log(f"iframe νŽ˜μ΄μ§€ μš”μ²­ URL: {parsed_iframe_url}")
53
+ async with aiohttp.ClientSession() as session:
54
+ async with session.get(parsed_iframe_url, headers=headers) as iframe_response:
55
+ debug_log("HTTP GET μš”μ²­(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
56
+ if iframe_response.status != 200:
57
+ debug_log(f"iframe μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {iframe_response.status}")
58
+ return f"iframeμ—μ„œ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {iframe_response.status}"
59
+ iframe_html = await iframe_response.text()
60
+ iframe_soup = BeautifulSoup(iframe_html, "html.parser")
61
+ debug_log("HTML νŒŒμ‹±(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
62
+ title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
63
+ title = title_div.get_text(strip=True) if title_div else "제λͺ©μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
64
+ debug_log(f"μΆ”μΆœλœ 제λͺ©: {title}")
65
+ content_div = iframe_soup.select_one('.se-main-container')
66
+ if content_div:
67
+ content = content_div.get_text("\n", strip=True)
68
+ else:
69
+ content = "본문을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
70
+ debug_log("λ³Έλ¬Έ μΆ”μΆœ μ™„λ£Œ")
71
+ result = f"[제λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
72
+ debug_log("제λͺ©κ³Ό λ³Έλ¬Έ ν•©μΉ¨ μ™„λ£Œ")
73
+ return result
74
  except Exception as e:
75
  debug_log(f"μ—λŸ¬ λ°œμƒ: {str(e)}")
76
  return f"μŠ€ν¬λž˜ν•‘ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
77
 
78
+ # --- 넀이버 검색 및 κ΄‘κ³  API κ΄€λ ¨ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def generate_signature(timestamp, method, uri, secret_key):
80
  message = f"{timestamp}.{method}.{uri}"
81
  digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
 
92
  "X-Signature": signature
93
  }
94
 
95
+ # --- μ—°κ΄€ ν‚€μ›Œλ“œ 쑰회 (비동기) ---
96
+ async def fetch_related_keywords(keyword):
97
  debug_log(f"fetch_related_keywords 호좜, ν‚€μ›Œλ“œ: {keyword}")
98
  API_KEY = os.environ["NAVER_API_KEY"]
99
  SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
 
106
  "hintKeywords": [keyword],
107
  "showDetail": "1"
108
  }
109
+ async with aiohttp.ClientSession() as session:
110
+ async with session.get(BASE_URL + uri, headers=headers, params=params) as response:
111
+ data = await response.json()
112
  if "keywordList" not in data:
113
  return pd.DataFrame()
114
  df = pd.DataFrame(data["keywordList"])
 
127
  debug_log("fetch_related_keywords μ™„λ£Œ")
128
  return result_df
129
 
130
+ # --- λΈ”λ‘œκ·Έ λ¬Έμ„œμˆ˜ 쑰회 (비동기) ---
131
+ async def fetch_blog_count(keyword):
132
  debug_log(f"fetch_blog_count 호좜, ν‚€μ›Œλ“œ: {keyword}")
133
  client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
134
  client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
 
138
  "X-Naver-Client-Secret": client_secret
139
  }
140
  params = {"query": keyword, "display": 1}
141
+ async with aiohttp.ClientSession() as session:
142
+ async with session.get(url, headers=headers, params=params) as response:
143
+ if response.status == 200:
144
+ data = await response.json()
145
+ debug_log(f"fetch_blog_count κ²°κ³Ό: {data.get('total', 0)}")
146
+ return data.get("total", 0)
147
+ else:
148
+ debug_log(f"fetch_blog_count 였λ₯˜, μƒνƒœμ½”λ“œ: {response.status}")
149
+ return 0
150
 
151
  def create_excel_file(df):
152
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
153
  excel_path = tmp.name
154
+ df.to_excel(excel_path, index=False, engine='openpyxl')
155
  debug_log(f"Excel 파일 생성됨: {excel_path}")
156
  return excel_path
157
 
158
+ # --- ν‚€μ›Œλ“œ 검색 (비동기) ---
159
+ async def process_keyword(keywords: str, include_related: bool):
160
  debug_log(f"process_keyword 호좜, ν‚€μ›Œλ“œλ“€: {keywords}, 연관검색어 포함: {include_related}")
161
  input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
162
  result_dfs = []
163
  for idx, kw in enumerate(input_keywords):
164
+ df_kw = await fetch_related_keywords(kw)
165
  if df_kw.empty:
166
  continue
167
  row_kw = df_kw[df_kw["μ •λ³΄ν‚€μ›Œλ“œ"] == kw]
 
178
  result_df.drop_duplicates(subset=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
179
  else:
180
  result_df = pd.DataFrame(columns=["μ •λ³΄ν‚€μ›Œλ“œ", "PCμ›”κ²€μƒ‰λŸ‰", "λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", "ν† νƒˆμ›”κ²€μƒ‰λŸ‰"])
181
+ # λΈ”λ‘œκ·Έ λ¬Έμ„œμˆ˜ 쑰회λ₯Ό λ³‘λ ¬λ‘œ 처리
182
+ tasks = [fetch_blog_count(kw) for kw in result_df["μ •λ³΄ν‚€μ›Œλ“œ"]]
183
+ counts = await asyncio.gather(*tasks)
184
+ result_df["λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜"] = counts
185
  result_df.sort_values(by="ν† νƒˆμ›”κ²€μƒ‰λŸ‰", ascending=False, inplace=True)
186
  debug_log("process_keyword μ™„λ£Œ")
187
  return result_df, create_excel_file(result_df)
188
 
189
+ # --- ν˜•νƒœμ†Œ 뢄석 (μ°Έκ³  μ½”λ“œ-1, 동기) ---
190
+ def analyze_text(text: str):
191
+ logging.basicConfig(level=logging.DEBUG)
192
+ logger = logging.getLogger(__name__)
193
+ logger.debug("원본 ν…μŠ€νŠΈ: %s", text)
194
+ filtered_text = re.sub(r'[^κ°€-힣]', '', text)
195
+ logger.debug("ν•„ν„°λ§λœ ν…μŠ€νŠΈ: %s", filtered_text)
196
+ if not filtered_text:
197
+ logger.debug("μœ νš¨ν•œ ν•œκ΅­μ–΄ ν…μŠ€νŠΈκ°€ μ—†μŒ.")
198
+ return pd.DataFrame(columns=["단어", "λΉˆλ„μˆ˜"]), ""
199
+ mecab_instance = mecab.MeCab()
200
+ tokens = mecab_instance.pos(filtered_text)
201
+ logger.debug("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Ό: %s", tokens)
202
+ freq = {}
203
+ for word, pos in tokens:
204
+ if word and word.strip() and pos.startswith("NN"):
205
+ freq[word] = freq.get(word, 0) + 1
206
+ logger.debug("단어: %s, ν’ˆμ‚¬: %s, λΉˆλ„: %d", word, pos, freq[word])
207
+ sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
208
+ logger.debug("μ •λ ¬λœ 단어 λΉˆλ„: %s", sorted_freq)
209
+ df = pd.DataFrame(sorted_freq, columns=["단어", "λΉˆλ„μˆ˜"])
210
+ logger.debug("ν˜•νƒœμ†Œ 뢄석 DataFrame 생성됨, shape: %s", df.shape)
211
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
212
+ df.to_excel(temp_file.name, index=False, engine='openpyxl')
213
+ temp_file.close()
214
+ logger.debug("Excel 파일 생성됨: %s", temp_file.name)
215
+ return df, temp_file.name
216
+
217
+ # --- ν˜•νƒœμ†Œ 뢄석과 κ²€μƒ‰λŸ‰/λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 병합 (비동기) ---
218
+ async def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
219
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ‹œμž‘")
220
  df_freq, _ = analyze_text(text)
221
  if df_freq.empty:
 
227
  debug_log(f"λΉˆλ„μˆ˜ 1 제거 적용됨. {before_shape} -> {df_freq.shape}")
228
  keywords = "\n".join(df_freq["단어"].tolist())
229
  debug_log(f"λΆ„μ„λœ ν‚€μ›Œλ“œ: {keywords}")
230
+ df_keyword_info, _ = await process_keyword(keywords, include_related=False)
231
  debug_log("κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 쑰회 μ™„λ£Œ")
232
  merged_df = pd.merge(df_freq, df_keyword_info, left_on="단어", right_on="μ •λ³΄ν‚€μ›Œλ“œ", how="left")
233
  merged_df.drop(columns=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
 
235
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ™„λ£Œ")
236
  return merged_df, merged_excel_path
237
 
238
+ # --- 직접 ν‚€μ›Œλ“œ 뢄석 (단독 뢄석, 비동기) ---
239
+ async def direct_keyword_analysis(text: str, keyword_input: str):
240
  debug_log("direct_keyword_analysis ν•¨μˆ˜ μ‹œμž‘")
241
  keywords = re.split(r'[\n,]+', keyword_input)
242
  keywords = [kw.strip() for kw in keywords if kw.strip()]
 
246
  count = text.count(kw)
247
  results.append((kw, count))
248
  debug_log(f"ν‚€μ›Œλ“œ '{kw}'의 λΉˆλ„μˆ˜: {count}")
249
+ # 직접 μž…λ ₯ ν‚€μ›Œλ“œκ°€ 본문에 μ—†μœΌλ©΄ μΆ”κ°€ 쑰회
250
+ if kw not in text:
251
+ df_direct, _ = await process_keyword(kw, include_related=False)
252
+ if (not df_direct.empty) and (kw in df_direct["μ •λ³΄ν‚€μ›Œλ“œ"].values):
253
+ row = df_direct[df_direct["μ •λ³΄ν‚€μ›Œλ“œ"] == kw].iloc[0]
254
+ pc = row.get("PCμ›”κ²€μƒ‰λŸ‰", None)
255
+ mobile = row.get("λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", None)
256
+ total = row.get("ν† νƒˆμ›”κ²€μƒ‰λŸ‰", None)
257
+ blog_count = row.get("λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜", None)
258
+ else:
259
+ pc = mobile = total = blog_count = None
260
+ # 결과에 μƒˆ ν–‰ μΆ”κ°€
261
+ results.append((kw, count))
262
  df = pd.DataFrame(results, columns=["ν‚€μ›Œλ“œ", "λΉˆλ„μˆ˜"])
263
  excel_path = create_excel_file(df)
264
  debug_log("direct_keyword_analysis ν•¨μˆ˜ μ™„λ£Œ")
265
  return df, excel_path
266
 
267
+ # --- 톡합 뢄석 (ν˜•νƒœμ†Œ 뢄석 + 직접 ν‚€μ›Œλ“œ 뢄석, 비동기) ---
268
+ async def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
269
  debug_log("combined_analysis ν•¨μˆ˜ μ‹œμž‘")
270
+ merged_df, _ = await morphological_analysis_and_enrich(blog_text, remove_freq1)
271
  if "μ§μ ‘μž…λ ₯" not in merged_df.columns:
272
  merged_df["μ§μ ‘μž…λ ₯"] = ""
273
  direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
 
278
  merged_df.loc[merged_df["단어"] == dk, "μ§μ ‘μž…λ ₯"] = "μ§μ ‘μž…λ ₯"
279
  else:
280
  freq = blog_text.count(dk)
281
+ df_direct, _ = await process_keyword(dk, include_related=False)
282
  if (not df_direct.empty) and (dk in df_direct["μ •λ³΄ν‚€μ›Œλ“œ"].values):
283
  row = df_direct[df_direct["μ •λ³΄ν‚€μ›Œλ“œ"] == dk].iloc[0]
284
  pc = row.get("PCμ›”κ²€μƒ‰λŸ‰", None)
 
302
  debug_log("combined_analysis ν•¨μˆ˜ μ™„λ£Œ")
303
  return merged_df, combined_excel
304
 
305
+ # --- 뢄석 ν•Έλ“€λŸ¬ (비동기) ---
306
+ async def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
307
  debug_log("analysis_handler ν•¨μˆ˜ μ‹œμž‘")
308
  if direct_keyword_only:
309
+ return await direct_keyword_analysis(blog_text, direct_keyword_input)
 
310
  else:
311
+ return await combined_analysis(blog_text, remove_freq1, direct_keyword_input)
 
312
 
313
+ # --- μŠ€ν¬λž˜ν•‘ μ‹€ν–‰ ν•Έλ“€λŸ¬ (비동기) ---
314
+ async def fetch_blog_content(url: str):
315
  debug_log("fetch_blog_content ν•¨μˆ˜ μ‹œμž‘")
316
+ content = await scrape_naver_blog(url)
317
  debug_log("fetch_blog_content ν•¨μˆ˜ μ™„λ£Œ")
318
  return content
319
 
 
399
  # --- Gradio μΈν„°νŽ˜μ΄μŠ€ ꡬ성 ---
400
  with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€", css=custom_css) as demo:
401
  gr.HTML("<div class='custom-header'>넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 μ„œλΉ„μŠ€ πŸš€</div>")
 
402
  with gr.Group(elem_classes="custom-group"):
403
  with gr.Row():
404
  blog_url_input = gr.Textbox(label="넀이버 λΈ”λ‘œκ·Έ 링크", placeholder="예: https://blog.naver.com/ssboost/222983068507", lines=1)
 
420
  result_df = gr.Dataframe(label="톡합 뢄석 κ²°κ³Ό (단어, λΉˆλ„μˆ˜, κ²€μƒ‰λŸ‰, λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜, μ§μ ‘μž…λ ₯)", interactive=True)
421
  with gr.Group(elem_classes="custom-group"):
422
  excel_file = gr.File(label="Excel λ‹€μš΄λ‘œλ“œ")
 
423
  with gr.Group(elem_classes="custom-group"):
424
  usage_html = gr.HTML("""
425
  <div class="usage-instructions">
 
441
  <p><strong>Tip:</strong> 뢄석 κ²°κ³ΌλŠ” μ‹€μ‹œκ°„μœΌλ‘œ μ—…λ°μ΄νŠΈλ˜λ©°, ν•„μš”μ‹œ μˆ˜μ • ν›„ λ‹€μ‹œ 뢄석할 수 μžˆμŠ΅λ‹ˆλ‹€. 즐거운 뢄석 λ˜μ„Έμš”! 😊</p>
442
  </div>
443
  """)
444
+ # 이벀트 μ—°κ²° (비동기 ν•¨μˆ˜ μ‚¬μš©)
445
  scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
446
  analyze_button.click(fn=analysis_handler,
447
  inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],