Kims12 commited on
Commit
b4650b8
Β·
verified Β·
1 Parent(s): fdac880

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -95
app.py CHANGED
@@ -17,11 +17,10 @@ import base64
17
  def debug_log(message: str):
18
  print(f"[DEBUG] {message}")
19
 
20
- # [κΈ°λ³Έμ½”λ“œ] - 넀이버 λΈ”λ‘œκ·Έ μŠ€ν¬λž˜ν•‘ κΈ°λŠ₯
21
  def scrape_naver_blog(url: str) -> str:
22
  debug_log("scrape_naver_blog ν•¨μˆ˜ μ‹œμž‘")
23
  debug_log(f"μš”μ²­λ°›μ€ URL: {url}")
24
-
25
  headers = {
26
  "User-Agent": (
27
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -29,20 +28,14 @@ def scrape_naver_blog(url: str) -> str:
29
  "Chrome/96.0.4664.110 Safari/537.36"
30
  )
31
  }
32
-
33
  try:
34
- # 1) 넀이버 λΈ”λ‘œκ·Έ '메인' νŽ˜μ΄μ§€ μš”μ²­
35
  response = requests.get(url, headers=headers)
36
  debug_log("HTTP GET μš”μ²­(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
37
  if response.status_code != 200:
38
  debug_log(f"μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {response.status_code}")
39
  return f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {response.status_code}"
40
-
41
- # 2) 메인 νŽ˜μ΄μ§€ νŒŒμ‹±
42
  soup = BeautifulSoup(response.text, "html.parser")
43
  debug_log("HTML νŒŒμ‹±(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
44
-
45
- # 3) iframe νƒœκ·Έ μ°ΎκΈ°
46
  iframe = soup.select_one("iframe#mainFrame")
47
  if not iframe:
48
  debug_log("iframe#mainFrame νƒœκ·Έλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
@@ -51,12 +44,8 @@ def scrape_naver_blog(url: str) -> str:
51
  if not iframe_src:
52
  debug_log("iframe srcκ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
53
  return "λ³Έλ¬Έ iframe의 srcλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
54
-
55
- # 4) iframe src 보정 (μ ˆλŒ€κ²½λ‘œ 처리)
56
  parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
57
  debug_log(f"iframe νŽ˜μ΄μ§€ μš”μ²­ URL: {parsed_iframe_url}")
58
-
59
- # 5) iframe νŽ˜μ΄μ§€ μš”μ²­ 및 νŒŒμ‹±
60
  iframe_response = requests.get(parsed_iframe_url, headers=headers)
61
  debug_log("HTTP GET μš”μ²­(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
62
  if iframe_response.status_code != 200:
@@ -64,8 +53,6 @@ def scrape_naver_blog(url: str) -> str:
64
  return f"iframeμ—μ„œ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {iframe_response.status_code}"
65
  iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
66
  debug_log("HTML νŒŒμ‹±(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
67
-
68
- # 6) 제λͺ©κ³Ό λ³Έλ¬Έ μΆ”μΆœ
69
  title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
70
  title = title_div.get_text(strip=True) if title_div else "제λͺ©μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
71
  debug_log(f"μΆ”μΆœλœ 제λͺ©: {title}")
@@ -75,58 +62,42 @@ def scrape_naver_blog(url: str) -> str:
75
  else:
76
  content = "본문을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
77
  debug_log("λ³Έλ¬Έ μΆ”μΆœ μ™„λ£Œ")
78
-
79
  result = f"[제λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
80
- debug_log("제λͺ©κ³Ό 본문을 합쳐 λ°˜ν™˜ μ€€λΉ„ μ™„λ£Œ")
81
  return result
82
-
83
  except Exception as e:
84
  debug_log(f"μ—λŸ¬ λ°œμƒ: {str(e)}")
85
  return f"μŠ€ν¬λž˜ν•‘ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
86
 
87
- # [μ°Έμ‘°μ½”λ“œ-1] ν˜•νƒœμ†Œ 뢄석 κΈ°λŠ₯
88
  def analyze_text(text: str):
89
  logging.basicConfig(level=logging.DEBUG)
90
  logger = logging.getLogger(__name__)
91
  logger.debug("원본 ν…μŠ€νŠΈ: %s", text)
92
-
93
- # 1. ν•œκ΅­μ–΄λ§Œ 남기기 (곡백, μ˜μ–΄, 기호 λ“± 제거)
94
  filtered_text = re.sub(r'[^κ°€-힣]', '', text)
95
- logger.debug("ν•„ν„°λ§λœ ν…μŠ€νŠΈ (ν•œκ΅­μ–΄λ§Œ, 곡백 제거): %s", filtered_text)
96
-
97
  if not filtered_text:
98
  logger.debug("μœ νš¨ν•œ ν•œκ΅­μ–΄ ν…μŠ€νŠΈκ°€ μ—†μŒ.")
99
  return pd.DataFrame(columns=["단어", "λΉˆλ„μˆ˜"]), ""
100
-
101
- # 2. Mecab을 μ΄μš©ν•œ ν˜•νƒœμ†Œ 뢄석 (λͺ…사와 볡합λͺ…μ‚¬λ§Œ μΆ”μΆœ)
102
  mecab_instance = mecab.MeCab()
103
  tokens = mecab_instance.pos(filtered_text)
104
  logger.debug("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Ό: %s", tokens)
105
-
106
  freq = {}
107
  for word, pos in tokens:
108
- if word and word.strip():
109
- if pos.startswith("NN"):
110
- freq[word] = freq.get(word, 0) + 1
111
- logger.debug("단어: %s, ν’ˆμ‚¬: %s, ν˜„μž¬ λΉˆλ„: %d", word, pos, freq[word])
112
-
113
- # 3. λΉˆλ„μˆ˜λ₯Ό λ‚΄λ¦Όμ°¨μˆœ μ •λ ¬
114
  sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
115
- logger.debug("λ‚΄λ¦Όμ°¨μˆœ μ •λ ¬λœ 단어 λΉˆλ„: %s", sorted_freq)
116
-
117
- # 4. κ²°κ³Ό DataFrame 생성
118
  df = pd.DataFrame(sorted_freq, columns=["단어", "λΉˆλ„μˆ˜"])
119
- logger.debug("κ²°κ³Ό DataFrame 생성됨, shape: %s", df.shape)
120
-
121
- # 5. Excel 파일 생성 (μž„μ‹œ 파일)
122
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
123
  df.to_excel(temp_file.name, index=False, engine='openpyxl')
124
  temp_file.close()
125
  logger.debug("Excel 파일 생성됨: %s", temp_file.name)
126
-
127
  return df, temp_file.name
128
 
129
- # [μ°Έμ‘°μ½”λ“œ-2] 넀이버 κ΄‘κ³  API 및 κ²€μƒ‰λŸ‰/λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 쑰회 κΈ°λŠ₯
130
  def generate_signature(timestamp, method, uri, secret_key):
131
  message = f"{timestamp}.{method}.{uri}"
132
  digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
@@ -148,7 +119,6 @@ def fetch_related_keywords(keyword):
148
  API_KEY = os.environ["NAVER_API_KEY"]
149
  SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
150
  CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
151
-
152
  BASE_URL = "https://api.naver.com"
153
  uri = "/keywordstool"
154
  method = "GET"
@@ -164,13 +134,11 @@ def fetch_related_keywords(keyword):
164
  df = pd.DataFrame(data["keywordList"])
165
  if len(df) > 100:
166
  df = df.head(100)
167
-
168
  def parse_count(x):
169
  try:
170
  return int(str(x).replace(",", ""))
171
  except:
172
  return 0
173
-
174
  df["PCμ›”κ²€μƒ‰λŸ‰"] = df["monthlyPcQcCnt"].apply(parse_count)
175
  df["λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰"] = df["monthlyMobileQcCnt"].apply(parse_count)
176
  df["ν† νƒˆμ›”κ²€μƒ‰λŸ‰"] = df["PCμ›”κ²€μƒ‰λŸ‰"] + df["λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰"]
@@ -209,7 +177,6 @@ def process_keyword(keywords: str, include_related: bool):
209
  debug_log(f"process_keyword 호좜, ν‚€μ›Œλ“œλ“€: {keywords}, 연관검색어 포함: {include_related}")
210
  input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
211
  result_dfs = []
212
-
213
  for idx, kw in enumerate(input_keywords):
214
  df_kw = fetch_related_keywords(kw)
215
  if df_kw.empty:
@@ -223,81 +190,86 @@ def process_keyword(keywords: str, include_related: bool):
223
  df_related = df_kw[df_kw["μ •λ³΄ν‚€μ›Œλ“œ"] != kw]
224
  if not df_related.empty:
225
  result_dfs.append(df_related)
226
-
227
  if result_dfs:
228
  result_df = pd.concat(result_dfs, ignore_index=True)
229
  result_df.drop_duplicates(subset=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
230
  else:
231
  result_df = pd.DataFrame(columns=["μ •λ³΄ν‚€μ›Œλ“œ", "PCμ›”κ²€μƒ‰λŸ‰", "λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", "ν† νƒˆμ›”κ²€μƒ‰λŸ‰"])
232
-
233
  result_df["λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜"] = result_df["μ •λ³΄ν‚€μ›Œλ“œ"].apply(fetch_blog_count)
234
  result_df.sort_values(by="ν† νƒˆμ›”κ²€μƒ‰λŸ‰", ascending=False, inplace=True)
235
  debug_log("process_keyword μ™„λ£Œ")
236
  return result_df, create_excel_file(result_df)
237
 
238
- # [μ°Έμ‘°μ½”λ“œ-1] 및 [μ°Έμ‘°μ½”λ“œ-2]λ₯Ό ν™œμš©ν•œ ν˜•νƒœμ†Œ 뢄석 및 κ²€μƒ‰λŸ‰, λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ μΆ”κ°€ (λΉˆλ„μˆ˜1 제거 μ˜΅μ…˜ 포함)
239
  def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
240
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ‹œμž‘")
241
  df_freq, _ = analyze_text(text)
242
  if df_freq.empty:
243
  debug_log("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Όκ°€ 빈 λ°μ΄ν„°ν”„λ ˆμž„μž…λ‹ˆλ‹€.")
244
  return df_freq, ""
245
-
246
  if remove_freq1:
247
  before_shape = df_freq.shape
248
  df_freq = df_freq[df_freq["λΉˆλ„μˆ˜"] != 1]
249
  debug_log(f"λΉˆλ„μˆ˜ 1 제거 적용됨. {before_shape} -> {df_freq.shape}")
250
-
251
- # ν˜•νƒœμ†Œ 뢄석 κ²°κ³Όμ—μ„œ ν‚€μ›Œλ“œ μΆ”μΆœ (각 단어λ₯Ό μ—”ν„°λ‘œ ꡬ뢄)
252
  keywords = "\n".join(df_freq["단어"].tolist())
253
  debug_log(f"λΆ„μ„λœ ν‚€μ›Œλ“œ: {keywords}")
254
-
255
- # [μ°Έμ‘°μ½”λ“œ-2]λ₯Ό ν™œμš©ν•˜μ—¬ 각 ν‚€μ›Œλ“œμ˜ κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 쑰회 (연관검색어 미포함)
256
  df_keyword_info, _ = process_keyword(keywords, include_related=False)
257
  debug_log("κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 쑰회 μ™„λ£Œ")
258
-
259
- # ν˜•νƒœμ†Œ 뢄석 결과와 κ²€μƒ‰λŸ‰ 정보λ₯Ό 병합 (ν‚€μ›Œλ“œ κΈ°μ€€)
260
  merged_df = pd.merge(df_freq, df_keyword_info, left_on="단어", right_on="μ •λ³΄ν‚€μ›Œλ“œ", how="left")
261
  merged_df.drop(columns=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
262
-
263
- # 병합 κ²°κ³Ό Excel 파일 생성
264
  merged_excel_path = create_excel_file(merged_df)
265
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ™„λ£Œ")
266
  return merged_df, merged_excel_path
267
 
268
- # μƒˆλ‘­κ²Œ μΆ”κ°€λœ κΈ°λŠ₯ 1,2,3: 직접 μž…λ ₯ν•œ ν‚€μ›Œλ“œ(μ—”ν„° λ˜λŠ” ','둜 κ΅¬λΆ„λœ λ‹€μˆ˜μ˜ ν‚€μ›Œλ“œ)κ°€ λΈ”λ‘œκ·Έ λ³Έλ¬Έ λ‚΄ λ“±μž₯ λΉˆλ„μˆ˜λ₯Ό 체크
269
- def direct_keyword_analysis(text: str, keyword_input: str):
270
- debug_log("direct_keyword_analysis ν•¨μˆ˜ μ‹œμž‘")
271
- # μ—”ν„° λ˜λŠ” μ‰Όν‘œλ‘œ λΆ„λ¦¬ν•˜μ—¬ ν‚€μ›Œλ“œ λͺ©λ‘ 생성
272
- keywords = re.split(r'[\n,]+', keyword_input)
273
- keywords = [kw.strip() for kw in keywords if kw.strip()]
274
- debug_log(f"μž…λ ₯된 ν‚€μ›Œλ“œ λͺ©λ‘: {keywords}")
275
- results = []
276
- for kw in keywords:
277
- count = text.count(kw)
278
- results.append((kw, count))
279
- debug_log(f"ν‚€μ›Œλ“œ '{kw}'의 λΉˆλ„μˆ˜: {count}")
280
- df = pd.DataFrame(results, columns=["ν‚€μ›Œλ“œ", "λΉˆλ„μˆ˜"])
281
- excel_path = create_excel_file(df)
282
- debug_log("direct_keyword_analysis ν•¨μˆ˜ μ™„λ£Œ")
283
- return df, excel_path
284
-
285
- # 뢄석 μ‹€ν–‰ λ²„νŠΌ 클릭 μ‹œ, μˆ˜μ • κ°€λŠ₯ν•œ λΈ”λ‘œκ·Έ 본문을 λŒ€μƒμœΌλ‘œ ν˜•νƒœμ†Œ 뢄석과 직접 ν‚€μ›Œλ“œ 뢄석을 ν•¨κ»˜ μ§„ν–‰
286
- def analyze_combined(blog_text: str, remove_freq1: bool, keyword_input: str):
287
- debug_log("analyze_combined ν•¨μˆ˜ μ‹œμž‘")
288
- morph_df, morph_excel = morphological_analysis_and_enrich(blog_text, remove_freq1)
289
- direct_df, direct_excel = direct_keyword_analysis(blog_text, keyword_input)
290
- debug_log("analyze_combined ν•¨μˆ˜ μ™„λ£Œ")
291
- return morph_df, morph_excel, direct_df, direct_excel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- # μŠ€ν¬λž˜ν•‘ μ‹€ν–‰: λΈ”λ‘œκ·Έ 링크λ₯Ό 톡해 λ‚΄μš©μ„ 가져와 μˆ˜μ • κ°€λŠ₯ν•œ ν…μŠ€νŠΈ λ°•μŠ€μ— 좜λ ₯
294
  def fetch_blog_content(url: str):
295
  debug_log("fetch_blog_content ν•¨μˆ˜ μ‹œμž‘")
296
  content = scrape_naver_blog(url)
297
  debug_log("fetch_blog_content ν•¨μˆ˜ μ™„λ£Œ")
298
  return content
299
 
300
- # Gradio μΈν„°νŽ˜μ΄μŠ€ ꡬ성 (단일 νƒ­)
301
  with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 슀페이슀", css=".gradio-container { max-width: 960px; margin: auto; }") as demo:
302
  gr.Markdown("# 넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 슀페이슀")
303
  with gr.Row():
@@ -307,26 +279,18 @@ with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 슀페이슀", css=".
307
  blog_content_box = gr.Textbox(label="λΈ”λ‘œκ·Έ λ‚΄μš© (μˆ˜μ • κ°€λŠ₯)", lines=10, placeholder="μŠ€ν¬λž˜ν•‘λœ λΈ”λ‘œκ·Έ λ‚΄μš©μ΄ 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€.")
308
  with gr.Row():
309
  remove_freq_checkbox = gr.Checkbox(label="λΉˆλ„μˆ˜1 제거", value=False)
310
- with gr.Row():
311
- keyword_input_box = gr.Textbox(label="직접 ν‚€μ›Œλ“œ μž…λ ₯ (μ—”ν„° λ˜λŠ” ','둜 ꡬ뢄)", lines=2, placeholder="예: ν‚€μ›Œλ“œ1, ν‚€μ›Œλ“œ2\nν‚€μ›Œλ“œ3")
312
  with gr.Row():
313
  analyze_button = gr.Button("뢄석 μ‹€ν–‰")
314
-
315
- gr.Markdown("### ν˜•νƒœμ†Œ 뢄석 κ²°κ³Ό")
316
  with gr.Row():
317
- morph_result_df = gr.Dataframe(label="ν˜•νƒœμ†Œ 뢄석 κ²°κ³Ό (단어, λΉˆλ„μˆ˜, κ²€μƒ‰λŸ‰, λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ λ“±)")
318
- morph_excel_file = gr.File(label="ν˜•νƒœμ†Œ 뢄석 Excel λ‹€μš΄λ‘œλ“œ")
319
-
320
- gr.Markdown("### 직접 ν‚€μ›Œλ“œ 뢄석 κ²°κ³Ό")
321
  with gr.Row():
322
- direct_result_df = gr.Dataframe(label="직접 ν‚€μ›Œλ“œ 뢄석 κ²°κ³Ό (ν‚€μ›Œλ“œ, λΉˆλ„μˆ˜)")
323
- direct_excel_file = gr.File(label="직접 ν‚€μ›Œλ“œ 뢄석 Excel λ‹€μš΄λ‘œλ“œ")
324
 
325
- # μŠ€ν¬λž˜ν•‘ μ‹€ν–‰: URL을 μž…λ ₯ν•˜λ©΄ λΈ”λ‘œκ·Έ λ‚΄μš©μ„ μˆ˜μ • κ°€λŠ₯ν•œ ν…μŠ€νŠΈ λ°•μŠ€μ— μ±„μ›Œμ€Œ
326
  scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
327
- # 뢄석 μ‹€ν–‰: μˆ˜μ •λœ λΈ”λ‘œκ·Έ λ‚΄μš©κ³Ό λΉˆλ„μˆ˜1 제거 μ˜΅μ…˜, 직접 μž…λ ₯ ν‚€μ›Œλ“œλ₯Ό λŒ€μƒμœΌλ‘œ 두 뢄석을 ν•¨κ»˜ μ§„ν–‰
328
- analyze_button.click(fn=analyze_combined, inputs=[blog_content_box, remove_freq_checkbox, keyword_input_box],
329
- outputs=[morph_result_df, morph_excel_file, direct_result_df, direct_excel_file])
330
 
331
  if __name__ == "__main__":
332
  debug_log("Gradio μ•± μ‹€ν–‰ μ‹œμž‘")
 
17
  def debug_log(message: str):
18
  print(f"[DEBUG] {message}")
19
 
20
+ # --- 넀이버 λΈ”λ‘œκ·Έ μŠ€ν¬λž˜ν•‘ ---
21
  def scrape_naver_blog(url: str) -> str:
22
  debug_log("scrape_naver_blog ν•¨μˆ˜ μ‹œμž‘")
23
  debug_log(f"μš”μ²­λ°›μ€ URL: {url}")
 
24
  headers = {
25
  "User-Agent": (
26
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
 
28
  "Chrome/96.0.4664.110 Safari/537.36"
29
  )
30
  }
 
31
  try:
 
32
  response = requests.get(url, headers=headers)
33
  debug_log("HTTP GET μš”μ²­(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
34
  if response.status_code != 200:
35
  debug_log(f"μš”μ²­ μ‹€νŒ¨, μƒνƒœμ½”λ“œ: {response.status_code}")
36
  return f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {response.status_code}"
 
 
37
  soup = BeautifulSoup(response.text, "html.parser")
38
  debug_log("HTML νŒŒμ‹±(메인 νŽ˜μ΄μ§€) μ™„λ£Œ")
 
 
39
  iframe = soup.select_one("iframe#mainFrame")
40
  if not iframe:
41
  debug_log("iframe#mainFrame νƒœκ·Έλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€.")
 
44
  if not iframe_src:
45
  debug_log("iframe srcκ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€.")
46
  return "λ³Έλ¬Έ iframe의 srcλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
 
 
47
  parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
48
  debug_log(f"iframe νŽ˜μ΄μ§€ μš”μ²­ URL: {parsed_iframe_url}")
 
 
49
  iframe_response = requests.get(parsed_iframe_url, headers=headers)
50
  debug_log("HTTP GET μš”μ²­(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
51
  if iframe_response.status_code != 200:
 
53
  return f"iframeμ—μ„œ 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€. μƒνƒœμ½”λ“œ: {iframe_response.status_code}"
54
  iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
55
  debug_log("HTML νŒŒμ‹±(iframe νŽ˜μ΄μ§€) μ™„λ£Œ")
 
 
56
  title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
57
  title = title_div.get_text(strip=True) if title_div else "제λͺ©μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
58
  debug_log(f"μΆ”μΆœλœ 제λͺ©: {title}")
 
62
  else:
63
  content = "본문을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
64
  debug_log("λ³Έλ¬Έ μΆ”μΆœ μ™„λ£Œ")
 
65
  result = f"[제λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
66
+ debug_log("제λͺ©κ³Ό λ³Έλ¬Έ ν•©μΉ¨ μ™„λ£Œ")
67
  return result
 
68
  except Exception as e:
69
  debug_log(f"μ—λŸ¬ λ°œμƒ: {str(e)}")
70
  return f"μŠ€ν¬λž˜ν•‘ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
71
 
72
+ # --- ν˜•νƒœμ†Œ 뢄석 (μ°Έμ‘°μ½”λ“œ-1) ---
73
  def analyze_text(text: str):
74
  logging.basicConfig(level=logging.DEBUG)
75
  logger = logging.getLogger(__name__)
76
  logger.debug("원본 ν…μŠ€νŠΈ: %s", text)
 
 
77
  filtered_text = re.sub(r'[^κ°€-힣]', '', text)
78
+ logger.debug("ν•„ν„°λ§λœ ν…μŠ€νŠΈ: %s", filtered_text)
 
79
  if not filtered_text:
80
  logger.debug("μœ νš¨ν•œ ν•œκ΅­μ–΄ ν…μŠ€νŠΈκ°€ μ—†μŒ.")
81
  return pd.DataFrame(columns=["단어", "λΉˆλ„μˆ˜"]), ""
 
 
82
  mecab_instance = mecab.MeCab()
83
  tokens = mecab_instance.pos(filtered_text)
84
  logger.debug("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Ό: %s", tokens)
 
85
  freq = {}
86
  for word, pos in tokens:
87
+ if word and word.strip() and pos.startswith("NN"):
88
+ freq[word] = freq.get(word, 0) + 1
89
+ logger.debug("단어: %s, ν’ˆμ‚¬: %s, λΉˆλ„: %d", word, pos, freq[word])
 
 
 
90
  sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
91
+ logger.debug("μ •λ ¬λœ 단어 λΉˆλ„: %s", sorted_freq)
 
 
92
  df = pd.DataFrame(sorted_freq, columns=["단어", "λΉˆλ„μˆ˜"])
93
+ logger.debug("ν˜•νƒœμ†Œ 뢄석 DataFrame 생성됨, shape: %s", df.shape)
 
 
94
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
95
  df.to_excel(temp_file.name, index=False, engine='openpyxl')
96
  temp_file.close()
97
  logger.debug("Excel 파일 생성됨: %s", temp_file.name)
 
98
  return df, temp_file.name
99
 
100
+ # --- 넀이버 검색 및 κ΄‘κ³  API κ΄€λ ¨ (μ°Έμ‘°μ½”λ“œ-2) ---
101
  def generate_signature(timestamp, method, uri, secret_key):
102
  message = f"{timestamp}.{method}.{uri}"
103
  digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
 
119
  API_KEY = os.environ["NAVER_API_KEY"]
120
  SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
121
  CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
 
122
  BASE_URL = "https://api.naver.com"
123
  uri = "/keywordstool"
124
  method = "GET"
 
134
  df = pd.DataFrame(data["keywordList"])
135
  if len(df) > 100:
136
  df = df.head(100)
 
137
  def parse_count(x):
138
  try:
139
  return int(str(x).replace(",", ""))
140
  except:
141
  return 0
 
142
  df["PCμ›”κ²€μƒ‰λŸ‰"] = df["monthlyPcQcCnt"].apply(parse_count)
143
  df["λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰"] = df["monthlyMobileQcCnt"].apply(parse_count)
144
  df["ν† νƒˆμ›”κ²€μƒ‰λŸ‰"] = df["PCμ›”κ²€μƒ‰λŸ‰"] + df["λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰"]
 
177
  debug_log(f"process_keyword 호좜, ν‚€μ›Œλ“œλ“€: {keywords}, 연관검색어 포함: {include_related}")
178
  input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
179
  result_dfs = []
 
180
  for idx, kw in enumerate(input_keywords):
181
  df_kw = fetch_related_keywords(kw)
182
  if df_kw.empty:
 
190
  df_related = df_kw[df_kw["μ •λ³΄ν‚€μ›Œλ“œ"] != kw]
191
  if not df_related.empty:
192
  result_dfs.append(df_related)
 
193
  if result_dfs:
194
  result_df = pd.concat(result_dfs, ignore_index=True)
195
  result_df.drop_duplicates(subset=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
196
  else:
197
  result_df = pd.DataFrame(columns=["μ •λ³΄ν‚€μ›Œλ“œ", "PCμ›”κ²€μƒ‰λŸ‰", "λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", "ν† νƒˆμ›”κ²€μƒ‰λŸ‰"])
 
198
  result_df["λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜"] = result_df["μ •λ³΄ν‚€μ›Œλ“œ"].apply(fetch_blog_count)
199
  result_df.sort_values(by="ν† νƒˆμ›”κ²€μƒ‰λŸ‰", ascending=False, inplace=True)
200
  debug_log("process_keyword μ™„λ£Œ")
201
  return result_df, create_excel_file(result_df)
202
 
203
+ # --- ν˜•νƒœμ†Œ 뢄석과 κ²€μƒ‰λŸ‰/λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 병합 ---
204
  def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
205
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ‹œμž‘")
206
  df_freq, _ = analyze_text(text)
207
  if df_freq.empty:
208
  debug_log("ν˜•νƒœμ†Œ 뢄석 κ²°κ³Όκ°€ 빈 λ°μ΄ν„°ν”„λ ˆμž„μž…λ‹ˆλ‹€.")
209
  return df_freq, ""
 
210
  if remove_freq1:
211
  before_shape = df_freq.shape
212
  df_freq = df_freq[df_freq["λΉˆλ„μˆ˜"] != 1]
213
  debug_log(f"λΉˆλ„μˆ˜ 1 제거 적용됨. {before_shape} -> {df_freq.shape}")
 
 
214
  keywords = "\n".join(df_freq["단어"].tolist())
215
  debug_log(f"λΆ„μ„λœ ν‚€μ›Œλ“œ: {keywords}")
 
 
216
  df_keyword_info, _ = process_keyword(keywords, include_related=False)
217
  debug_log("κ²€μƒ‰λŸ‰ 및 λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 쑰회 μ™„λ£Œ")
 
 
218
  merged_df = pd.merge(df_freq, df_keyword_info, left_on="단어", right_on="μ •λ³΄ν‚€μ›Œλ“œ", how="left")
219
  merged_df.drop(columns=["μ •λ³΄ν‚€μ›Œλ“œ"], inplace=True)
 
 
220
  merged_excel_path = create_excel_file(merged_df)
221
  debug_log("morphological_analysis_and_enrich ν•¨μˆ˜ μ™„λ£Œ")
222
  return merged_df, merged_excel_path
223
 
224
+ # --- 톡합 뢄석 (ν˜•νƒœμ†Œ 뢄석 + 직접 μž…λ ₯ ν‚€μ›Œλ“œ) ---
225
+ def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
226
+ debug_log("combined_analysis ν•¨μˆ˜ μ‹œμž‘")
227
+ # ν˜•νƒœμ†Œ 뢄석 및 κ²€μƒ‰λŸ‰/λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜ 병합 κ²°κ³Ό
228
+ merged_df, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
229
+ # 결과에 'μ§μ ‘μž…λ ₯' 컬럼 μΆ”κ°€ (μ΄ˆκΈ°κ°’: 빈 λ¬Έμžμ—΄)
230
+ if "μ§μ ‘μž…λ ₯" not in merged_df.columns:
231
+ merged_df["μ§μ ‘μž…λ ₯"] = ""
232
+ # 직접 μž…λ ₯ν•œ ν‚€μ›Œλ“œ λͺ©λ‘ (μ—”ν„° λ˜λŠ” ','둜 ꡬ뢄)
233
+ direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
234
+ direct_keywords = [kw.strip() for kw in direct_keywords if kw.strip()]
235
+ debug_log(f"μž…λ ₯된 직접 ν‚€μ›Œλ“œ: {direct_keywords}")
236
+ for dk in direct_keywords:
237
+ if dk in merged_df["단어"].values:
238
+ merged_df.loc[merged_df["단어"] == dk, "μ§μ ‘μž…λ ₯"] = "μ§μ ‘μž…λ ₯"
239
+ else:
240
+ freq = blog_text.count(dk)
241
+ df_direct, _ = process_keyword(dk, include_related=False)
242
+ if (not df_direct.empty) and (dk in df_direct["μ •λ³΄ν‚€μ›Œλ“œ"].values):
243
+ row = df_direct[df_direct["μ •λ³΄ν‚€μ›Œλ“œ"] == dk].iloc[0]
244
+ pc = row.get("PCμ›”κ²€μƒ‰λŸ‰", None)
245
+ mobile = row.get("λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰", None)
246
+ total = row.get("ν† νƒˆμ›”κ²€μƒ‰λŸ‰", None)
247
+ blog_count = row.get("λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜", None)
248
+ else:
249
+ pc = mobile = total = blog_count = None
250
+ new_row = {
251
+ "단어": dk,
252
+ "λΉˆλ„μˆ˜": freq,
253
+ "PCμ›”κ²€μƒ‰λŸ‰": pc,
254
+ "λͺ¨λ°”μΌμ›”κ²€μƒ‰λŸ‰": mobile,
255
+ "ν† νƒˆμ›”κ²€μƒ‰λŸ‰": total,
256
+ "λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜": blog_count,
257
+ "μ§μ ‘μž…λ ₯": "μ§μ ‘μž…λ ₯"
258
+ }
259
+ merged_df = pd.concat([merged_df, pd.DataFrame([new_row])], ignore_index=True)
260
+ merged_df = merged_df.sort_values(by="λΉˆλ„μˆ˜", ascending=False).reset_index(drop=True)
261
+ combined_excel = create_excel_file(merged_df)
262
+ debug_log("combined_analysis ν•¨μˆ˜ μ™„λ£Œ")
263
+ return merged_df, combined_excel
264
 
265
+ # --- μŠ€ν¬λž˜ν•‘ μ‹€ν–‰ ---
266
  def fetch_blog_content(url: str):
267
  debug_log("fetch_blog_content ν•¨μˆ˜ μ‹œμž‘")
268
  content = scrape_naver_blog(url)
269
  debug_log("fetch_blog_content ν•¨μˆ˜ μ™„λ£Œ")
270
  return content
271
 
272
+ # --- Gradio μΈν„°νŽ˜μ΄μŠ€ ꡬ성 ---
273
  with gr.Blocks(title="넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 슀페이슀", css=".gradio-container { max-width: 960px; margin: auto; }") as demo:
274
  gr.Markdown("# 넀이버 λΈ”λ‘œκ·Έ ν˜•νƒœμ†Œ 뢄석 슀페이슀")
275
  with gr.Row():
 
279
  blog_content_box = gr.Textbox(label="λΈ”λ‘œκ·Έ λ‚΄μš© (μˆ˜μ • κ°€λŠ₯)", lines=10, placeholder="μŠ€ν¬λž˜ν•‘λœ λΈ”λ‘œκ·Έ λ‚΄μš©μ΄ 여기에 ν‘œμ‹œλ©λ‹ˆλ‹€.")
280
  with gr.Row():
281
  remove_freq_checkbox = gr.Checkbox(label="λΉˆλ„μˆ˜1 제거", value=False)
282
+ direct_keyword_box = gr.Textbox(label="직접 ν‚€μ›Œλ“œ μž…λ ₯ (μ—”ν„° λ˜λŠ” ','둜 ꡬ뢄)", lines=2, placeholder="예: ν‚€μ›Œλ“œ1, ν‚€μ›Œλ“œ2\nν‚€μ›Œλ“œ3")
 
283
  with gr.Row():
284
  analyze_button = gr.Button("뢄석 μ‹€ν–‰")
 
 
285
  with gr.Row():
286
+ result_df = gr.Dataframe(label="톡합 뢄석 κ²°κ³Ό (단어, λΉˆλ„μˆ˜, κ²€μƒ‰λŸ‰, λΈ”λ‘œκ·Έλ¬Έμ„œμˆ˜, μ§μ ‘μž…λ ₯)", interactive=True)
 
 
 
287
  with gr.Row():
288
+ excel_file = gr.File(label="Excel λ‹€μš΄λ‘œλ“œ")
 
289
 
290
+ # 이벀트 μ—°κ²°
291
  scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
292
+ analyze_button.click(fn=combined_analysis, inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box],
293
+ outputs=[result_df, excel_file])
 
294
 
295
  if __name__ == "__main__":
296
  debug_log("Gradio μ•± μ‹€ν–‰ μ‹œμž‘")