Kims12 commited on
Commit
c850803
ยท
verified ยท
1 Parent(s): d5fb63f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -85
app.py CHANGED
@@ -17,14 +17,11 @@ import base64
17
  def debug_log(message: str):
18
  print(f"[DEBUG] {message}")
19
 
20
- # =============================================================================
21
- # [๊ธฐ๋ณธ์ฝ”๋“œ]: ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ์—์„œ ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜
22
- # =============================================================================
23
  def scrape_naver_blog(url: str) -> str:
24
  debug_log("scrape_naver_blog ํ•จ์ˆ˜ ์‹œ์ž‘")
25
  debug_log(f"์š”์ฒญ๋ฐ›์€ URL: {url}")
26
 
27
- # ํ—ค๋” ์„ธํŒ…(ํฌ๋กค๋ง ์ฐจ๋‹จ ๋ฐฉ์ง€)
28
  headers = {
29
  "User-Agent": (
30
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -34,57 +31,51 @@ def scrape_naver_blog(url: str) -> str:
34
  }
35
 
36
  try:
37
- # 1) ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋ฉ”์ธ ํŽ˜์ด์ง€ ์š”์ฒญ
38
  response = requests.get(url, headers=headers)
39
  debug_log("HTTP GET ์š”์ฒญ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
40
-
41
  if response.status_code != 200:
42
  debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
43
  return f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {response.status_code}"
44
-
 
45
  soup = BeautifulSoup(response.text, "html.parser")
46
  debug_log("HTML ํŒŒ์‹ฑ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
47
-
48
- # 2) iframe ํƒœ๊ทธ ์ฐพ๊ธฐ
49
  iframe = soup.select_one("iframe#mainFrame")
50
  if not iframe:
51
  debug_log("iframe#mainFrame ํƒœ๊ทธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
52
  return "๋ณธ๋ฌธ iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
53
-
54
  iframe_src = iframe.get("src")
55
  if not iframe_src:
56
  debug_log("iframe src๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
57
  return "๋ณธ๋ฌธ iframe์˜ src๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
58
-
59
- # 3) iframe src๊ฐ€ ์ƒ๋Œ€๊ฒฝ๋กœ์ธ ๊ฒฝ์šฐ ์ ˆ๋Œ€๊ฒฝ๋กœ๋กœ ๋ณด์ •
60
  parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
61
  debug_log(f"iframe ํŽ˜์ด์ง€ ์š”์ฒญ URL: {parsed_iframe_url}")
62
-
63
- # 4) iframe ํŽ˜์ด์ง€ ์žฌ์š”์ฒญ
64
  iframe_response = requests.get(parsed_iframe_url, headers=headers)
65
  debug_log("HTTP GET ์š”์ฒญ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
66
-
67
  if iframe_response.status_code != 200:
68
  debug_log(f"iframe ์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}")
69
  return f"iframe์—์„œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}"
70
-
71
  iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
72
  debug_log("HTML ํŒŒ์‹ฑ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
73
-
74
- # ์ œ๋ชฉ ์ถ”์ถœ
75
  title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
76
  title = title_div.get_text(strip=True) if title_div else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
77
  debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")
78
-
79
- # ๋ณธ๋ฌธ ์ถ”์ถœ
80
  content_div = iframe_soup.select_one('.se-main-container')
81
  if content_div:
82
  content = content_div.get_text("\n", strip=True)
83
  else:
84
  content = "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
85
  debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")
86
-
87
- # ๊ฒฐ๊ณผ ํ•ฉ์น˜๊ธฐ
88
  result = f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
89
  debug_log("์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ํ•ฉ์ณ ๋ฐ˜ํ™˜ ์ค€๋น„ ์™„๋ฃŒ")
90
  return result
@@ -93,13 +84,10 @@ def scrape_naver_blog(url: str) -> str:
93
  debug_log(f"์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}")
94
  return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
95
 
96
- # =============================================================================
97
- # [์ฐธ์กฐ์ฝ”๋“œ-1]: ํ˜•ํƒœ์†Œ ๋ถ„์„ ํ•จ์ˆ˜ (Mecab ์ด์šฉ)
98
- # =============================================================================
99
- logging.basicConfig(level=logging.DEBUG)
100
- logger = logging.getLogger(__name__)
101
-
102
  def analyze_text(text: str):
 
 
103
  logger.debug("์›๋ณธ ํ…์ŠคํŠธ: %s", text)
104
 
105
  # 1. ํ•œ๊ตญ์–ด๋งŒ ๋‚จ๊ธฐ๊ธฐ (๊ณต๋ฐฑ, ์˜์–ด, ๊ธฐํ˜ธ ๋“ฑ ์ œ๊ฑฐ)
@@ -111,7 +99,7 @@ def analyze_text(text: str):
111
  return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"]), ""
112
 
113
  # 2. Mecab์„ ์ด์šฉํ•œ ํ˜•ํƒœ์†Œ ๋ถ„์„ (๋ช…์‚ฌ์™€ ๋ณตํ•ฉ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ)
114
- mecab_instance = mecab.MeCab() # ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
115
  tokens = mecab_instance.pos(filtered_text)
116
  logger.debug("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ: %s", tokens)
117
 
@@ -130,7 +118,7 @@ def analyze_text(text: str):
130
  df = pd.DataFrame(sorted_freq, columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"])
131
  logger.debug("๊ฒฐ๊ณผ DataFrame ์ƒ์„ฑ๋จ, shape: %s", df.shape)
132
 
133
- # 5. Excel ํŒŒ์ผ ์ƒ์„ฑ (์ž„์‹œ ํŒŒ์ผ ์ €์žฅ)
134
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
135
  df.to_excel(temp_file.name, index=False, engine='openpyxl')
136
  temp_file.close()
@@ -138,9 +126,7 @@ def analyze_text(text: str):
138
 
139
  return df, temp_file.name
140
 
141
- # =============================================================================
142
- # [์ฐธ์กฐ์ฝ”๋“œ-2]: ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ ๋ฌธ์„œ์ˆ˜ ์กฐํšŒ ๊ด€๋ จ ํ•จ์ˆ˜
143
- # =============================================================================
144
  def generate_signature(timestamp, method, uri, secret_key):
145
  message = f"{timestamp}.{method}.{uri}"
146
  digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
@@ -158,6 +144,7 @@ def get_header(method, uri, api_key, secret_key, customer_id):
158
  }
159
 
160
  def fetch_related_keywords(keyword):
 
161
  API_KEY = os.environ["NAVER_API_KEY"]
162
  SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
163
  CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
@@ -189,9 +176,11 @@ def fetch_related_keywords(keyword):
189
  df["ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["PC์›”๊ฒ€์ƒ‰๋Ÿ‰"] + df["๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰"]
190
  df.rename(columns={"relKeyword": "์ •๋ณดํ‚ค์›Œ๋“œ"}, inplace=True)
191
  result_df = df[["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"]]
 
192
  return result_df
193
 
194
  def fetch_blog_count(keyword):
 
195
  client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
196
  client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
197
  url = "https://openapi.naver.com/v1/search/blog.json"
@@ -203,23 +192,21 @@ def fetch_blog_count(keyword):
203
  response = requests.get(url, headers=headers, params=params)
204
  if response.status_code == 200:
205
  data = response.json()
 
206
  return data.get("total", 0)
207
  else:
 
208
  return 0
209
 
210
  def create_excel_file(df):
211
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
212
  excel_path = tmp.name
213
  df.to_excel(excel_path, index=False)
 
214
  return excel_path
215
 
216
  def process_keyword(keywords: str, include_related: bool):
217
- """
218
- ์—ฌ๋Ÿฌ ํ‚ค์›Œ๋“œ๋ฅผ ์—”ํ„ฐ๋กœ ๊ตฌ๋ถ„ํ•˜์—ฌ ๋ฆฌ์ŠคํŠธ๋กœ ๋งŒ๋“ค๊ณ ,
219
- ๊ฐ ํ‚ค์›Œ๋“œ์— ๋Œ€ํ•ด ๋„ค์ด๋ฒ„ ๊ด‘๊ณ  API๋กœ ๊ฒ€์ƒ‰๋Ÿ‰ ์ •๋ณด๋ฅผ ์กฐํšŒํ•˜๋ฉฐ,
220
- ์ฒซ ๋ฒˆ์งธ ํ‚ค์›Œ๋“œ์˜ ๊ฒฝ์šฐ ์˜ต์…˜์— ๋”ฐ๋ผ ์—ฐ๊ด€๊ฒ€์ƒ‰์–ด๋„ ์ถ”๊ฐ€ํ•œ ํ›„,
221
- ๊ฐ ์ •๋ณดํ‚ค์›Œ๋“œ์— ๋Œ€ํ•ด ๋ธ”๋กœ๊ทธ ๋ฌธ์„œ์ˆ˜๋ฅผ ์กฐํšŒํ•˜์—ฌ DataFrame๊ณผ Excel ํŒŒ์ผ์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
222
- """
223
  input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
224
  result_dfs = []
225
 
@@ -245,64 +232,62 @@ def process_keyword(keywords: str, include_related: bool):
245
 
246
  result_df["๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"] = result_df["์ •๋ณดํ‚ค์›Œ๋“œ"].apply(fetch_blog_count)
247
  result_df.sort_values(by="ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", ascending=False, inplace=True)
248
-
249
  return result_df, create_excel_file(result_df)
250
 
251
- # =============================================================================
252
- # ํ†ตํ•ฉ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜: ๋ธ”๋กœ๊ทธ ๋‚ด์šฉ(ํ…์ŠคํŠธ)์— ๋Œ€ํ•ด ํ˜•ํƒœ์†Œ ๋ถ„์„์„ ์ˆ˜ํ–‰ํ•œ ํ›„,
253
- # ํ‚ค์›Œ๋“œ์˜ ๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ ๋ฌธ์„œ์ˆ˜๋ฅผ ์ถ”๊ฐ€ํ•˜์—ฌ ์ตœ์ข… ๊ฒฐ๊ณผ๋ฅผ ๋ฐ˜ํ™˜ํ•จ.
254
- # =============================================================================
255
- def process_blog_content(text: str):
256
- debug_log("process_blog_content ํ•จ์ˆ˜ ์‹œ์ž‘")
257
- # 1. ํ˜•ํƒœ์†Œ ๋ถ„์„ ์‹คํ–‰ ([์ฐธ์กฐ์ฝ”๋“œ-1] ํ™œ์šฉ)
258
- df_morph, morph_excel = analyze_text(text)
259
- debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ์™„๋ฃŒ")
260
-
261
- if df_morph.empty:
262
- debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๊ฐ€ ๋น„์–ด์žˆ์Œ")
263
- return df_morph, ""
264
-
265
- # 2. ํ˜•ํƒœ์†Œ ๋ถ„์„๋œ ๋‹จ์–ด ๋ชฉ๋ก ์ถ”์ถœ (ํ‚ค์›Œ๋“œ ์กฐํšŒ์šฉ)
266
- keywords = "\n".join(df_morph["๋‹จ์–ด"].tolist())
267
- debug_log(f"์ถ”์ถœ๋œ ๋‹จ์–ด ๋ชฉ๋ก: {keywords}")
268
 
269
- # 3. ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ ๋ฌธ์„œ์ˆ˜ ์กฐํšŒ ([์ฐธ์กฐ์ฝ”๋“œ-2] ํ™œ์šฉ)
270
- df_keyword, keyword_excel = process_keyword(keywords, include_related=False)
271
- debug_log("ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰ ์ •๋ณด ์กฐํšŒ ์™„๋ฃŒ")
272
 
273
- # 4. ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์™€ ํ‚ค์›Œ๋“œ ์ •๋ณด๋ฅผ ๋‹จ์–ด ๊ธฐ์ค€์œผ๋กœ ๋ณ‘ํ•ฉ
274
- df_merged = pd.merge(df_morph, df_keyword, left_on="๋‹จ์–ด", right_on="์ •๋ณดํ‚ค์›Œ๋“œ", how="left")
275
- debug_log("๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ ์™„๋ฃŒ")
276
- df_merged.drop(columns=["์ •๋ณดํ‚ค์›Œ๋“œ"], inplace=True)
277
 
278
- # 5. ๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ๋ฅผ Excel ํŒŒ์ผ๋กœ ์ƒ์„ฑ
279
- merged_excel = create_excel_file(df_merged)
280
- debug_log(f"๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ Excel ํŒŒ์ผ ์ƒ์„ฑ๋จ: {merged_excel}")
281
 
282
- return df_merged, merged_excel
 
 
 
283
 
284
- # =============================================================================
285
- # Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ (ํ—ˆ๊น…ํŽ˜์ด์Šค ๊ทธ๋ผ๋””์˜ค ํ™˜๊ฒฝ)
286
- # =============================================================================
287
- with gr.Blocks() as demo:
288
- gr.Markdown("# ๋ธ”๋กœ๊ทธ ๊ธ€ ํ˜•ํƒœ์†Œ ๋ถ„์„ ๋ฐ ํ‚ค์›Œ๋“œ ์ •๋ณด ์กฐํšŒ")
289
 
290
- with gr.Tab("๋ธ”๋กœ๊ทธ ๋‚ด์šฉ ์ž…๋ ฅ ๋ฐ ์Šคํฌ๋ž˜ํ•‘"):
291
  with gr.Row():
292
- blog_url = gr.Textbox(label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ", placeholder="์˜ˆ: https://blog.naver.com/ssboost/222983068507")
293
  fetch_button = gr.Button("๋ธ”๋กœ๊ทธ๋‚ด์šฉ๊ฐ€์ ธ์˜ค๊ธฐ")
294
- blog_content = gr.Textbox(label="๋ธ”๋กœ๊ทธ ๋‚ด์šฉ (์ œ๋ชฉ ๋ฐ ๋ณธ๋ฌธ)", lines=10, placeholder="๋ธ”๋กœ๊ทธ ๋‚ด์šฉ์„ ๊ฐ€์ ธ์˜ค๊ฑฐ๋‚˜ ์ง์ ‘ ์ž…๋ ฅํ•˜์„ธ์š”.")
295
- # '๋ธ”๋กœ๊ทธ๋‚ด์šฉ๊ฐ€์ ธ์˜ค๊ธฐ' ๋ฒ„ํŠผ ํด๋ฆญ ์‹œ ์Šคํฌ๋ž˜ํ•‘ ์‹คํ–‰ํ•˜์—ฌ blog_content์— ๋ฐ˜์˜
296
- fetch_button.click(fn=scrape_naver_blog, inputs=blog_url, outputs=blog_content)
297
 
298
- with gr.Tab("ํ˜•ํƒœ์†Œ ๋ถ„์„ ์‹คํ–‰"):
 
 
 
 
 
 
299
  with gr.Row():
300
- analysis_button = gr.Button("ํ˜•ํƒœ์†Œ๋ถ„์„")
301
- # ๋ถ„์„ ๊ฒฐ๊ณผ๋Š” ์ˆ˜์ • ๊ฐ€๋Šฅํ•˜๋„๋ก interactive=True ์„ค์ •
302
- output_table = gr.Dataframe(label="๋ถ„์„ ๊ฒฐ๊ณผ (ํ˜•ํƒœ์†Œ ๋ฐ ํ‚ค์›Œ๋“œ ์ •๋ณด)", interactive=True)
303
- output_file = gr.File(label="Excel ๋‹ค์šด๋กœ๋“œ")
304
- # 'ํ˜•ํƒœ์†Œ๋ถ„์„' ๋ฒ„ํŠผ ํด๋ฆญ ์‹œ process_blog_content ํ•จ์ˆ˜ ์‹คํ–‰
305
- analysis_button.click(fn=process_blog_content, inputs=blog_content, outputs=[output_table, output_file])
306
 
307
  if __name__ == "__main__":
308
  debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")
 
17
  def debug_log(message: str):
18
  print(f"[DEBUG] {message}")
19
 
20
+ # [๊ธฐ๋ณธ์ฝ”๋“œ] - ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํ•‘ ๊ธฐ๋Šฅ
 
 
21
  def scrape_naver_blog(url: str) -> str:
22
  debug_log("scrape_naver_blog ํ•จ์ˆ˜ ์‹œ์ž‘")
23
  debug_log(f"์š”์ฒญ๋ฐ›์€ URL: {url}")
24
 
 
25
  headers = {
26
  "User-Agent": (
27
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
 
31
  }
32
 
33
  try:
34
+ # 1) ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ '๋ฉ”์ธ' ํŽ˜์ด์ง€ ์š”์ฒญ
35
  response = requests.get(url, headers=headers)
36
  debug_log("HTTP GET ์š”์ฒญ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
 
37
  if response.status_code != 200:
38
  debug_log(f"์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
39
  return f"์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {response.status_code}"
40
+
41
+ # 2) ๋ฉ”์ธ ํŽ˜์ด์ง€ ํŒŒ์‹ฑ
42
  soup = BeautifulSoup(response.text, "html.parser")
43
  debug_log("HTML ํŒŒ์‹ฑ(๋ฉ”์ธ ํŽ˜์ด์ง€) ์™„๋ฃŒ")
44
+
45
+ # 3) iframe ํƒœ๊ทธ ์ฐพ๊ธฐ
46
  iframe = soup.select_one("iframe#mainFrame")
47
  if not iframe:
48
  debug_log("iframe#mainFrame ํƒœ๊ทธ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
49
  return "๋ณธ๋ฌธ iframe์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
 
50
  iframe_src = iframe.get("src")
51
  if not iframe_src:
52
  debug_log("iframe src๊ฐ€ ์กด์žฌํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.")
53
  return "๋ณธ๋ฌธ iframe์˜ src๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
54
+
55
+ # 4) iframe src ๋ณด์ • (์ ˆ๋Œ€๊ฒฝ๋กœ ์ฒ˜๋ฆฌ)
56
  parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
57
  debug_log(f"iframe ํŽ˜์ด์ง€ ์š”์ฒญ URL: {parsed_iframe_url}")
58
+
59
+ # 5) iframe ํŽ˜์ด์ง€ ์š”์ฒญ ๋ฐ ํŒŒ์‹ฑ
60
  iframe_response = requests.get(parsed_iframe_url, headers=headers)
61
  debug_log("HTTP GET ์š”์ฒญ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
 
62
  if iframe_response.status_code != 200:
63
  debug_log(f"iframe ์š”์ฒญ ์‹คํŒจ, ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}")
64
  return f"iframe์—์„œ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค. ์ƒํƒœ์ฝ”๋“œ: {iframe_response.status_code}"
 
65
  iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
66
  debug_log("HTML ํŒŒ์‹ฑ(iframe ํŽ˜์ด์ง€) ์™„๋ฃŒ")
67
+
68
+ # 6) ์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ ์ถ”์ถœ
69
  title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
70
  title = title_div.get_text(strip=True) if title_div else "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
71
  debug_log(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")
 
 
72
  content_div = iframe_soup.select_one('.se-main-container')
73
  if content_div:
74
  content = content_div.get_text("\n", strip=True)
75
  else:
76
  content = "๋ณธ๋ฌธ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
77
  debug_log("๋ณธ๋ฌธ ์ถ”์ถœ ์™„๋ฃŒ")
78
+
 
79
  result = f"[์ œ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
80
  debug_log("์ œ๋ชฉ๊ณผ ๋ณธ๋ฌธ์„ ํ•ฉ์ณ ๋ฐ˜ํ™˜ ์ค€๋น„ ์™„๋ฃŒ")
81
  return result
 
84
  debug_log(f"์—๋Ÿฌ ๋ฐœ์ƒ: {str(e)}")
85
  return f"์Šคํฌ๋ž˜ํ•‘ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
86
 
87
+ # [์ฐธ์กฐ์ฝ”๋“œ-1] ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ธฐ๋Šฅ
 
 
 
 
 
88
  def analyze_text(text: str):
89
+ logging.basicConfig(level=logging.DEBUG)
90
+ logger = logging.getLogger(__name__)
91
  logger.debug("์›๋ณธ ํ…์ŠคํŠธ: %s", text)
92
 
93
  # 1. ํ•œ๊ตญ์–ด๋งŒ ๋‚จ๊ธฐ๊ธฐ (๊ณต๋ฐฑ, ์˜์–ด, ๊ธฐํ˜ธ ๋“ฑ ์ œ๊ฑฐ)
 
99
  return pd.DataFrame(columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"]), ""
100
 
101
  # 2. Mecab์„ ์ด์šฉํ•œ ํ˜•ํƒœ์†Œ ๋ถ„์„ (๋ช…์‚ฌ์™€ ๋ณตํ•ฉ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ)
102
+ mecab_instance = mecab.MeCab()
103
  tokens = mecab_instance.pos(filtered_text)
104
  logger.debug("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ: %s", tokens)
105
 
 
118
  df = pd.DataFrame(sorted_freq, columns=["๋‹จ์–ด", "๋นˆ๋„์ˆ˜"])
119
  logger.debug("๊ฒฐ๊ณผ DataFrame ์ƒ์„ฑ๋จ, shape: %s", df.shape)
120
 
121
+ # 5. Excel ํŒŒ์ผ ์ƒ์„ฑ (์ž„์‹œ ํŒŒ์ผ)
122
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
123
  df.to_excel(temp_file.name, index=False, engine='openpyxl')
124
  temp_file.close()
 
126
 
127
  return df, temp_file.name
128
 
129
+ # [์ฐธ์กฐ์ฝ”๋“œ-2] ๋„ค์ด๋ฒ„ ๊ด‘๊ณ  API ๋ฐ ๊ฒ€์ƒ‰๋Ÿ‰/๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ์กฐํšŒ ๊ธฐ๋Šฅ
 
 
130
  def generate_signature(timestamp, method, uri, secret_key):
131
  message = f"{timestamp}.{method}.{uri}"
132
  digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
 
144
  }
145
 
146
  def fetch_related_keywords(keyword):
147
+ debug_log(f"fetch_related_keywords ํ˜ธ์ถœ, ํ‚ค์›Œ๋“œ: {keyword}")
148
  API_KEY = os.environ["NAVER_API_KEY"]
149
  SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
150
  CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
 
176
  df["ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"] = df["PC์›”๊ฒ€์ƒ‰๋Ÿ‰"] + df["๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰"]
177
  df.rename(columns={"relKeyword": "์ •๋ณดํ‚ค์›Œ๋“œ"}, inplace=True)
178
  result_df = df[["์ •๋ณดํ‚ค์›Œ๋“œ", "PC์›”๊ฒ€์ƒ‰๋Ÿ‰", "๋ชจ๋ฐ”์ผ์›”๊ฒ€์ƒ‰๋Ÿ‰", "ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰"]]
179
+ debug_log("fetch_related_keywords ์™„๋ฃŒ")
180
  return result_df
181
 
182
  def fetch_blog_count(keyword):
183
+ debug_log(f"fetch_blog_count ํ˜ธ์ถœ, ํ‚ค์›Œ๋“œ: {keyword}")
184
  client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
185
  client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
186
  url = "https://openapi.naver.com/v1/search/blog.json"
 
192
  response = requests.get(url, headers=headers, params=params)
193
  if response.status_code == 200:
194
  data = response.json()
195
+ debug_log(f"fetch_blog_count ๊ฒฐ๊ณผ: {data.get('total', 0)}")
196
  return data.get("total", 0)
197
  else:
198
+ debug_log(f"fetch_blog_count ์˜ค๋ฅ˜, ์ƒํƒœ์ฝ”๋“œ: {response.status_code}")
199
  return 0
200
 
201
  def create_excel_file(df):
202
  with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
203
  excel_path = tmp.name
204
  df.to_excel(excel_path, index=False)
205
+ debug_log(f"Excel ํŒŒ์ผ ์ƒ์„ฑ๋จ: {excel_path}")
206
  return excel_path
207
 
208
  def process_keyword(keywords: str, include_related: bool):
209
+ debug_log(f"process_keyword ํ˜ธ์ถœ, ํ‚ค์›Œ๋“œ๋“ค: {keywords}, ์—ฐ๊ด€๊ฒ€์ƒ‰์–ด ํฌํ•จ: {include_related}")
 
 
 
 
 
210
  input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
211
  result_dfs = []
212
 
 
232
 
233
  result_df["๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜"] = result_df["์ •๋ณดํ‚ค์›Œ๋“œ"].apply(fetch_blog_count)
234
  result_df.sort_values(by="ํ† ํƒˆ์›”๊ฒ€์ƒ‰๋Ÿ‰", ascending=False, inplace=True)
235
+ debug_log("process_keyword ์™„๋ฃŒ")
236
  return result_df, create_excel_file(result_df)
237
 
238
+ # ์ƒˆ๋กœ์šด ๊ธฐ๋Šฅ: '๋ธ”๋กœ๊ทธ๋‚ด์šฉ๊ฐ€์ ธ์˜ค๊ธฐ' ์‹คํ–‰ ์‹œ ๋ธ”๋กœ๊ทธ ๋งํฌ๋กœ๋ถ€ํ„ฐ ์ œ๋ชฉ/๋ณธ๋ฌธ ์Šคํฌ๋ž˜ํ•‘
239
+ def fetch_blog_content(url: str):
240
+ debug_log("fetch_blog_content ํ•จ์ˆ˜ ์‹œ์ž‘")
241
+ content = scrape_naver_blog(url)
242
+ debug_log("fetch_blog_content ํ•จ์ˆ˜ ์™„๋ฃŒ")
243
+ return content
244
+
245
+ # ์ƒˆ๋กœ์šด ๊ธฐ๋Šฅ: ํ˜•ํƒœ์†Œ ๋ถ„์„ ๋ฐ ๊ฒ€์ƒ‰๋Ÿ‰, ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ์ถ”๊ฐ€
246
+ def morphological_analysis_and_enrich(text: str):
247
+ debug_log("morphological_analysis_and_enrich ํ•จ์ˆ˜ ์‹œ์ž‘")
248
+ df_freq, _ = analyze_text(text)
249
+ if df_freq.empty:
250
+ debug_log("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ๊ฐ€ ๋นˆ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์ž…๋‹ˆ๋‹ค.")
251
+ return df_freq, ""
 
 
 
252
 
253
+ # ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์—์„œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (๊ฐ ๋‹จ์–ด๋ฅผ ์—”ํ„ฐ๋กœ ๊ตฌ๋ถ„)
254
+ keywords = "\n".join(df_freq["๋‹จ์–ด"].tolist())
255
+ debug_log(f"๋ถ„์„๋œ ํ‚ค์›Œ๋“œ: {keywords}")
256
 
257
+ # [์ฐธ์กฐ์ฝ”๋“œ-2]๋ฅผ ํ™œ์šฉํ•˜์—ฌ ๊ฐ ํ‚ค์›Œ๋“œ์˜ ๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ์กฐํšŒ (์—ฐ๊ด€๊ฒ€์ƒ‰์–ด ๋ฏธํฌํ•จ)
258
+ df_keyword_info, _ = process_keyword(keywords, include_related=False)
259
+ debug_log("๊ฒ€์ƒ‰๋Ÿ‰ ๋ฐ ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ์กฐํšŒ ์™„๋ฃŒ")
 
260
 
261
+ # ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ฒฐ๊ณผ์™€ ๊ฒ€์ƒ‰๋Ÿ‰ ์ •๋ณด๋ฅผ ๋ณ‘ํ•ฉ (ํ‚ค์›Œ๋“œ ๊ธฐ์ค€)
262
+ merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋‹จ์–ด", right_on="์ •๋ณดํ‚ค์›Œ๋“œ", how="left")
263
+ merged_df.drop(columns=["์ •๋ณดํ‚ค์›Œ๋“œ"], inplace=True)
264
 
265
+ # ๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ Excel ํŒŒ์ผ ์ƒ์„ฑ
266
+ merged_excel_path = create_excel_file(merged_df)
267
+ debug_log("morphological_analysis_and_enrich ํ•จ์ˆ˜ ์™„๋ฃŒ")
268
+ return merged_df, merged_excel_path
269
 
270
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ (Hugging Face Spaces ํ™˜๊ฒฝ์— ์ ํ•ฉ)
271
+ with gr.Blocks(title="๋ธ”๋กœ๊ทธ๊ธ€ ํ˜•ํƒœ์†Œ ๋ถ„์„ ์ŠคํŽ˜์ด์Šค", css=".gradio-container { max-width: 960px; margin: auto; }") as demo:
272
+ gr.Markdown("# ๋ธ”๋กœ๊ทธ๊ธ€ ํ˜•ํƒœ์†Œ ๋ถ„์„ ์ŠคํŽ˜์ด์Šค")
 
 
273
 
274
+ with gr.Tab("๋ธ”๋กœ๊ทธ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ"):
275
  with gr.Row():
276
+ blog_url_input = gr.Textbox(label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ๋งํฌ", placeholder="์˜ˆ: https://blog.naver.com/ssboost/222983068507", lines=1)
277
  fetch_button = gr.Button("๋ธ”๋กœ๊ทธ๋‚ด์šฉ๊ฐ€์ ธ์˜ค๊ธฐ")
278
+ blog_content = gr.Textbox(label="๋ธ”๋กœ๊ทธ ๋‚ด์šฉ", lines=10, placeholder="๋ธ”๋กœ๊ทธ ๋‚ด์šฉ์„ ๊ฐ€์ ธ์˜ค๊ฑฐ๋‚˜ ์ง์ ‘ ์ž…๋ ฅํ•˜์„ธ์š”.")
279
+ fetch_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content)
 
280
 
281
+ with gr.Tab("ํ˜•ํƒœ์†Œ ๋ถ„์„"):
282
+ with gr.Row():
283
+ analysis_input = gr.Textbox(label="๋ถ„์„ํ•  ํ…์ŠคํŠธ", lines=10, placeholder="๋ถ„์„ํ•  ํ…์ŠคํŠธ๋ฅผ ์ž…๋ ฅํ•˜๊ฑฐ๋‚˜ '๋ธ”๋กœ๊ทธ ๋‚ด์šฉ ๊ฐ€์ ธ์˜ค๊ธฐ'์—์„œ ๊ฐ€์ ธ์˜จ ๋‚ด์šฉ์„ ์ˆ˜์ •ํ•˜์„ธ์š”.")
284
+ with gr.Row():
285
+ analyze_button = gr.Button("ํ˜•ํƒœ์†Œ๋ถ„์„")
286
+ with gr.Row():
287
+ analysis_result = gr.Dataframe(label="๋ถ„์„ ๊ฒฐ๊ณผ (๋‹จ์–ด, ๋นˆ๋„์ˆ˜, ๊ฒ€์ƒ‰๋Ÿ‰, ๋ธ”๋กœ๊ทธ๋ฌธ์„œ์ˆ˜ ๋“ฑ)")
288
  with gr.Row():
289
+ analysis_excel = gr.File(label="Excel ๋‹ค์šด๋กœ๋“œ")
290
+ analyze_button.click(fn=morphological_analysis_and_enrich, inputs=analysis_input, outputs=[analysis_result, analysis_excel])
 
 
 
 
291
 
292
  if __name__ == "__main__":
293
  debug_log("Gradio ์•ฑ ์‹คํ–‰ ์‹œ์ž‘")