Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,10 +17,11 @@ import base64
|
|
17 |
def debug_log(message: str):
|
18 |
print(f"[DEBUG] {message}")
|
19 |
|
20 |
-
#
|
21 |
def scrape_naver_blog(url: str) -> str:
|
22 |
debug_log("scrape_naver_blog ํจ์ ์์")
|
23 |
debug_log(f"์์ฒญ๋ฐ์ URL: {url}")
|
|
|
24 |
headers = {
|
25 |
"User-Agent": (
|
26 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
@@ -28,14 +29,20 @@ def scrape_naver_blog(url: str) -> str:
|
|
28 |
"Chrome/96.0.4664.110 Safari/537.36"
|
29 |
)
|
30 |
}
|
|
|
31 |
try:
|
|
|
32 |
response = requests.get(url, headers=headers)
|
33 |
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
34 |
if response.status_code != 200:
|
35 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
36 |
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
|
|
|
|
|
37 |
soup = BeautifulSoup(response.text, "html.parser")
|
38 |
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
|
|
|
|
39 |
iframe = soup.select_one("iframe#mainFrame")
|
40 |
if not iframe:
|
41 |
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
@@ -44,8 +51,12 @@ def scrape_naver_blog(url: str) -> str:
|
|
44 |
if not iframe_src:
|
45 |
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
46 |
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
|
|
|
|
47 |
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
48 |
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
|
|
|
|
49 |
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
50 |
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
51 |
if iframe_response.status_code != 200:
|
@@ -53,6 +64,8 @@ def scrape_naver_blog(url: str) -> str:
|
|
53 |
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
54 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
55 |
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
|
|
|
|
56 |
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
57 |
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
58 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
@@ -62,42 +75,58 @@ def scrape_naver_blog(url: str) -> str:
|
|
62 |
else:
|
63 |
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
64 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
|
|
65 |
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
66 |
-
debug_log("์ ๋ชฉ๊ณผ
|
67 |
return result
|
|
|
68 |
except Exception as e:
|
69 |
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}")
|
70 |
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
71 |
|
72 |
-
#
|
73 |
def analyze_text(text: str):
|
74 |
logging.basicConfig(level=logging.DEBUG)
|
75 |
logger = logging.getLogger(__name__)
|
76 |
logger.debug("์๋ณธ ํ
์คํธ: %s", text)
|
|
|
|
|
77 |
filtered_text = re.sub(r'[^๊ฐ-ํฃ]', '', text)
|
78 |
-
logger.debug("ํํฐ๋ง๋
|
|
|
79 |
if not filtered_text:
|
80 |
logger.debug("์ ํจํ ํ๊ตญ์ด ํ
์คํธ๊ฐ ์์.")
|
81 |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
|
|
|
|
82 |
mecab_instance = mecab.MeCab()
|
83 |
tokens = mecab_instance.pos(filtered_text)
|
84 |
logger.debug("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ: %s", tokens)
|
|
|
85 |
freq = {}
|
86 |
for word, pos in tokens:
|
87 |
-
if word and word.strip()
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
90 |
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
91 |
-
logger.debug("์ ๋ ฌ๋ ๋จ์ด ๋น๋: %s", sorted_freq)
|
|
|
|
|
92 |
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"])
|
93 |
-
logger.debug("
|
|
|
|
|
94 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
95 |
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
96 |
temp_file.close()
|
97 |
logger.debug("Excel ํ์ผ ์์ฑ๋จ: %s", temp_file.name)
|
|
|
98 |
return df, temp_file.name
|
99 |
|
100 |
-
#
|
101 |
def generate_signature(timestamp, method, uri, secret_key):
|
102 |
message = f"{timestamp}.{method}.{uri}"
|
103 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
@@ -119,6 +148,7 @@ def fetch_related_keywords(keyword):
|
|
119 |
API_KEY = os.environ["NAVER_API_KEY"]
|
120 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
121 |
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
|
|
122 |
BASE_URL = "https://api.naver.com"
|
123 |
uri = "/keywordstool"
|
124 |
method = "GET"
|
@@ -134,11 +164,13 @@ def fetch_related_keywords(keyword):
|
|
134 |
df = pd.DataFrame(data["keywordList"])
|
135 |
if len(df) > 100:
|
136 |
df = df.head(100)
|
|
|
137 |
def parse_count(x):
|
138 |
try:
|
139 |
return int(str(x).replace(",", ""))
|
140 |
except:
|
141 |
return 0
|
|
|
142 |
df["PC์๊ฒ์๋"] = df["monthlyPcQcCnt"].apply(parse_count)
|
143 |
df["๋ชจ๋ฐ์ผ์๊ฒ์๋"] = df["monthlyMobileQcCnt"].apply(parse_count)
|
144 |
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"]
|
@@ -177,6 +209,7 @@ def process_keyword(keywords: str, include_related: bool):
|
|
177 |
debug_log(f"process_keyword ํธ์ถ, ํค์๋๋ค: {keywords}, ์ฐ๊ด๊ฒ์์ด ํฌํจ: {include_related}")
|
178 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
179 |
result_dfs = []
|
|
|
180 |
for idx, kw in enumerate(input_keywords):
|
181 |
df_kw = fetch_related_keywords(kw)
|
182 |
if df_kw.empty:
|
@@ -190,241 +223,73 @@ def process_keyword(keywords: str, include_related: bool):
|
|
190 |
df_related = df_kw[df_kw["์ ๋ณดํค์๋"] != kw]
|
191 |
if not df_related.empty:
|
192 |
result_dfs.append(df_related)
|
|
|
193 |
if result_dfs:
|
194 |
result_df = pd.concat(result_dfs, ignore_index=True)
|
195 |
result_df.drop_duplicates(subset=["์ ๋ณดํค์๋"], inplace=True)
|
196 |
else:
|
197 |
result_df = pd.DataFrame(columns=["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"])
|
|
|
198 |
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].apply(fetch_blog_count)
|
199 |
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True)
|
200 |
debug_log("process_keyword ์๋ฃ")
|
201 |
return result_df, create_excel_file(result_df)
|
202 |
|
203 |
-
#
|
204 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
debug_log("morphological_analysis_and_enrich ํจ์ ์์")
|
206 |
df_freq, _ = analyze_text(text)
|
207 |
if df_freq.empty:
|
208 |
debug_log("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ๊ฐ ๋น ๋ฐ์ดํฐํ๋ ์์
๋๋ค.")
|
209 |
return df_freq, ""
|
210 |
-
|
211 |
-
|
212 |
-
df_freq = df_freq[df_freq["๋น๋์"] != 1]
|
213 |
-
debug_log(f"๋น๋์ 1 ์ ๊ฑฐ ์ ์ฉ๋จ. {before_shape} -> {df_freq.shape}")
|
214 |
keywords = "\n".join(df_freq["๋จ์ด"].tolist())
|
215 |
debug_log(f"๋ถ์๋ ํค์๋: {keywords}")
|
|
|
|
|
216 |
df_keyword_info, _ = process_keyword(keywords, include_related=False)
|
217 |
debug_log("๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ์๋ฃ")
|
|
|
|
|
218 |
merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋จ์ด", right_on="์ ๋ณดํค์๋", how="left")
|
219 |
merged_df.drop(columns=["์ ๋ณดํค์๋"], inplace=True)
|
|
|
|
|
220 |
merged_excel_path = create_excel_file(merged_df)
|
221 |
debug_log("morphological_analysis_and_enrich ํจ์ ์๋ฃ")
|
222 |
return merged_df, merged_excel_path
|
223 |
|
224 |
-
#
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
debug_log(f"์
๋ ฅ๋ ํค์๋ ๋ชฉ๋ก: {keywords}")
|
230 |
-
results = []
|
231 |
-
for kw in keywords:
|
232 |
-
count = text.count(kw)
|
233 |
-
results.append((kw, count))
|
234 |
-
debug_log(f"ํค์๋ '{kw}'์ ๋น๋์: {count}")
|
235 |
-
df = pd.DataFrame(results, columns=["ํค์๋", "๋น๋์"])
|
236 |
-
excel_path = create_excel_file(df)
|
237 |
-
debug_log("direct_keyword_analysis ํจ์ ์๋ฃ")
|
238 |
-
return df, excel_path
|
239 |
-
|
240 |
-
# --- ํตํฉ ๋ถ์ (ํํ์ ๋ถ์ + ์ง์ ํค์๋ ๋ถ์) ---
|
241 |
-
def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
|
242 |
-
debug_log("combined_analysis ํจ์ ์์")
|
243 |
-
merged_df, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
|
244 |
-
if "์ง์ ์
๋ ฅ" not in merged_df.columns:
|
245 |
-
merged_df["์ง์ ์
๋ ฅ"] = ""
|
246 |
-
direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
|
247 |
-
direct_keywords = [kw.strip() for kw in direct_keywords if kw.strip()]
|
248 |
-
debug_log(f"์
๋ ฅ๋ ์ง์ ํค์๋: {direct_keywords}")
|
249 |
-
for dk in direct_keywords:
|
250 |
-
if dk in merged_df["๋จ์ด"].values:
|
251 |
-
merged_df.loc[merged_df["๋จ์ด"] == dk, "์ง์ ์
๋ ฅ"] = "์ง์ ์
๋ ฅ"
|
252 |
-
else:
|
253 |
-
freq = blog_text.count(dk)
|
254 |
-
df_direct, _ = process_keyword(dk, include_related=False)
|
255 |
-
if (not df_direct.empty) and (dk in df_direct["์ ๋ณดํค์๋"].values):
|
256 |
-
row = df_direct[df_direct["์ ๋ณดํค์๋"] == dk].iloc[0]
|
257 |
-
pc = row.get("PC์๊ฒ์๋", None)
|
258 |
-
mobile = row.get("๋ชจ๋ฐ์ผ์๊ฒ์๋", None)
|
259 |
-
total = row.get("ํ ํ์๊ฒ์๋", None)
|
260 |
-
blog_count = row.get("๋ธ๋ก๊ทธ๋ฌธ์์", None)
|
261 |
-
else:
|
262 |
-
pc = mobile = total = blog_count = None
|
263 |
-
new_row = {
|
264 |
-
"๋จ์ด": dk,
|
265 |
-
"๋น๋์": freq,
|
266 |
-
"PC์๊ฒ์๋": pc,
|
267 |
-
"๋ชจ๋ฐ์ผ์๊ฒ์๋": mobile,
|
268 |
-
"ํ ํ์๊ฒ์๋": total,
|
269 |
-
"๋ธ๋ก๊ทธ๋ฌธ์์": blog_count,
|
270 |
-
"์ง์ ์
๋ ฅ": "์ง์ ์
๋ ฅ"
|
271 |
-
}
|
272 |
-
merged_df = pd.concat([merged_df, pd.DataFrame([new_row])], ignore_index=True)
|
273 |
-
merged_df = merged_df.sort_values(by="๋น๋์", ascending=False).reset_index(drop=True)
|
274 |
-
combined_excel = create_excel_file(merged_df)
|
275 |
-
debug_log("combined_analysis ํจ์ ์๋ฃ")
|
276 |
-
return merged_df, combined_excel
|
277 |
-
|
278 |
-
# --- ๋ถ์ ํธ๋ค๋ฌ ---
|
279 |
-
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
|
280 |
-
debug_log("analysis_handler ํจ์ ์์")
|
281 |
-
if direct_keyword_only:
|
282 |
-
# "์ง์ ํค์๋ ์
๋ ฅ๋ง ๋ถ์" ์ ํ ์ ๋จ๋
๋ถ์ ์ํ
|
283 |
-
return direct_keyword_analysis(blog_text, direct_keyword_input)
|
284 |
-
else:
|
285 |
-
# ๊ธฐ๋ณธ ํตํฉ ๋ถ์ ์ํ
|
286 |
-
return combined_analysis(blog_text, remove_freq1, direct_keyword_input)
|
287 |
-
|
288 |
-
# --- ์คํฌ๋ํ ์คํ ---
|
289 |
-
def fetch_blog_content(url: str):
|
290 |
-
debug_log("fetch_blog_content ํจ์ ์์")
|
291 |
-
content = scrape_naver_blog(url)
|
292 |
-
debug_log("fetch_blog_content ํจ์ ์๋ฃ")
|
293 |
-
return content
|
294 |
-
|
295 |
-
# --- Custom CSS ---
|
296 |
-
custom_css = """
|
297 |
-
/* ์ ์ฒด ์ปจํ
์ด๋ ์คํ์ผ */
|
298 |
-
.gradio-container {
|
299 |
-
max-width: 960px;
|
300 |
-
margin: auto;
|
301 |
-
font-family: 'Helvetica Neue', Arial, sans-serif;
|
302 |
-
background: #f5f7fa;
|
303 |
-
padding: 2rem;
|
304 |
-
}
|
305 |
-
/* ํค๋ ์คํ์ผ */
|
306 |
-
.custom-header {
|
307 |
-
text-align: center;
|
308 |
-
font-size: 2.5rem;
|
309 |
-
font-weight: bold;
|
310 |
-
margin-bottom: 1.5rem;
|
311 |
-
color: #333;
|
312 |
-
}
|
313 |
-
/* ๊ทธ๋ฃน ๋ฐ์ค ์คํ์ผ */
|
314 |
-
.custom-group {
|
315 |
-
background: #ffffff;
|
316 |
-
border-radius: 8px;
|
317 |
-
padding: 1.5rem;
|
318 |
-
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
319 |
-
margin-bottom: 1.5rem;
|
320 |
-
}
|
321 |
-
/* ๋ฒํผ ์คํ์ผ */
|
322 |
-
.custom-button {
|
323 |
-
background-color: #007bff;
|
324 |
-
color: #fff;
|
325 |
-
border: none;
|
326 |
-
border-radius: 4px;
|
327 |
-
padding: 0.6rem 1.2rem;
|
328 |
-
font-size: 1rem;
|
329 |
-
cursor: pointer;
|
330 |
-
transition: background-color 0.3s;
|
331 |
-
}
|
332 |
-
.custom-button:hover {
|
333 |
-
background-color: #0056b3;
|
334 |
-
}
|
335 |
-
/* ์ฒดํฌ๋ฐ์ค ์คํ์ผ */
|
336 |
-
.custom-checkbox {
|
337 |
-
margin-right: 1rem;
|
338 |
-
font-size: 1rem;
|
339 |
-
font-weight: bold;
|
340 |
-
}
|
341 |
-
/* ๊ฒฐ๊ณผ ํ
์ด๋ธ ๋ฐ ๋ค์ด๋ก๋ ๋ฒํผ */
|
342 |
-
.custom-result {
|
343 |
-
margin-top: 1.5rem;
|
344 |
-
}
|
345 |
-
/* ๊ฐ์ด๋ฐ ์ ๋ ฌ */
|
346 |
-
.centered {
|
347 |
-
display: flex;
|
348 |
-
justify-content: center;
|
349 |
-
align-items: center;
|
350 |
-
}
|
351 |
-
/* ์ฌ์ฉ์ค๋ช
์คํ์ผ */
|
352 |
-
.usage-instructions {
|
353 |
-
font-size: 1.1rem;
|
354 |
-
line-height: 1.6;
|
355 |
-
color: #555;
|
356 |
-
background: #fff;
|
357 |
-
padding: 1.5rem;
|
358 |
-
border-radius: 8px;
|
359 |
-
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
360 |
-
margin-top: 2rem;
|
361 |
-
}
|
362 |
-
.usage-instructions h2 {
|
363 |
-
font-size: 1.8rem;
|
364 |
-
font-weight: bold;
|
365 |
-
margin-bottom: 1rem;
|
366 |
-
color: #333;
|
367 |
-
}
|
368 |
-
.usage-instructions ul {
|
369 |
-
list-style: disc;
|
370 |
-
margin-left: 2rem;
|
371 |
-
}
|
372 |
-
"""
|
373 |
-
|
374 |
-
# --- Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ ---
|
375 |
-
with gr.Blocks(title="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ํํ์ ๋ถ์ ์๋น์ค", css=custom_css) as demo:
|
376 |
-
gr.HTML("<div class='custom-header'>๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ํํ์ ๋ถ์ ์๋น์ค ๐</div>")
|
377 |
-
# ๋ธ๋ก๊ทธ ๋งํฌ์ ์คํฌ๋ํ ์คํ ๋ฒํผ์ ํ ๊ทธ๋ฃน ๋ด์ ๋ฐฐ์น (๋ฒํผ์ ๊ฐ์ด๋ฐ ์ ๋ ฌ)
|
378 |
-
with gr.Group(elem_classes="custom-group"):
|
379 |
with gr.Row():
|
380 |
blog_url_input = gr.Textbox(label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ", placeholder="์: https://blog.naver.com/ssboost/222983068507", lines=1)
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
with gr.
|
|
|
|
|
386 |
with gr.Row():
|
387 |
-
|
388 |
with gr.Row():
|
389 |
-
|
390 |
with gr.Row():
|
391 |
-
|
392 |
-
|
393 |
-
with gr.Row(elem_classes="centered"):
|
394 |
-
analyze_button = gr.Button("๋ถ์ ์คํ", elem_classes="custom-button")
|
395 |
-
with gr.Group(elem_classes="custom-group custom-result"):
|
396 |
-
result_df = gr.Dataframe(label="ํตํฉ ๋ถ์ ๊ฒฐ๊ณผ (๋จ์ด, ๋น๋์, ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์, ์ง์ ์
๋ ฅ)", interactive=True)
|
397 |
-
with gr.Group(elem_classes="custom-group"):
|
398 |
-
excel_file = gr.File(label="Excel ๋ค์ด๋ก๋")
|
399 |
-
# ์ฌ์ฉ์ค๋ช
HTML ๋ธ๋ก (์๋์ ๋ฐฐ์น)
|
400 |
-
with gr.Group(elem_classes="custom-group"):
|
401 |
-
usage_html = gr.HTML("""
|
402 |
-
<div class="usage-instructions">
|
403 |
-
<h2>์ฌ์ฉ ์ค๋ช
๐</h2>
|
404 |
-
<ul>
|
405 |
-
<li>๐ <strong>๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ</strong>: ๋ถ์ํ ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ์ URL์ ์
๋ ฅํ์ธ์.</li>
|
406 |
-
<li>โ๏ธ <strong>์คํฌ๋ํ ์คํ</strong>: ๋งํฌ ์
๋ ฅ ํ ๋ฒํผ์ ํด๋ฆญํ๋ฉด ๋ธ๋ก๊ทธ์ ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ด ์๋์ผ๋ก ๋ถ๋ฌ์์ง๋๋ค.</li>
|
407 |
-
<li>๐ <strong>๋ธ๋ก๊ทธ ๋ด์ฉ (์์ ๊ฐ๋ฅ)</strong>: ๋ถ๋ฌ์จ ๋ธ๋ก๊ทธ ๋ด์ฉ์ด ํ์๋๋ฉฐ, ํ์์ ๋ฐ๋ผ ์ง์ ์์ ํ ์ ์์ต๋๋ค.</li>
|
408 |
-
<li>โ๏ธ <strong>์ต์
์ค์ </strong>:
|
409 |
-
<ul>
|
410 |
-
<li><em>๋น๋์1 ์ ๊ฑฐ</em>: ๊ธฐ๋ณธ ์ ํ๋์ด ์์ผ๋ฉฐ, ๋น๋์๊ฐ 1์ธ ๋จ์ด๋ ๊ฒฐ๊ณผ์์ ์ ์ธํฉ๋๋ค.</li>
|
411 |
-
<li><em>์ง์ ํค์๋ ์
๋ ฅ๋ง ๋ถ์</em>: ์ด ์ต์
์ ์ ํํ๋ฉด, ๋ธ๋ก๊ทธ ๋ณธ๋ฌธ์์ ์ง์ ์
๋ ฅํ ํค์๋๋ง ๋ถ์ํฉ๋๋ค.</li>
|
412 |
-
</ul>
|
413 |
-
</li>
|
414 |
-
<li>๐ค <strong>์ง์ ํค์๋ ์
๋ ฅ</strong>: ์ํฐ ๋๋ ์ผํ(,)๋ก ๊ตฌ๋ถํ์ฌ ๋ถ์ํ ํค์๋๋ฅผ ์
๋ ฅํ์ธ์.</li>
|
415 |
-
<li>๐ <strong>๋ถ์ ์คํ</strong>: ์ค์ ํ ์ต์
์ ๋ฐ๋ผ ํํ์ ๋ถ์ ๋ฐ ํค์๋ ๋ถ์์ด ์ํ๋์ด ๊ฒฐ๊ณผ๊ฐ ํ์ Excel ํ์ผ๋ก ์ถ๋ ฅ๋ฉ๋๋ค.</li>
|
416 |
-
<li>๐ฅ <strong>Excel ๋ค์ด๋ก๋</strong>: ๋ถ์ ๊ฒฐ๊ณผ๋ฅผ Excel ํ์ผ๋ก ๋ค์ด๋ก๋ํ ์ ์์ต๋๋ค.</li>
|
417 |
-
</ul>
|
418 |
-
<p><strong>Tip:</strong> ๋ถ์ ๊ฒฐ๊ณผ๋ ์ค์๊ฐ์ผ๋ก ์
๋ฐ์ดํธ๋๋ฉฐ, ํ์์ ์์ ํ ๋ค์ ๋ถ์ํ ์ ์์ต๋๋ค. ์ฆ๊ฑฐ์ด ๋ถ์ ๋์ธ์! ๐</p>
|
419 |
-
</div>
|
420 |
-
""")
|
421 |
-
# ์ด๋ฒคํธ ์ฐ๊ฒฐ
|
422 |
-
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
423 |
-
analyze_button.click(fn=analysis_handler,
|
424 |
-
inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
|
425 |
-
outputs=[result_df, excel_file])
|
426 |
|
427 |
if __name__ == "__main__":
|
428 |
debug_log("Gradio ์ฑ ์คํ ์์")
|
429 |
demo.launch()
|
430 |
-
debug_log("Gradio ์ฑ ์คํ ์ข
๋ฃ")
|
|
|
17 |
def debug_log(message: str):
|
18 |
print(f"[DEBUG] {message}")
|
19 |
|
20 |
+
# [๊ธฐ๋ณธ์ฝ๋] - ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ ๊ธฐ๋ฅ
|
21 |
def scrape_naver_blog(url: str) -> str:
|
22 |
debug_log("scrape_naver_blog ํจ์ ์์")
|
23 |
debug_log(f"์์ฒญ๋ฐ์ URL: {url}")
|
24 |
+
|
25 |
headers = {
|
26 |
"User-Agent": (
|
27 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
29 |
"Chrome/96.0.4664.110 Safari/537.36"
|
30 |
)
|
31 |
}
|
32 |
+
|
33 |
try:
|
34 |
+
# 1) ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ '๋ฉ์ธ' ํ์ด์ง ์์ฒญ
|
35 |
response = requests.get(url, headers=headers)
|
36 |
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
37 |
if response.status_code != 200:
|
38 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
39 |
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
|
40 |
+
|
41 |
+
# 2) ๋ฉ์ธ ํ์ด์ง ํ์ฑ
|
42 |
soup = BeautifulSoup(response.text, "html.parser")
|
43 |
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
44 |
+
|
45 |
+
# 3) iframe ํ๊ทธ ์ฐพ๊ธฐ
|
46 |
iframe = soup.select_one("iframe#mainFrame")
|
47 |
if not iframe:
|
48 |
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
|
|
51 |
if not iframe_src:
|
52 |
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
53 |
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
54 |
+
|
55 |
+
# 4) iframe src ๋ณด์ (์ ๋๊ฒฝ๋ก ์ฒ๋ฆฌ)
|
56 |
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
57 |
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
58 |
+
|
59 |
+
# 5) iframe ํ์ด์ง ์์ฒญ ๋ฐ ํ์ฑ
|
60 |
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
61 |
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
62 |
if iframe_response.status_code != 200:
|
|
|
64 |
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
65 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
66 |
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
67 |
+
|
68 |
+
# 6) ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ์ถ์ถ
|
69 |
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
70 |
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
71 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
|
|
75 |
else:
|
76 |
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
77 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
78 |
+
|
79 |
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
80 |
+
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ํฉ์ณ ๋ฐํ ์ค๋น ์๋ฃ")
|
81 |
return result
|
82 |
+
|
83 |
except Exception as e:
|
84 |
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}")
|
85 |
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
86 |
|
87 |
+
# [์ฐธ์กฐ์ฝ๋-1] ํํ์ ๋ถ์ ๊ธฐ๋ฅ
|
88 |
def analyze_text(text: str):
|
89 |
logging.basicConfig(level=logging.DEBUG)
|
90 |
logger = logging.getLogger(__name__)
|
91 |
logger.debug("์๋ณธ ํ
์คํธ: %s", text)
|
92 |
+
|
93 |
+
# 1. ํ๊ตญ์ด๋ง ๋จ๊ธฐ๊ธฐ (๊ณต๋ฐฑ, ์์ด, ๊ธฐํธ ๋ฑ ์ ๊ฑฐ)
|
94 |
filtered_text = re.sub(r'[^๊ฐ-ํฃ]', '', text)
|
95 |
+
logger.debug("ํํฐ๋ง๋ ํ
์คํธ (ํ๊ตญ์ด๋ง, ๊ณต๋ฐฑ ์ ๊ฑฐ): %s", filtered_text)
|
96 |
+
|
97 |
if not filtered_text:
|
98 |
logger.debug("์ ํจํ ํ๊ตญ์ด ํ
์คํธ๊ฐ ์์.")
|
99 |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
100 |
+
|
101 |
+
# 2. Mecab์ ์ด์ฉํ ํํ์ ๋ถ์ (๋ช
์ฌ์ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ)
|
102 |
mecab_instance = mecab.MeCab()
|
103 |
tokens = mecab_instance.pos(filtered_text)
|
104 |
logger.debug("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ: %s", tokens)
|
105 |
+
|
106 |
freq = {}
|
107 |
for word, pos in tokens:
|
108 |
+
if word and word.strip():
|
109 |
+
if pos.startswith("NN"):
|
110 |
+
freq[word] = freq.get(word, 0) + 1
|
111 |
+
logger.debug("๋จ์ด: %s, ํ์ฌ: %s, ํ์ฌ ๋น๋: %d", word, pos, freq[word])
|
112 |
+
|
113 |
+
# 3. ๋น๋์๋ฅผ ๋ด๋ฆผ์ฐจ์ ์ ๋ ฌ
|
114 |
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
115 |
+
logger.debug("๋ด๋ฆผ์ฐจ์ ์ ๋ ฌ๋ ๋จ์ด ๋น๋: %s", sorted_freq)
|
116 |
+
|
117 |
+
# 4. ๊ฒฐ๊ณผ DataFrame ์์ฑ
|
118 |
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"])
|
119 |
+
logger.debug("๊ฒฐ๊ณผ DataFrame ์์ฑ๋จ, shape: %s", df.shape)
|
120 |
+
|
121 |
+
# 5. Excel ํ์ผ ์์ฑ (์์ ํ์ผ)
|
122 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
123 |
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
124 |
temp_file.close()
|
125 |
logger.debug("Excel ํ์ผ ์์ฑ๋จ: %s", temp_file.name)
|
126 |
+
|
127 |
return df, temp_file.name
|
128 |
|
129 |
+
# [์ฐธ์กฐ์ฝ๋-2] ๋ค์ด๋ฒ ๊ด๊ณ API ๋ฐ ๊ฒ์๋/๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ๊ธฐ๋ฅ
|
130 |
def generate_signature(timestamp, method, uri, secret_key):
|
131 |
message = f"{timestamp}.{method}.{uri}"
|
132 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
|
148 |
API_KEY = os.environ["NAVER_API_KEY"]
|
149 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
150 |
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
151 |
+
|
152 |
BASE_URL = "https://api.naver.com"
|
153 |
uri = "/keywordstool"
|
154 |
method = "GET"
|
|
|
164 |
df = pd.DataFrame(data["keywordList"])
|
165 |
if len(df) > 100:
|
166 |
df = df.head(100)
|
167 |
+
|
168 |
def parse_count(x):
|
169 |
try:
|
170 |
return int(str(x).replace(",", ""))
|
171 |
except:
|
172 |
return 0
|
173 |
+
|
174 |
df["PC์๊ฒ์๋"] = df["monthlyPcQcCnt"].apply(parse_count)
|
175 |
df["๋ชจ๋ฐ์ผ์๊ฒ์๋"] = df["monthlyMobileQcCnt"].apply(parse_count)
|
176 |
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"]
|
|
|
209 |
debug_log(f"process_keyword ํธ์ถ, ํค์๋๋ค: {keywords}, ์ฐ๊ด๊ฒ์์ด ํฌํจ: {include_related}")
|
210 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
211 |
result_dfs = []
|
212 |
+
|
213 |
for idx, kw in enumerate(input_keywords):
|
214 |
df_kw = fetch_related_keywords(kw)
|
215 |
if df_kw.empty:
|
|
|
223 |
df_related = df_kw[df_kw["์ ๋ณดํค์๋"] != kw]
|
224 |
if not df_related.empty:
|
225 |
result_dfs.append(df_related)
|
226 |
+
|
227 |
if result_dfs:
|
228 |
result_df = pd.concat(result_dfs, ignore_index=True)
|
229 |
result_df.drop_duplicates(subset=["์ ๋ณดํค์๋"], inplace=True)
|
230 |
else:
|
231 |
result_df = pd.DataFrame(columns=["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"])
|
232 |
+
|
233 |
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].apply(fetch_blog_count)
|
234 |
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True)
|
235 |
debug_log("process_keyword ์๋ฃ")
|
236 |
return result_df, create_excel_file(result_df)
|
237 |
|
238 |
+
# ์๋ก์ด ๊ธฐ๋ฅ: '๋ธ๋ก๊ทธ๋ด์ฉ๊ฐ์ ธ์ค๊ธฐ' ์คํ ์ ๋ธ๋ก๊ทธ ๋งํฌ๋ก๋ถํฐ ์ ๋ชฉ/๋ณธ๋ฌธ ์คํฌ๋ํ
|
239 |
+
def fetch_blog_content(url: str):
|
240 |
+
debug_log("fetch_blog_content ํจ์ ์์")
|
241 |
+
content = scrape_naver_blog(url)
|
242 |
+
debug_log("fetch_blog_content ํจ์ ์๋ฃ")
|
243 |
+
return content
|
244 |
+
|
245 |
+
# ์๋ก์ด ๊ธฐ๋ฅ: ํํ์ ๋ถ์ ๋ฐ ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์ ์ถ๊ฐ
|
246 |
+
def morphological_analysis_and_enrich(text: str):
|
247 |
debug_log("morphological_analysis_and_enrich ํจ์ ์์")
|
248 |
df_freq, _ = analyze_text(text)
|
249 |
if df_freq.empty:
|
250 |
debug_log("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ๊ฐ ๋น ๋ฐ์ดํฐํ๋ ์์
๋๋ค.")
|
251 |
return df_freq, ""
|
252 |
+
|
253 |
+
# ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์์ ํค์๋ ์ถ์ถ (๊ฐ ๋จ์ด๋ฅผ ์ํฐ๋ก ๊ตฌ๋ถ)
|
|
|
|
|
254 |
keywords = "\n".join(df_freq["๋จ์ด"].tolist())
|
255 |
debug_log(f"๋ถ์๋ ํค์๋: {keywords}")
|
256 |
+
|
257 |
+
# [์ฐธ์กฐ์ฝ๋-2]๋ฅผ ํ์ฉํ์ฌ ๊ฐ ํค์๋์ ๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ (์ฐ๊ด๊ฒ์์ด ๋ฏธํฌํจ)
|
258 |
df_keyword_info, _ = process_keyword(keywords, include_related=False)
|
259 |
debug_log("๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ์๋ฃ")
|
260 |
+
|
261 |
+
# ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์ ๊ฒ์๋ ์ ๋ณด๋ฅผ ๋ณํฉ (ํค์๋ ๊ธฐ์ค)
|
262 |
merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋จ์ด", right_on="์ ๋ณดํค์๋", how="left")
|
263 |
merged_df.drop(columns=["์ ๋ณดํค์๋"], inplace=True)
|
264 |
+
|
265 |
+
# ๋ณํฉ ๊ฒฐ๊ณผ Excel ํ์ผ ์์ฑ
|
266 |
merged_excel_path = create_excel_file(merged_df)
|
267 |
debug_log("morphological_analysis_and_enrich ํจ์ ์๋ฃ")
|
268 |
return merged_df, merged_excel_path
|
269 |
|
270 |
+
# Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ (Hugging Face Spaces ํ๊ฒฝ์ ์ ํฉ)
|
271 |
+
with gr.Blocks(title="๋ธ๋ก๊ทธ๊ธ ํํ์ ๋ถ์ ์คํ์ด์ค", css=".gradio-container { max-width: 960px; margin: auto; }") as demo:
|
272 |
+
gr.Markdown("# ๋ธ๋ก๊ทธ๊ธ ํํ์ ๋ถ์ ์คํ์ด์ค")
|
273 |
+
|
274 |
+
with gr.Tab("๋ธ๋ก๊ทธ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
with gr.Row():
|
276 |
blog_url_input = gr.Textbox(label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ", placeholder="์: https://blog.naver.com/ssboost/222983068507", lines=1)
|
277 |
+
fetch_button = gr.Button("๋ธ๋ก๊ทธ๋ด์ฉ๊ฐ์ ธ์ค๊ธฐ")
|
278 |
+
blog_content = gr.Textbox(label="๋ธ๋ก๊ทธ ๋ด์ฉ", lines=10, placeholder="๋ธ๋ก๊ทธ ๋ด์ฉ์ ๊ฐ์ ธ์ค๊ฑฐ๋ ์ง์ ์
๋ ฅํ์ธ์.")
|
279 |
+
fetch_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content)
|
280 |
+
|
281 |
+
with gr.Tab("ํํ์ ๋ถ์"):
|
282 |
+
with gr.Row():
|
283 |
+
analysis_input = gr.Textbox(label="๋ถ์ํ ํ
์คํธ", lines=10, placeholder="๋ถ์ํ ํ
์คํธ๋ฅผ ์
๋ ฅํ๊ฑฐ๋ '๋ธ๋ก๊ทธ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ'์์ ๊ฐ์ ธ์จ ๋ด์ฉ์ ์์ ํ์ธ์.")
|
284 |
with gr.Row():
|
285 |
+
analyze_button = gr.Button("ํํ์๋ถ์")
|
286 |
with gr.Row():
|
287 |
+
analysis_result = gr.Dataframe(label="๋ถ์ ๊ฒฐ๊ณผ (๋จ์ด, ๋น๋์, ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์ ๋ฑ)")
|
288 |
with gr.Row():
|
289 |
+
analysis_excel = gr.File(label="Excel ๋ค์ด๋ก๋")
|
290 |
+
analyze_button.click(fn=morphological_analysis_and_enrich, inputs=analysis_input, outputs=[analysis_result, analysis_excel])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
|
292 |
if __name__ == "__main__":
|
293 |
debug_log("Gradio ์ฑ ์คํ ์์")
|
294 |
demo.launch()
|
295 |
+
debug_log("Gradio ์ฑ ์คํ ์ข
๋ฃ")
|