Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,389 +1,289 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
-
|
4 |
-
import urllib.parse # iframe ๊ฒฝ๋ก ๋ณด์ ์ ์ํ ๋ชจ๋
|
5 |
import re
|
6 |
import logging
|
7 |
-
import
|
8 |
-
import pandas as pd
|
9 |
-
import mecab # pythonโmecabโko ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ฌ์ฉ
|
10 |
-
import os
|
11 |
-
import time
|
12 |
-
import hmac
|
13 |
-
import hashlib
|
14 |
-
import base64
|
15 |
-
|
16 |
-
# ๋๋ฒ๊น
(๋ก๊ทธ)์ฉ ํจ์
|
17 |
-
def debug_log(message: str):
|
18 |
-
print(f"[DEBUG] {message}")
|
19 |
|
20 |
-
|
21 |
-
def scrape_naver_blog(url: str) -> str:
|
22 |
-
debug_log("scrape_naver_blog ํจ์ ์์")
|
23 |
-
debug_log(f"์์ฒญ๋ฐ์ URL: {url}")
|
24 |
-
headers = {
|
25 |
-
"User-Agent": (
|
26 |
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
27 |
-
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
28 |
-
"Chrome/96.0.4664.110 Safari/537.36"
|
29 |
-
)
|
30 |
-
}
|
31 |
-
try:
|
32 |
-
response = requests.get(url, headers=headers)
|
33 |
-
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
34 |
-
if response.status_code != 200:
|
35 |
-
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
36 |
-
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
|
37 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
38 |
-
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
39 |
-
iframe = soup.select_one("iframe#mainFrame")
|
40 |
-
if not iframe:
|
41 |
-
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
42 |
-
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค."
|
43 |
-
iframe_src = iframe.get("src")
|
44 |
-
if not iframe_src:
|
45 |
-
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
46 |
-
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
47 |
-
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
48 |
-
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
49 |
-
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
50 |
-
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
51 |
-
if iframe_response.status_code != 200:
|
52 |
-
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}")
|
53 |
-
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
54 |
-
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
55 |
-
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
56 |
-
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
57 |
-
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
58 |
-
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
59 |
-
content_div = iframe_soup.select_one('.se-main-container')
|
60 |
-
if content_div:
|
61 |
-
content = content_div.get_text("\n", strip=True)
|
62 |
-
else:
|
63 |
-
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
64 |
-
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
65 |
-
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
66 |
-
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ํฉ์นจ ์๋ฃ")
|
67 |
-
return result
|
68 |
-
except Exception as e:
|
69 |
-
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}")
|
70 |
-
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
71 |
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
logger = logging.getLogger(__name__)
|
76 |
-
logger.debug("์๋ณธ ํ
์คํธ: %s", text)
|
77 |
-
filtered_text = re.sub(r'[^๊ฐ-ํฃ]', '', text)
|
78 |
-
logger.debug("ํํฐ๋ง๋ ํ
์คํธ: %s", filtered_text)
|
79 |
-
if not filtered_text:
|
80 |
-
logger.debug("์ ํจํ ํ๊ตญ์ด ํ
์คํธ๊ฐ ์์.")
|
81 |
-
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
82 |
-
mecab_instance = mecab.MeCab()
|
83 |
-
tokens = mecab_instance.pos(filtered_text)
|
84 |
-
logger.debug("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ: %s", tokens)
|
85 |
-
freq = {}
|
86 |
-
for word, pos in tokens:
|
87 |
-
if word and word.strip() and pos.startswith("NN"):
|
88 |
-
freq[word] = freq.get(word, 0) + 1
|
89 |
-
logger.debug("๋จ์ด: %s, ํ์ฌ: %s, ๋น๋: %d", word, pos, freq[word])
|
90 |
-
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
91 |
-
logger.debug("์ ๋ ฌ๋ ๋จ์ด ๋น๋: %s", sorted_freq)
|
92 |
-
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"])
|
93 |
-
logger.debug("ํํ์ ๋ถ์ DataFrame ์์ฑ๋จ, shape: %s", df.shape)
|
94 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
95 |
-
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
96 |
-
temp_file.close()
|
97 |
-
logger.debug("Excel ํ์ผ ์์ฑ๋จ: %s", temp_file.name)
|
98 |
-
return df, temp_file.name
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
|
|
105 |
|
106 |
-
def
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
121 |
-
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
122 |
-
BASE_URL = "https://api.naver.com"
|
123 |
-
uri = "/keywordstool"
|
124 |
-
method = "GET"
|
125 |
-
headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID)
|
126 |
-
params = {
|
127 |
-
"hintKeywords": [keyword],
|
128 |
-
"showDetail": "1"
|
129 |
-
}
|
130 |
-
response = requests.get(BASE_URL + uri, params=params, headers=headers)
|
131 |
-
data = response.json()
|
132 |
-
if "keywordList" not in data:
|
133 |
-
return pd.DataFrame()
|
134 |
-
df = pd.DataFrame(data["keywordList"])
|
135 |
-
if len(df) > 100:
|
136 |
-
df = df.head(100)
|
137 |
-
def parse_count(x):
|
138 |
-
try:
|
139 |
-
return int(str(x).replace(",", ""))
|
140 |
-
except:
|
141 |
-
return 0
|
142 |
-
df["PC์๊ฒ์๋"] = df["monthlyPcQcCnt"].apply(parse_count)
|
143 |
-
df["๋ชจ๋ฐ์ผ์๊ฒ์๋"] = df["monthlyMobileQcCnt"].apply(parse_count)
|
144 |
-
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"]
|
145 |
-
df.rename(columns={"relKeyword": "์ ๋ณดํค์๋"}, inplace=True)
|
146 |
-
result_df = df[["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"]]
|
147 |
-
debug_log("fetch_related_keywords ์๋ฃ")
|
148 |
-
return result_df
|
149 |
|
150 |
-
def
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
response = requests.get(url, headers=headers, params=params)
|
161 |
-
if response.status_code == 200:
|
162 |
-
data = response.json()
|
163 |
-
debug_log(f"fetch_blog_count ๊ฒฐ๊ณผ: {data.get('total', 0)}")
|
164 |
-
return data.get("total", 0)
|
165 |
-
else:
|
166 |
-
debug_log(f"fetch_blog_count ์ค๋ฅ, ์ํ์ฝ๋: {response.status_code}")
|
167 |
-
return 0
|
168 |
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
def process_keyword(keywords: str, include_related: bool):
|
177 |
-
debug_log(f"process_keyword ํธ์ถ, ํค์๋๋ค: {keywords}, ์ฐ๊ด๊ฒ์์ด ํฌํจ: {include_related}")
|
178 |
-
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
179 |
-
result_dfs = []
|
180 |
-
for idx, kw in enumerate(input_keywords):
|
181 |
-
df_kw = fetch_related_keywords(kw)
|
182 |
-
if df_kw.empty:
|
183 |
-
continue
|
184 |
-
row_kw = df_kw[df_kw["์ ๋ณดํค์๋"] == kw]
|
185 |
-
if not row_kw.empty:
|
186 |
-
result_dfs.append(row_kw)
|
187 |
-
else:
|
188 |
-
result_dfs.append(df_kw.head(1))
|
189 |
-
if include_related and idx == 0:
|
190 |
-
df_related = df_kw[df_kw["์ ๋ณดํค์๋"] != kw]
|
191 |
-
if not df_related.empty:
|
192 |
-
result_dfs.append(df_related)
|
193 |
-
if result_dfs:
|
194 |
-
result_df = pd.concat(result_dfs, ignore_index=True)
|
195 |
-
result_df.drop_duplicates(subset=["์ ๋ณดํค์๋"], inplace=True)
|
196 |
-
else:
|
197 |
-
result_df = pd.DataFrame(columns=["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"])
|
198 |
-
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].apply(fetch_blog_count)
|
199 |
-
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True)
|
200 |
-
debug_log("process_keyword ์๋ฃ")
|
201 |
-
return result_df, create_excel_file(result_df)
|
202 |
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
debug_log(f"๋ถ์๋ ํค์๋: {keywords}")
|
216 |
-
df_keyword_info, _ = process_keyword(keywords, include_related=False)
|
217 |
-
debug_log("๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ์๋ฃ")
|
218 |
-
merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋จ์ด", right_on="์ ๋ณดํค์๋", how="left")
|
219 |
-
merged_df.drop(columns=["์ ๋ณดํค์๋"], inplace=True)
|
220 |
-
merged_excel_path = create_excel_file(merged_df)
|
221 |
-
debug_log("morphological_analysis_and_enrich ํจ์ ์๋ฃ")
|
222 |
-
return merged_df, merged_excel_path
|
223 |
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
#
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
debug_log(f"์
๋ ฅ๋ ์ง์ ํค์๋: {direct_keywords}")
|
249 |
-
for dk in direct_keywords:
|
250 |
-
if dk in merged_df["๋จ์ด"].values:
|
251 |
-
merged_df.loc[merged_df["๋จ์ด"] == dk, "์ง์ ์
๋ ฅ"] = "์ง์ ์
๋ ฅ"
|
252 |
-
else:
|
253 |
-
freq = blog_text.count(dk)
|
254 |
-
df_direct, _ = process_keyword(dk, include_related=False)
|
255 |
-
if (not df_direct.empty) and (dk in df_direct["์ ๋ณดํค์๋"].values):
|
256 |
-
row = df_direct[df_direct["์ ๋ณดํค์๋"] == dk].iloc[0]
|
257 |
-
pc = row.get("PC์๊ฒ์๋", None)
|
258 |
-
mobile = row.get("๋ชจ๋ฐ์ผ์๊ฒ์๋", None)
|
259 |
-
total = row.get("ํ ํ์๊ฒ์๋", None)
|
260 |
-
blog_count = row.get("๋ธ๋ก๊ทธ๋ฌธ์์", None)
|
261 |
-
else:
|
262 |
-
pc = mobile = total = blog_count = None
|
263 |
-
new_row = {
|
264 |
-
"๋จ์ด": dk,
|
265 |
-
"๋น๋์": freq,
|
266 |
-
"PC์๊ฒ์๋": pc,
|
267 |
-
"๋ชจ๋ฐ์ผ์๊ฒ์๋": mobile,
|
268 |
-
"ํ ํ์๊ฒ์๋": total,
|
269 |
-
"๋ธ๋ก๊ทธ๋ฌธ์์": blog_count,
|
270 |
-
"์ง์ ์
๋ ฅ": "์ง์ ์
๋ ฅ"
|
271 |
-
}
|
272 |
-
merged_df = pd.concat([merged_df, pd.DataFrame([new_row])], ignore_index=True)
|
273 |
-
merged_df = merged_df.sort_values(by="๋น๋์", ascending=False).reset_index(drop=True)
|
274 |
-
combined_excel = create_excel_file(merged_df)
|
275 |
-
debug_log("combined_analysis ํจ์ ์๋ฃ")
|
276 |
-
return merged_df, combined_excel
|
277 |
-
|
278 |
-
# --- ๋ถ์ ํธ๋ค๋ฌ ---
|
279 |
-
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
|
280 |
-
debug_log("analysis_handler ํจ์ ์์")
|
281 |
-
if direct_keyword_only:
|
282 |
-
# "์ง์ ํค์๋ ์
๋ ฅ๋ง ๋ถ์" ์ ํ ์ ๋จ๋
๋ถ์ ์ํ
|
283 |
-
return direct_keyword_analysis(blog_text, direct_keyword_input)
|
284 |
else:
|
285 |
-
#
|
286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
287 |
|
288 |
-
#
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
|
|
294 |
|
295 |
-
#
|
296 |
-
|
297 |
-
|
298 |
-
.
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
305 |
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
font-size: 2.5rem;
|
310 |
-
font-weight: bold;
|
311 |
-
margin-bottom: 1.5rem;
|
312 |
-
color: #333;
|
313 |
-
}
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
334 |
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
}
|
339 |
|
340 |
-
|
341 |
-
|
342 |
-
margin-top: 1.5rem;
|
343 |
-
}
|
344 |
|
345 |
-
/*
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
}
|
351 |
"""
|
352 |
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
with gr.Row():
|
368 |
-
#
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
385 |
|
386 |
if __name__ == "__main__":
|
387 |
-
debug_log("Gradio ์ฑ ์คํ ์์")
|
388 |
demo.launch()
|
389 |
-
debug_log("Gradio ์ฑ ์คํ ์ข
๋ฃ")
|
|
|
1 |
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import tempfile
|
|
|
4 |
import re
|
5 |
import logging
|
6 |
+
from mecab import MeCab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
logging.basicConfig(level=logging.DEBUG)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
##############################
|
11 |
+
# 1) ๊ณตํต ํจ์๋ค
|
12 |
+
##############################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
def preprocess_text(text: str) -> str:
|
15 |
+
"""
|
16 |
+
์ผํ, ๋ง์นจํ, ๊ณต๋ฐฑ, ์ซ์, ์์ด ๋ฑ
|
17 |
+
ํ๊ธ(๊ฐ-ํฃ) ์ด์ธ์ ๋ฌธ์๋ฅผ ๋ชจ๋ ์ ๊ฑฐํ๊ณ
|
18 |
+
ํ๊ธ๋ง ์ฐ์์ผ๋ก ๋จ๊ธด๋ค.
|
19 |
+
"""
|
20 |
+
return re.sub(r'[^๊ฐ-ํฃ]', '', text)
|
21 |
|
22 |
+
def expand_columns_if_needed(df, needed_index: int):
|
23 |
+
"""
|
24 |
+
df์ (needed_index + 1)๋ฒ์งธ ์ด์ด ์กด์ฌํ์ง ์์ผ๋ฉด
|
25 |
+
์์๋ก ํ์ฅํด์ ๋น ์ด์ ๋ง๋ ๋ค.
|
26 |
+
์) needed_index=13 โ N์ด(14๋ฒ์งธ ์ด)์ ์ฐ๋ ค๋ฉด
|
27 |
+
df.shape[1]์ด 14 ์ด์์ด ๋๋๋ก ํ์ฅ
|
28 |
+
"""
|
29 |
+
while df.shape[1] <= needed_index:
|
30 |
+
# ๋งจ ๋์ ๋น ์ด ์ถ๊ฐ
|
31 |
+
df[df.shape[1]] = None
|
32 |
|
33 |
+
##############################
|
34 |
+
# 2) ํค์๋ ์นด์ดํธ ํจ์
|
35 |
+
##############################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
def count_keywords(main_text, excel_file, direct_input):
|
38 |
+
"""
|
39 |
+
- ์ง์ ์
๋ ฅ ํค์๋(์ค๋ฐ๊ฟ ๊ตฌ๋ถ)๊ฐ ์์ผ๋ฉด ์ฐ์ ์ฌ์ฉ(A์ด=ํค์๋, B์ด=์นด์ดํธ)
|
40 |
+
- ์์ผ๋ฉด ์์
์ฌ์ฉ:
|
41 |
+
* ํค๋๋ฅผ ์ฌ์ฉํ์ง ์์(header=None) โ 1ํ ๊ทธ๋๋ก ๋ณด์กด
|
42 |
+
* A5~A10000: ํค์๋
|
43 |
+
* N5~N10000: ์นด์ดํธ ๊ธฐ๋ก(์ด ์ธ๋ฑ์ค 13)
|
44 |
+
- ๋ณธ๋ฌธ์ ํ๊ธ๋ง ๋จ๊ธฐ๊ณ .count(ํค์๋)๋ก ๋น๋์๋ฅผ ๊ณ์ฐ
|
45 |
+
- 1ํ ์ด์์ธ ํค์๋๋ง ๊ฒฐ๊ณผ ํ(Markdown)์ ํ์
|
46 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
logging.debug(f"main_text: {main_text}")
|
49 |
+
logging.debug(f"excel_file: {excel_file}")
|
50 |
+
logging.debug(f"direct_input: {direct_input}")
|
51 |
+
|
52 |
+
# ๋ณธ๋ฌธ ์ ์ฒ๋ฆฌ
|
53 |
+
cleaned_text = preprocess_text(main_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
+
direct_input = direct_input.strip()
|
56 |
+
if direct_input:
|
57 |
+
# ===== ์ง์ ์
๋ ฅ ํค์๋ ์ฌ์ฉ =====
|
58 |
+
keywords = [kw.strip() for kw in direct_input.split('\n') if kw.strip()]
|
59 |
+
if not keywords:
|
60 |
+
return ("์ง์ ์
๋ ฅ ํค์๋๊ฐ ์์ต๋๋ค.", None)
|
61 |
+
|
62 |
+
# counts
|
63 |
+
counts = [cleaned_text.count(k) for k in keywords]
|
64 |
+
|
65 |
+
# 1ํ ์ด์ ํํฐ
|
66 |
+
filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
+
if not filtered:
|
69 |
+
# ์ ๋ถ 0ํ
|
70 |
+
msg = "๋ณธ๋ฌธ์ ํด๋น ํค์๋๊ฐ ์ ํ ๋ฑ์ฅํ์ง ์์์ต๋๋ค."
|
71 |
+
# ๊ทธ๋๋ ๊ฒฐ๊ณผ CSV(A,B) ๋ง๋ค์ด์ ๋ฐํ
|
72 |
+
tmp_df = pd.DataFrame({"๋ช
์ฌ": keywords, "๋น๋์": counts})
|
73 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
74 |
+
tmp_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
|
75 |
+
tmp_path = tmp.name
|
76 |
+
return (msg, tmp_path)
|
77 |
+
|
78 |
+
# 1ํ ์ด์ ํ(Markdown)
|
79 |
+
lines = ["| ๋ช
์ฌ | ๋น๋์ |", "|---|---|"]
|
80 |
+
for (k, c) in filtered:
|
81 |
+
lines.append(f"| {k} | {c} |")
|
82 |
+
md_table = "\n".join(lines)
|
83 |
+
|
84 |
+
# CSV ์ ์ฅ
|
85 |
+
tmp_df = pd.DataFrame({"๋ช
์ฌ": keywords, "๋น๋์": counts})
|
86 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
87 |
+
tmp_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
|
88 |
+
tmp_path = tmp.name
|
89 |
+
|
90 |
+
return (md_table, tmp_path)
|
91 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
else:
|
93 |
+
# ===== ์์
ํ์ผ ์ฌ์ฉ =====
|
94 |
+
if not excel_file:
|
95 |
+
return ("์์
ํ์ผ์ ์
๋ก๋ํ๊ฑฐ๋ ํค์๋๋ฅผ ์ง์ ์
๋ ฅํ์ธ์.", None)
|
96 |
+
|
97 |
+
# 1) ์์
์ ์ฒด๋ฅผ header=None๋ก ์ฝ์ โ 1ํ ๊ทธ๋๋ก ๋ณด์กด
|
98 |
+
df = pd.read_excel(excel_file.name, header=None)
|
99 |
+
|
100 |
+
# 2) A5~A10000 โ (์ธ๋ฑ์ค 4~9999) ํค์๋
|
101 |
+
max_row = min(df.shape[0], 10000) # ์ค์ ํ ๊ฐ์ vs 10000 ์ค ๋ ์์ ๊ฒ
|
102 |
+
sub_df = df.iloc[4:max_row, 0] # ์ฒซ ๋ฒ์งธ ์ด(์ธ๋ฑ์ค=0)
|
103 |
|
104 |
+
# strip + NaN ์ ๊ฑฐ
|
105 |
+
keywords = sub_df.dropna().astype(str).apply(lambda x: x.strip()).tolist()
|
106 |
+
if not keywords:
|
107 |
+
return ("A5~A10000 ๋ฒ์์ ํค์๋๊ฐ ์์ต๋๋ค.", None)
|
108 |
+
|
109 |
+
# counts
|
110 |
+
counts = [cleaned_text.count(k) for k in keywords]
|
111 |
|
112 |
+
# 1ํ ์ด์ ํํฐ
|
113 |
+
filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
|
114 |
+
if not filtered:
|
115 |
+
msg = "๋ณธ๋ฌธ์ ํด๋น ํค์๋๊ฐ ์ ํ ๋ฑ์ฅํ์ง ์์์ต๋๋ค(0ํ)."
|
116 |
+
# ๊ทธ๋๋ N5~N10000์ ๊ธฐ๋ก
|
117 |
+
expand_columns_if_needed(df, 13) # N์ด=13
|
118 |
+
for i, cnt_val in enumerate(counts):
|
119 |
+
row_idx = 4 + i
|
120 |
+
if row_idx < df.shape[0]:
|
121 |
+
df.iloc[row_idx, 13] = cnt_val
|
122 |
+
|
123 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
124 |
+
df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
|
125 |
+
tmp_path = tmp.name
|
126 |
+
return (msg, tmp_path)
|
127 |
+
|
128 |
+
# 1ํ ์ด์ ํ(Markdown)
|
129 |
+
lines = ["| ๋ช
์ฌ | ๋น๋์ |", "|---|---|"]
|
130 |
+
for (k, c) in filtered:
|
131 |
+
lines.append(f"| {k} | {c} |")
|
132 |
+
md_table = "\n".join(lines)
|
133 |
+
|
134 |
+
# N5~N10000์ ๊ธฐ๋ก
|
135 |
+
expand_columns_if_needed(df, 13)
|
136 |
+
for i, cnt_val in enumerate(counts):
|
137 |
+
row_idx = 4 + i
|
138 |
+
if row_idx < df.shape[0]:
|
139 |
+
df.iloc[row_idx, 13] = cnt_val
|
140 |
+
|
141 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
142 |
+
df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
|
143 |
+
tmp_path = tmp.name
|
144 |
+
|
145 |
+
return (md_table, tmp_path)
|
146 |
|
147 |
+
##############################
|
148 |
+
# 3) ํํ์ ๋ถ์ ๊ธฐ๋ฐ ํค์๋ ์นด์ดํธ ํจ์
|
149 |
+
##############################
|
|
|
|
|
|
|
|
|
|
|
150 |
|
151 |
+
def morph_analysis_and_count(text: str):
|
152 |
+
"""
|
153 |
+
1) ์
๋ ฅ๋ ํ
์คํธ์์ ํ๊ธ๋ง ๋จ๊น
|
154 |
+
2) Mecab ํํ์ ๋ถ์ (python-mecab-ko)
|
155 |
+
3) ๋ช
์ฌ ๋ฐ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ
|
156 |
+
4) ๊ฐ ํค์๋๋ฅผ ๋ณธ๋ฌธ์์ ๋ค์ ๏ฟฝ๏ฟฝ์ํ์ฌ ๋น๋์ ์นด์ดํธ
|
157 |
+
"""
|
158 |
+
# 1) ์ ์ฒ๋ฆฌ
|
159 |
+
cleaned = preprocess_text(text)
|
160 |
+
|
161 |
+
# 2) Mecab ๋ถ์
|
162 |
+
tagger = MeCab()
|
163 |
+
parsed = tagger.pos(cleaned)
|
164 |
+
|
165 |
+
# 3) ๋ช
์ฌ ๋ฐ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ
|
166 |
+
noun_tags = ['NNG', 'NNP', 'NP', 'NNB']
|
167 |
+
nouns = [word for (word, pos) in parsed if pos in noun_tags]
|
168 |
+
|
169 |
+
# ์ค๋ณต ์ ๊ฑฐํ์ฌ ๊ณ ์ ํค์๋ ๋ฆฌ์คํธ ์์ฑ
|
170 |
+
unique_nouns = list(set(nouns))
|
171 |
+
|
172 |
+
# 4) ๊ฐ ํค์๋๋ฅผ ๋ณธ๋ฌธ์์ ๊ฒ์ํ์ฌ ๋น๋์ ์นด์ดํธ
|
173 |
+
freq_dict = {}
|
174 |
+
for noun in unique_nouns:
|
175 |
+
count = cleaned.count(noun)
|
176 |
+
freq_dict[noun] = count
|
177 |
+
|
178 |
+
filtered_freq = {k: v for k, v in freq_dict.items() if v > 0}
|
179 |
+
|
180 |
+
if not filtered_freq:
|
181 |
+
return "์ถ์ถ๋ ๋ช
์ฌ๊ฐ ์์ต๋๋ค.", None
|
182 |
+
|
183 |
+
freq_df = pd.DataFrame(list(filtered_freq.items()), columns=['๋ช
์ฌ', '๋น๋์'])
|
184 |
+
freq_df = freq_df.sort_values(by='๋น๋์', ascending=False).reset_index(drop=True)
|
185 |
+
|
186 |
+
try:
|
187 |
+
md_table = freq_df.to_markdown(index=False)
|
188 |
+
except ImportError:
|
189 |
+
md_table = "Markdown ๋ณํ์ ์ํด 'tabulate' ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ํ์ํฉ๋๋ค."
|
190 |
+
return md_table, None
|
191 |
+
|
192 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
193 |
+
freq_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
|
194 |
+
tmp_path = tmp.name
|
195 |
+
|
196 |
+
return md_table, tmp_path
|
197 |
|
198 |
+
########################
|
199 |
+
# 4) Gradio ์ธํฐํ์ด์ค #
|
200 |
+
########################
|
|
|
201 |
|
202 |
+
# ๊ธฐ์กด CSS์ ๋ฒํผ ์์ ์ถ๊ฐ
|
203 |
+
css = """
|
|
|
|
|
204 |
|
205 |
+
/* '๋ถ์ํ๊ธฐ' ๋ฒํผ ์์ ๋ฐ ๊ธ์์ ๋ณ๊ฒฝ */
|
206 |
+
#run_analysis_button > button,
|
207 |
+
#morph_analysis_button > button {
|
208 |
+
background-color: #EA580C !important; /* ์งํ ์ฃผํฉ์ */
|
209 |
+
color: #FFFFFF !important; /* ํฐ์ ๊ธ์ */
|
210 |
}
|
211 |
"""
|
212 |
|
213 |
+
with gr.Blocks(
|
214 |
+
theme=gr.themes.Soft(
|
215 |
+
primary_hue=gr.themes.Color(
|
216 |
+
c50="#FFF7ED",
|
217 |
+
c100="#FFEDD5",
|
218 |
+
c200="#FED7AA",
|
219 |
+
c300="#FDBA74",
|
220 |
+
c400="#FB923C",
|
221 |
+
c500="#F97316",
|
222 |
+
c600="#EA580C",
|
223 |
+
c700="#C2410C",
|
224 |
+
c800="#9A3412",
|
225 |
+
c900="#7C2D12",
|
226 |
+
c950="#431407",
|
227 |
+
),
|
228 |
+
secondary_hue="zinc",
|
229 |
+
neutral_hue="zinc",
|
230 |
+
font=("Pretendard", "sans-serif")
|
231 |
+
),
|
232 |
+
css=css
|
233 |
+
) as demo:
|
234 |
+
with gr.Tab("ํค์๋ ์นด์ดํธ"):
|
235 |
with gr.Row():
|
236 |
+
# ์ผ์ชฝ ์
๋ ฅ ์์ญ
|
237 |
+
with gr.Column():
|
238 |
+
main_textbox = gr.Textbox(
|
239 |
+
label="๋ณธ๋ฌธ ํ
์คํธ",
|
240 |
+
lines=16,
|
241 |
+
placeholder="์ฌ๊ธฐ์ ๊ธด ๋ณธ๋ฌธ์ ๋ถ์ฌ๋ฃ์ผ์ธ์."
|
242 |
+
)
|
243 |
+
keyword_input = gr.Textbox(
|
244 |
+
label="(์ ํ) ์ง์ ์
๋ ฅ ํค์๋ - ์ํฐ๋ก ๊ตฌ๋ถ",
|
245 |
+
lines=6,
|
246 |
+
placeholder="์)\n์ด์ํ๊ฐ์ต๊ธฐ\n๊ฐ์ต๊ธฐ\n..."
|
247 |
+
)
|
248 |
+
excel_input = gr.File(
|
249 |
+
label="(์ ํ) ์์
์
๋ก๋"
|
250 |
+
)
|
251 |
+
# ๋ฒํผ์ elem_id ์ถ๊ฐ
|
252 |
+
run_button = gr.Button("๋ถ์ํ๊ธฐ", elem_id="run_analysis_button")
|
253 |
+
|
254 |
+
# ์ค๋ฅธ์ชฝ ์ถ๋ ฅ ์์ญ
|
255 |
+
with gr.Column():
|
256 |
+
output_md = gr.Markdown(label="๊ฒฐ๊ณผ ํ")
|
257 |
+
output_file = gr.File(label="๊ฒฐ๊ณผ ๋ค์ด๋ก๋")
|
258 |
+
|
259 |
+
run_button.click(
|
260 |
+
fn=count_keywords,
|
261 |
+
inputs=[main_textbox, excel_input, keyword_input],
|
262 |
+
outputs=[output_md, output_file]
|
263 |
+
)
|
264 |
|
265 |
+
with gr.Tab("ํํ์ ๋ถ์ ๊ธฐ๋ฐ ์นด์ดํธ"):
|
266 |
+
with gr.Row():
|
267 |
+
# ์ผ์ชฝ ์
๋ ฅ ์์ญ
|
268 |
+
with gr.Column():
|
269 |
+
morph_text_input = gr.Textbox(
|
270 |
+
label="๋ณธ๋ฌธ ํ
์คํธ",
|
271 |
+
lines=16,
|
272 |
+
placeholder="์ฌ๊ธฐ์ ๊ธด ๋ณธ๋ฌธ์ ๋ถ์ฌ๋ฃ์ผ์ธ์."
|
273 |
+
)
|
274 |
+
# ๋ฒํผ์ elem_id ์ถ๊ฐ
|
275 |
+
morph_run_button = gr.Button("๋ถ์ํ๊ธฐ", elem_id="morph_analysis_button")
|
276 |
+
|
277 |
+
# ์ค๋ฅธ์ชฝ ์ถ๋ ฅ ์์ญ
|
278 |
+
with gr.Column():
|
279 |
+
morph_result_display = gr.Markdown(label="๋ถ์ ๊ฒฐ๊ณผ")
|
280 |
+
morph_download_button = gr.File(label="๊ฒฐ๊ณผ ๋ค์ด๋ก๋")
|
281 |
+
|
282 |
+
morph_run_button.click(
|
283 |
+
fn=morph_analysis_and_count,
|
284 |
+
inputs=morph_text_input,
|
285 |
+
outputs=[morph_result_display, morph_download_button]
|
286 |
+
)
|
287 |
|
288 |
if __name__ == "__main__":
|
|
|
289 |
demo.launch()
|
|