Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,14 +17,11 @@ import base64
|
|
17 |
def debug_log(message: str):
|
18 |
print(f"[DEBUG] {message}")
|
19 |
|
20 |
-
#
|
21 |
-
# [๊ธฐ๋ณธ์ฝ๋]: ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ์์ ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ์ถ์ถํ๋ ํจ์
|
22 |
-
# =============================================================================
|
23 |
def scrape_naver_blog(url: str) -> str:
|
24 |
debug_log("scrape_naver_blog ํจ์ ์์")
|
25 |
debug_log(f"์์ฒญ๋ฐ์ URL: {url}")
|
26 |
|
27 |
-
# ํค๋ ์ธํ
(ํฌ๋กค๋ง ์ฐจ๋จ ๋ฐฉ์ง)
|
28 |
headers = {
|
29 |
"User-Agent": (
|
30 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
@@ -34,57 +31,51 @@ def scrape_naver_blog(url: str) -> str:
|
|
34 |
}
|
35 |
|
36 |
try:
|
37 |
-
# 1) ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋ฉ์ธ ํ์ด์ง ์์ฒญ
|
38 |
response = requests.get(url, headers=headers)
|
39 |
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
40 |
-
|
41 |
if response.status_code != 200:
|
42 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
43 |
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
|
44 |
-
|
|
|
45 |
soup = BeautifulSoup(response.text, "html.parser")
|
46 |
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
47 |
-
|
48 |
-
#
|
49 |
iframe = soup.select_one("iframe#mainFrame")
|
50 |
if not iframe:
|
51 |
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
52 |
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค."
|
53 |
-
|
54 |
iframe_src = iframe.get("src")
|
55 |
if not iframe_src:
|
56 |
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
57 |
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
58 |
-
|
59 |
-
#
|
60 |
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
61 |
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
62 |
-
|
63 |
-
#
|
64 |
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
65 |
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
66 |
-
|
67 |
if iframe_response.status_code != 200:
|
68 |
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}")
|
69 |
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
70 |
-
|
71 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
72 |
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
73 |
-
|
74 |
-
#
|
75 |
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
76 |
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
77 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
78 |
-
|
79 |
-
# ๋ณธ๋ฌธ ์ถ์ถ
|
80 |
content_div = iframe_soup.select_one('.se-main-container')
|
81 |
if content_div:
|
82 |
content = content_div.get_text("\n", strip=True)
|
83 |
else:
|
84 |
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
85 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
86 |
-
|
87 |
-
# ๊ฒฐ๊ณผ ํฉ์น๊ธฐ
|
88 |
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
89 |
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ํฉ์ณ ๋ฐํ ์ค๋น ์๋ฃ")
|
90 |
return result
|
@@ -93,13 +84,10 @@ def scrape_naver_blog(url: str) -> str:
|
|
93 |
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}")
|
94 |
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
95 |
|
96 |
-
#
|
97 |
-
# [์ฐธ์กฐ์ฝ๋-1]: ํํ์ ๋ถ์ ํจ์ (Mecab ์ด์ฉ)
|
98 |
-
# =============================================================================
|
99 |
-
logging.basicConfig(level=logging.DEBUG)
|
100 |
-
logger = logging.getLogger(__name__)
|
101 |
-
|
102 |
def analyze_text(text: str):
|
|
|
|
|
103 |
logger.debug("์๋ณธ ํ
์คํธ: %s", text)
|
104 |
|
105 |
# 1. ํ๊ตญ์ด๋ง ๋จ๊ธฐ๊ธฐ (๊ณต๋ฐฑ, ์์ด, ๊ธฐํธ ๋ฑ ์ ๊ฑฐ)
|
@@ -111,7 +99,7 @@ def analyze_text(text: str):
|
|
111 |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
112 |
|
113 |
# 2. Mecab์ ์ด์ฉํ ํํ์ ๋ถ์ (๋ช
์ฌ์ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ)
|
114 |
-
mecab_instance = mecab.MeCab()
|
115 |
tokens = mecab_instance.pos(filtered_text)
|
116 |
logger.debug("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ: %s", tokens)
|
117 |
|
@@ -130,7 +118,7 @@ def analyze_text(text: str):
|
|
130 |
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"])
|
131 |
logger.debug("๊ฒฐ๊ณผ DataFrame ์์ฑ๋จ, shape: %s", df.shape)
|
132 |
|
133 |
-
# 5. Excel ํ์ผ ์์ฑ (์์ ํ์ผ
|
134 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
135 |
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
136 |
temp_file.close()
|
@@ -138,9 +126,7 @@ def analyze_text(text: str):
|
|
138 |
|
139 |
return df, temp_file.name
|
140 |
|
141 |
-
#
|
142 |
-
# [์ฐธ์กฐ์ฝ๋-2]: ํค์๋ ๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ ๋ฌธ์์ ์กฐํ ๊ด๋ จ ํจ์
|
143 |
-
# =============================================================================
|
144 |
def generate_signature(timestamp, method, uri, secret_key):
|
145 |
message = f"{timestamp}.{method}.{uri}"
|
146 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
@@ -158,6 +144,7 @@ def get_header(method, uri, api_key, secret_key, customer_id):
|
|
158 |
}
|
159 |
|
160 |
def fetch_related_keywords(keyword):
|
|
|
161 |
API_KEY = os.environ["NAVER_API_KEY"]
|
162 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
163 |
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
@@ -189,9 +176,11 @@ def fetch_related_keywords(keyword):
|
|
189 |
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"]
|
190 |
df.rename(columns={"relKeyword": "์ ๋ณดํค์๋"}, inplace=True)
|
191 |
result_df = df[["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"]]
|
|
|
192 |
return result_df
|
193 |
|
194 |
def fetch_blog_count(keyword):
|
|
|
195 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
196 |
client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
|
197 |
url = "https://openapi.naver.com/v1/search/blog.json"
|
@@ -203,23 +192,21 @@ def fetch_blog_count(keyword):
|
|
203 |
response = requests.get(url, headers=headers, params=params)
|
204 |
if response.status_code == 200:
|
205 |
data = response.json()
|
|
|
206 |
return data.get("total", 0)
|
207 |
else:
|
|
|
208 |
return 0
|
209 |
|
210 |
def create_excel_file(df):
|
211 |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
212 |
excel_path = tmp.name
|
213 |
df.to_excel(excel_path, index=False)
|
|
|
214 |
return excel_path
|
215 |
|
216 |
def process_keyword(keywords: str, include_related: bool):
|
217 |
-
""
|
218 |
-
์ฌ๋ฌ ํค์๋๋ฅผ ์ํฐ๋ก ๊ตฌ๋ถํ์ฌ ๋ฆฌ์คํธ๋ก ๋ง๋ค๊ณ ,
|
219 |
-
๊ฐ ํค์๋์ ๋ํด ๋ค์ด๋ฒ ๊ด๊ณ API๋ก ๊ฒ์๋ ์ ๋ณด๋ฅผ ์กฐํํ๋ฉฐ,
|
220 |
-
์ฒซ ๋ฒ์งธ ํค์๋์ ๊ฒฝ์ฐ ์ต์
์ ๋ฐ๋ผ ์ฐ๊ด๊ฒ์์ด๋ ์ถ๊ฐํ ํ,
|
221 |
-
๊ฐ ์ ๋ณดํค์๋์ ๋ํด ๋ธ๋ก๊ทธ ๋ฌธ์์๋ฅผ ์กฐํํ์ฌ DataFrame๊ณผ Excel ํ์ผ์ ๋ฐํํฉ๋๋ค.
|
222 |
-
"""
|
223 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
224 |
result_dfs = []
|
225 |
|
@@ -245,64 +232,62 @@ def process_keyword(keywords: str, include_related: bool):
|
|
245 |
|
246 |
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].apply(fetch_blog_count)
|
247 |
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True)
|
248 |
-
|
249 |
return result_df, create_excel_file(result_df)
|
250 |
|
251 |
-
#
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
# 2. ํํ์ ๋ถ์๋ ๋จ์ด ๋ชฉ๋ก ์ถ์ถ (ํค์๋ ์กฐํ์ฉ)
|
266 |
-
keywords = "\n".join(df_morph["๋จ์ด"].tolist())
|
267 |
-
debug_log(f"์ถ์ถ๋ ๋จ์ด ๋ชฉ๋ก: {keywords}")
|
268 |
|
269 |
-
#
|
270 |
-
|
271 |
-
debug_log("
|
272 |
|
273 |
-
#
|
274 |
-
|
275 |
-
debug_log("
|
276 |
-
df_merged.drop(columns=["์ ๋ณดํค์๋"], inplace=True)
|
277 |
|
278 |
-
#
|
279 |
-
|
280 |
-
|
281 |
|
282 |
-
|
|
|
|
|
|
|
283 |
|
284 |
-
#
|
285 |
-
|
286 |
-
#
|
287 |
-
with gr.Blocks() as demo:
|
288 |
-
gr.Markdown("# ๋ธ๋ก๊ทธ ๊ธ ํํ์ ๋ถ์ ๋ฐ ํค์๋ ์ ๋ณด ์กฐํ")
|
289 |
|
290 |
-
with gr.Tab("๋ธ๋ก๊ทธ ๋ด์ฉ
|
291 |
with gr.Row():
|
292 |
-
|
293 |
fetch_button = gr.Button("๋ธ๋ก๊ทธ๋ด์ฉ๊ฐ์ ธ์ค๊ธฐ")
|
294 |
-
blog_content = gr.Textbox(label="๋ธ๋ก๊ทธ ๋ด์ฉ
|
295 |
-
|
296 |
-
fetch_button.click(fn=scrape_naver_blog, inputs=blog_url, outputs=blog_content)
|
297 |
|
298 |
-
with gr.Tab("ํํ์ ๋ถ์
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
with gr.Row():
|
300 |
-
|
301 |
-
|
302 |
-
output_table = gr.Dataframe(label="๋ถ์ ๊ฒฐ๊ณผ (ํํ์ ๋ฐ ํค์๋ ์ ๋ณด)", interactive=True)
|
303 |
-
output_file = gr.File(label="Excel ๋ค์ด๋ก๋")
|
304 |
-
# 'ํํ์๋ถ์' ๋ฒํผ ํด๋ฆญ ์ process_blog_content ํจ์ ์คํ
|
305 |
-
analysis_button.click(fn=process_blog_content, inputs=blog_content, outputs=[output_table, output_file])
|
306 |
|
307 |
if __name__ == "__main__":
|
308 |
debug_log("Gradio ์ฑ ์คํ ์์")
|
|
|
17 |
def debug_log(message: str):
|
18 |
print(f"[DEBUG] {message}")
|
19 |
|
20 |
+
# [๊ธฐ๋ณธ์ฝ๋] - ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ์คํฌ๋ํ ๊ธฐ๋ฅ
|
|
|
|
|
21 |
def scrape_naver_blog(url: str) -> str:
|
22 |
debug_log("scrape_naver_blog ํจ์ ์์")
|
23 |
debug_log(f"์์ฒญ๋ฐ์ URL: {url}")
|
24 |
|
|
|
25 |
headers = {
|
26 |
"User-Agent": (
|
27 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
|
31 |
}
|
32 |
|
33 |
try:
|
34 |
+
# 1) ๋ค์ด๋ฒ ๋ธ๋ก๊ทธ '๋ฉ์ธ' ํ์ด์ง ์์ฒญ
|
35 |
response = requests.get(url, headers=headers)
|
36 |
debug_log("HTTP GET ์์ฒญ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
|
|
37 |
if response.status_code != 200:
|
38 |
debug_log(f"์์ฒญ ์คํจ, ์ํ์ฝ๋: {response.status_code}")
|
39 |
return f"์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {response.status_code}"
|
40 |
+
|
41 |
+
# 2) ๋ฉ์ธ ํ์ด์ง ํ์ฑ
|
42 |
soup = BeautifulSoup(response.text, "html.parser")
|
43 |
debug_log("HTML ํ์ฑ(๋ฉ์ธ ํ์ด์ง) ์๋ฃ")
|
44 |
+
|
45 |
+
# 3) iframe ํ๊ทธ ์ฐพ๊ธฐ
|
46 |
iframe = soup.select_one("iframe#mainFrame")
|
47 |
if not iframe:
|
48 |
debug_log("iframe#mainFrame ํ๊ทธ๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค.")
|
49 |
return "๋ณธ๋ฌธ iframe์ ์ฐพ์ ์ ์์ต๋๋ค."
|
|
|
50 |
iframe_src = iframe.get("src")
|
51 |
if not iframe_src:
|
52 |
debug_log("iframe src๊ฐ ์กด์ฌํ์ง ์์ต๋๋ค.")
|
53 |
return "๋ณธ๋ฌธ iframe์ src๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค."
|
54 |
+
|
55 |
+
# 4) iframe src ๋ณด์ (์ ๋๊ฒฝ๋ก ์ฒ๋ฆฌ)
|
56 |
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
57 |
debug_log(f"iframe ํ์ด์ง ์์ฒญ URL: {parsed_iframe_url}")
|
58 |
+
|
59 |
+
# 5) iframe ํ์ด์ง ์์ฒญ ๋ฐ ํ์ฑ
|
60 |
iframe_response = requests.get(parsed_iframe_url, headers=headers)
|
61 |
debug_log("HTTP GET ์์ฒญ(iframe ํ์ด์ง) ์๋ฃ")
|
|
|
62 |
if iframe_response.status_code != 200:
|
63 |
debug_log(f"iframe ์์ฒญ ์คํจ, ์ํ์ฝ๋: {iframe_response.status_code}")
|
64 |
return f"iframe์์ ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค. ์ํ์ฝ๋: {iframe_response.status_code}"
|
|
|
65 |
iframe_soup = BeautifulSoup(iframe_response.text, "html.parser")
|
66 |
debug_log("HTML ํ์ฑ(iframe ํ์ด์ง) ์๋ฃ")
|
67 |
+
|
68 |
+
# 6) ์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ ์ถ์ถ
|
69 |
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
70 |
title = title_div.get_text(strip=True) if title_div else "์ ๋ชฉ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
71 |
debug_log(f"์ถ์ถ๋ ์ ๋ชฉ: {title}")
|
|
|
|
|
72 |
content_div = iframe_soup.select_one('.se-main-container')
|
73 |
if content_div:
|
74 |
content = content_div.get_text("\n", strip=True)
|
75 |
else:
|
76 |
content = "๋ณธ๋ฌธ์ ์ฐพ์ ์ ์์ต๋๋ค."
|
77 |
debug_log("๋ณธ๋ฌธ ์ถ์ถ ์๋ฃ")
|
78 |
+
|
|
|
79 |
result = f"[์ ๋ชฉ]\n{title}\n\n[๋ณธ๋ฌธ]\n{content}"
|
80 |
debug_log("์ ๋ชฉ๊ณผ ๋ณธ๋ฌธ์ ํฉ์ณ ๋ฐํ ์ค๋น ์๋ฃ")
|
81 |
return result
|
|
|
84 |
debug_log(f"์๋ฌ ๋ฐ์: {str(e)}")
|
85 |
return f"์คํฌ๋ํ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {str(e)}"
|
86 |
|
87 |
+
# [์ฐธ์กฐ์ฝ๋-1] ํํ์ ๋ถ์ ๊ธฐ๋ฅ
|
|
|
|
|
|
|
|
|
|
|
88 |
def analyze_text(text: str):
|
89 |
+
logging.basicConfig(level=logging.DEBUG)
|
90 |
+
logger = logging.getLogger(__name__)
|
91 |
logger.debug("์๋ณธ ํ
์คํธ: %s", text)
|
92 |
|
93 |
# 1. ํ๊ตญ์ด๋ง ๋จ๊ธฐ๊ธฐ (๊ณต๋ฐฑ, ์์ด, ๊ธฐํธ ๋ฑ ์ ๊ฑฐ)
|
|
|
99 |
return pd.DataFrame(columns=["๋จ์ด", "๋น๋์"]), ""
|
100 |
|
101 |
# 2. Mecab์ ์ด์ฉํ ํํ์ ๋ถ์ (๋ช
์ฌ์ ๋ณตํฉ๋ช
์ฌ๋ง ์ถ์ถ)
|
102 |
+
mecab_instance = mecab.MeCab()
|
103 |
tokens = mecab_instance.pos(filtered_text)
|
104 |
logger.debug("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ: %s", tokens)
|
105 |
|
|
|
118 |
df = pd.DataFrame(sorted_freq, columns=["๋จ์ด", "๋น๋์"])
|
119 |
logger.debug("๊ฒฐ๊ณผ DataFrame ์์ฑ๋จ, shape: %s", df.shape)
|
120 |
|
121 |
+
# 5. Excel ํ์ผ ์์ฑ (์์ ํ์ผ)
|
122 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
123 |
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
124 |
temp_file.close()
|
|
|
126 |
|
127 |
return df, temp_file.name
|
128 |
|
129 |
+
# [์ฐธ์กฐ์ฝ๋-2] ๋ค์ด๋ฒ ๊ด๊ณ API ๋ฐ ๊ฒ์๋/๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ๊ธฐ๋ฅ
|
|
|
|
|
130 |
def generate_signature(timestamp, method, uri, secret_key):
|
131 |
message = f"{timestamp}.{method}.{uri}"
|
132 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
|
144 |
}
|
145 |
|
146 |
def fetch_related_keywords(keyword):
|
147 |
+
debug_log(f"fetch_related_keywords ํธ์ถ, ํค์๋: {keyword}")
|
148 |
API_KEY = os.environ["NAVER_API_KEY"]
|
149 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
150 |
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
|
|
176 |
df["ํ ํ์๊ฒ์๋"] = df["PC์๊ฒ์๋"] + df["๋ชจ๋ฐ์ผ์๊ฒ์๋"]
|
177 |
df.rename(columns={"relKeyword": "์ ๋ณดํค์๋"}, inplace=True)
|
178 |
result_df = df[["์ ๋ณดํค์๋", "PC์๊ฒ์๋", "๋ชจ๋ฐ์ผ์๊ฒ์๋", "ํ ํ์๊ฒ์๋"]]
|
179 |
+
debug_log("fetch_related_keywords ์๋ฃ")
|
180 |
return result_df
|
181 |
|
182 |
def fetch_blog_count(keyword):
|
183 |
+
debug_log(f"fetch_blog_count ํธ์ถ, ํค์๋: {keyword}")
|
184 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
185 |
client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
|
186 |
url = "https://openapi.naver.com/v1/search/blog.json"
|
|
|
192 |
response = requests.get(url, headers=headers, params=params)
|
193 |
if response.status_code == 200:
|
194 |
data = response.json()
|
195 |
+
debug_log(f"fetch_blog_count ๊ฒฐ๊ณผ: {data.get('total', 0)}")
|
196 |
return data.get("total", 0)
|
197 |
else:
|
198 |
+
debug_log(f"fetch_blog_count ์ค๋ฅ, ์ํ์ฝ๋: {response.status_code}")
|
199 |
return 0
|
200 |
|
201 |
def create_excel_file(df):
|
202 |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
203 |
excel_path = tmp.name
|
204 |
df.to_excel(excel_path, index=False)
|
205 |
+
debug_log(f"Excel ํ์ผ ์์ฑ๋จ: {excel_path}")
|
206 |
return excel_path
|
207 |
|
208 |
def process_keyword(keywords: str, include_related: bool):
|
209 |
+
debug_log(f"process_keyword ํธ์ถ, ํค์๋๋ค: {keywords}, ์ฐ๊ด๊ฒ์์ด ํฌํจ: {include_related}")
|
|
|
|
|
|
|
|
|
|
|
210 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
211 |
result_dfs = []
|
212 |
|
|
|
232 |
|
233 |
result_df["๋ธ๋ก๊ทธ๋ฌธ์์"] = result_df["์ ๋ณดํค์๋"].apply(fetch_blog_count)
|
234 |
result_df.sort_values(by="ํ ํ์๊ฒ์๋", ascending=False, inplace=True)
|
235 |
+
debug_log("process_keyword ์๋ฃ")
|
236 |
return result_df, create_excel_file(result_df)
|
237 |
|
238 |
+
# ์๋ก์ด ๊ธฐ๋ฅ: '๋ธ๋ก๊ทธ๋ด์ฉ๊ฐ์ ธ์ค๊ธฐ' ์คํ ์ ๋ธ๋ก๊ทธ ๋งํฌ๋ก๋ถํฐ ์ ๋ชฉ/๋ณธ๋ฌธ ์คํฌ๋ํ
|
239 |
+
def fetch_blog_content(url: str):
|
240 |
+
debug_log("fetch_blog_content ํจ์ ์์")
|
241 |
+
content = scrape_naver_blog(url)
|
242 |
+
debug_log("fetch_blog_content ํจ์ ์๋ฃ")
|
243 |
+
return content
|
244 |
+
|
245 |
+
# ์๋ก์ด ๊ธฐ๋ฅ: ํํ์ ๋ถ์ ๋ฐ ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์ ์ถ๊ฐ
|
246 |
+
def morphological_analysis_and_enrich(text: str):
|
247 |
+
debug_log("morphological_analysis_and_enrich ํจ์ ์์")
|
248 |
+
df_freq, _ = analyze_text(text)
|
249 |
+
if df_freq.empty:
|
250 |
+
debug_log("ํํ์ ๋ถ์ ๊ฒฐ๊ณผ๊ฐ ๋น ๋ฐ์ดํฐํ๋ ์์
๋๋ค.")
|
251 |
+
return df_freq, ""
|
|
|
|
|
|
|
252 |
|
253 |
+
# ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์์ ํค์๋ ์ถ์ถ (๊ฐ ๋จ์ด๋ฅผ ์ํฐ๋ก ๊ตฌ๋ถ)
|
254 |
+
keywords = "\n".join(df_freq["๋จ์ด"].tolist())
|
255 |
+
debug_log(f"๋ถ์๋ ํค์๋: {keywords}")
|
256 |
|
257 |
+
# [์ฐธ์กฐ์ฝ๋-2]๋ฅผ ํ์ฉํ์ฌ ๊ฐ ํค์๋์ ๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ (์ฐ๊ด๊ฒ์์ด ๋ฏธํฌํจ)
|
258 |
+
df_keyword_info, _ = process_keyword(keywords, include_related=False)
|
259 |
+
debug_log("๊ฒ์๋ ๋ฐ ๋ธ๋ก๊ทธ๋ฌธ์์ ์กฐํ ์๋ฃ")
|
|
|
260 |
|
261 |
+
# ํํ์ ๋ถ์ ๊ฒฐ๊ณผ์ ๊ฒ์๋ ์ ๋ณด๋ฅผ ๋ณํฉ (ํค์๋ ๊ธฐ์ค)
|
262 |
+
merged_df = pd.merge(df_freq, df_keyword_info, left_on="๋จ์ด", right_on="์ ๋ณดํค์๋", how="left")
|
263 |
+
merged_df.drop(columns=["์ ๋ณดํค์๋"], inplace=True)
|
264 |
|
265 |
+
# ๋ณํฉ ๊ฒฐ๊ณผ Excel ํ์ผ ์์ฑ
|
266 |
+
merged_excel_path = create_excel_file(merged_df)
|
267 |
+
debug_log("morphological_analysis_and_enrich ํจ์ ์๋ฃ")
|
268 |
+
return merged_df, merged_excel_path
|
269 |
|
270 |
+
# Gradio ์ธํฐํ์ด์ค ๊ตฌ์ฑ (Hugging Face Spaces ํ๊ฒฝ์ ์ ํฉ)
|
271 |
+
with gr.Blocks(title="๋ธ๋ก๊ทธ๊ธ ํํ์ ๋ถ์ ์คํ์ด์ค", css=".gradio-container { max-width: 960px; margin: auto; }") as demo:
|
272 |
+
gr.Markdown("# ๋ธ๋ก๊ทธ๊ธ ํํ์ ๋ถ์ ์คํ์ด์ค")
|
|
|
|
|
273 |
|
274 |
+
with gr.Tab("๋ธ๋ก๊ทธ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ"):
|
275 |
with gr.Row():
|
276 |
+
blog_url_input = gr.Textbox(label="๋ค์ด๋ฒ ๋ธ๋ก๊ทธ ๋งํฌ", placeholder="์: https://blog.naver.com/ssboost/222983068507", lines=1)
|
277 |
fetch_button = gr.Button("๋ธ๋ก๊ทธ๋ด์ฉ๊ฐ์ ธ์ค๊ธฐ")
|
278 |
+
blog_content = gr.Textbox(label="๋ธ๋ก๊ทธ ๋ด์ฉ", lines=10, placeholder="๋ธ๋ก๊ทธ ๋ด์ฉ์ ๊ฐ์ ธ์ค๊ฑฐ๋ ์ง์ ์
๋ ฅํ์ธ์.")
|
279 |
+
fetch_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content)
|
|
|
280 |
|
281 |
+
with gr.Tab("ํํ์ ๋ถ์"):
|
282 |
+
with gr.Row():
|
283 |
+
analysis_input = gr.Textbox(label="๋ถ์ํ ํ
์คํธ", lines=10, placeholder="๋ถ์ํ ํ
์คํธ๋ฅผ ์
๋ ฅํ๊ฑฐ๋ '๋ธ๋ก๊ทธ ๋ด์ฉ ๊ฐ์ ธ์ค๊ธฐ'์์ ๊ฐ์ ธ์จ ๋ด์ฉ์ ์์ ํ์ธ์.")
|
284 |
+
with gr.Row():
|
285 |
+
analyze_button = gr.Button("ํํ์๋ถ์")
|
286 |
+
with gr.Row():
|
287 |
+
analysis_result = gr.Dataframe(label="๋ถ์ ๊ฒฐ๊ณผ (๋จ์ด, ๋น๋์, ๊ฒ์๋, ๋ธ๋ก๊ทธ๋ฌธ์์ ๋ฑ)")
|
288 |
with gr.Row():
|
289 |
+
analysis_excel = gr.File(label="Excel ๋ค์ด๋ก๋")
|
290 |
+
analyze_button.click(fn=morphological_analysis_and_enrich, inputs=analysis_input, outputs=[analysis_result, analysis_excel])
|
|
|
|
|
|
|
|
|
291 |
|
292 |
if __name__ == "__main__":
|
293 |
debug_log("Gradio ์ฑ ์คํ ์์")
|