Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
import requests
|
|
|
|
|
3 |
from bs4 import BeautifulSoup
|
4 |
import urllib.parse # iframe κ²½λ‘ λ³΄μ μ μν λͺ¨λ
|
5 |
import re
|
@@ -17,8 +19,8 @@ import base64
|
|
17 |
def debug_log(message: str):
|
18 |
print(f"[DEBUG] {message}")
|
19 |
|
20 |
-
# --- λ€μ΄λ² λΈλ‘κ·Έ μ€ν¬λν ---
|
21 |
-
def scrape_naver_blog(url: str) -> str:
|
22 |
debug_log("scrape_naver_blog ν¨μ μμ")
|
23 |
debug_log(f"μμ²λ°μ URL: {url}")
|
24 |
headers = {
|
@@ -29,75 +31,51 @@ def scrape_naver_blog(url: str) -> str:
|
|
29 |
)
|
30 |
}
|
31 |
try:
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
|
|
|
|
68 |
except Exception as e:
|
69 |
debug_log(f"μλ¬ λ°μ: {str(e)}")
|
70 |
return f"μ€ν¬λν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
71 |
|
72 |
-
# ---
|
73 |
-
def analyze_text(text: str):
|
74 |
-
logging.basicConfig(level=logging.DEBUG)
|
75 |
-
logger = logging.getLogger(__name__)
|
76 |
-
logger.debug("μλ³Έ ν
μ€νΈ: %s", text)
|
77 |
-
filtered_text = re.sub(r'[^κ°-ν£]', '', text)
|
78 |
-
logger.debug("νν°λ§λ ν
μ€νΈ: %s", filtered_text)
|
79 |
-
if not filtered_text:
|
80 |
-
logger.debug("μ ν¨ν νκ΅μ΄ ν
μ€νΈκ° μμ.")
|
81 |
-
return pd.DataFrame(columns=["λ¨μ΄", "λΉλμ"]), ""
|
82 |
-
mecab_instance = mecab.MeCab()
|
83 |
-
tokens = mecab_instance.pos(filtered_text)
|
84 |
-
logger.debug("ννμ λΆμ κ²°κ³Ό: %s", tokens)
|
85 |
-
freq = {}
|
86 |
-
for word, pos in tokens:
|
87 |
-
if word and word.strip() and pos.startswith("NN"):
|
88 |
-
freq[word] = freq.get(word, 0) + 1
|
89 |
-
logger.debug("λ¨μ΄: %s, νμ¬: %s, λΉλ: %d", word, pos, freq[word])
|
90 |
-
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
91 |
-
logger.debug("μ λ ¬λ λ¨μ΄ λΉλ: %s", sorted_freq)
|
92 |
-
df = pd.DataFrame(sorted_freq, columns=["λ¨μ΄", "λΉλμ"])
|
93 |
-
logger.debug("ννμ λΆμ DataFrame μμ±λ¨, shape: %s", df.shape)
|
94 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
95 |
-
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
96 |
-
temp_file.close()
|
97 |
-
logger.debug("Excel νμΌ μμ±λ¨: %s", temp_file.name)
|
98 |
-
return df, temp_file.name
|
99 |
-
|
100 |
-
# --- λ€μ΄λ² κ²μ λ° κ΄κ³ API κ΄λ ¨ (μ°Έμ‘°μ½λ-2) ---
|
101 |
def generate_signature(timestamp, method, uri, secret_key):
|
102 |
message = f"{timestamp}.{method}.{uri}"
|
103 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
@@ -114,7 +92,8 @@ def get_header(method, uri, api_key, secret_key, customer_id):
|
|
114 |
"X-Signature": signature
|
115 |
}
|
116 |
|
117 |
-
|
|
|
118 |
debug_log(f"fetch_related_keywords νΈμΆ, ν€μλ: {keyword}")
|
119 |
API_KEY = os.environ["NAVER_API_KEY"]
|
120 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
@@ -127,8 +106,9 @@ def fetch_related_keywords(keyword):
|
|
127 |
"hintKeywords": [keyword],
|
128 |
"showDetail": "1"
|
129 |
}
|
130 |
-
|
131 |
-
|
|
|
132 |
if "keywordList" not in data:
|
133 |
return pd.DataFrame()
|
134 |
df = pd.DataFrame(data["keywordList"])
|
@@ -147,7 +127,8 @@ def fetch_related_keywords(keyword):
|
|
147 |
debug_log("fetch_related_keywords μλ£")
|
148 |
return result_df
|
149 |
|
150 |
-
|
|
|
151 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
152 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
153 |
client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
|
@@ -157,28 +138,30 @@ def fetch_blog_count(keyword):
|
|
157 |
"X-Naver-Client-Secret": client_secret
|
158 |
}
|
159 |
params = {"query": keyword, "display": 1}
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
168 |
|
169 |
def create_excel_file(df):
|
170 |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
171 |
excel_path = tmp.name
|
172 |
-
df.to_excel(excel_path, index=False)
|
173 |
debug_log(f"Excel νμΌ μμ±λ¨: {excel_path}")
|
174 |
return excel_path
|
175 |
|
176 |
-
|
|
|
177 |
debug_log(f"process_keyword νΈμΆ, ν€μλλ€: {keywords}, μ°κ΄κ²μμ΄ ν¬ν¨: {include_related}")
|
178 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
179 |
result_dfs = []
|
180 |
for idx, kw in enumerate(input_keywords):
|
181 |
-
df_kw = fetch_related_keywords(kw)
|
182 |
if df_kw.empty:
|
183 |
continue
|
184 |
row_kw = df_kw[df_kw["μ 보ν€μλ"] == kw]
|
@@ -195,13 +178,44 @@ def process_keyword(keywords: str, include_related: bool):
|
|
195 |
result_df.drop_duplicates(subset=["μ 보ν€μλ"], inplace=True)
|
196 |
else:
|
197 |
result_df = pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"])
|
198 |
-
|
|
|
|
|
|
|
199 |
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
200 |
debug_log("process_keyword μλ£")
|
201 |
return result_df, create_excel_file(result_df)
|
202 |
|
203 |
-
# --- ννμ
|
204 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
206 |
df_freq, _ = analyze_text(text)
|
207 |
if df_freq.empty:
|
@@ -213,7 +227,7 @@ def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
|
213 |
debug_log(f"λΉλμ 1 μ κ±° μ μ©λ¨. {before_shape} -> {df_freq.shape}")
|
214 |
keywords = "\n".join(df_freq["λ¨μ΄"].tolist())
|
215 |
debug_log(f"λΆμλ ν€μλ: {keywords}")
|
216 |
-
df_keyword_info, _ = process_keyword(keywords, include_related=False)
|
217 |
debug_log("κ²μλ λ° λΈλ‘κ·Έλ¬Έμμ μ‘°ν μλ£")
|
218 |
merged_df = pd.merge(df_freq, df_keyword_info, left_on="λ¨μ΄", right_on="μ 보ν€μλ", how="left")
|
219 |
merged_df.drop(columns=["μ 보ν€μλ"], inplace=True)
|
@@ -221,8 +235,8 @@ def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
|
221 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
222 |
return merged_df, merged_excel_path
|
223 |
|
224 |
-
# --- μ§μ ν€μλ λΆμ (λ¨λ
|
225 |
-
def direct_keyword_analysis(text: str, keyword_input: str):
|
226 |
debug_log("direct_keyword_analysis ν¨μ μμ")
|
227 |
keywords = re.split(r'[\n,]+', keyword_input)
|
228 |
keywords = [kw.strip() for kw in keywords if kw.strip()]
|
@@ -232,15 +246,28 @@ def direct_keyword_analysis(text: str, keyword_input: str):
|
|
232 |
count = text.count(kw)
|
233 |
results.append((kw, count))
|
234 |
debug_log(f"ν€μλ '{kw}'μ λΉλμ: {count}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
df = pd.DataFrame(results, columns=["ν€μλ", "λΉλμ"])
|
236 |
excel_path = create_excel_file(df)
|
237 |
debug_log("direct_keyword_analysis ν¨μ μλ£")
|
238 |
return df, excel_path
|
239 |
|
240 |
-
# --- ν΅ν© λΆμ (ννμ λΆμ + μ§μ ν€μλ
|
241 |
-
def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
|
242 |
debug_log("combined_analysis ν¨μ μμ")
|
243 |
-
merged_df, _ = morphological_analysis_and_enrich(blog_text, remove_freq1)
|
244 |
if "μ§μ μ
λ ₯" not in merged_df.columns:
|
245 |
merged_df["μ§μ μ
λ ₯"] = ""
|
246 |
direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
|
@@ -251,7 +278,7 @@ def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input:
|
|
251 |
merged_df.loc[merged_df["λ¨μ΄"] == dk, "μ§μ μ
λ ₯"] = "μ§μ μ
λ ₯"
|
252 |
else:
|
253 |
freq = blog_text.count(dk)
|
254 |
-
df_direct, _ = process_keyword(dk, include_related=False)
|
255 |
if (not df_direct.empty) and (dk in df_direct["μ 보ν€μλ"].values):
|
256 |
row = df_direct[df_direct["μ 보ν€μλ"] == dk].iloc[0]
|
257 |
pc = row.get("PCμκ²μλ", None)
|
@@ -275,20 +302,18 @@ def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input:
|
|
275 |
debug_log("combined_analysis ν¨μ μλ£")
|
276 |
return merged_df, combined_excel
|
277 |
|
278 |
-
# --- λΆμ νΈλ€λ¬ ---
|
279 |
-
def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
|
280 |
debug_log("analysis_handler ν¨μ μμ")
|
281 |
if direct_keyword_only:
|
282 |
-
|
283 |
-
return direct_keyword_analysis(blog_text, direct_keyword_input)
|
284 |
else:
|
285 |
-
|
286 |
-
return combined_analysis(blog_text, remove_freq1, direct_keyword_input)
|
287 |
|
288 |
-
# --- μ€ν¬λν μ€ν ---
|
289 |
-
def fetch_blog_content(url: str):
|
290 |
debug_log("fetch_blog_content ν¨μ μμ")
|
291 |
-
content = scrape_naver_blog(url)
|
292 |
debug_log("fetch_blog_content ν¨μ μλ£")
|
293 |
return content
|
294 |
|
@@ -374,7 +399,6 @@ custom_css = """
|
|
374 |
# --- Gradio μΈν°νμ΄μ€ κ΅¬μ± ---
|
375 |
with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€", css=custom_css) as demo:
|
376 |
gr.HTML("<div class='custom-header'>λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€ π</div>")
|
377 |
-
# λΈλ‘κ·Έ λ§ν¬μ μ€ν¬λν μ€ν λ²νΌμ ν κ·Έλ£Ή λ΄μ λ°°μΉ (λ²νΌμ κ°μ΄λ° μ λ ¬)
|
378 |
with gr.Group(elem_classes="custom-group"):
|
379 |
with gr.Row():
|
380 |
blog_url_input = gr.Textbox(label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬", placeholder="μ: https://blog.naver.com/ssboost/222983068507", lines=1)
|
@@ -396,7 +420,6 @@ with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€", css=custo
|
|
396 |
result_df = gr.Dataframe(label="ν΅ν© λΆμ κ²°κ³Ό (λ¨μ΄, λΉλμ, κ²μλ, λΈλ‘κ·Έλ¬Έμμ, μ§μ μ
λ ₯)", interactive=True)
|
397 |
with gr.Group(elem_classes="custom-group"):
|
398 |
excel_file = gr.File(label="Excel λ€μ΄λ‘λ")
|
399 |
-
# μ¬μ©μ€λͺ
HTML λΈλ‘ (μλμ λ°°μΉ)
|
400 |
with gr.Group(elem_classes="custom-group"):
|
401 |
usage_html = gr.HTML("""
|
402 |
<div class="usage-instructions">
|
@@ -418,7 +441,7 @@ with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€", css=custo
|
|
418 |
<p><strong>Tip:</strong> λΆμ κ²°κ³Όλ μ€μκ°μΌλ‘ μ
λ°μ΄νΈλλ©°, νμμ μμ ν λ€μ λΆμν μ μμ΅λλ€. μ¦κ±°μ΄ λΆμ λμΈμ! π</p>
|
419 |
</div>
|
420 |
""")
|
421 |
-
# μ΄λ²€νΈ μ°κ²°
|
422 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
423 |
analyze_button.click(fn=analysis_handler,
|
424 |
inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
|
|
|
1 |
import gradio as gr
|
2 |
import requests
|
3 |
+
import aiohttp
|
4 |
+
import asyncio
|
5 |
from bs4 import BeautifulSoup
|
6 |
import urllib.parse # iframe κ²½λ‘ λ³΄μ μ μν λͺ¨λ
|
7 |
import re
|
|
|
19 |
def debug_log(message: str):
|
20 |
print(f"[DEBUG] {message}")
|
21 |
|
22 |
+
# --- λ€μ΄λ² λΈλ‘κ·Έ μ€ν¬λν (λΉλκΈ° λ²μ ) ---
|
23 |
+
async def scrape_naver_blog(url: str) -> str:
|
24 |
debug_log("scrape_naver_blog ν¨μ μμ")
|
25 |
debug_log(f"μμ²λ°μ URL: {url}")
|
26 |
headers = {
|
|
|
31 |
)
|
32 |
}
|
33 |
try:
|
34 |
+
async with aiohttp.ClientSession() as session:
|
35 |
+
async with session.get(url, headers=headers) as response:
|
36 |
+
debug_log("HTTP GET μμ²(λ©μΈ νμ΄μ§) μλ£")
|
37 |
+
if response.status != 200:
|
38 |
+
debug_log(f"μμ² μ€ν¨, μνμ½λ: {response.status}")
|
39 |
+
return f"μ€λ₯κ° λ°μνμ΅λλ€. μνμ½λ: {response.status}"
|
40 |
+
html = await response.text()
|
41 |
+
soup = BeautifulSoup(html, "html.parser")
|
42 |
+
debug_log("HTML νμ±(λ©μΈ νμ΄μ§) μλ£")
|
43 |
+
iframe = soup.select_one("iframe#mainFrame")
|
44 |
+
if not iframe:
|
45 |
+
debug_log("iframe#mainFrame νκ·Έλ₯Ό μ°Ύμ μ μμ΅λλ€.")
|
46 |
+
return "λ³Έλ¬Έ iframeμ μ°Ύμ μ μμ΅λλ€."
|
47 |
+
iframe_src = iframe.get("src")
|
48 |
+
if not iframe_src:
|
49 |
+
debug_log("iframe srcκ° μ‘΄μ¬νμ§ μμ΅λλ€.")
|
50 |
+
return "λ³Έλ¬Έ iframeμ srcλ₯Ό μ°Ύμ μ μμ΅λλ€."
|
51 |
+
parsed_iframe_url = urllib.parse.urljoin(url, iframe_src)
|
52 |
+
debug_log(f"iframe νμ΄μ§ μμ² URL: {parsed_iframe_url}")
|
53 |
+
async with aiohttp.ClientSession() as session:
|
54 |
+
async with session.get(parsed_iframe_url, headers=headers) as iframe_response:
|
55 |
+
debug_log("HTTP GET μμ²(iframe νμ΄μ§) μλ£")
|
56 |
+
if iframe_response.status != 200:
|
57 |
+
debug_log(f"iframe μμ² μ€ν¨, μνμ½λ: {iframe_response.status}")
|
58 |
+
return f"iframeμμ μ€λ₯κ° λ°μνμ΅λλ€. μνμ½λ: {iframe_response.status}"
|
59 |
+
iframe_html = await iframe_response.text()
|
60 |
+
iframe_soup = BeautifulSoup(iframe_html, "html.parser")
|
61 |
+
debug_log("HTML νμ±(iframe νμ΄μ§) μλ£")
|
62 |
+
title_div = iframe_soup.select_one('.se-module.se-module-text.se-title-text')
|
63 |
+
title = title_div.get_text(strip=True) if title_div else "μ λͺ©μ μ°Ύμ μ μμ΅λλ€."
|
64 |
+
debug_log(f"μΆμΆλ μ λͺ©: {title}")
|
65 |
+
content_div = iframe_soup.select_one('.se-main-container')
|
66 |
+
if content_div:
|
67 |
+
content = content_div.get_text("\n", strip=True)
|
68 |
+
else:
|
69 |
+
content = "λ³Έλ¬Έμ μ°Ύμ μ μμ΅λλ€."
|
70 |
+
debug_log("λ³Έλ¬Έ μΆμΆ μλ£")
|
71 |
+
result = f"[μ λͺ©]\n{title}\n\n[λ³Έλ¬Έ]\n{content}"
|
72 |
+
debug_log("μ λͺ©κ³Ό λ³Έλ¬Έ ν©μΉ¨ μλ£")
|
73 |
+
return result
|
74 |
except Exception as e:
|
75 |
debug_log(f"μλ¬ λ°μ: {str(e)}")
|
76 |
return f"μ€ν¬λν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(e)}"
|
77 |
|
78 |
+
# --- λ€μ΄λ² κ²μ λ° κ΄κ³ API κ΄λ ¨ ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def generate_signature(timestamp, method, uri, secret_key):
|
80 |
message = f"{timestamp}.{method}.{uri}"
|
81 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
|
92 |
"X-Signature": signature
|
93 |
}
|
94 |
|
95 |
+
# --- μ°κ΄ ν€μλ μ‘°ν (λΉλκΈ°) ---
|
96 |
+
async def fetch_related_keywords(keyword):
|
97 |
debug_log(f"fetch_related_keywords νΈμΆ, ν€μλ: {keyword}")
|
98 |
API_KEY = os.environ["NAVER_API_KEY"]
|
99 |
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
|
|
106 |
"hintKeywords": [keyword],
|
107 |
"showDetail": "1"
|
108 |
}
|
109 |
+
async with aiohttp.ClientSession() as session:
|
110 |
+
async with session.get(BASE_URL + uri, headers=headers, params=params) as response:
|
111 |
+
data = await response.json()
|
112 |
if "keywordList" not in data:
|
113 |
return pd.DataFrame()
|
114 |
df = pd.DataFrame(data["keywordList"])
|
|
|
127 |
debug_log("fetch_related_keywords μλ£")
|
128 |
return result_df
|
129 |
|
130 |
+
# --- λΈλ‘κ·Έ λ¬Έμμ μ‘°ν (λΉλκΈ°) ---
|
131 |
+
async def fetch_blog_count(keyword):
|
132 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
133 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
134 |
client_secret = os.environ["NAVER_SEARCH_CLIENT_SECRET"]
|
|
|
138 |
"X-Naver-Client-Secret": client_secret
|
139 |
}
|
140 |
params = {"query": keyword, "display": 1}
|
141 |
+
async with aiohttp.ClientSession() as session:
|
142 |
+
async with session.get(url, headers=headers, params=params) as response:
|
143 |
+
if response.status == 200:
|
144 |
+
data = await response.json()
|
145 |
+
debug_log(f"fetch_blog_count κ²°κ³Ό: {data.get('total', 0)}")
|
146 |
+
return data.get("total", 0)
|
147 |
+
else:
|
148 |
+
debug_log(f"fetch_blog_count μ€λ₯, μνμ½λ: {response.status}")
|
149 |
+
return 0
|
150 |
|
151 |
def create_excel_file(df):
|
152 |
with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp:
|
153 |
excel_path = tmp.name
|
154 |
+
df.to_excel(excel_path, index=False, engine='openpyxl')
|
155 |
debug_log(f"Excel νμΌ μμ±λ¨: {excel_path}")
|
156 |
return excel_path
|
157 |
|
158 |
+
# --- ν€μλ κ²μ (λΉλκΈ°) ---
|
159 |
+
async def process_keyword(keywords: str, include_related: bool):
|
160 |
debug_log(f"process_keyword νΈμΆ, ν€μλλ€: {keywords}, μ°κ΄κ²μμ΄ ν¬ν¨: {include_related}")
|
161 |
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
162 |
result_dfs = []
|
163 |
for idx, kw in enumerate(input_keywords):
|
164 |
+
df_kw = await fetch_related_keywords(kw)
|
165 |
if df_kw.empty:
|
166 |
continue
|
167 |
row_kw = df_kw[df_kw["μ 보ν€μλ"] == kw]
|
|
|
178 |
result_df.drop_duplicates(subset=["μ 보ν€μλ"], inplace=True)
|
179 |
else:
|
180 |
result_df = pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"])
|
181 |
+
# λΈλ‘κ·Έ λ¬Έμμ μ‘°νλ₯Ό λ³λ ¬λ‘ μ²λ¦¬
|
182 |
+
tasks = [fetch_blog_count(kw) for kw in result_df["μ 보ν€μλ"]]
|
183 |
+
counts = await asyncio.gather(*tasks)
|
184 |
+
result_df["λΈλ‘κ·Έλ¬Έμμ"] = counts
|
185 |
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
186 |
debug_log("process_keyword μλ£")
|
187 |
return result_df, create_excel_file(result_df)
|
188 |
|
189 |
+
# --- ννμ λΆμ (μ°Έκ³ μ½λ-1, λκΈ°) ---
|
190 |
+
def analyze_text(text: str):
|
191 |
+
logging.basicConfig(level=logging.DEBUG)
|
192 |
+
logger = logging.getLogger(__name__)
|
193 |
+
logger.debug("μλ³Έ ν
μ€νΈ: %s", text)
|
194 |
+
filtered_text = re.sub(r'[^κ°-ν£]', '', text)
|
195 |
+
logger.debug("νν°λ§λ ν
μ€νΈ: %s", filtered_text)
|
196 |
+
if not filtered_text:
|
197 |
+
logger.debug("μ ν¨ν νκ΅μ΄ ν
μ€νΈκ° μμ.")
|
198 |
+
return pd.DataFrame(columns=["λ¨μ΄", "λΉλμ"]), ""
|
199 |
+
mecab_instance = mecab.MeCab()
|
200 |
+
tokens = mecab_instance.pos(filtered_text)
|
201 |
+
logger.debug("ννμ λΆμ κ²°κ³Ό: %s", tokens)
|
202 |
+
freq = {}
|
203 |
+
for word, pos in tokens:
|
204 |
+
if word and word.strip() and pos.startswith("NN"):
|
205 |
+
freq[word] = freq.get(word, 0) + 1
|
206 |
+
logger.debug("λ¨μ΄: %s, νμ¬: %s, λΉλ: %d", word, pos, freq[word])
|
207 |
+
sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
|
208 |
+
logger.debug("μ λ ¬λ λ¨μ΄ λΉλ: %s", sorted_freq)
|
209 |
+
df = pd.DataFrame(sorted_freq, columns=["λ¨μ΄", "λΉλμ"])
|
210 |
+
logger.debug("ννμ λΆμ DataFrame μμ±λ¨, shape: %s", df.shape)
|
211 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
|
212 |
+
df.to_excel(temp_file.name, index=False, engine='openpyxl')
|
213 |
+
temp_file.close()
|
214 |
+
logger.debug("Excel νμΌ μμ±λ¨: %s", temp_file.name)
|
215 |
+
return df, temp_file.name
|
216 |
+
|
217 |
+
# --- ννμ λΆμκ³Ό κ²μλ/λΈλ‘κ·Έλ¬Έμμ λ³ν© (λΉλκΈ°) ---
|
218 |
+
async def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
219 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
220 |
df_freq, _ = analyze_text(text)
|
221 |
if df_freq.empty:
|
|
|
227 |
debug_log(f"λΉλμ 1 μ κ±° μ μ©λ¨. {before_shape} -> {df_freq.shape}")
|
228 |
keywords = "\n".join(df_freq["λ¨μ΄"].tolist())
|
229 |
debug_log(f"λΆμλ ν€μλ: {keywords}")
|
230 |
+
df_keyword_info, _ = await process_keyword(keywords, include_related=False)
|
231 |
debug_log("κ²μλ λ° λΈλ‘κ·Έλ¬Έμμ μ‘°ν μλ£")
|
232 |
merged_df = pd.merge(df_freq, df_keyword_info, left_on="λ¨μ΄", right_on="μ 보ν€μλ", how="left")
|
233 |
merged_df.drop(columns=["μ 보ν€μλ"], inplace=True)
|
|
|
235 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
236 |
return merged_df, merged_excel_path
|
237 |
|
238 |
+
# --- μ§μ ν€μλ λΆμ (λ¨λ
λΆμ, λΉλκΈ°) ---
|
239 |
+
async def direct_keyword_analysis(text: str, keyword_input: str):
|
240 |
debug_log("direct_keyword_analysis ν¨μ μμ")
|
241 |
keywords = re.split(r'[\n,]+', keyword_input)
|
242 |
keywords = [kw.strip() for kw in keywords if kw.strip()]
|
|
|
246 |
count = text.count(kw)
|
247 |
results.append((kw, count))
|
248 |
debug_log(f"ν€μλ '{kw}'μ λΉλμ: {count}")
|
249 |
+
# μ§μ μ
λ ₯ ν€μλκ° λ³Έλ¬Έμ μμΌλ©΄ μΆκ° μ‘°ν
|
250 |
+
if kw not in text:
|
251 |
+
df_direct, _ = await process_keyword(kw, include_related=False)
|
252 |
+
if (not df_direct.empty) and (kw in df_direct["μ 보ν€μλ"].values):
|
253 |
+
row = df_direct[df_direct["μ 보ν€μλ"] == kw].iloc[0]
|
254 |
+
pc = row.get("PCμκ²μλ", None)
|
255 |
+
mobile = row.get("λͺ¨λ°μΌμκ²μλ", None)
|
256 |
+
total = row.get("ν νμκ²μλ", None)
|
257 |
+
blog_count = row.get("λΈλ‘κ·Έλ¬Έμμ", None)
|
258 |
+
else:
|
259 |
+
pc = mobile = total = blog_count = None
|
260 |
+
# κ²°κ³Όμ μ ν μΆκ°
|
261 |
+
results.append((kw, count))
|
262 |
df = pd.DataFrame(results, columns=["ν€μλ", "λΉλμ"])
|
263 |
excel_path = create_excel_file(df)
|
264 |
debug_log("direct_keyword_analysis ν¨μ μλ£")
|
265 |
return df, excel_path
|
266 |
|
267 |
+
# --- ν΅ν© λΆμ (ννμ λΆμ + μ§μ ν€μλ λΆμ, λΉλκΈ°) ---
|
268 |
+
async def combined_analysis(blog_text: str, remove_freq1: bool, direct_keyword_input: str):
|
269 |
debug_log("combined_analysis ν¨μ μμ")
|
270 |
+
merged_df, _ = await morphological_analysis_and_enrich(blog_text, remove_freq1)
|
271 |
if "μ§μ μ
λ ₯" not in merged_df.columns:
|
272 |
merged_df["μ§μ μ
λ ₯"] = ""
|
273 |
direct_keywords = re.split(r'[\n,]+', direct_keyword_input)
|
|
|
278 |
merged_df.loc[merged_df["λ¨μ΄"] == dk, "μ§μ μ
λ ₯"] = "μ§μ μ
λ ₯"
|
279 |
else:
|
280 |
freq = blog_text.count(dk)
|
281 |
+
df_direct, _ = await process_keyword(dk, include_related=False)
|
282 |
if (not df_direct.empty) and (dk in df_direct["μ 보ν€μλ"].values):
|
283 |
row = df_direct[df_direct["μ 보ν€μλ"] == dk].iloc[0]
|
284 |
pc = row.get("PCμκ²μλ", None)
|
|
|
302 |
debug_log("combined_analysis ν¨μ μλ£")
|
303 |
return merged_df, combined_excel
|
304 |
|
305 |
+
# --- λΆμ νΈλ€λ¬ (λΉλκΈ°) ---
|
306 |
+
async def analysis_handler(blog_text: str, remove_freq1: bool, direct_keyword_input: str, direct_keyword_only: bool):
|
307 |
debug_log("analysis_handler ν¨μ μμ")
|
308 |
if direct_keyword_only:
|
309 |
+
return await direct_keyword_analysis(blog_text, direct_keyword_input)
|
|
|
310 |
else:
|
311 |
+
return await combined_analysis(blog_text, remove_freq1, direct_keyword_input)
|
|
|
312 |
|
313 |
+
# --- μ€ν¬λν μ€ν νΈλ€λ¬ (λΉλκΈ°) ---
|
314 |
+
async def fetch_blog_content(url: str):
|
315 |
debug_log("fetch_blog_content ν¨μ μμ")
|
316 |
+
content = await scrape_naver_blog(url)
|
317 |
debug_log("fetch_blog_content ν¨μ μλ£")
|
318 |
return content
|
319 |
|
|
|
399 |
# --- Gradio μΈν°νμ΄μ€ κ΅¬μ± ---
|
400 |
with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€", css=custom_css) as demo:
|
401 |
gr.HTML("<div class='custom-header'>λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μλΉμ€ π</div>")
|
|
|
402 |
with gr.Group(elem_classes="custom-group"):
|
403 |
with gr.Row():
|
404 |
blog_url_input = gr.Textbox(label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬", placeholder="μ: https://blog.naver.com/ssboost/222983068507", lines=1)
|
|
|
420 |
result_df = gr.Dataframe(label="ν΅ν© λΆμ κ²°κ³Ό (λ¨μ΄, λΉλμ, κ²μλ, λΈλ‘κ·Έλ¬Έμμ, μ§μ μ
λ ₯)", interactive=True)
|
421 |
with gr.Group(elem_classes="custom-group"):
|
422 |
excel_file = gr.File(label="Excel λ€μ΄λ‘λ")
|
|
|
423 |
with gr.Group(elem_classes="custom-group"):
|
424 |
usage_html = gr.HTML("""
|
425 |
<div class="usage-instructions">
|
|
|
441 |
<p><strong>Tip:</strong> λΆμ κ²°κ³Όλ μ€μκ°μΌλ‘ μ
λ°μ΄νΈλλ©°, νμμ μμ ν λ€μ λΆμν μ μμ΅λλ€. μ¦κ±°μ΄ λΆμ λμΈμ! π</p>
|
442 |
</div>
|
443 |
""")
|
444 |
+
# μ΄λ²€νΈ μ°κ²° (λΉλκΈ° ν¨μ μ¬μ©)
|
445 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
446 |
analyze_button.click(fn=analysis_handler,
|
447 |
inputs=[blog_content_box, remove_freq_checkbox, direct_keyword_box, direct_keyword_only_checkbox],
|