Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -126,7 +126,7 @@ def analyze_text(text: str):
|
|
126 |
|
127 |
return df, temp_file.name
|
128 |
|
129 |
-
# [μ°Έμ‘°μ½λ-2] λ€μ΄λ² κ΄κ³ API
|
130 |
def generate_signature(timestamp, method, uri, secret_key):
|
131 |
message = f"{timestamp}.{method}.{uri}"
|
132 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
@@ -143,7 +143,6 @@ def get_header(method, uri, api_key, secret_key, customer_id):
|
|
143 |
"X-Signature": signature
|
144 |
}
|
145 |
|
146 |
-
# κΈ°μ‘΄ λ¨μΌ ν€μλμ© ν¨μ (μ°Έκ³ μ©)
|
147 |
def fetch_related_keywords(keyword):
|
148 |
debug_log(f"fetch_related_keywords νΈμΆ, ν€μλ: {keyword}")
|
149 |
API_KEY = os.environ["NAVER_API_KEY"]
|
@@ -180,83 +179,6 @@ def fetch_related_keywords(keyword):
|
|
180 |
debug_log("fetch_related_keywords μλ£")
|
181 |
return result_df
|
182 |
|
183 |
-
# μ κ· μΆκ°: ν€μλ 10κ°μ© κ·Έλ£ΉμΌλ‘ λ¬Άμ΄ ν λ²μ API νΈμΆμ νλ ν¨μ
|
184 |
-
# (λ¨, κ° κ·Έλ£Ήμ μμ°¨μ μΌλ‘ νΈμΆλ¨)
|
185 |
-
def fetch_related_keywords_batch(keywords: list):
|
186 |
-
debug_log(f"fetch_related_keywords_batch νΈμΆ, ν€μλ κ·Έλ£Ή: {keywords}")
|
187 |
-
API_KEY = os.environ["NAVER_API_KEY"]
|
188 |
-
SECRET_KEY = os.environ["NAVER_SECRET_KEY"]
|
189 |
-
CUSTOMER_ID = os.environ["NAVER_CUSTOMER_ID"]
|
190 |
-
|
191 |
-
BASE_URL = "https://api.naver.com"
|
192 |
-
uri = "/keywordstool"
|
193 |
-
method = "GET"
|
194 |
-
headers = get_header(method, uri, API_KEY, SECRET_KEY, CUSTOMER_ID)
|
195 |
-
params = {
|
196 |
-
"hintKeywords": keywords, # 리μ€νΈ κ·Έλλ‘ μ λ¬ (μ΅λ 10κ°)
|
197 |
-
"showDetail": "1"
|
198 |
-
}
|
199 |
-
response = requests.get(BASE_URL + uri, params=params, headers=headers)
|
200 |
-
data = response.json()
|
201 |
-
if "keywordList" not in data:
|
202 |
-
return pd.DataFrame()
|
203 |
-
df = pd.DataFrame(data["keywordList"])
|
204 |
-
if len(df) > 100:
|
205 |
-
df = df.head(100)
|
206 |
-
|
207 |
-
def parse_count(x):
|
208 |
-
try:
|
209 |
-
return int(str(x).replace(",", ""))
|
210 |
-
except:
|
211 |
-
return 0
|
212 |
-
|
213 |
-
df["PCμκ²μλ"] = df["monthlyPcQcCnt"].apply(parse_count)
|
214 |
-
df["λͺ¨λ°μΌμκ²μλ"] = df["monthlyMobileQcCnt"].apply(parse_count)
|
215 |
-
df["ν νμκ²μλ"] = df["PCμκ²μλ"] + df["λͺ¨λ°μΌμκ²μλ"]
|
216 |
-
df.rename(columns={"relKeyword": "μ 보ν€μλ"}, inplace=True)
|
217 |
-
result_df = df[["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"]]
|
218 |
-
debug_log("fetch_related_keywords_batch μλ£")
|
219 |
-
return result_df
|
220 |
-
|
221 |
-
# process_keyword ν¨μλ₯Ό κ·Έλ£Ήλ³λ‘(κ° κ·Έλ£Ήμ μμ°¨μ μΌλ‘) μ²λ¦¬νλλ‘ κ°μ
|
222 |
-
def process_keyword(keywords: str, include_related: bool):
|
223 |
-
debug_log(f"process_keyword νΈμΆ, ν€μλλ€: {keywords}, μ°κ΄κ²μμ΄ ν¬ν¨: {include_related}")
|
224 |
-
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
225 |
-
groups = [input_keywords[i:i+10] for i in range(0, len(input_keywords), 10)]
|
226 |
-
result_dfs = []
|
227 |
-
|
228 |
-
# κ° κ·Έλ£Ήμ μμ°¨μ μΌλ‘ μ²λ¦¬ (λμμ νΈμΆνμ§ μμ)
|
229 |
-
for idx, group in enumerate(groups):
|
230 |
-
debug_log(f"κ·Έλ£Ή {idx+1} μ²λ¦¬ μμ: {group}")
|
231 |
-
df_batch = fetch_related_keywords_batch(group)
|
232 |
-
if df_batch.empty:
|
233 |
-
continue
|
234 |
-
# κ·Έλ£Ή λ΄ κ° ν€μλμ λν΄ κ²°κ³Ό μΆμΆ
|
235 |
-
for kw in group:
|
236 |
-
row_kw = df_batch[df_batch["μ 보ν€μλ"] == kw]
|
237 |
-
if not row_kw.empty:
|
238 |
-
result_dfs.append(row_kw)
|
239 |
-
else:
|
240 |
-
result_dfs.append(df_batch.head(1))
|
241 |
-
# 첫 λ²μ§Έ κ·Έλ£Ήμ λν΄μλ§ μ°κ΄κ²μμ΄ μ΅μ
μ μ© (첫 ν€μλ μ μΈ)
|
242 |
-
if include_related and idx == 0:
|
243 |
-
first_keyword = group[0]
|
244 |
-
df_related = df_batch[df_batch["μ 보ν€μλ"] != first_keyword]
|
245 |
-
if not df_related.empty:
|
246 |
-
result_dfs.append(df_related)
|
247 |
-
debug_log(f"κ·Έλ£Ή {idx+1} μ²λ¦¬ μλ£")
|
248 |
-
|
249 |
-
if result_dfs:
|
250 |
-
result_df = pd.concat(result_dfs, ignore_index=True)
|
251 |
-
result_df.drop_duplicates(subset=["μ 보ν€μλ"], inplace=True)
|
252 |
-
else:
|
253 |
-
result_df = pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"])
|
254 |
-
|
255 |
-
result_df["λΈλ‘κ·Έλ¬Έμμ"] = result_df["μ 보ν€μλ"].apply(fetch_blog_count)
|
256 |
-
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
257 |
-
debug_log("process_keyword μλ£")
|
258 |
-
return result_df, create_excel_file(result_df)
|
259 |
-
|
260 |
def fetch_blog_count(keyword):
|
261 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
262 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
@@ -283,6 +205,36 @@ def create_excel_file(df):
|
|
283 |
debug_log(f"Excel νμΌ μμ±λ¨: {excel_path}")
|
284 |
return excel_path
|
285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
# [μ°Έμ‘°μ½λ-1] λ° [μ°Έμ‘°μ½λ-2]λ₯Ό νμ©ν ννμ λΆμ λ° κ²μλ, λΈλ‘κ·Έλ¬Έμμ μΆκ° (λΉλμ1 μ κ±° μ΅μ
ν¬ν¨)
|
287 |
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
288 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
@@ -313,7 +265,32 @@ def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
|
313 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
314 |
return merged_df, merged_excel_path
|
315 |
|
316 |
-
# μλ‘κ² μΆκ°λ
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
def fetch_blog_content(url: str):
|
318 |
debug_log("fetch_blog_content ν¨μ μμ")
|
319 |
content = scrape_naver_blog(url)
|
@@ -325,23 +302,29 @@ with gr.Blocks(title="λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μ€νμ΄μ€", css=".
|
|
325 |
gr.Markdown("# λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μ€νμ΄μ€")
|
326 |
with gr.Row():
|
327 |
blog_url_input = gr.Textbox(label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬", placeholder="μ: https://blog.naver.com/ssboost/222983068507", lines=1)
|
328 |
-
with gr.Row():
|
329 |
scrape_button = gr.Button("μ€ν¬λν μ€ν")
|
330 |
with gr.Row():
|
331 |
blog_content_box = gr.Textbox(label="λΈλ‘κ·Έ λ΄μ© (μμ κ°λ₯)", lines=10, placeholder="μ€ν¬λνλ λΈλ‘κ·Έ λ΄μ©μ΄ μ¬κΈ°μ νμλ©λλ€.")
|
332 |
with gr.Row():
|
333 |
remove_freq_checkbox = gr.Checkbox(label="λΉλμ1 μ κ±°", value=False)
|
|
|
|
|
334 |
with gr.Row():
|
335 |
analyze_button = gr.Button("λΆμ μ€ν")
|
|
|
336 |
with gr.Row():
|
337 |
-
|
|
|
|
|
338 |
with gr.Row():
|
339 |
-
|
|
|
340 |
|
341 |
-
# μ€ν¬λν
|
342 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
343 |
-
# λΆμ
|
344 |
-
analyze_button.click(fn=
|
|
|
345 |
|
346 |
if __name__ == "__main__":
|
347 |
debug_log("Gradio μ± μ€ν μμ")
|
|
|
126 |
|
127 |
return df, temp_file.name
|
128 |
|
129 |
+
# [μ°Έμ‘°μ½λ-2] λ€μ΄λ² κ΄κ³ API λ° κ²μλ/λΈλ‘κ·Έλ¬Έμμ μ‘°ν κΈ°λ₯
|
130 |
def generate_signature(timestamp, method, uri, secret_key):
|
131 |
message = f"{timestamp}.{method}.{uri}"
|
132 |
digest = hmac.new(secret_key.encode("utf-8"), message.encode("utf-8"), hashlib.sha256).digest()
|
|
|
143 |
"X-Signature": signature
|
144 |
}
|
145 |
|
|
|
146 |
def fetch_related_keywords(keyword):
|
147 |
debug_log(f"fetch_related_keywords νΈμΆ, ν€μλ: {keyword}")
|
148 |
API_KEY = os.environ["NAVER_API_KEY"]
|
|
|
179 |
debug_log("fetch_related_keywords μλ£")
|
180 |
return result_df
|
181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
def fetch_blog_count(keyword):
|
183 |
debug_log(f"fetch_blog_count νΈμΆ, ν€μλ: {keyword}")
|
184 |
client_id = os.environ["NAVER_SEARCH_CLIENT_ID"]
|
|
|
205 |
debug_log(f"Excel νμΌ μμ±λ¨: {excel_path}")
|
206 |
return excel_path
|
207 |
|
208 |
+
def process_keyword(keywords: str, include_related: bool):
|
209 |
+
debug_log(f"process_keyword νΈμΆ, ν€μλλ€: {keywords}, μ°κ΄κ²μμ΄ ν¬ν¨: {include_related}")
|
210 |
+
input_keywords = [k.strip() for k in keywords.splitlines() if k.strip()]
|
211 |
+
result_dfs = []
|
212 |
+
|
213 |
+
for idx, kw in enumerate(input_keywords):
|
214 |
+
df_kw = fetch_related_keywords(kw)
|
215 |
+
if df_kw.empty:
|
216 |
+
continue
|
217 |
+
row_kw = df_kw[df_kw["μ 보ν€μλ"] == kw]
|
218 |
+
if not row_kw.empty:
|
219 |
+
result_dfs.append(row_kw)
|
220 |
+
else:
|
221 |
+
result_dfs.append(df_kw.head(1))
|
222 |
+
if include_related and idx == 0:
|
223 |
+
df_related = df_kw[df_kw["μ 보ν€μλ"] != kw]
|
224 |
+
if not df_related.empty:
|
225 |
+
result_dfs.append(df_related)
|
226 |
+
|
227 |
+
if result_dfs:
|
228 |
+
result_df = pd.concat(result_dfs, ignore_index=True)
|
229 |
+
result_df.drop_duplicates(subset=["μ 보ν€μλ"], inplace=True)
|
230 |
+
else:
|
231 |
+
result_df = pd.DataFrame(columns=["μ 보ν€μλ", "PCμκ²μλ", "λͺ¨λ°μΌμκ²μλ", "ν νμκ²μλ"])
|
232 |
+
|
233 |
+
result_df["λΈλ‘κ·Έλ¬Έμμ"] = result_df["μ 보ν€μλ"].apply(fetch_blog_count)
|
234 |
+
result_df.sort_values(by="ν νμκ²μλ", ascending=False, inplace=True)
|
235 |
+
debug_log("process_keyword μλ£")
|
236 |
+
return result_df, create_excel_file(result_df)
|
237 |
+
|
238 |
# [μ°Έμ‘°μ½λ-1] λ° [μ°Έμ‘°μ½λ-2]λ₯Ό νμ©ν ννμ λΆμ λ° κ²μλ, λΈλ‘κ·Έλ¬Έμμ μΆκ° (λΉλμ1 μ κ±° μ΅μ
ν¬ν¨)
|
239 |
def morphological_analysis_and_enrich(text: str, remove_freq1: bool):
|
240 |
debug_log("morphological_analysis_and_enrich ν¨μ μμ")
|
|
|
265 |
debug_log("morphological_analysis_and_enrich ν¨μ μλ£")
|
266 |
return merged_df, merged_excel_path
|
267 |
|
268 |
+
# μλ‘κ² μΆκ°λ κΈ°λ₯ 1,2,3: μ§μ μ
λ ₯ν ν€μλ(μν° λλ ','λ‘ κ΅¬λΆλ λ€μμ ν€μλ)κ° λΈλ‘κ·Έ λ³Έλ¬Έ λ΄ λ±μ₯ λΉλμλ₯Ό 체ν¬
|
269 |
+
def direct_keyword_analysis(text: str, keyword_input: str):
|
270 |
+
debug_log("direct_keyword_analysis ν¨μ μμ")
|
271 |
+
# μν° λλ μΌνλ‘ λΆλ¦¬νμ¬ ν€μλ λͺ©λ‘ μμ±
|
272 |
+
keywords = re.split(r'[\n,]+', keyword_input)
|
273 |
+
keywords = [kw.strip() for kw in keywords if kw.strip()]
|
274 |
+
debug_log(f"μ
λ ₯λ ν€μλ λͺ©λ‘: {keywords}")
|
275 |
+
results = []
|
276 |
+
for kw in keywords:
|
277 |
+
count = text.count(kw)
|
278 |
+
results.append((kw, count))
|
279 |
+
debug_log(f"ν€μλ '{kw}'μ λΉλμ: {count}")
|
280 |
+
df = pd.DataFrame(results, columns=["ν€μλ", "λΉλμ"])
|
281 |
+
excel_path = create_excel_file(df)
|
282 |
+
debug_log("direct_keyword_analysis ν¨μ μλ£")
|
283 |
+
return df, excel_path
|
284 |
+
|
285 |
+
# λΆμ μ€ν λ²νΌ ν΄λ¦ μ, μμ κ°λ₯ν λΈλ‘κ·Έ λ³Έλ¬Έμ λμμΌλ‘ ννμ λΆμκ³Ό μ§μ ν€μλ λΆμμ ν¨κ» μ§ν
|
286 |
+
def analyze_combined(blog_text: str, remove_freq1: bool, keyword_input: str):
|
287 |
+
debug_log("analyze_combined ν¨μ μμ")
|
288 |
+
morph_df, morph_excel = morphological_analysis_and_enrich(blog_text, remove_freq1)
|
289 |
+
direct_df, direct_excel = direct_keyword_analysis(blog_text, keyword_input)
|
290 |
+
debug_log("analyze_combined ν¨μ μλ£")
|
291 |
+
return morph_df, morph_excel, direct_df, direct_excel
|
292 |
+
|
293 |
+
# μ€ν¬λν μ€ν: λΈλ‘κ·Έ λ§ν¬λ₯Ό ν΅ν΄ λ΄μ©μ κ°μ Έμ μμ κ°λ₯ν ν
μ€νΈ λ°μ€μ μΆλ ₯
|
294 |
def fetch_blog_content(url: str):
|
295 |
debug_log("fetch_blog_content ν¨μ μμ")
|
296 |
content = scrape_naver_blog(url)
|
|
|
302 |
gr.Markdown("# λ€μ΄λ² λΈλ‘κ·Έ ννμ λΆμ μ€νμ΄μ€")
|
303 |
with gr.Row():
|
304 |
blog_url_input = gr.Textbox(label="λ€μ΄λ² λΈλ‘κ·Έ λ§ν¬", placeholder="μ: https://blog.naver.com/ssboost/222983068507", lines=1)
|
|
|
305 |
scrape_button = gr.Button("μ€ν¬λν μ€ν")
|
306 |
with gr.Row():
|
307 |
blog_content_box = gr.Textbox(label="λΈλ‘κ·Έ λ΄μ© (μμ κ°λ₯)", lines=10, placeholder="μ€ν¬λνλ λΈλ‘κ·Έ λ΄μ©μ΄ μ¬κΈ°μ νμλ©λλ€.")
|
308 |
with gr.Row():
|
309 |
remove_freq_checkbox = gr.Checkbox(label="λΉλμ1 μ κ±°", value=False)
|
310 |
+
with gr.Row():
|
311 |
+
keyword_input_box = gr.Textbox(label="μ§μ ν€μλ μ
λ ₯ (μν° λλ ','λ‘ κ΅¬λΆ)", lines=2, placeholder="μ: ν€μλ1, ν€μλ2\nν€μλ3")
|
312 |
with gr.Row():
|
313 |
analyze_button = gr.Button("λΆμ μ€ν")
|
314 |
+
with gr.Markdown("### ννμ λΆμ κ²°κ³Ό")
|
315 |
with gr.Row():
|
316 |
+
morph_result_df = gr.Dataframe(label="ννμ λΆμ κ²°κ³Ό (λ¨μ΄, λΉλμ, κ²μλ, λΈλ‘κ·Έλ¬Έμμ λ±)")
|
317 |
+
morph_excel_file = gr.File(label="ννμ λΆμ Excel λ€μ΄λ‘λ")
|
318 |
+
with gr.Markdown("### μ§μ ν€μλ λΆμ κ²°κ³Ό")
|
319 |
with gr.Row():
|
320 |
+
direct_result_df = gr.Dataframe(label="μ§μ ν€μλ λΆμ κ²°κ³Ό (ν€μλ, λΉλμ)")
|
321 |
+
direct_excel_file = gr.File(label="μ§μ ν€μλ λΆμ Excel λ€μ΄λ‘λ")
|
322 |
|
323 |
+
# μ€ν¬λν μ€ν: URLμ μ
λ ₯νλ©΄ λΈλ‘κ·Έ λ΄μ©μ μμ κ°λ₯ν ν
μ€νΈ λ°μ€μ μ±μμ€
|
324 |
scrape_button.click(fn=fetch_blog_content, inputs=blog_url_input, outputs=blog_content_box)
|
325 |
+
# λΆμ μ€ν: μμ λ λΈλ‘κ·Έ λ΄μ©κ³Ό λΉλμ1 μ κ±° μ΅μ
, μ§μ μ
λ ₯ ν€μλλ₯Ό λμμΌλ‘ λ λΆμμ ν¨κ» μ§ν
|
326 |
+
analyze_button.click(fn=analyze_combined, inputs=[blog_content_box, remove_freq_checkbox, keyword_input_box],
|
327 |
+
outputs=[morph_result_df, morph_excel_file, direct_result_df, direct_excel_file])
|
328 |
|
329 |
if __name__ == "__main__":
|
330 |
debug_log("Gradio μ± μ€ν μμ")
|