N_B_analysis-5 / app.py
Kims12's picture
Update app.py
0b91ba4 verified
import gradio as gr
import pandas as pd
import tempfile
import re
import logging
from mecab import MeCab
logging.basicConfig(level=logging.DEBUG)
##############################
# 1) ๊ณตํ†ต ํ•จ์ˆ˜๋“ค
##############################
def preprocess_text(text: str) -> str:
"""
์‰ผํ‘œ, ๋งˆ์นจํ‘œ, ๊ณต๋ฐฑ, ์ˆซ์ž, ์˜์–ด ๋“ฑ
ํ•œ๊ธ€(๊ฐ€-ํžฃ) ์ด์™ธ์˜ ๋ฌธ์ž๋ฅผ ๋ชจ๋‘ ์ œ๊ฑฐํ•˜๊ณ 
ํ•œ๊ธ€๋งŒ ์—ฐ์†์œผ๋กœ ๋‚จ๊ธด๋‹ค.
"""
return re.sub(r'[^๊ฐ€-ํžฃ]', '', text)
def expand_columns_if_needed(df, needed_index: int):
"""
df์— (needed_index + 1)๋ฒˆ์งธ ์—ด์ด ์กด์žฌํ•˜์ง€ ์•Š์œผ๋ฉด
์ž„์‹œ๋กœ ํ™•์žฅํ•ด์„œ ๋นˆ ์—ด์„ ๋งŒ๋“ ๋‹ค.
์˜ˆ) needed_index=13 โ†’ N์—ด(14๋ฒˆ์งธ ์—ด)์„ ์“ฐ๋ ค๋ฉด
df.shape[1]์ด 14 ์ด์ƒ์ด ๋˜๋„๋ก ํ™•์žฅ
"""
while df.shape[1] <= needed_index:
# ๋งจ ๋์— ๋นˆ ์—ด ์ถ”๊ฐ€
df[df.shape[1]] = None
##############################
# 2) ํ‚ค์›Œ๋“œ ์นด์šดํŠธ ํ•จ์ˆ˜
##############################
def count_keywords(main_text, excel_file, direct_input):
"""
- ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ(์ค„๋ฐ”๊ฟˆ ๊ตฌ๋ถ„)๊ฐ€ ์žˆ์œผ๋ฉด ์šฐ์„  ์‚ฌ์šฉ(A์—ด=ํ‚ค์›Œ๋“œ, B์—ด=์นด์šดํŠธ)
- ์—†์œผ๋ฉด ์—‘์…€ ์‚ฌ์šฉ:
* ํ—ค๋”๋ฅผ ์‚ฌ์šฉํ•˜์ง€ ์•Š์Œ(header=None) โ†’ 1ํ–‰ ๊ทธ๋Œ€๋กœ ๋ณด์กด
* A5~A10000: ํ‚ค์›Œ๋“œ
* N5~N10000: ์นด์šดํŠธ ๊ธฐ๋ก(์—ด ์ธ๋ฑ์Šค 13)
- ๋ณธ๋ฌธ์€ ํ•œ๊ธ€๋งŒ ๋‚จ๊ธฐ๊ณ  .count(ํ‚ค์›Œ๋“œ)๋กœ ๋นˆ๋„์ˆ˜๋ฅผ ๊ณ„์‚ฐ
- 1ํšŒ ์ด์ƒ์ธ ํ‚ค์›Œ๋“œ๋งŒ ๊ฒฐ๊ณผ ํ‘œ(Markdown)์— ํ‘œ์‹œ
"""
logging.debug(f"main_text: {main_text}")
logging.debug(f"excel_file: {excel_file}")
logging.debug(f"direct_input: {direct_input}")
# ๋ณธ๋ฌธ ์ „์ฒ˜๋ฆฌ
cleaned_text = preprocess_text(main_text)
direct_input = direct_input.strip()
if direct_input:
# ===== ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ ์‚ฌ์šฉ =====
keywords = [kw.strip() for kw in direct_input.split('\n') if kw.strip()]
if not keywords:
return ("์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", None)
# counts
counts = [cleaned_text.count(k) for k in keywords]
# 1ํšŒ ์ด์ƒ ํ•„ํ„ฐ
filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
if not filtered:
# ์ „๋ถ€ 0ํšŒ
msg = "๋ณธ๋ฌธ์— ํ•ด๋‹น ํ‚ค์›Œ๋“œ๊ฐ€ ์ „ํ˜€ ๋“ฑ์žฅํ•˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค."
# ๊ทธ๋ž˜๋„ ๊ฒฐ๊ณผ CSV(A,B) ๋งŒ๋“ค์–ด์„œ ๋ฐ˜ํ™˜
tmp_df = pd.DataFrame({"๋ช…์‚ฌ": keywords, "๋นˆ๋„์ˆ˜": counts})
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
tmp_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
tmp_path = tmp.name
return (msg, tmp_path)
# 1ํšŒ ์ด์ƒ ํ‘œ(Markdown)
lines = ["| ๋ช…์‚ฌ | ๋นˆ๋„์ˆ˜ |", "|---|---|"]
for (k, c) in filtered:
lines.append(f"| {k} | {c} |")
md_table = "\n".join(lines)
# CSV ์ €์žฅ
tmp_df = pd.DataFrame({"๋ช…์‚ฌ": keywords, "๋นˆ๋„์ˆ˜": counts})
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
tmp_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
tmp_path = tmp.name
return (md_table, tmp_path)
else:
# ===== ์—‘์…€ ํŒŒ์ผ ์‚ฌ์šฉ =====
if not excel_file:
return ("์—‘์…€ ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜๊ฑฐ๋‚˜ ํ‚ค์›Œ๋“œ๋ฅผ ์ง์ ‘ ์ž…๋ ฅํ•˜์„ธ์š”.", None)
# 1) ์—‘์…€ ์ „์ฒด๋ฅผ header=None๋กœ ์ฝ์Œ โ†’ 1ํ–‰ ๊ทธ๋Œ€๋กœ ๋ณด์กด
df = pd.read_excel(excel_file.name, header=None)
# 2) A5~A10000 โ†’ (์ธ๋ฑ์Šค 4~9999) ํ‚ค์›Œ๋“œ
max_row = min(df.shape[0], 10000) # ์‹ค์ œ ํ–‰ ๊ฐœ์ˆ˜ vs 10000 ์ค‘ ๋” ์ž‘์€ ๊ฒƒ
sub_df = df.iloc[4:max_row, 0] # ์ฒซ ๋ฒˆ์งธ ์—ด(์ธ๋ฑ์Šค=0)
# strip + NaN ์ œ๊ฑฐ
keywords = sub_df.dropna().astype(str).apply(lambda x: x.strip()).tolist()
if not keywords:
return ("A5~A10000 ๋ฒ”์œ„์— ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", None)
# counts
counts = [cleaned_text.count(k) for k in keywords]
# 1ํšŒ ์ด์ƒ ํ•„ํ„ฐ
filtered = [(k, c) for k, c in zip(keywords, counts) if c > 0]
if not filtered:
msg = "๋ณธ๋ฌธ์— ํ•ด๋‹น ํ‚ค์›Œ๋“œ๊ฐ€ ์ „ํ˜€ ๋“ฑ์žฅํ•˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค(0ํšŒ)."
# ๊ทธ๋ž˜๋„ N5~N10000์— ๊ธฐ๋ก
expand_columns_if_needed(df, 13) # N์—ด=13
for i, cnt_val in enumerate(counts):
row_idx = 4 + i
if row_idx < df.shape[0]:
df.iloc[row_idx, 13] = cnt_val
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
tmp_path = tmp.name
return (msg, tmp_path)
# 1ํšŒ ์ด์ƒ ํ‘œ(Markdown)
lines = ["| ๋ช…์‚ฌ | ๋นˆ๋„์ˆ˜ |", "|---|---|"]
for (k, c) in filtered:
lines.append(f"| {k} | {c} |")
md_table = "\n".join(lines)
# N5~N10000์— ๊ธฐ๋ก
expand_columns_if_needed(df, 13)
for i, cnt_val in enumerate(counts):
row_idx = 4 + i
if row_idx < df.shape[0]:
df.iloc[row_idx, 13] = cnt_val
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
tmp_path = tmp.name
return (md_table, tmp_path)
##############################
# 3) ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ธฐ๋ฐ˜ ํ‚ค์›Œ๋“œ ์นด์šดํŠธ ํ•จ์ˆ˜
##############################
def morph_analysis_and_count(text: str):
"""
1) ์ž…๋ ฅ๋œ ํ…์ŠคํŠธ์—์„œ ํ•œ๊ธ€๋งŒ ๋‚จ๊น€
2) Mecab ํ˜•ํƒœ์†Œ ๋ถ„์„ (python-mecab-ko)
3) ๋ช…์‚ฌ ๋ฐ ๋ณตํ•ฉ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ
4) ๊ฐ ํ‚ค์›Œ๋“œ๋ฅผ ๋ณธ๋ฌธ์—์„œ ๋‹ค์‹œ ๊ฒ€์ƒ‰ํ•˜์—ฌ ๋นˆ๋„์ˆ˜ ์นด์šดํŠธ
"""
# 1) ์ „์ฒ˜๋ฆฌ
cleaned = preprocess_text(text)
# 2) Mecab ๋ถ„์„
tagger = MeCab()
parsed = tagger.pos(cleaned)
# 3) ๋ช…์‚ฌ ๋ฐ ๋ณตํ•ฉ๋ช…์‚ฌ๋งŒ ์ถ”์ถœ
noun_tags = ['NNG', 'NNP', 'NP', 'NNB']
nouns = [word for (word, pos) in parsed if pos in noun_tags]
# ์ค‘๋ณต ์ œ๊ฑฐํ•˜์—ฌ ๊ณ ์œ  ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ ์ƒ์„ฑ
unique_nouns = list(set(nouns))
# 4) ๊ฐ ํ‚ค์›Œ๋“œ๋ฅผ ๋ณธ๋ฌธ์—์„œ ๊ฒ€์ƒ‰ํ•˜์—ฌ ๋นˆ๋„์ˆ˜ ์นด์šดํŠธ
freq_dict = {}
for noun in unique_nouns:
count = cleaned.count(noun)
freq_dict[noun] = count
filtered_freq = {k: v for k, v in freq_dict.items() if v > 0}
if not filtered_freq:
return "์ถ”์ถœ๋œ ๋ช…์‚ฌ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.", None
freq_df = pd.DataFrame(list(filtered_freq.items()), columns=['๋ช…์‚ฌ', '๋นˆ๋„์ˆ˜'])
freq_df = freq_df.sort_values(by='๋นˆ๋„์ˆ˜', ascending=False).reset_index(drop=True)
try:
md_table = freq_df.to_markdown(index=False)
except ImportError:
md_table = "Markdown ๋ณ€ํ™˜์„ ์œ„ํ•ด 'tabulate' ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๊ฐ€ ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค."
return md_table, None
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
freq_df.to_csv(tmp.name, index=False, encoding='utf-8-sig')
tmp_path = tmp.name
return md_table, tmp_path
########################
# 4) Gradio ์ธํ„ฐํŽ˜์ด์Šค #
########################
# ๊ธฐ์กด CSS์— ๋ฒ„ํŠผ ์ƒ‰์ƒ ์ถ”๊ฐ€
css = """
/* '๋ถ„์„ํ•˜๊ธฐ' ๋ฒ„ํŠผ ์ƒ‰์ƒ ๋ฐ ๊ธ€์ž์ƒ‰ ๋ณ€๊ฒฝ */
#run_analysis_button > button,
#morph_analysis_button > button {
background-color: #EA580C !important; /* ์ง„ํ•œ ์ฃผํ™ฉ์ƒ‰ */
color: #FFFFFF !important; /* ํฐ์ƒ‰ ๊ธ€์ž */
}
"""
with gr.Blocks(
theme=gr.themes.Soft(
primary_hue=gr.themes.Color(
c50="#FFF7ED",
c100="#FFEDD5",
c200="#FED7AA",
c300="#FDBA74",
c400="#FB923C",
c500="#F97316",
c600="#EA580C",
c700="#C2410C",
c800="#9A3412",
c900="#7C2D12",
c950="#431407",
),
secondary_hue="zinc",
neutral_hue="zinc",
font=("Pretendard", "sans-serif")
),
css=css
) as demo:
with gr.Tab("ํ‚ค์›Œ๋“œ ์นด์šดํŠธ"):
with gr.Row():
# ์™ผ์ชฝ ์ž…๋ ฅ ์˜์—ญ
with gr.Column():
main_textbox = gr.Textbox(
label="๋ณธ๋ฌธ ํ…์ŠคํŠธ",
lines=16,
placeholder="์—ฌ๊ธฐ์— ๊ธด ๋ณธ๋ฌธ์„ ๋ถ™์—ฌ๋„ฃ์œผ์„ธ์š”."
)
keyword_input = gr.Textbox(
label="(์„ ํƒ) ์ง์ ‘ ์ž…๋ ฅ ํ‚ค์›Œ๋“œ - ์—”ํ„ฐ๋กœ ๊ตฌ๋ถ„",
lines=6,
placeholder="์˜ˆ)\n์ดˆ์ŒํŒŒ๊ฐ€์Šต๊ธฐ\n๊ฐ€์Šต๊ธฐ\n..."
)
excel_input = gr.File(
label="(์„ ํƒ) ์—‘์…€ ์—…๋กœ๋“œ"
)
# ๋ฒ„ํŠผ์— elem_id ์ถ”๊ฐ€
run_button = gr.Button("๋ถ„์„ํ•˜๊ธฐ", elem_id="run_analysis_button")
# ์˜ค๋ฅธ์ชฝ ์ถœ๋ ฅ ์˜์—ญ
with gr.Column():
output_md = gr.Markdown(label="๊ฒฐ๊ณผ ํ‘œ")
output_file = gr.File(label="๊ฒฐ๊ณผ ๋‹ค์šด๋กœ๋“œ")
run_button.click(
fn=count_keywords,
inputs=[main_textbox, excel_input, keyword_input],
outputs=[output_md, output_file]
)
with gr.Tab("ํ˜•ํƒœ์†Œ ๋ถ„์„ ๊ธฐ๋ฐ˜ ์นด์šดํŠธ"):
with gr.Row():
# ์™ผ์ชฝ ์ž…๋ ฅ ์˜์—ญ
with gr.Column():
morph_text_input = gr.Textbox(
label="๋ณธ๋ฌธ ํ…์ŠคํŠธ",
lines=16,
placeholder="์—ฌ๊ธฐ์— ๊ธด ๋ณธ๋ฌธ์„ ๋ถ™์—ฌ๋„ฃ์œผ์„ธ์š”."
)
# ๋ฒ„ํŠผ์— elem_id ์ถ”๊ฐ€
morph_run_button = gr.Button("๋ถ„์„ํ•˜๊ธฐ", elem_id="morph_analysis_button")
# ์˜ค๋ฅธ์ชฝ ์ถœ๋ ฅ ์˜์—ญ
with gr.Column():
morph_result_display = gr.Markdown(label="๋ถ„์„ ๊ฒฐ๊ณผ")
morph_download_button = gr.File(label="๊ฒฐ๊ณผ ๋‹ค์šด๋กœ๋“œ")
morph_run_button.click(
fn=morph_analysis_and_count,
inputs=morph_text_input,
outputs=[morph_result_display, morph_download_button]
)
if __name__ == "__main__":
demo.launch()