Spaces:
Build error
Build error
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
import re | |
# 모델 초기화 | |
model_name = "Leo97/KoELECTRA-small-v3-modu-ner" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForTokenClassification.from_pretrained(model_name) | |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True) | |
def extract_names(text): | |
results = ner_pipeline(text) | |
names = [] | |
for entity in results: | |
if entity["entity_group"] == "PS": | |
name = entity["word"].replace("##", "").strip() | |
if len(name) >= 2 and name not in names: | |
names.append(name) | |
return names | |
def refactored_mask_names(original_text, names, start_counter=100): | |
korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로', | |
'까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도', | |
'이','가','의'] | |
masked = original_text | |
mapping = {} | |
counter = start_counter | |
used_names = set() | |
for name in names: | |
for josa in korean_josa: | |
full = name + josa | |
pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])' | |
if re.search(pattern, masked): | |
tag = f"N{counter:03d}" | |
mapping[tag] = name | |
masked = re.sub(pattern, tag + josa, masked) | |
counter += 1 | |
used_names.add(name) | |
break | |
for name in names: | |
if name in used_names: | |
continue | |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])' | |
if re.search(pattern, masked): | |
tag = f"N{counter:03d}" | |
mapping[tag] = name | |
masked = re.sub(pattern, tag, masked) | |
counter += 1 | |
return masked, mapping | |
def to_chosung(text): | |
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)] | |
result = "" | |
for ch in text: | |
if '가' <= ch <= '힣': | |
code = ord(ch) - ord('가') | |
cho = code // 588 | |
result += CHOSUNG_LIST[cho] | |
else: | |
result += ch | |
return result | |
def mask_school_names(text): | |
school_patterns = [ | |
(r"(\b[가-힣]{2,20})(초등학교|중학교|고등학교)", True), | |
(r"(\b[가-힣]{2,20})\s(초등학교|중학교|고등학교)", False), | |
] | |
for pattern, attach in school_patterns: | |
text = re.sub(pattern, lambda m: to_chosung(m.group(1)) + (" " if not attach else "") + m.group(2), text) | |
return text | |
def mask_department(text): | |
text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text) | |
return text | |
def sanitize_sensitive_info(text, keyword_string, replace_word): | |
text = mask_school_names(text) | |
text = mask_department(text) | |
text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text) | |
text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text) | |
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] | |
for kw in keywords: | |
pattern = rf"\b{re.escape(kw)}\b" | |
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE) | |
text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text) | |
text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text) | |
text = re.sub(r"(\d{1,3})번지", r"***번지", text) | |
text = re.sub(r"(\d{1,3})동", r"***동", text) | |
text = re.sub(r"(\d{1,4})호", r"****호", text) | |
text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text) | |
text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text) | |
text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text) | |
text = re.sub(r"(\d{2,6})[-]?(\d{2,6})[-]?(\d{2,6})", lambda m: f"{m.group(1)[:2]}{'*'*(len(m.group(1))-2)}{'*'*len(m.group(2))}{m.group(3)[-4:]}", text) | |
text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})", lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text) | |
text = re.sub(r"(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})", lambda m: f"{m.group(1)}.{m.group(2)}.*.*", text) | |
text = re.sub(r"([가-힣]{1,10})(은행|동|로|길)\s?([\d\-]{4,})", lambda m: m.group(1) + m.group(2) + " " + re.sub(r"\d", "*", m.group(3)), text) | |
return text | |
def final_name_remask_exact_only(text, mapping_dict): | |
for tag, name in mapping_dict.items(): | |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])' | |
text = re.sub(pattern, tag, text) | |
return text | |
def apply_masking(text, keywords, replace_word): | |
names = extract_names(text) | |
masked, mapping = refactored_mask_names(text, names) | |
sanitized = sanitize_sensitive_info(masked, keywords, replace_word) | |
sanitized = final_name_remask_exact_only(sanitized, mapping) | |
mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()]) | |
return sanitized, mapping_table | |
def remask_with_mapping(text, mapping_string): | |
mapping = {} | |
for line in mapping_string.strip().split("\n"): | |
if "→" in line: | |
tag, name = line.split("→") | |
mapping[tag.strip()] = name.strip() | |
for tag, name in mapping.items(): | |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])' | |
text = re.sub(pattern, tag, text) | |
return text | |
with gr.Blocks() as demo: | |
gr.Markdown("🛡️ 민감정보 마스킹 [땡땡이 마스킹] : 이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반)") | |
input_text = gr.Textbox(lines=15, label="📥 원본 텍스트 입력") | |
keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, good neighbors, gn, 사회복지법인 굿네이버스") | |
replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관") | |
run_button = gr.Button("🚀 마스킹 실행") | |
masked_output = gr.Textbox(lines=15, label="🔐 마스킹된 텍스트") | |
mapping_output = gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑", interactive=False) | |
run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output]) | |
demo.launch() | |