Spaces:
Build error
Build error
# masking_ver2.py | |
import re | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline | |
def dummy(text): | |
return text + " ✅" | |
with gr.Blocks() as demo: | |
inp = gr.Textbox(label="입력") | |
out = gr.Textbox(label="출력") | |
btn = gr.Button("실행") | |
btn.click(fn=dummy, inputs=inp, outputs=out) | |
def sanitize_sensitive_info(text, keyword_string, replace_word): | |
# 📍 기관 키워드 치환 | |
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] | |
for kw in keywords: | |
pattern = rf"{re.escape(kw)}(?=\W|$)" | |
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE) | |
# 📍 기본 민감정보 마스킹 예시 (이메일 앞부분 마스킹) | |
text = re.sub(r"\b[\w\.-]+@", "******@", text) | |
return text | |
# ============================================= | |
# Configurable Constants | |
# ============================================= | |
TAG_PREFIX = "N" | |
NAME_ENTITY_EXCEPTIONS = set([ | |
'법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적', | |
'개인정보', '본인', '해당', '현재', '아래', '위치', '소속', | |
'상담', '그래도' | |
]) | |
REGEX_KEYWORDS_TO_MASK = set([ | |
'이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소', '계좌번호' | |
]) | |
# 분리된 suffix 그룹 | |
FAMILY_TITLES = ['어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생', '아들', '딸', | |
'할머니', '할아버지', '외할머니', '외할아버지', '이모', '고모', '삼촌', '숙모', '외삼촌', | |
'고모부', '이모부', '조카', '사촌', '남편', '아내', '부인', '와이프', '신랑', '장모', | |
'장인', '사위', '며느리', '올케', '형수', '제수씨', '매형', '처제', '시누이'] | |
ACADEMIC_TITLES = ['학생', '초등학생', '중학생', '고등학생', '수험생', '학부모'] | |
OCCUPATIONAL_TITLES = ['대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원', | |
'실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장', | |
'사무장', '직원', '매니저', '지점장', '선생님', '선생', '교사', '교장', | |
'교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사', | |
'의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님'] | |
COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES | |
# ============================================= | |
# Preload Model | |
# ============================================= | |
model_name = "Leo97/KoELECTRA-small-v3-modu-ner" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForTokenClassification.from_pretrained(model_name) | |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") | |
# ============================================= | |
# Utility Functions | |
# ============================================= | |
def to_chosung(text): | |
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)] | |
result = "" | |
for ch in text: | |
if '가' <= ch <= '힣': | |
code = ord(ch) - ord('가') | |
cho = code // 588 | |
result += CHOSUNG_LIST[cho] | |
else: | |
result += ch | |
return result | |
def postprocess_sensitive_patterns(text): | |
text = re.sub(r"\b[\w\.-]+@", r"******@", text) # 이메일: 골뱅이 앞만 가리기 | |
def mask_sequence(match): | |
parts = re.split(r'[.-]', match.group()) | |
masked = [] | |
for i, part in enumerate(parts): | |
if part.isdigit(): | |
if i % 2 == 0: | |
masked.append(part) | |
else: | |
masked.append('*' * len(part)) | |
else: | |
masked.append(part) | |
return '.'.join(masked) if '.' in match.group() else '-'.join(masked) | |
text = re.sub(r"(?<![\\$\\\\])(?<!\d,)(?:\d{2,4}[.-]){1,3}\d{2,4}(?!\d)", mask_sequence, text) | |
text = re.sub(r"(\d{1,3})동", r"***동", text) # 동 정보 | |
text = re.sub(r"(\d{1,4})호", r"****호", text) # 호수 정보 | |
return text | |
# ============================================= | |
# Masking Core Functions | |
# ============================================= | |
def extract_names(text): | |
try: | |
results = ner_pipeline(text) | |
except Exception as e: | |
print("NER 오류 발생:", e) | |
return [] | |
names = [] | |
base_names = set() | |
for entity in results: | |
if entity.get("entity_group") == "PS": | |
name = entity["word"].replace("##", "").strip() | |
if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS: | |
names.append(name) | |
base_names.add(name) | |
KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?' | |
attached = r'([가-힣]{2,4})(?:' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA | |
spaced = r'([가-힣]{2,4})\s+(?:' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA | |
for pattern in [attached, spaced]: | |
for match in re.findall(pattern, text): | |
name = match[0] | |
if name not in names and name not in NAME_ENTITY_EXCEPTIONS: | |
names.append(name) | |
# 🧠 후처리: 이름+조사 붙은 경우로도 다시 추출 | |
for name in base_names: | |
for suffix in COMMON_SUFFIXES: | |
for josa in ["", "은", "는", "이", "가", "을", "를", "도", "과", "와", "께서", "에서", "으로"]: | |
pattern = rf'{re.escape(name)}\s?{suffix}{josa}' | |
if re.search(pattern, text): | |
if name not in names: | |
names.append(name) | |
return names | |
def refactored_mask_names(original_text, names, start_counter=100): | |
korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로','까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도','이','가','의'] | |
masked = original_text | |
mapping = {} | |
counter = start_counter | |
used_names = set() | |
for name in names: | |
for josa in korean_josa: | |
full = name + josa | |
pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])' | |
if re.search(pattern, masked): | |
tag = f"{TAG_PREFIX}{counter:03d}" | |
mapping[tag] = name | |
masked = re.sub(pattern, tag + josa, masked) | |
counter += 1 | |
used_names.add(name) | |
break | |
for name in names: | |
if name in used_names: | |
continue | |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])' | |
if re.search(pattern, masked): | |
tag = f"{TAG_PREFIX}{counter:03d}" | |
mapping[tag] = name | |
masked = re.sub(pattern, tag, masked) | |
counter += 1 | |
return masked, mapping | |
def final_name_remask_exact_only(text, mapping_dict): | |
for tag, name in mapping_dict.items(): | |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])' | |
text = re.sub(pattern, tag, text) | |
return text | |
def mask_department(text): | |
return re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text) | |
def mask_school_names(text): | |
global school_name_candidates | |
school_name_candidates = [] | |
def replacer(match): | |
name = match.group(1) | |
if 2 <= len(name) <= 20: | |
school_name_candidates.append(name) | |
return to_chosung(name) + match.group(2) | |
return match.group(0) | |
text = re.sub(r"(\b[가-힣]{2,20})(초등학교|중학교|고등학교)", replacer, text) | |
for name in school_name_candidates: | |
pattern = rf"{re.escape(name)}\s?(초등학교|중학교|고등학교)" | |
text = re.sub(pattern, to_chosung(name) + " " + r"\1", text) | |
return text | |
def sanitize_sensitive_info(text, keyword_string, replace_word): | |
text = postprocess_sensitive_patterns(text) # 먼저 처리 | |
text = mask_school_names(text) | |
text = mask_department(text) | |
text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text) | |
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK) | |
for kw in keywords: | |
pattern = rf"\b{re.escape(kw)}\b" | |
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE) | |
text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text) | |
text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text) | |
return text | |
# 🔹 마스킹 함수 (정리된 최종본) | |
def extract_names(text): | |
try: | |
results = ner_pipeline(text) | |
except Exception as e: | |
print("NER 오류 발생:", e) | |
return [] | |
names = [] | |
for entity in results: | |
if entity.get("entity_group") == "PS": | |
name = entity["word"].replace("##", "").strip() | |
if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS: | |
names.append(name) | |
return names | |
def refactored_mask_names(text, names): | |
counter = 1 | |
mapping = {} | |
used_names = set() | |
masked = text | |
for name in names: | |
# 조사 구분 있는 경우 | |
for josa in ["은", "는", "이", "가", "을", "를", "께서", "도", "만", "의", "에서"]: | |
pattern = rf'(?<![\w가-힣]){re.escape(name)}{josa}(?![\w가-힣])' | |
if re.search(pattern, masked): | |
tag = f"{TAG_PREFIX}{counter:03d}" | |
mapping[tag] = name | |
masked = re.sub(pattern, tag + josa, masked) | |
counter += 1 | |
used_names.add(name) | |
break | |
for name in names: | |
if name in used_names: | |
continue | |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])' | |
if re.search(pattern, masked): | |
tag = f"{TAG_PREFIX}{counter:03d}" | |
mapping[tag] = name | |
masked = re.sub(pattern, tag, masked) | |
counter += 1 | |
return masked, mapping | |
def final_name_remask_exact_only(text, mapping_dict): | |
for tag, name in mapping_dict.items(): | |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])' | |
text = re.sub(pattern, tag, text) | |
return text | |
def sanitize_sensitive_info(text, keyword_string, replace_word): | |
text = postprocess_sensitive_patterns(text) | |
text = mask_school_names(text) | |
text = mask_department(text) | |
text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text) | |
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK) | |
for kw in keywords: | |
pattern = rf"{re.escape(kw)}(?=\W|$)" | |
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE) | |
text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text) | |
text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text) | |
return text | |
def apply_masking(text, keyword_str, replace_word): | |
keywords = [kw.strip() for kw in keyword_str.split(",") if kw.strip()] | |
names = extract_names(text) | |
masked_text, name_mapping = refactored_mask_names(text, names) | |
sanitized_text = sanitize_sensitive_info(masked_text, keyword_str, replace_word) | |
final_text = final_name_remask_exact_only(sanitized_text, name_mapping) | |
mapping_table = "\n".join(f"{k} → {v}" for k, v in name_mapping.items()) | |
return final_text, mapping_table | |
# 📦 PART 4: 기관 키워드 치환기 + Gradio UI 실행기 | |
import gradio as gr | |
# ✅ 마스킹 실행 함수는 기존에 작성된 apply_full_masking() 사용 | |
with gr.Blocks() as demo: | |
gr.Markdown("🧠 **v5.0 마스킹 통합 시스템** — 키워드, 이름, 개인정보, 학교 마스킹") | |
input_text = gr.Textbox(lines=15, label="📄 원문 텍스트") | |
keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, 사회복지법인 굿네이버스") | |
replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관") | |
run_button = gr.Button("🚀 실행") | |
masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과") | |
mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False) | |
run_button.click( | |
fn=apply_masking, # ← 이걸로 바꿔야 실제 정의된 함수와 연결됨 | |
inputs=[input_text, keyword_input, replace_input], | |
outputs=[masked_output, mapping_output] | |
) | |
# ✅ 반드시 필요! Gradio 실행 | |
demo.launch(share=True, log=False) | |