# masking_ver2.py import re import gradio as gr from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline def dummy(text): return text + " ✅" with gr.Blocks() as demo: inp = gr.Textbox(label="입력") out = gr.Textbox(label="출력") btn = gr.Button("실행") btn.click(fn=dummy, inputs=inp, outputs=out) def sanitize_sensitive_info(text, keyword_string, replace_word): # 📍 기관 키워드 치환 keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] for kw in keywords: pattern = rf"{re.escape(kw)}(?=\W|$)" text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE) # 📍 기본 민감정보 마스킹 예시 (이메일 앞부분 마스킹) text = re.sub(r"\b[\w\.-]+@", "******@", text) return text # ============================================= # Configurable Constants # ============================================= TAG_PREFIX = "N" NAME_ENTITY_EXCEPTIONS = set([ '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적', '개인정보', '본인', '해당', '현재', '아래', '위치', '소속', '상담', '그래도' ]) REGEX_KEYWORDS_TO_MASK = set([ '이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소', '계좌번호' ]) # 분리된 suffix 그룹 FAMILY_TITLES = ['어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생', '아들', '딸', '할머니', '할아버지', '외할머니', '외할아버지', '이모', '고모', '삼촌', '숙모', '외삼촌', '고모부', '이모부', '조카', '사촌', '남편', '아내', '부인', '와이프', '신랑', '장모', '장인', '사위', '며느리', '올케', '형수', '제수씨', '매형', '처제', '시누이'] ACADEMIC_TITLES = ['학생', '초등학생', '중학생', '고등학생', '수험생', '학부모'] OCCUPATIONAL_TITLES = ['대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원', '실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장', '사무장', '직원', '매니저', '지점장', '선생님', '선생', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사', '의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님'] COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES # ============================================= # Preload Model # ============================================= model_name = "Leo97/KoELECTRA-small-v3-modu-ner" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForTokenClassification.from_pretrained(model_name) ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # ============================================= # Utility Functions # ============================================= def to_chosung(text): CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)] result = "" for ch in text: if '가' <= ch <= '힣': code = ord(ch) - ord('가') cho = code // 588 result += CHOSUNG_LIST[cho] else: result += ch return result def postprocess_sensitive_patterns(text): text = re.sub(r"\b[\w\.-]+@", r"******@", text) # 이메일: 골뱅이 앞만 가리기 def mask_sequence(match): parts = re.split(r'[.-]', match.group()) masked = [] for i, part in enumerate(parts): if part.isdigit(): if i % 2 == 0: masked.append(part) else: masked.append('*' * len(part)) else: masked.append(part) return '.'.join(masked) if '.' in match.group() else '-'.join(masked) text = re.sub(r"(?= 2 and name not in NAME_ENTITY_EXCEPTIONS: names.append(name) base_names.add(name) KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?' attached = r'([가-힣]{2,4})(?:' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA spaced = r'([가-힣]{2,4})\s+(?:' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA for pattern in [attached, spaced]: for match in re.findall(pattern, text): name = match[0] if name not in names and name not in NAME_ENTITY_EXCEPTIONS: names.append(name) # 🧠 후처리: 이름+조사 붙은 경우로도 다시 추출 for name in base_names: for suffix in COMMON_SUFFIXES: for josa in ["", "은", "는", "이", "가", "을", "를", "도", "과", "와", "께서", "에서", "으로"]: pattern = rf'{re.escape(name)}\s?{suffix}{josa}' if re.search(pattern, text): if name not in names: names.append(name) return names def refactored_mask_names(original_text, names, start_counter=100): korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로','까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도','이','가','의'] masked = original_text mapping = {} counter = start_counter used_names = set() for name in names: for josa in korean_josa: full = name + josa pattern = rf'(?= 2 and name not in NAME_ENTITY_EXCEPTIONS: names.append(name) return names def refactored_mask_names(text, names): counter = 1 mapping = {} used_names = set() masked = text for name in names: # 조사 구분 있는 경우 for josa in ["은", "는", "이", "가", "을", "를", "께서", "도", "만", "의", "에서"]: pattern = rf'(?