# app_updated_with_filter_sets.py import re import gradio as gr from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline school_name_candidates = [] def mask_school_names(text): global school_name_candidates school_name_candidates = [] def replacer(match): name = match.group(1) full = match.group(0) if 2 <= len(name) <= 20: school_name_candidates.append(name) return to_chosung(name) + match.group(2) else: return full text = re.sub(r"(\b[가-힣]{2,20})(초등학교|중학교|고등학교)", replacer, text) for name in school_name_candidates: pattern = rf"{re.escape(name)}\s?(초등학교|중학교|고등학교)" text = re.sub(pattern, to_chosung(name) + " " + r"\1", text) return text model_name = "Leo97/KoELECTRA-small-v3-modu-ner" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForTokenClassification.from_pretrained(model_name) ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # ✅ 예외 필터 NAME_ENTITY_EXCEPTIONS = set([ '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적', '개인정보', '본인', '해당', '현재', '아래', '위치', '소속' ]) REGEX_KEYWORDS_TO_MASK = set([ '이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소' ]) def extract_names(text): try: results = ner_pipeline(text) except Exception as e: print("NER 오류 발생:", e) return [] names = [] for entity in results: if entity.get("entity_group") == "PS": name = entity["word"].replace("##", "").strip() if len(name) >= 2 and name not in names and name not in NAME_ENTITY_EXCEPTIONS: names.append(name) COMMON_SUFFIXES = [ '대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원', '실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장', '사무장', '직원', '매니저', '지점장', '선생님', '선생', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사', '의사', '간호사', '간병인', '학생', '수험생', '초등학생', '중학생', '고등학생', '학부모', '어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생', '아들', '딸', '할머니', '할아버지', '외할머니', '외할아버지', '이모', '고모', '삼촌', '숙모', '외삼촌', '고모부', '이모부', '조카', '사촌', '남편', '아내', '부인', '와이프', '신랑', '장모', '장인', '사위', '며느리', '올케', '형수', '제수씨', '매형', '처제', '시누이', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님' ] KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?' attached_pattern = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA spaced_pattern = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA for pattern in [attached_pattern, spaced_pattern]: matches = re.findall(pattern, text) for match in matches: name = match[0] if name not in names and name not in NAME_ENTITY_EXCEPTIONS: names.append(name) return names def to_chosung(text): CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)] result = "" for ch in text: if '가' <= ch <= '힣': code = ord(ch) - ord('가') cho = code // 588 result += CHOSUNG_LIST[cho] else: result += ch return result def mask_department(text): text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text) return text def sanitize_sensitive_info(text, keyword_string, replace_word): text = mask_school_names(text) text = mask_department(text) text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text) text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text) keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] keywords += list(REGEX_KEYWORDS_TO_MASK) for kw in keywords: pattern = rf"\b{re.escape(kw)}\b" text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE) text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text) text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text) text = re.sub(r"(\d{1,3})번지", r"***번지", text) text = re.sub(r"(\d{1,3})동", r"***동", text) text = re.sub(r"(\d{1,4})호", r"****호", text) text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text) text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text) text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text) text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})", lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text) return text def final_name_remask_exact_only(text, mapping_dict): for tag, name in mapping_dict.items(): pattern = rf'(?