Spaces:

blueradiance
/

masking

Build error

App Files Files Community

blueradiance commited on Apr 17

Commit

04a745e

verified ·

1 Parent(s): 5f9191a

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -81

app.py CHANGED Viewed

@@ -1,32 +1,13 @@
 import re
 import gradio as gr
 import threading
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 TAG_PREFIX = "N"
-NAME_ENTITY_EXCEPTIONS = set([
-    '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
-    '개인정보', '본인', '해당', '현재', '아래', '위치', '소속',
-    '상담', '그래도'
-])
-REGEX_KEYWORDS_TO_MASK = set([
-    '이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소', '계좌번호'
-])
-FAMILY_TITLES = ['어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생', '아들', '딸',
-    '할머니', '할아버지', '외할머니', '외할아버지', '이모', '고모', '삼촌', '숙모', '외삼촌',
-    '고모부', '이모부', '조카', '사촌', '남편', '아내', '부인', '와이프', '신랑', '장모',
-    '장인', '사위', '며느리', '올케', '형수', '제수씨', '매형', '처제', '시누이']
-ACADEMIC_TITLES = ['학생', '초등학생', '중학생', '고등학생', '수험생', '학부모']
-OCCUPATIONAL_TITLES = ['대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원',
-    '실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장',
-    '사무장', '직원', '매니저', '지점장', '선생님', '선생', '교사', '교장',
-    '교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사',
-    '의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님']
-COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES
 COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
 model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
@@ -34,25 +15,13 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForTokenClassification.from_pretrained(model_name)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
-def to_chosung(text):
-    CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
-    result = ""
-    for ch in text:
-        if '가' <= ch <= '힣':
-            code = ord(ch) - ord('가')
-            cho = code // 588
-            result += CHOSUNG_LIST[cho]
-        else:
-            result += ch
-    return result
 def extract_names(text):
     results = ner_pipeline(text)
     names = []
     for entity in results:
         if entity.get("entity_group") == "PS":
             name = entity["word"].replace("##", "").strip()
-            if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS:
                 names.append(name)
     return list(set(names))
@@ -69,72 +38,107 @@ def apply_name_tags(text, names, start=100):
             counter += 1
     return tagged, mapping
-def expand_from_tag_context(tagged_text, mapping):
-    updated = {}
-    for tag, name in mapping.items():
-        idx = tagged_text.find(tag)
-        if idx == -1:
-            updated[tag] = name
-            continue
-        context = tagged_text[max(0, idx - 50): idx + 50]
-        pattern = re.compile(rf'([가-힣])?{re.escape(name)}({"|".join(COMMON_SUFFIXES)})?({"|".join(COMMON_JOSA)})?')
-        matches = pattern.findall(context)
-        if matches:
-            longest = max(matches, key=lambda x: len(''.join(x)))
-            updated[tag] = ''.join(longest)
-        else:
-            updated[tag] = name
-    return updated
-def mask_school_names(text):
-    def replace_school(m):
-        return to_chosung(m.group(1)) + m.group(2)
-    return re.sub(r"([가-힣]{2,20})(초등학교|중학교|고등학교)", replace_school, text)
-def mask_department(text):
-    return re.sub(r"([가-힣]{2,20})학과", lambda m: to_chosung(m.group(1)) + "학과", text)
 def postprocess_sensitive_patterns(text):
-    text = re.sub(r"[\w\.-]+@", "******@", text)
     text = re.sub(r"(\d{6})[- ]?(\d{7})", "******-*******", text)
     text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "***-****-****", text)
     text = re.sub(r"(\d{1,3})동", "***동", text)
     text = re.sub(r"(\d{1,4})호", "****호", text)
     return text
-def sanitize_sensitive_info(text, keyword_string, replace_word):
-    text = postprocess_sensitive_patterns(text)
-    text = mask_school_names(text)
-    text = mask_department(text)
-    text = re.sub(r"(\d)학년(\s?(\d)반)?", "*학년 *반", text)
-    keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
-    for kw in keywords:
-        text = re.sub(rf"{re.escape(kw)}", replace_word, text, flags=re.IGNORECASE)
-    return text
 def apply_masking(text, keyword_string, replace_word):
-    original = text
-    text = sanitize_sensitive_info(text, keyword_string, replace_word)
     names = extract_names(text)
     tagged, mapping = apply_name_tags(text, names)
     def finalize():
-        updated_mapping = expand_from_tag_context(tagged, mapping)
-        final_map = "\n".join([f"{k} → {v}" for k, v in updated_mapping.items()])
-        masked_output.update(value=tagged)
-        mapping_output.update(value=final_map)
     threading.Timer(0.2, finalize).start()
-    initial_map = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
-    return tagged, initial_map
 with gr.Blocks() as demo:
-    gr.Markdown("🧠 **v4.2 ULTIMATE FULL: 태그 기반 확장 + 민감정보 마스킹 완전체**")
-    input_text = gr.Textbox(lines=15, label="📄 입력 텍스트")
-    keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
     replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
-    run_button = gr.Button("🚀 마스킹 실행")
     masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
     mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False)
     run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
 demo.launch()

+# ▶️ Part 1: 이름 태깅 + 파생 표현 후처리 + 우리기관 강화치환
 import re
 import gradio as gr
 import threading
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 TAG_PREFIX = "N"
+COMMON_SUFFIXES = ['학생', '선생님', '씨', '님']
 COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
 model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
 model = AutoModelForTokenClassification.from_pretrained(model_name)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
 def extract_names(text):
     results = ner_pipeline(text)
     names = []
     for entity in results:
         if entity.get("entity_group") == "PS":
             name = entity["word"].replace("##", "").strip()
+            if len(name) >= 2:
                 names.append(name)
     return list(set(names))
             counter += 1
     return tagged, mapping
+def expand_variation_patterns(tagged_text, mapping):
+    for tag, base in mapping.items():
+        suffix_pattern = re.compile(rf'([\(\["'‘“\s]*){re.escape(base)}(?:{"|".join(COMMON_SUFFIXES)})?(?:{"|".join(COMMON_JOSA)})?', re.IGNORECASE)
+        tagged_text = suffix_pattern.sub(lambda m: m.group(0).replace(base, tag), tagged_text)
+    return tagged_text
+def replace_institution_keywords(text, keywords, replace_word):
+    for kw in keywords:
+        pattern = re.compile(rf'([\s\(\["'‘“]*){re.escape(kw)}([가-힣\s.,;:!?()"'”’]*)', re.IGNORECASE)
+        text = pattern.sub(lambda m: m.group(1) + replace_word + m.group(2), text)
+    return text
 def postprocess_sensitive_patterns(text):
+    text = re.sub(r"[\w\.-]+@[\w\.-]+", "******@***.***", text)
     text = re.sub(r"(\d{6})[- ]?(\d{7})", "******-*******", text)
     text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "***-****-****", text)
     text = re.sub(r"(\d{1,3})동", "***동", text)
     text = re.sub(r"(\d{1,4})호", "****호", text)
     return text
 def apply_masking(text, keyword_string, replace_word):
+    keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
+    text = replace_institution_keywords(text, keywords, replace_word)
+    text = postprocess_sensitive_patterns(text)
     names = extract_names(text)
     tagged, mapping = apply_name_tags(text, names)
     def finalize():
+        final_tagged = expand_variation_patterns(tagged, mapping)
+        mapping_str = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
+        masked_output.update(value=final_tagged)
+        mapping_output.update(value=mapping_str)
     threading.Timer(0.2, finalize).start()
+    return tagged, "\n".join([f"{k} → {v}" for k, v in mapping.items()])
+# Part 1 UI
 with gr.Blocks() as demo:
+    gr.Markdown("🧠 **v4.3A: 이름 + 파생 표현 + 기관 치환 (1단계)**")
+    input_text = gr.Textbox(lines=15, label="📄 원문 텍스트")
+    keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
     replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
+    run_button = gr.Button("🚀 실행")
     masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
     mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False)
     run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
+# ▶️ Part 2: 학과/학교 초성 변환 + 학년/반 마스킹 + mapping 보정
+# - 이 부분은 이어서 별도 실행 환경 or 연동 코드로 붙일 수 있습니다.
+# - 필요 시 한 파일로 통합 가능
+# ▶️ Part 2: 학과/학교 초성 변환 + 학년/반 마스킹 + mapping 보정
+def to_chosung(text):
+    CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
+    result = ""
+    for ch in text:
+        if '가' <= ch <= '힣':
+            code = ord(ch) - ord('가')
+            cho = code // 588
+            result += CHOSUNG_LIST[cho]
+        else:
+            result += ch
+    return result
+def mask_school_names(text):
+    def replace_school(m):
+        return to_chosung(m.group(1)) + m.group(2)
+    return re.sub(r"([가-힣]{2,20})(초등학교|중학교|고등학교)", replace_school, text)
+def mask_department_names(text):
+    return re.sub(r"([가-힣]{2,20})학과", lambda m: to_chosung(m.group(1)) + "학과", text)
+def mask_grade_class(text):
+    return re.sub(r"(\d)학년(\s?(\d)반)?", "*학년 *반", text)
+def mapping_boost_context(text, mapping_dict):
+    COMMON_SUFFIXES = ['학생', '선생님', '씨', '님']
+    COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
+    updated = {}
+    for tag, base in mapping_dict.items():
+        idx = text.find(tag)
+        if idx == -1:
+            updated[tag] = base
+            continue
+        window = text[max(0, idx - 100): idx + 100]
+        pattern = re.compile(rf"([가-힣])?{re.escape(base)}(?:{'|'.join(COMMON_SUFFIXES)})?(?:{'|'.join(COMMON_JOSA)})?")
+        match = pattern.search(window)
+        if match:
+            updated[tag] = match.group(0)
+        else:
+            updated[tag] = base
+    return updated
+# 👇 예시: 후처리 최종 적용
+# final_output = mask_school_names(tagged_text)
+# final_output = mask_department_names(final_output)
+# final_output = mask_grade_class(final_output)
+# mapping = mapping_boost_context(final_output, mapping)
 demo.launch()