Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -1,36 +1,13 @@
|
|
1 |
-
#
|
2 |
import re
|
3 |
import gradio as gr
|
4 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
5 |
|
6 |
-
|
|
|
|
|
|
|
7 |
|
8 |
-
def mask_school_names(text):
|
9 |
-
global school_name_candidates
|
10 |
-
school_name_candidates = []
|
11 |
-
|
12 |
-
def replacer(match):
|
13 |
-
name = match.group(1)
|
14 |
-
full = match.group(0)
|
15 |
-
if 2 <= len(name) <= 20:
|
16 |
-
school_name_candidates.append(name)
|
17 |
-
return to_chosung(name) + match.group(2)
|
18 |
-
else:
|
19 |
-
return full
|
20 |
-
|
21 |
-
text = re.sub(r"(\b[가-힣]{2,20})(초등학교|중학교|고등학교)", replacer, text)
|
22 |
-
|
23 |
-
for name in school_name_candidates:
|
24 |
-
pattern = rf"{re.escape(name)}\s?(초등학교|중학교|고등학교)"
|
25 |
-
text = re.sub(pattern, to_chosung(name) + " " + r"\1", text)
|
26 |
-
return text
|
27 |
-
|
28 |
-
model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
|
29 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
30 |
-
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
31 |
-
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
32 |
-
|
33 |
-
# ✅ 예외 필터
|
34 |
NAME_ENTITY_EXCEPTIONS = set([
|
35 |
'법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
|
36 |
'개인정보', '본인', '해당', '현재', '아래', '위치', '소속'
|
@@ -40,47 +17,33 @@ REGEX_KEYWORDS_TO_MASK = set([
|
|
40 |
'이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소'
|
41 |
])
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
return []
|
49 |
|
50 |
-
|
51 |
-
for entity in results:
|
52 |
-
if entity.get("entity_group") == "PS":
|
53 |
-
name = entity["word"].replace("##", "").strip()
|
54 |
-
if len(name) >= 2 and name not in names and name not in NAME_ENTITY_EXCEPTIONS:
|
55 |
-
names.append(name)
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
'박사', '석사', '학사', '의사', '간호사', '간병인',
|
63 |
-
'학생', '수험생', '초등학생', '중학생', '고등학생', '학부모',
|
64 |
-
'어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생',
|
65 |
-
'아들', '딸', '할머니', '할아버지', '외할머니', '외할아버지',
|
66 |
-
'이모', '고모', '삼촌', '숙모', '외삼촌', '고모부', '이모부', '조카', '사촌',
|
67 |
-
'남편', '아내', '부인', '와이프', '신랑', '장모', '장인', '사위', '며느리',
|
68 |
-
'올케', '형수', '제수씨', '매형', '처제', '시누이',
|
69 |
-
'보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님'
|
70 |
-
]
|
71 |
|
72 |
-
|
73 |
-
attached_pattern = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
|
74 |
-
spaced_pattern = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
|
|
|
|
|
|
|
84 |
def to_chosung(text):
|
85 |
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
|
86 |
result = ""
|
@@ -93,73 +56,53 @@ def to_chosung(text):
|
|
93 |
result += ch
|
94 |
return result
|
95 |
|
96 |
-
def mask_department(text):
|
97 |
-
text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
|
98 |
-
return text
|
99 |
-
|
100 |
-
|
101 |
def postprocess_sensitive_patterns(text):
|
102 |
-
# IP 주소: 192.168.35.201 → 192.168.*.*
|
103 |
text = re.sub(r"\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\b", r"\1.\2.*.*", text)
|
104 |
-
|
105 |
-
# 전화번호: 031-987-6543 → 031-***-6543
|
106 |
text = re.sub(r"\b(\d{2,4})-(\d{3,4})-(\d{4})\b", r"\1-***-\3", text)
|
107 |
-
|
108 |
-
# 카드번호: 1234-5678-9012-3456 → 1234-****-****-3456
|
109 |
text = re.sub(r"\b(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})\b", r"\1-****-****-\4", text)
|
110 |
-
|
111 |
return text
|
112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
|
120 |
-
text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text)
|
121 |
-
|
122 |
-
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
|
123 |
-
keywords += list(REGEX_KEYWORDS_TO_MASK)
|
124 |
-
for kw in keywords:
|
125 |
-
pattern = rf"\b{re.escape(kw)}\b"
|
126 |
-
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
|
127 |
-
|
128 |
-
text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
|
129 |
-
text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
|
130 |
-
text = re.sub(r"(\d{1,3})번지", r"***번지", text)
|
131 |
-
text = re.sub(r"(\d{1,3})동", r"***동", text)
|
132 |
-
text = re.sub(r"(\d{1,4})호", r"****호", text)
|
133 |
-
text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text)
|
134 |
-
text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
|
135 |
-
text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
|
136 |
-
text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
|
137 |
-
lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
|
138 |
-
# 📌 후처리 추가
|
139 |
-
text = postprocess_sensitive_patterns(text)
|
140 |
-
|
141 |
-
return text
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
148 |
|
149 |
def refactored_mask_names(original_text, names, start_counter=100):
|
150 |
-
korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로',
|
151 |
-
'까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도',
|
152 |
-
'이','가','의']
|
153 |
masked = original_text
|
154 |
mapping = {}
|
155 |
counter = start_counter
|
156 |
used_names = set()
|
|
|
157 |
for name in names:
|
158 |
for josa in korean_josa:
|
159 |
full = name + josa
|
160 |
pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])'
|
161 |
if re.search(pattern, masked):
|
162 |
-
tag = f"
|
163 |
mapping[tag] = name
|
164 |
masked = re.sub(pattern, tag + josa, masked)
|
165 |
counter += 1
|
@@ -170,12 +113,53 @@ def refactored_mask_names(original_text, names, start_counter=100):
|
|
170 |
continue
|
171 |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
172 |
if re.search(pattern, masked):
|
173 |
-
tag = f"
|
174 |
mapping[tag] = name
|
175 |
masked = re.sub(pattern, tag, masked)
|
176 |
counter += 1
|
177 |
return masked, mapping
|
178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
def apply_masking(text, keywords, replace_word):
|
180 |
names = extract_names(text)
|
181 |
masked, mapping = refactored_mask_names(text, names)
|
@@ -184,17 +168,17 @@ def apply_masking(text, keywords, replace_word):
|
|
184 |
mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
|
185 |
return sanitized, mapping_table
|
186 |
|
|
|
187 |
with gr.Blocks() as demo:
|
188 |
gr.Markdown("""
|
189 |
-
🛡️ **민감정보 마스킹 [
|
190 |
-
이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반)
|
191 |
-
|
192 |
-
""
|
193 |
-
|
194 |
-
keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, good neighbors, gn, 사회복지법인 굿네이버스")
|
195 |
replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
|
196 |
run_button = gr.Button("🚀 마스킹 실행")
|
197 |
-
masked_output = gr.Textbox(lines=15, label="🔐
|
198 |
mapping_output = gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑", interactive=False)
|
199 |
|
200 |
run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
|
|
|
1 |
+
# masking_ver2.py
|
2 |
import re
|
3 |
import gradio as gr
|
4 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
5 |
|
6 |
+
# =============================================
|
7 |
+
# Configurable Constants
|
8 |
+
# =============================================
|
9 |
+
TAG_PREFIX = "N"
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
NAME_ENTITY_EXCEPTIONS = set([
|
12 |
'법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
|
13 |
'개인정보', '본인', '해당', '현재', '아래', '위치', '소속'
|
|
|
17 |
'이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소'
|
18 |
])
|
19 |
|
20 |
+
# 분리된 suffix 그룹
|
21 |
+
FAMILY_TITLES = ['어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생', '아들', '딸',
|
22 |
+
'할머니', '할아버지', '외할머니', '외할아버지', '이모', '고모', '삼촌', '숙모', '외삼촌',
|
23 |
+
'고모부', '이모부', '조카', '사촌', '남편', '아내', '부인', '와이프', '신랑', '장모',
|
24 |
+
'장인', '사위', '며느리', '올케', '형수', '제수씨', '매형', '처제', '시누이']
|
|
|
25 |
|
26 |
+
ACADEMIC_TITLES = ['학생', '초등학생', '중학생', '고등학생', '수험생', '학부모']
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
OCCUPATIONAL_TITLES = ['대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원',
|
29 |
+
'실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장',
|
30 |
+
'사무장', '직원', '매니저', '지점장', '선생님', '선생', '교사', '교장',
|
31 |
+
'교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사',
|
32 |
+
'의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES
|
|
|
|
|
35 |
|
36 |
+
# =============================================
|
37 |
+
# Preload Model
|
38 |
+
# =============================================
|
39 |
+
model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
|
40 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
41 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
42 |
+
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
43 |
|
44 |
+
# =============================================
|
45 |
+
# Utility Functions
|
46 |
+
# =============================================
|
47 |
def to_chosung(text):
|
48 |
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
|
49 |
result = ""
|
|
|
56 |
result += ch
|
57 |
return result
|
58 |
|
|
|
|
|
|
|
|
|
|
|
59 |
def postprocess_sensitive_patterns(text):
|
|
|
60 |
text = re.sub(r"\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\b", r"\1.\2.*.*", text)
|
|
|
|
|
61 |
text = re.sub(r"\b(\d{2,4})-(\d{3,4})-(\d{4})\b", r"\1-***-\3", text)
|
|
|
|
|
62 |
text = re.sub(r"\b(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})\b", r"\1-****-****-\4", text)
|
|
|
63 |
return text
|
64 |
|
65 |
+
# =============================================
|
66 |
+
# Masking Core Functions
|
67 |
+
# =============================================
|
68 |
+
def extract_names(text):
|
69 |
+
try:
|
70 |
+
results = ner_pipeline(text)
|
71 |
+
except Exception as e:
|
72 |
+
print("NER 오류 발생:", e)
|
73 |
+
return []
|
74 |
|
75 |
+
names = []
|
76 |
+
for entity in results:
|
77 |
+
if entity.get("entity_group") == "PS":
|
78 |
+
name = entity["word"].replace("##", "").strip()
|
79 |
+
if len(name) >= 2 and name not in names and name not in NAME_ENTITY_EXCEPTIONS:
|
80 |
+
names.append(name)
|
81 |
|
82 |
+
KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
|
83 |
+
attached = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
|
84 |
+
spaced = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
+
for pattern in [attached, spaced]:
|
87 |
+
for match in re.findall(pattern, text):
|
88 |
+
name = match[0]
|
89 |
+
if name not in names and name not in NAME_ENTITY_EXCEPTIONS:
|
90 |
+
names.append(name)
|
91 |
+
return names
|
92 |
|
93 |
def refactored_mask_names(original_text, names, start_counter=100):
|
94 |
+
korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로','까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도','이','가','의']
|
|
|
|
|
95 |
masked = original_text
|
96 |
mapping = {}
|
97 |
counter = start_counter
|
98 |
used_names = set()
|
99 |
+
|
100 |
for name in names:
|
101 |
for josa in korean_josa:
|
102 |
full = name + josa
|
103 |
pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])'
|
104 |
if re.search(pattern, masked):
|
105 |
+
tag = f"{TAG_PREFIX}{counter:03d}"
|
106 |
mapping[tag] = name
|
107 |
masked = re.sub(pattern, tag + josa, masked)
|
108 |
counter += 1
|
|
|
113 |
continue
|
114 |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
115 |
if re.search(pattern, masked):
|
116 |
+
tag = f"{TAG_PREFIX}{counter:03d}"
|
117 |
mapping[tag] = name
|
118 |
masked = re.sub(pattern, tag, masked)
|
119 |
counter += 1
|
120 |
return masked, mapping
|
121 |
|
122 |
+
def final_name_remask_exact_only(text, mapping_dict):
|
123 |
+
for tag, name in mapping_dict.items():
|
124 |
+
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
125 |
+
text = re.sub(pattern, tag, text)
|
126 |
+
return text
|
127 |
+
|
128 |
+
def mask_department(text):
|
129 |
+
return re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
|
130 |
+
|
131 |
+
def mask_school_names(text):
|
132 |
+
global school_name_candidates
|
133 |
+
school_name_candidates = []
|
134 |
+
|
135 |
+
def replacer(match):
|
136 |
+
name = match.group(1)
|
137 |
+
if 2 <= len(name) <= 20:
|
138 |
+
school_name_candidates.append(name)
|
139 |
+
return to_chosung(name) + match.group(2)
|
140 |
+
return match.group(0)
|
141 |
+
|
142 |
+
text = re.sub(r"(\b[가-힣]{2,20})(초등학교|중학교|고등학교)", replacer, text)
|
143 |
+
for name in school_name_candidates:
|
144 |
+
pattern = rf"{re.escape(name)}\s?(초등학교|중학교|고등학교)"
|
145 |
+
text = re.sub(pattern, to_chosung(name) + " " + r"\1", text)
|
146 |
+
return text
|
147 |
+
|
148 |
+
def sanitize_sensitive_info(text, keyword_string, replace_word):
|
149 |
+
text = postprocess_sensitive_patterns(text) # 먼저 처리
|
150 |
+
text = mask_school_names(text)
|
151 |
+
text = mask_department(text)
|
152 |
+
text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
|
153 |
+
|
154 |
+
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
|
155 |
+
for kw in keywords:
|
156 |
+
pattern = rf"\b{re.escape(kw)}\b"
|
157 |
+
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
|
158 |
+
|
159 |
+
text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
|
160 |
+
text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
|
161 |
+
return text
|
162 |
+
|
163 |
def apply_masking(text, keywords, replace_word):
|
164 |
names = extract_names(text)
|
165 |
masked, mapping = refactored_mask_names(text, names)
|
|
|
168 |
mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
|
169 |
return sanitized, mapping_table
|
170 |
|
171 |
+
# UI
|
172 |
with gr.Blocks() as demo:
|
173 |
gr.Markdown("""
|
174 |
+
🛡️ **민감정보 마스킹 [ver2]**
|
175 |
+
이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반 + 예외 필터 + 후처리 강화)
|
176 |
+
""")
|
177 |
+
input_text = gr.Textbox(lines=15, label="📅 원본 텍스트 입력")
|
178 |
+
keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
|
|
|
179 |
replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
|
180 |
run_button = gr.Button("🚀 마스킹 실행")
|
181 |
+
masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
|
182 |
mapping_output = gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑", interactive=False)
|
183 |
|
184 |
run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
|