Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import re
|
2 |
import gradio as gr
|
3 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
@@ -29,6 +30,15 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
29 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
30 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
def extract_names(text):
|
33 |
try:
|
34 |
results = ner_pipeline(text)
|
@@ -37,17 +47,10 @@ def extract_names(text):
|
|
37 |
return []
|
38 |
|
39 |
names = []
|
40 |
-
|
41 |
-
NAME_FILTER_EXCEPTIONS = set([
|
42 |
-
'법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
|
43 |
-
'개인정보', '본인', '직통번호', '이메일', '전화번호', '연락처', '주소',
|
44 |
-
'해당', '현재', '상담', '예약', '센터', '아래', '위치', '소속',
|
45 |
-
])
|
46 |
-
|
47 |
for entity in results:
|
48 |
if entity.get("entity_group") == "PS":
|
49 |
name = entity["word"].replace("##", "").strip()
|
50 |
-
if len(name) >= 2 and name not in names and name not in
|
51 |
names.append(name)
|
52 |
|
53 |
COMMON_SUFFIXES = [
|
@@ -66,21 +69,15 @@ def extract_names(text):
|
|
66 |
]
|
67 |
|
68 |
KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
|
69 |
-
|
70 |
attached_pattern = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
|
71 |
-
attached_matches = re.findall(attached_pattern, text)
|
72 |
-
for match in attached_matches:
|
73 |
-
name = match[0]
|
74 |
-
if name not in names and name not in NAME_FILTER_EXCEPTIONS:
|
75 |
-
names.append(name)
|
76 |
-
|
77 |
spaced_pattern = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
|
78 |
-
spaced_matches = re.findall(spaced_pattern, text)
|
79 |
-
for match in spaced_matches:
|
80 |
-
name = match[0]
|
81 |
-
if name not in names and name not in NAME_FILTER_EXCEPTIONS:
|
82 |
-
names.append(name)
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
return names
|
85 |
|
86 |
def to_chosung(text):
|
@@ -95,13 +92,10 @@ def to_chosung(text):
|
|
95 |
result += ch
|
96 |
return result
|
97 |
|
98 |
-
|
99 |
-
|
100 |
def mask_department(text):
|
101 |
text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
|
102 |
return text
|
103 |
|
104 |
-
|
105 |
def sanitize_sensitive_info(text, keyword_string, replace_word):
|
106 |
text = mask_school_names(text)
|
107 |
text = mask_department(text)
|
@@ -110,9 +104,11 @@ def sanitize_sensitive_info(text, keyword_string, replace_word):
|
|
110 |
text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text)
|
111 |
|
112 |
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
|
|
|
113 |
for kw in keywords:
|
114 |
pattern = rf"\b{re.escape(kw)}\b"
|
115 |
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
|
|
|
116 |
text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
|
117 |
text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
|
118 |
text = re.sub(r"(\d{1,3})번지", r"***번지", text)
|
@@ -121,19 +117,45 @@ def sanitize_sensitive_info(text, keyword_string, replace_word):
|
|
121 |
text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text)
|
122 |
text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
|
123 |
text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
|
124 |
-
text = re.sub(r"(\d{2,6})[-]?(\d{2,6})[-]?(\d{2,6})",
|
125 |
-
lambda m: f"{m.group(1)[:2]}{'*'*(len(m.group(1))-2)}{'*'*len(m.group(2))}{m.group(3)[-4:]}", text)
|
126 |
text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
|
127 |
lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
|
128 |
return text
|
129 |
|
130 |
-
|
131 |
def final_name_remask_exact_only(text, mapping_dict):
|
132 |
for tag, name in mapping_dict.items():
|
133 |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
134 |
text = re.sub(pattern, tag, text)
|
135 |
return text
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
def apply_masking(text, keywords, replace_word):
|
139 |
names = extract_names(text)
|
@@ -143,7 +165,6 @@ def apply_masking(text, keywords, replace_word):
|
|
143 |
mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
|
144 |
return sanitized, mapping_table
|
145 |
|
146 |
-
|
147 |
with gr.Blocks() as demo:
|
148 |
gr.Markdown("""
|
149 |
🛡️ **민감정보 마스킹 [땡땡이 마스킹]**
|
|
|
1 |
+
# app_updated_with_filter_sets.py
|
2 |
import re
|
3 |
import gradio as gr
|
4 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
|
|
30 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
31 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
32 |
|
33 |
+
# ✅ 예외 필터
|
34 |
+
NAME_ENTITY_EXCEPTIONS = set([
|
35 |
+
'법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
|
36 |
+
'개인정보', '본인', '해당', '현재', '아래', '위치', '소속'
|
37 |
+
])
|
38 |
+
REGEX_KEYWORDS_TO_MASK = set([
|
39 |
+
'이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소'
|
40 |
+
])
|
41 |
+
|
42 |
def extract_names(text):
|
43 |
try:
|
44 |
results = ner_pipeline(text)
|
|
|
47 |
return []
|
48 |
|
49 |
names = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
for entity in results:
|
51 |
if entity.get("entity_group") == "PS":
|
52 |
name = entity["word"].replace("##", "").strip()
|
53 |
+
if len(name) >= 2 and name not in names and name not in NAME_ENTITY_EXCEPTIONS:
|
54 |
names.append(name)
|
55 |
|
56 |
COMMON_SUFFIXES = [
|
|
|
69 |
]
|
70 |
|
71 |
KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
|
|
|
72 |
attached_pattern = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
spaced_pattern = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
+
for pattern in [attached_pattern, spaced_pattern]:
|
76 |
+
matches = re.findall(pattern, text)
|
77 |
+
for match in matches:
|
78 |
+
name = match[0]
|
79 |
+
if name not in names and name not in NAME_ENTITY_EXCEPTIONS:
|
80 |
+
names.append(name)
|
81 |
return names
|
82 |
|
83 |
def to_chosung(text):
|
|
|
92 |
result += ch
|
93 |
return result
|
94 |
|
|
|
|
|
95 |
def mask_department(text):
|
96 |
text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
|
97 |
return text
|
98 |
|
|
|
99 |
def sanitize_sensitive_info(text, keyword_string, replace_word):
|
100 |
text = mask_school_names(text)
|
101 |
text = mask_department(text)
|
|
|
104 |
text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text)
|
105 |
|
106 |
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
|
107 |
+
keywords += list(REGEX_KEYWORDS_TO_MASK)
|
108 |
for kw in keywords:
|
109 |
pattern = rf"\b{re.escape(kw)}\b"
|
110 |
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
|
111 |
+
|
112 |
text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
|
113 |
text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
|
114 |
text = re.sub(r"(\d{1,3})번지", r"***번지", text)
|
|
|
117 |
text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text)
|
118 |
text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
|
119 |
text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
|
|
|
|
|
120 |
text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
|
121 |
lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
|
122 |
return text
|
123 |
|
|
|
124 |
def final_name_remask_exact_only(text, mapping_dict):
|
125 |
for tag, name in mapping_dict.items():
|
126 |
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
127 |
text = re.sub(pattern, tag, text)
|
128 |
return text
|
129 |
|
130 |
+
def refactored_mask_names(original_text, names, start_counter=100):
|
131 |
+
korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로',
|
132 |
+
'까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도',
|
133 |
+
'이','가','의']
|
134 |
+
masked = original_text
|
135 |
+
mapping = {}
|
136 |
+
counter = start_counter
|
137 |
+
used_names = set()
|
138 |
+
for name in names:
|
139 |
+
for josa in korean_josa:
|
140 |
+
full = name + josa
|
141 |
+
pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])'
|
142 |
+
if re.search(pattern, masked):
|
143 |
+
tag = f"N{counter:03d}"
|
144 |
+
mapping[tag] = name
|
145 |
+
masked = re.sub(pattern, tag + josa, masked)
|
146 |
+
counter += 1
|
147 |
+
used_names.add(name)
|
148 |
+
break
|
149 |
+
for name in names:
|
150 |
+
if name in used_names:
|
151 |
+
continue
|
152 |
+
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
153 |
+
if re.search(pattern, masked):
|
154 |
+
tag = f"N{counter:03d}"
|
155 |
+
mapping[tag] = name
|
156 |
+
masked = re.sub(pattern, tag, masked)
|
157 |
+
counter += 1
|
158 |
+
return masked, mapping
|
159 |
|
160 |
def apply_masking(text, keywords, replace_word):
|
161 |
names = extract_names(text)
|
|
|
165 |
mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
|
166 |
return sanitized, mapping_table
|
167 |
|
|
|
168 |
with gr.Blocks() as demo:
|
169 |
gr.Markdown("""
|
170 |
🛡️ **민감정보 마스킹 [땡땡이 마스킹]**
|