Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -108,3 +108,87 @@ def refactored_mask_names(original_text, names, start_counter=100):
|
|
108 |
|
109 |
def to_chosung(text):
|
110 |
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
def to_chosung(text):
|
110 |
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
|
111 |
+
result = ""
|
112 |
+
for ch in text:
|
113 |
+
if '가' <= ch <= '힣':
|
114 |
+
code = ord(ch) - ord('가')
|
115 |
+
cho = code // 588
|
116 |
+
result += CHOSUNG_LIST[cho]
|
117 |
+
else:
|
118 |
+
result += ch
|
119 |
+
return result
|
120 |
+
|
121 |
+
def mask_department(text):
|
122 |
+
text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
|
123 |
+
return text
|
124 |
+
|
125 |
+
def sanitize_sensitive_info(text, keyword_string, replace_word):
|
126 |
+
text = mask_school_names(text)
|
127 |
+
text = mask_department(text)
|
128 |
+
|
129 |
+
text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
|
130 |
+
text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text)
|
131 |
+
|
132 |
+
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
|
133 |
+
for kw in keywords:
|
134 |
+
pattern = rf"\b{re.escape(kw)}\b"
|
135 |
+
text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
|
136 |
+
text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
|
137 |
+
text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
|
138 |
+
text = re.sub(r"(\d{1,3})번지", r"***번지", text)
|
139 |
+
text = re.sub(r"(\d{1,3})동", r"***동", text)
|
140 |
+
text = re.sub(r"(\d{1,4})호", r"****호", text)
|
141 |
+
text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text)
|
142 |
+
text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
|
143 |
+
text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
|
144 |
+
text = re.sub(r"(\d{2,6})[-]?(\d{2,6})[-]?(\d{2,6})",
|
145 |
+
lambda m: f"{m.group(1)[:2]}{'*'*(len(m.group(1))-2)}{'*'*len(m.group(2))}{m.group(3)[-4:]}", text)
|
146 |
+
text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
|
147 |
+
lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
|
148 |
+
text = re.sub(r"(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})",
|
149 |
+
lambda m: f"{m.group(1)}.{m.group(2)}.*.*", text)
|
150 |
+
text = re.sub(r"([가-힣]{1,10})(은행|동|로|길)\s?([\d\-]{4,})",
|
151 |
+
lambda m: m.group(1) + m.group(2) + " " + re.sub(r"\d", "*", m.group(3)), text)
|
152 |
+
return text
|
153 |
+
|
154 |
+
def final_name_remask_exact_only(text, mapping_dict):
|
155 |
+
for tag, name in mapping_dict.items():
|
156 |
+
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
157 |
+
text = re.sub(pattern, tag, text)
|
158 |
+
return text
|
159 |
+
|
160 |
+
def apply_masking(text, keywords, replace_word):
|
161 |
+
names = extract_names(text)
|
162 |
+
masked, mapping = refactored_mask_names(text, names)
|
163 |
+
sanitized = sanitize_sensitive_info(masked, keywords, replace_word)
|
164 |
+
sanitized = final_name_remask_exact_only(sanitized, mapping)
|
165 |
+
mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
|
166 |
+
return sanitized, mapping_table
|
167 |
+
|
168 |
+
def remask_with_mapping(text, mapping_string):
|
169 |
+
mapping = {}
|
170 |
+
for line in mapping_string.strip().split("\n"):
|
171 |
+
if "→" in line:
|
172 |
+
tag, name = line.split("→")
|
173 |
+
mapping[tag.strip()] = name.strip()
|
174 |
+
for tag, name in mapping.items():
|
175 |
+
pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
|
176 |
+
text = re.sub(pattern, tag, text)
|
177 |
+
return text
|
178 |
+
|
179 |
+
with gr.Blocks() as demo:
|
180 |
+
gr.Markdown("""
|
181 |
+
🛡️ **민감정보 마스킹 [땡땡이 마스킹]**
|
182 |
+
이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반)
|
183 |
+
⚠️ *완벽하지 않을 수 있습니다. 반드시 직접 최종 점검하세요.*
|
184 |
+
""")
|
185 |
+
input_text = gr.Textbox(lines=15, label="📥 원본 텍스트 입력")
|
186 |
+
keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, good neighbors, gn, 사회복지법인 굿네이버스")
|
187 |
+
replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
|
188 |
+
run_button = gr.Button("🚀 마스킹 실행")
|
189 |
+
masked_output = gr.Textbox(lines=15, label="🔐 마스킹된 텍스트")
|
190 |
+
mapping_output = gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑", interactive=False)
|
191 |
+
|
192 |
+
run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
|
193 |
+
|
194 |
+
demo.launch()
|