Spaces:
Build error
Build error
Upload 4 files
Browse files
app.py
CHANGED
@@ -49,7 +49,11 @@ def refactored_mask_names(original_text, names, start_counter=100):
|
|
49 |
counter += 1
|
50 |
return masked, mapping
|
51 |
|
52 |
-
def sanitize_sensitive_info(text):
|
|
|
|
|
|
|
|
|
53 |
text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
|
54 |
text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
|
55 |
text = re.sub(r"(\d{1,3})번지", r"***번지", text)
|
@@ -62,7 +66,6 @@ def sanitize_sensitive_info(text):
|
|
62 |
text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})", lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
|
63 |
text = re.sub(r"(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})", lambda m: f"{m.group(1)}.{m.group(2)}.*.*", text)
|
64 |
text = re.sub(r"([가-힣]{1,10})(은행|동|로|길)\s?([\d\-]{4,})", lambda m: m.group(1) + m.group(2) + " " + re.sub(r"\d", "*", m.group(3)), text)
|
65 |
-
text = re.sub(r"\b(good neighbors|굿네이버스|사회복지법인 굿네이버스|gn)\b", "우리기관", text, flags=re.IGNORECASE)
|
66 |
return text
|
67 |
|
68 |
def final_name_remask_exact_only(text, mapping_dict):
|
@@ -71,17 +74,20 @@ def final_name_remask_exact_only(text, mapping_dict):
|
|
71 |
text = re.sub(pattern, tag, text)
|
72 |
return text
|
73 |
|
74 |
-
def full_pipeline(text):
|
75 |
names = extract_names(text)
|
76 |
masked, mapping = refactored_mask_names(text, names)
|
77 |
-
sanitized = sanitize_sensitive_info(masked)
|
78 |
sanitized = final_name_remask_exact_only(sanitized, mapping)
|
79 |
mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
|
80 |
return sanitized, mapping_table
|
81 |
|
82 |
gr.Interface(
|
83 |
fn=full_pipeline,
|
84 |
-
inputs=
|
|
|
|
|
|
|
85 |
outputs=[
|
86 |
gr.Textbox(lines=15, label="🔐 마스킹된 텍스트"),
|
87 |
gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑 (NXXX → 이름)")
|
|
|
49 |
counter += 1
|
50 |
return masked, mapping
|
51 |
|
52 |
+
def sanitize_sensitive_info(text, keyword_string):
|
53 |
+
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
|
54 |
+
for kw in keywords:
|
55 |
+
pattern = rf"\b{re.escape(kw)}\b"
|
56 |
+
text = re.sub(pattern, "우리기관", text, flags=re.IGNORECASE)
|
57 |
text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
|
58 |
text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
|
59 |
text = re.sub(r"(\d{1,3})번지", r"***번지", text)
|
|
|
66 |
text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})", lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
|
67 |
text = re.sub(r"(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})", lambda m: f"{m.group(1)}.{m.group(2)}.*.*", text)
|
68 |
text = re.sub(r"([가-힣]{1,10})(은행|동|로|길)\s?([\d\-]{4,})", lambda m: m.group(1) + m.group(2) + " " + re.sub(r"\d", "*", m.group(3)), text)
|
|
|
69 |
return text
|
70 |
|
71 |
def final_name_remask_exact_only(text, mapping_dict):
|
|
|
74 |
text = re.sub(pattern, tag, text)
|
75 |
return text
|
76 |
|
77 |
+
def full_pipeline(text, keywords):
|
78 |
names = extract_names(text)
|
79 |
masked, mapping = refactored_mask_names(text, names)
|
80 |
+
sanitized = sanitize_sensitive_info(masked, keywords)
|
81 |
sanitized = final_name_remask_exact_only(sanitized, mapping)
|
82 |
mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
|
83 |
return sanitized, mapping_table
|
84 |
|
85 |
gr.Interface(
|
86 |
fn=full_pipeline,
|
87 |
+
inputs=[
|
88 |
+
gr.Textbox(lines=15, label="📥 원본 텍스트 입력"),
|
89 |
+
gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, good neighbors, gn, 사회복지법인 굿네이버스")
|
90 |
+
],
|
91 |
outputs=[
|
92 |
gr.Textbox(lines=15, label="🔐 마스킹된 텍스트"),
|
93 |
gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑 (NXXX → 이름)")
|