blueradiance commited on
Commit
c21f1e8
·
verified ·
1 Parent(s): ed684ca

Upload 4 files

Browse files
Files changed (1) hide show
  1. app.py +11 -5
app.py CHANGED
@@ -49,7 +49,11 @@ def refactored_mask_names(original_text, names, start_counter=100):
49
  counter += 1
50
  return masked, mapping
51
 
52
- def sanitize_sensitive_info(text):
 
 
 
 
53
  text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
54
  text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
55
  text = re.sub(r"(\d{1,3})번지", r"***번지", text)
@@ -62,7 +66,6 @@ def sanitize_sensitive_info(text):
62
  text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})", lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
63
  text = re.sub(r"(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})", lambda m: f"{m.group(1)}.{m.group(2)}.*.*", text)
64
  text = re.sub(r"([가-힣]{1,10})(은행|동|로|길)\s?([\d\-]{4,})", lambda m: m.group(1) + m.group(2) + " " + re.sub(r"\d", "*", m.group(3)), text)
65
- text = re.sub(r"\b(good neighbors|굿네이버스|사회복지법인 굿네이버스|gn)\b", "우리기관", text, flags=re.IGNORECASE)
66
  return text
67
 
68
  def final_name_remask_exact_only(text, mapping_dict):
@@ -71,17 +74,20 @@ def final_name_remask_exact_only(text, mapping_dict):
71
  text = re.sub(pattern, tag, text)
72
  return text
73
 
74
- def full_pipeline(text):
75
  names = extract_names(text)
76
  masked, mapping = refactored_mask_names(text, names)
77
- sanitized = sanitize_sensitive_info(masked)
78
  sanitized = final_name_remask_exact_only(sanitized, mapping)
79
  mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
80
  return sanitized, mapping_table
81
 
82
  gr.Interface(
83
  fn=full_pipeline,
84
- inputs=gr.Textbox(lines=15, label="📥 원본 텍스트 입력"),
 
 
 
85
  outputs=[
86
  gr.Textbox(lines=15, label="🔐 마스킹된 텍스트"),
87
  gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑 (NXXX → 이름)")
 
49
  counter += 1
50
  return masked, mapping
51
 
52
+ def sanitize_sensitive_info(text, keyword_string):
53
+ keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
54
+ for kw in keywords:
55
+ pattern = rf"\b{re.escape(kw)}\b"
56
+ text = re.sub(pattern, "우리기관", text, flags=re.IGNORECASE)
57
  text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
58
  text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
59
  text = re.sub(r"(\d{1,3})번지", r"***번지", text)
 
66
  text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})", lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
67
  text = re.sub(r"(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})", lambda m: f"{m.group(1)}.{m.group(2)}.*.*", text)
68
  text = re.sub(r"([가-힣]{1,10})(은행|동|로|길)\s?([\d\-]{4,})", lambda m: m.group(1) + m.group(2) + " " + re.sub(r"\d", "*", m.group(3)), text)
 
69
  return text
70
 
71
  def final_name_remask_exact_only(text, mapping_dict):
 
74
  text = re.sub(pattern, tag, text)
75
  return text
76
 
77
+ def full_pipeline(text, keywords):
78
  names = extract_names(text)
79
  masked, mapping = refactored_mask_names(text, names)
80
+ sanitized = sanitize_sensitive_info(masked, keywords)
81
  sanitized = final_name_remask_exact_only(sanitized, mapping)
82
  mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
83
  return sanitized, mapping_table
84
 
85
  gr.Interface(
86
  fn=full_pipeline,
87
+ inputs=[
88
+ gr.Textbox(lines=15, label="📥 원본 텍스트 입력"),
89
+ gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, good neighbors, gn, 사회복지법인 굿네이버스")
90
+ ],
91
  outputs=[
92
  gr.Textbox(lines=15, label="🔐 마스킹된 텍스트"),
93
  gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑 (NXXX → 이름)")