blueradiance commited on
Commit
b27703e
·
verified ·
1 Parent(s): bf2143b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -27
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import re
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
@@ -29,6 +30,15 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
  model = AutoModelForTokenClassification.from_pretrained(model_name)
30
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
31
 
 
 
 
 
 
 
 
 
 
32
  def extract_names(text):
33
  try:
34
  results = ner_pipeline(text)
@@ -37,17 +47,10 @@ def extract_names(text):
37
  return []
38
 
39
  names = []
40
-
41
- NAME_FILTER_EXCEPTIONS = set([
42
- '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
43
- '개인정보', '본인', '직통번호', '이메일', '전화번호', '연락처', '주소',
44
- '해당', '현재', '상담', '예약', '센터', '아래', '위치', '소속',
45
- ])
46
-
47
  for entity in results:
48
  if entity.get("entity_group") == "PS":
49
  name = entity["word"].replace("##", "").strip()
50
- if len(name) >= 2 and name not in names and name not in NAME_FILTER_EXCEPTIONS:
51
  names.append(name)
52
 
53
  COMMON_SUFFIXES = [
@@ -66,21 +69,15 @@ def extract_names(text):
66
  ]
67
 
68
  KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
69
-
70
  attached_pattern = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
71
- attached_matches = re.findall(attached_pattern, text)
72
- for match in attached_matches:
73
- name = match[0]
74
- if name not in names and name not in NAME_FILTER_EXCEPTIONS:
75
- names.append(name)
76
-
77
  spaced_pattern = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
78
- spaced_matches = re.findall(spaced_pattern, text)
79
- for match in spaced_matches:
80
- name = match[0]
81
- if name not in names and name not in NAME_FILTER_EXCEPTIONS:
82
- names.append(name)
83
 
 
 
 
 
 
 
84
  return names
85
 
86
  def to_chosung(text):
@@ -95,13 +92,10 @@ def to_chosung(text):
95
  result += ch
96
  return result
97
 
98
-
99
-
100
  def mask_department(text):
101
  text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
102
  return text
103
 
104
-
105
  def sanitize_sensitive_info(text, keyword_string, replace_word):
106
  text = mask_school_names(text)
107
  text = mask_department(text)
@@ -110,9 +104,11 @@ def sanitize_sensitive_info(text, keyword_string, replace_word):
110
  text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text)
111
 
112
  keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
 
113
  for kw in keywords:
114
  pattern = rf"\b{re.escape(kw)}\b"
115
  text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
 
116
  text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
117
  text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
118
  text = re.sub(r"(\d{1,3})번지", r"***번지", text)
@@ -121,19 +117,45 @@ def sanitize_sensitive_info(text, keyword_string, replace_word):
121
  text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text)
122
  text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
123
  text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
124
- text = re.sub(r"(\d{2,6})[-]?(\d{2,6})[-]?(\d{2,6})",
125
- lambda m: f"{m.group(1)[:2]}{'*'*(len(m.group(1))-2)}{'*'*len(m.group(2))}{m.group(3)[-4:]}", text)
126
  text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
127
  lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
128
  return text
129
 
130
-
131
  def final_name_remask_exact_only(text, mapping_dict):
132
  for tag, name in mapping_dict.items():
133
  pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
134
  text = re.sub(pattern, tag, text)
135
  return text
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  def apply_masking(text, keywords, replace_word):
139
  names = extract_names(text)
@@ -143,7 +165,6 @@ def apply_masking(text, keywords, replace_word):
143
  mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
144
  return sanitized, mapping_table
145
 
146
-
147
  with gr.Blocks() as demo:
148
  gr.Markdown("""
149
  🛡️ **민감정보 마스킹 [땡땡이 마스킹]**
 
1
+ # app_updated_with_filter_sets.py
2
  import re
3
  import gradio as gr
4
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 
30
  model = AutoModelForTokenClassification.from_pretrained(model_name)
31
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
32
 
33
+ # ✅ 예외 필터
34
+ NAME_ENTITY_EXCEPTIONS = set([
35
+ '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
36
+ '개인정보', '본인', '해당', '현재', '아래', '위치', '소속'
37
+ ])
38
+ REGEX_KEYWORDS_TO_MASK = set([
39
+ '이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소'
40
+ ])
41
+
42
  def extract_names(text):
43
  try:
44
  results = ner_pipeline(text)
 
47
  return []
48
 
49
  names = []
 
 
 
 
 
 
 
50
  for entity in results:
51
  if entity.get("entity_group") == "PS":
52
  name = entity["word"].replace("##", "").strip()
53
+ if len(name) >= 2 and name not in names and name not in NAME_ENTITY_EXCEPTIONS:
54
  names.append(name)
55
 
56
  COMMON_SUFFIXES = [
 
69
  ]
70
 
71
  KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
 
72
  attached_pattern = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
 
 
 
 
 
 
73
  spaced_pattern = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
 
 
 
 
 
74
 
75
+ for pattern in [attached_pattern, spaced_pattern]:
76
+ matches = re.findall(pattern, text)
77
+ for match in matches:
78
+ name = match[0]
79
+ if name not in names and name not in NAME_ENTITY_EXCEPTIONS:
80
+ names.append(name)
81
  return names
82
 
83
  def to_chosung(text):
 
92
  result += ch
93
  return result
94
 
 
 
95
  def mask_department(text):
96
  text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
97
  return text
98
 
 
99
  def sanitize_sensitive_info(text, keyword_string, replace_word):
100
  text = mask_school_names(text)
101
  text = mask_department(text)
 
104
  text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text)
105
 
106
  keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
107
+ keywords += list(REGEX_KEYWORDS_TO_MASK)
108
  for kw in keywords:
109
  pattern = rf"\b{re.escape(kw)}\b"
110
  text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
111
+
112
  text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
113
  text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
114
  text = re.sub(r"(\d{1,3})번지", r"***번지", text)
 
117
  text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text)
118
  text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
119
  text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
 
 
120
  text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
121
  lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
122
  return text
123
 
 
124
  def final_name_remask_exact_only(text, mapping_dict):
125
  for tag, name in mapping_dict.items():
126
  pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
127
  text = re.sub(pattern, tag, text)
128
  return text
129
 
130
+ def refactored_mask_names(original_text, names, start_counter=100):
131
+ korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로',
132
+ '까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도',
133
+ '이','가','의']
134
+ masked = original_text
135
+ mapping = {}
136
+ counter = start_counter
137
+ used_names = set()
138
+ for name in names:
139
+ for josa in korean_josa:
140
+ full = name + josa
141
+ pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])'
142
+ if re.search(pattern, masked):
143
+ tag = f"N{counter:03d}"
144
+ mapping[tag] = name
145
+ masked = re.sub(pattern, tag + josa, masked)
146
+ counter += 1
147
+ used_names.add(name)
148
+ break
149
+ for name in names:
150
+ if name in used_names:
151
+ continue
152
+ pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
153
+ if re.search(pattern, masked):
154
+ tag = f"N{counter:03d}"
155
+ mapping[tag] = name
156
+ masked = re.sub(pattern, tag, masked)
157
+ counter += 1
158
+ return masked, mapping
159
 
160
  def apply_masking(text, keywords, replace_word):
161
  names = extract_names(text)
 
165
  mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
166
  return sanitized, mapping_table
167
 
 
168
  with gr.Blocks() as demo:
169
  gr.Markdown("""
170
  🛡️ **민감정보 마스킹 [땡땡이 마스킹]**