blueradiance commited on
Commit
5f9191a
·
verified ·
1 Parent(s): c9cedce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -106
app.py CHANGED
@@ -1,8 +1,8 @@
1
 
2
  import re
3
  import gradio as gr
4
- from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
5
  import threading
 
6
 
7
  TAG_PREFIX = "N"
8
 
@@ -27,6 +27,7 @@ OCCUPATIONAL_TITLES = ['대표', '이사', '전무', '상무', '부장', '차장
27
  '교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사',
28
  '의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님']
29
  COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES
 
30
 
31
  model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
32
  tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -45,149 +46,95 @@ def to_chosung(text):
45
  result += ch
46
  return result
47
 
48
- def expand_name_with_prefix_suffix(text, base_names):
49
- detected = set()
50
- for name in base_names:
51
- pattern1 = re.compile(rf'([가-힣]){re.escape(name)}(학생|선생|씨|님)?(이|가|은|는|을|를|께서|에게|에서)?')
52
- for m in pattern1.finditer(text):
53
- detected.add(m.group(0))
54
- pattern2 = re.compile(rf'{re.escape(name)}(씨|님)?(이|가|은|는|을|를|께서|에게|에서)?')
55
- for m in pattern2.finditer(text):
56
- detected.add(m.group(0))
57
- return list(detected)
58
-
59
  def extract_names(text):
60
- try:
61
- results = ner_pipeline(text)
62
- except Exception as e:
63
- print("NER 오류 발생:", e)
64
- return []
65
  names = []
66
- base_names = set()
67
  for entity in results:
68
  if entity.get("entity_group") == "PS":
69
  name = entity["word"].replace("##", "").strip()
70
  if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS:
71
  names.append(name)
72
- base_names.add(name)
73
- extended = expand_name_with_prefix_suffix(text, base_names)
74
- for name in extended:
75
- if name not in names:
76
- names.append(name)
77
- return names
78
-
79
- def refactored_mask_names(original_text, names, start_counter=100):
80
- korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로','까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도','이','가','의']
81
- masked = original_text
82
  mapping = {}
83
- counter = start_counter
84
- used_names = set()
85
  for name in names:
86
- for josa in korean_josa:
87
- full = name + josa
88
- pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])'
89
- if re.search(pattern, masked):
90
- tag = f"{TAG_PREFIX}{counter:03d}"
91
- mapping[tag] = name
92
- masked = re.sub(pattern, tag + josa, masked)
93
- counter += 1
94
- used_names.add(name)
95
- break
96
- for name in names:
97
- if name in used_names:
98
- continue
99
- pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
100
- if re.search(pattern, masked):
101
- tag = f"{TAG_PREFIX}{counter:03d}"
102
  mapping[tag] = name
103
- masked = re.sub(pattern, tag, masked)
104
  counter += 1
105
- return masked, mapping
106
-
107
- def final_name_remask_exact_only(text, mapping_dict):
108
- for tag, name in mapping_dict.items():
109
- pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
110
- text = re.sub(pattern, tag, text)
111
- return text
112
 
113
- def expand_suffix_name_mapping(text, mapping_dict):
114
  updated = {}
115
- for tag, name in mapping_dict.items():
116
- candidates = []
117
- pattern1 = rf"([가-힣]){re.escape(name)}(?:{'|'.join(COMMON_SUFFIXES)})?(이|가|은|는|을|를|께서|에게|에서)?"
118
- for m in re.finditer(pattern1, text):
119
- candidates.append(m.group(0))
120
- pattern2 = rf"{re.escape(name)}(씨|님)?(이|가|은|는|을|를|께서|에게|에서)?"
121
- for m in re.finditer(pattern2, text):
122
- candidates.append(m.group(0))
123
- updated[tag] = max(candidates, key=len) if candidates else name
 
 
 
 
124
  return updated
125
 
126
- def postprocess_sensitive_patterns(text):
127
- text = re.sub(r"\b[\w\.-]+@", r"******@", text)
128
- def mask_sequence(match):
129
- parts = re.split(r'[.-]', match.group())
130
- masked = [(part if i % 2 == 0 else '*' * len(part)) if part.isdigit() else part for i, part in enumerate(parts)]
131
- return '.'.join(masked) if '.' in match.group() else '-'.join(masked)
132
- text = re.sub(r"(?<![\\$\\\\])(?<!\d,)(?:\d{2,4}[.-]){1,3}\d{2,4}(?!\d)", mask_sequence, text)
133
- text = re.sub(r"(\d{1,3})동", r"***동", text)
134
- text = re.sub(r"(\d{1,4})호", r"****호", text)
135
- return text
136
 
137
  def mask_department(text):
138
- return re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
139
 
140
- def mask_school_names(text):
141
- global school_name_candidates
142
- school_name_candidates = []
143
- def replacer(match):
144
- name = match.group(1)
145
- if 2 <= len(name) <= 20:
146
- school_name_candidates.append(name)
147
- return to_chosung(name) + match.group(2)
148
- return match.group(0)
149
- text = re.sub(r"(\b[가-힣]{2,20})(초등학교|중학교|고등학교)", replacer, text)
150
- for name in school_name_candidates:
151
- pattern = rf"{re.escape(name)}\s?(초등학교|중학교|고등학교)"
152
- text = re.sub(pattern, to_chosung(name) + " " + r"\1", text)
153
  return text
154
 
155
  def sanitize_sensitive_info(text, keyword_string, replace_word):
156
  text = postprocess_sensitive_patterns(text)
157
  text = mask_school_names(text)
158
  text = mask_department(text)
159
- text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
160
  keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
161
  for kw in keywords:
162
- pattern = rf"\b{re.escape(kw)}\b"
163
- text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
164
- text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
165
- text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
166
  return text
167
 
168
- def apply_masking(text, keywords, replace_word):
 
 
169
  names = extract_names(text)
170
- masked, mapping = refactored_mask_names(text, names)
171
- sanitized = sanitize_sensitive_info(masked, keywords, replace_word)
172
 
173
- def delayed_postprocess():
174
- updated_mapping = expand_suffix_name_mapping(text, mapping)
175
- final_output = final_name_remask_exact_only(text, updated_mapping) # 원본 기준 재적용!!
176
  final_map = "\n".join([f"{k} → {v}" for k, v in updated_mapping.items()])
177
- masked_output.update(value=final_output)
178
  mapping_output.update(value=final_map)
179
 
180
- threading.Timer(0.2, delayed_postprocess).start()
181
- mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
182
- return sanitized, mapping_table
183
 
184
  with gr.Blocks() as demo:
185
- gr.Markdown("🛡️ **민감정보 마스킹 [v3 FIXED: 태그 대응 완성]**")
186
- input_text = gr.Textbox(lines=15, label="📅 원본 텍스트 입력")
187
  keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
188
  replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
189
  run_button = gr.Button("🚀 마스킹 실행")
190
  masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
191
- mapping_output = gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑", interactive=False)
192
  run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
193
  demo.launch()
 
1
 
2
  import re
3
  import gradio as gr
 
4
  import threading
5
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
6
 
7
  TAG_PREFIX = "N"
8
 
 
27
  '교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사',
28
  '의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님']
29
  COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES
30
+ COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
31
 
32
  model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
33
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
46
  result += ch
47
  return result
48
 
 
 
 
 
 
 
 
 
 
 
 
49
  def extract_names(text):
50
+ results = ner_pipeline(text)
 
 
 
 
51
  names = []
 
52
  for entity in results:
53
  if entity.get("entity_group") == "PS":
54
  name = entity["word"].replace("##", "").strip()
55
  if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS:
56
  names.append(name)
57
+ return list(set(names))
58
+
59
+ def apply_name_tags(text, names, start=100):
 
 
 
 
 
 
 
60
  mapping = {}
61
+ tagged = text
62
+ counter = start
63
  for name in names:
64
+ tag = f"{TAG_PREFIX}{counter:03d}"
65
+ pattern = re.compile(rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])')
66
+ tagged, n = pattern.subn(tag, tagged)
67
+ if n > 0:
 
 
 
 
 
 
 
 
 
 
 
 
68
  mapping[tag] = name
 
69
  counter += 1
70
+ return tagged, mapping
 
 
 
 
 
 
71
 
72
+ def expand_from_tag_context(tagged_text, mapping):
73
  updated = {}
74
+ for tag, name in mapping.items():
75
+ idx = tagged_text.find(tag)
76
+ if idx == -1:
77
+ updated[tag] = name
78
+ continue
79
+ context = tagged_text[max(0, idx - 50): idx + 50]
80
+ pattern = re.compile(rf'([가-힣])?{re.escape(name)}({"|".join(COMMON_SUFFIXES)})?({"|".join(COMMON_JOSA)})?')
81
+ matches = pattern.findall(context)
82
+ if matches:
83
+ longest = max(matches, key=lambda x: len(''.join(x)))
84
+ updated[tag] = ''.join(longest)
85
+ else:
86
+ updated[tag] = name
87
  return updated
88
 
89
+ def mask_school_names(text):
90
+ def replace_school(m):
91
+ return to_chosung(m.group(1)) + m.group(2)
92
+ return re.sub(r"([가-힣]{2,20})(초등학교|중학교|고등학교)", replace_school, text)
 
 
 
 
 
 
93
 
94
  def mask_department(text):
95
+ return re.sub(r"([가-힣]{2,20})학과", lambda m: to_chosung(m.group(1)) + "학과", text)
96
 
97
+ def postprocess_sensitive_patterns(text):
98
+ text = re.sub(r"[\w\.-]+@", "******@", text)
99
+ text = re.sub(r"(\d{6})[- ]?(\d{7})", "******-*******", text)
100
+ text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "***-****-****", text)
101
+ text = re.sub(r"(\d{1,3})동", "***동", text)
102
+ text = re.sub(r"(\d{1,4})호", "****호", text)
 
 
 
 
 
 
 
103
  return text
104
 
105
  def sanitize_sensitive_info(text, keyword_string, replace_word):
106
  text = postprocess_sensitive_patterns(text)
107
  text = mask_school_names(text)
108
  text = mask_department(text)
109
+ text = re.sub(r"(\d)학년(\s?(\d)반)?", "*학년 *반", text)
110
  keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
111
  for kw in keywords:
112
+ text = re.sub(rf"{re.escape(kw)}", replace_word, text, flags=re.IGNORECASE)
 
 
 
113
  return text
114
 
115
+ def apply_masking(text, keyword_string, replace_word):
116
+ original = text
117
+ text = sanitize_sensitive_info(text, keyword_string, replace_word)
118
  names = extract_names(text)
119
+ tagged, mapping = apply_name_tags(text, names)
 
120
 
121
+ def finalize():
122
+ updated_mapping = expand_from_tag_context(tagged, mapping)
 
123
  final_map = "\n".join([f"{k} → {v}" for k, v in updated_mapping.items()])
124
+ masked_output.update(value=tagged)
125
  mapping_output.update(value=final_map)
126
 
127
+ threading.Timer(0.2, finalize).start()
128
+ initial_map = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
129
+ return tagged, initial_map
130
 
131
  with gr.Blocks() as demo:
132
+ gr.Markdown("🧠 **v4.2 ULTIMATE FULL: 태그 기반 확장 + 민감정보 마스킹 완전체**")
133
+ input_text = gr.Textbox(lines=15, label="📄 입력 텍스트")
134
  keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
135
  replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
136
  run_button = gr.Button("🚀 마스킹 실행")
137
  masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
138
+ mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False)
139
  run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
140
  demo.launch()