blueradiance commited on
Commit
078f44f
·
verified ·
1 Parent(s): 04a745e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +180 -108
app.py CHANGED
@@ -1,144 +1,216 @@
1
-
2
- # ▶️ Part 1: 이름 태깅 + 파생 표현 후처리 + 우리기관 강화치환
3
-
4
  import re
5
  import gradio as gr
6
- import threading
7
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
8
 
 
 
 
9
  TAG_PREFIX = "N"
10
- COMMON_SUFFIXES = ['학생', '선생님', '씨', '님']
11
- COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
14
  tokenizer = AutoTokenizer.from_pretrained(model_name)
15
  model = AutoModelForTokenClassification.from_pretrained(model_name)
16
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def extract_names(text):
19
- results = ner_pipeline(text)
 
 
 
 
 
20
  names = []
 
21
  for entity in results:
22
  if entity.get("entity_group") == "PS":
23
  name = entity["word"].replace("##", "").strip()
24
- if len(name) >= 2:
25
  names.append(name)
26
- return list(set(names))
27
 
28
- def apply_name_tags(text, names, start=100):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  mapping = {}
30
- tagged = text
31
- counter = start
 
32
  for name in names:
33
- tag = f"{TAG_PREFIX}{counter:03d}"
34
- pattern = re.compile(rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])')
35
- tagged, n = pattern.subn(tag, tagged)
36
- if n > 0:
 
 
 
 
 
 
 
 
 
 
 
 
37
  mapping[tag] = name
 
38
  counter += 1
39
- return tagged, mapping
40
 
41
- def expand_variation_patterns(tagged_text, mapping):
42
- for tag, base in mapping.items():
43
- suffix_pattern = re.compile(rf'([\(\["'‘“\s]*){re.escape(base)}(?:{"|".join(COMMON_SUFFIXES)})?(?:{"|".join(COMMON_JOSA)})?', re.IGNORECASE)
44
- tagged_text = suffix_pattern.sub(lambda m: m.group(0).replace(base, tag), tagged_text)
45
- return tagged_text
46
-
47
- def replace_institution_keywords(text, keywords, replace_word):
48
- for kw in keywords:
49
- pattern = re.compile(rf'([\s\(\["'‘“]*){re.escape(kw)}([가-힣\s.,;:!?()"'”’]*)', re.IGNORECASE)
50
- text = pattern.sub(lambda m: m.group(1) + replace_word + m.group(2), text)
51
  return text
52
 
53
- def postprocess_sensitive_patterns(text):
54
- text = re.sub(r"[\w\.-]+@[\w\.-]+", "******@***.***", text)
55
- text = re.sub(r"(\d{6})[- ]?(\d{7})", "******-*******", text)
56
- text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "***-****-****", text)
57
- text = re.sub(r"(\d{1,3})동", "***동", text)
58
- text = re.sub(r"(\d{1,4})호", "****호", text)
 
 
 
 
 
 
 
 
 
 
 
 
59
  return text
60
 
61
- def apply_masking(text, keyword_string, replace_word):
62
- keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
63
- text = replace_institution_keywords(text, keywords, replace_word)
64
- text = postprocess_sensitive_patterns(text)
65
- names = extract_names(text)
66
- tagged, mapping = apply_name_tags(text, names)
67
 
68
- def finalize():
69
- final_tagged = expand_variation_patterns(tagged, mapping)
70
- mapping_str = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
71
- masked_output.update(value=final_tagged)
72
- mapping_output.update(value=mapping_str)
73
 
74
- threading.Timer(0.2, finalize).start()
75
- return tagged, "\n".join([f"{k} {v}" for k, v in mapping.items()])
 
76
 
77
- # Part 1 UI
 
 
 
 
 
 
 
 
78
  with gr.Blocks() as demo:
79
- gr.Markdown("🧠 **v4.3A: 이름 + 파생 표현 + 기관 치환 (1단계)**")
80
- input_text = gr.Textbox(lines=15, label="📄 원문 텍스트")
81
- keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
 
 
 
82
  replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
83
- run_button = gr.Button("🚀 실행")
84
  masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
85
- mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False)
86
- run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
87
-
88
-
89
- # ▶️ Part 2: 학과/학교 초성 변환 + 학년/반 마스킹 + mapping 보정
90
- # - 이 부분은 이어서 별도 실행 환경 or 연동 코드로 붙일 수 있습니다.
91
- # - 필요 시 한 파일로 통합 가능
92
-
93
-
94
- # ▶️ Part 2: 학과/학교 초성 변환 + 학년/반 마스킹 + mapping 보정
95
-
96
- def to_chosung(text):
97
- CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
98
- result = ""
99
- for ch in text:
100
- if '가' <= ch <= '힣':
101
- code = ord(ch) - ord('가')
102
- cho = code // 588
103
- result += CHOSUNG_LIST[cho]
104
- else:
105
- result += ch
106
- return result
107
-
108
- def mask_school_names(text):
109
- def replace_school(m):
110
- return to_chosung(m.group(1)) + m.group(2)
111
- return re.sub(r"([가-힣]{2,20})(초등학교|중학교|고등학교)", replace_school, text)
112
-
113
- def mask_department_names(text):
114
- return re.sub(r"([가-힣]{2,20})학과", lambda m: to_chosung(m.group(1)) + "학과", text)
115
-
116
- def mask_grade_class(text):
117
- return re.sub(r"(\d)학년(\s?(\d)반)?", "*학년 *반", text)
118
-
119
- def mapping_boost_context(text, mapping_dict):
120
- COMMON_SUFFIXES = ['학생', '선생님', '씨', '님']
121
- COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
122
- updated = {}
123
- for tag, base in mapping_dict.items():
124
- idx = text.find(tag)
125
- if idx == -1:
126
- updated[tag] = base
127
- continue
128
- window = text[max(0, idx - 100): idx + 100]
129
- pattern = re.compile(rf"([가-힣])?{re.escape(base)}(?:{'|'.join(COMMON_SUFFIXES)})?(?:{'|'.join(COMMON_JOSA)})?")
130
- match = pattern.search(window)
131
- if match:
132
- updated[tag] = match.group(0)
133
- else:
134
- updated[tag] = base
135
- return updated
136
-
137
- # 👇 예시: 후처리 최종 적용
138
- # final_output = mask_school_names(tagged_text)
139
- # final_output = mask_department_names(final_output)
140
- # final_output = mask_grade_class(final_output)
141
- # mapping = mapping_boost_context(final_output, mapping)
142
 
 
143
 
144
  demo.launch()
 
1
+ # masking_ver2.py
 
 
2
  import re
3
  import gradio as gr
 
4
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
5
 
6
+ # =============================================
7
+ # Configurable Constants
8
+ # =============================================
9
  TAG_PREFIX = "N"
 
 
10
 
11
+ NAME_ENTITY_EXCEPTIONS = set([
12
+ '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
13
+ '개인정보', '본인', '해당', '현재', '아래', '위치', '소속',
14
+ '상담', '그래도'
15
+ ])
16
+
17
+ REGEX_KEYWORDS_TO_MASK = set([
18
+ '이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소', '계좌번호'
19
+ ])
20
+
21
+ # 분리된 suffix 그룹
22
+ FAMILY_TITLES = ['어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생', '아들', '딸',
23
+ '할머니', '할아버지', '외할머니', '외할아버지', '이모', '고모', '삼촌', '숙모', '외삼촌',
24
+ '고모부', '이모부', '조카', '사촌', '남편', '아내', '부인', '와이프', '신랑', '장모',
25
+ '장인', '사위', '며느리', '올케', '형수', '제수씨', '매형', '처제', '시누이']
26
+
27
+ ACADEMIC_TITLES = ['학생', '초등학생', '중학생', '고등학생', '수험생', '학부모']
28
+
29
+ OCCUPATIONAL_TITLES = ['대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원',
30
+ '실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장',
31
+ '사무장', '직원', '매니저', '지점장', '선생님', '선생', '교사', '교장',
32
+ '교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사',
33
+ '의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님']
34
+
35
+ COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES
36
+
37
+ # =============================================
38
+ # Preload Model
39
+ # =============================================
40
  model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
41
  tokenizer = AutoTokenizer.from_pretrained(model_name)
42
  model = AutoModelForTokenClassification.from_pretrained(model_name)
43
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
44
 
45
+ # =============================================
46
+ # Utility Functions
47
+ # =============================================
48
+ def to_chosung(text):
49
+ CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
50
+ result = ""
51
+ for ch in text:
52
+ if '가' <= ch <= '힣':
53
+ code = ord(ch) - ord('가')
54
+ cho = code // 588
55
+ result += CHOSUNG_LIST[cho]
56
+ else:
57
+ result += ch
58
+ return result
59
+
60
+ def postprocess_sensitive_patterns(text):
61
+ text = re.sub(r"\b[\w\.-]+@", r"******@", text) # 이메일: 골뱅이 앞만 가리기
62
+
63
+ def mask_sequence(match):
64
+ parts = re.split(r'[.-]', match.group())
65
+ masked = []
66
+ for i, part in enumerate(parts):
67
+ if part.isdigit():
68
+ if i % 2 == 0:
69
+ masked.append(part)
70
+ else:
71
+ masked.append('*' * len(part))
72
+ else:
73
+ masked.append(part)
74
+ return '.'.join(masked) if '.' in match.group() else '-'.join(masked)
75
+
76
+ text = re.sub(r"(?<![\\$\\\\])(?<!\d,)(?:\d{2,4}[.-]){1,3}\d{2,4}(?!\d)", mask_sequence, text)
77
+ text = re.sub(r"(\d{1,3})동", r"***동", text) # 동 정보
78
+ text = re.sub(r"(\d{1,4})호", r"****호", text) # 호수 정보
79
+ return text
80
+
81
+ # =============================================
82
+ # Masking Core Functions
83
+ # =============================================
84
  def extract_names(text):
85
+ try:
86
+ results = ner_pipeline(text)
87
+ except Exception as e:
88
+ print("NER 오류 발생:", e)
89
+ return []
90
+
91
  names = []
92
+ base_names = set()
93
  for entity in results:
94
  if entity.get("entity_group") == "PS":
95
  name = entity["word"].replace("##", "").strip()
96
+ if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS:
97
  names.append(name)
98
+ base_names.add(name)
99
 
100
+ KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
101
+ attached = r'([가-힣]{2,4})(?:' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
102
+ spaced = r'([가-힣]{2,4})\s+(?:' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
103
+
104
+ for pattern in [attached, spaced]:
105
+ for match in re.findall(pattern, text):
106
+ name = match[0]
107
+ if name not in names and name not in NAME_ENTITY_EXCEPTIONS:
108
+ names.append(name)
109
+
110
+ # 🧠 후처리: 이름+조사 붙은 경우로도 다시 추출
111
+ for name in base_names:
112
+ for suffix in COMMON_SUFFIXES:
113
+ for josa in ["", "은", "는", "이", "가", "을", "를", "도", "과", "와", "께서", "에서", "으로"]:
114
+ pattern = rf'{re.escape(name)}\s?{suffix}{josa}'
115
+ if re.search(pattern, text):
116
+ if name not in names:
117
+ names.append(name)
118
+ return names
119
+
120
+
121
+
122
+
123
+ def refactored_mask_names(original_text, names, start_counter=100):
124
+ korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로','까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도','이','가','의']
125
+ masked = original_text
126
  mapping = {}
127
+ counter = start_counter
128
+ used_names = set()
129
+
130
  for name in names:
131
+ for josa in korean_josa:
132
+ full = name + josa
133
+ pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])'
134
+ if re.search(pattern, masked):
135
+ tag = f"{TAG_PREFIX}{counter:03d}"
136
+ mapping[tag] = name
137
+ masked = re.sub(pattern, tag + josa, masked)
138
+ counter += 1
139
+ used_names.add(name)
140
+ break
141
+ for name in names:
142
+ if name in used_names:
143
+ continue
144
+ pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
145
+ if re.search(pattern, masked):
146
+ tag = f"{TAG_PREFIX}{counter:03d}"
147
  mapping[tag] = name
148
+ masked = re.sub(pattern, tag, masked)
149
  counter += 1
150
+ return masked, mapping
151
 
152
+ def final_name_remask_exact_only(text, mapping_dict):
153
+ for tag, name in mapping_dict.items():
154
+ pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
155
+ text = re.sub(pattern, tag, text)
 
 
 
 
 
 
156
  return text
157
 
158
+ def mask_department(text):
159
+ return re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
160
+
161
+ def mask_school_names(text):
162
+ global school_name_candidates
163
+ school_name_candidates = []
164
+
165
+ def replacer(match):
166
+ name = match.group(1)
167
+ if 2 <= len(name) <= 20:
168
+ school_name_candidates.append(name)
169
+ return to_chosung(name) + match.group(2)
170
+ return match.group(0)
171
+
172
+ text = re.sub(r"(\b[가-힣]{2,20})(초등학교|중학교|고등학교)", replacer, text)
173
+ for name in school_name_candidates:
174
+ pattern = rf"{re.escape(name)}\s?(초등학교|중학교|고등학교)"
175
+ text = re.sub(pattern, to_chosung(name) + " " + r"\1", text)
176
  return text
177
 
178
+ def sanitize_sensitive_info(text, keyword_string, replace_word):
179
+ text = postprocess_sensitive_patterns(text) # 먼저 처리
180
+ text = mask_school_names(text)
181
+ text = mask_department(text)
182
+ text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
 
183
 
184
+ keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
185
+ for kw in keywords:
186
+ pattern = rf"\b{re.escape(kw)}\b"
187
+ text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
 
188
 
189
+ text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
190
+ text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
191
+ return text
192
 
193
+ def apply_masking(text, keywords, replace_word):
194
+ names = extract_names(text)
195
+ masked, mapping = refactored_mask_names(text, names)
196
+ sanitized = sanitize_sensitive_info(masked, keywords, replace_word)
197
+ sanitized = final_name_remask_exact_only(sanitized, mapping)
198
+ mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
199
+ return sanitized, mapping_table
200
+
201
+ # UI
202
  with gr.Blocks() as demo:
203
+ gr.Markdown("""
204
+ 🛡️ **민감정보 마스킹 [ver2]**
205
+ 이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반 + 예외 필터 + 후처리 강화)
206
+ """)
207
+ input_text = gr.Textbox(lines=15, label="📅 원본 텍스트 입력")
208
+ keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
209
  replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
210
+ run_button = gr.Button("🚀 마스킹 실행")
211
  masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
212
+ mapping_output = gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
+ run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
215
 
216
  demo.launch()