blueradiance commited on
Commit
ef5ddf0
·
verified ·
1 Parent(s): d290d68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -119
app.py CHANGED
@@ -1,36 +1,13 @@
1
- # app_updated_with_filter_sets.py
2
  import re
3
  import gradio as gr
4
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
5
 
6
- school_name_candidates = []
 
 
 
7
 
8
- def mask_school_names(text):
9
- global school_name_candidates
10
- school_name_candidates = []
11
-
12
- def replacer(match):
13
- name = match.group(1)
14
- full = match.group(0)
15
- if 2 <= len(name) <= 20:
16
- school_name_candidates.append(name)
17
- return to_chosung(name) + match.group(2)
18
- else:
19
- return full
20
-
21
- text = re.sub(r"(\b[가-힣]{2,20})(초등학교|중학교|고등학교)", replacer, text)
22
-
23
- for name in school_name_candidates:
24
- pattern = rf"{re.escape(name)}\s?(초등학교|중학교|고등학교)"
25
- text = re.sub(pattern, to_chosung(name) + " " + r"\1", text)
26
- return text
27
-
28
- model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
29
- tokenizer = AutoTokenizer.from_pretrained(model_name)
30
- model = AutoModelForTokenClassification.from_pretrained(model_name)
31
- ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
32
-
33
- # ✅ 예외 필터
34
  NAME_ENTITY_EXCEPTIONS = set([
35
  '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
36
  '개인정보', '본인', '해당', '현재', '아래', '위치', '소속'
@@ -40,47 +17,33 @@ REGEX_KEYWORDS_TO_MASK = set([
40
  '이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소'
41
  ])
42
 
43
- def extract_names(text):
44
- try:
45
- results = ner_pipeline(text)
46
- except Exception as e:
47
- print("NER 오류 발생:", e)
48
- return []
49
 
50
- names = []
51
- for entity in results:
52
- if entity.get("entity_group") == "PS":
53
- name = entity["word"].replace("##", "").strip()
54
- if len(name) >= 2 and name not in names and name not in NAME_ENTITY_EXCEPTIONS:
55
- names.append(name)
56
 
57
- COMMON_SUFFIXES = [
58
- '대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원',
59
- '실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장', '사무장',
60
- '직원', '매니저', '지점장',
61
- '선생님', '선생', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '강사',
62
- '박사', '석사', '학사', '의사', '간호사', '간병인',
63
- '학생', '수험생', '초등학생', '중학생', '고등학생', '학부모',
64
- '어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생',
65
- '아들', '딸', '할머니', '할아버지', '외할머니', '외할아버지',
66
- '이모', '고모', '삼촌', '숙모', '외삼촌', '고모부', '이모부', '조카', '사촌',
67
- '남편', '아내', '부인', '와이프', '신랑', '장모', '장인', '사위', '며느리',
68
- '올케', '형수', '제수씨', '매형', '처제', '시누이',
69
- '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님'
70
- ]
71
 
72
- KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
73
- attached_pattern = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
74
- spaced_pattern = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
75
 
76
- for pattern in [attached_pattern, spaced_pattern]:
77
- matches = re.findall(pattern, text)
78
- for match in matches:
79
- name = match[0]
80
- if name not in names and name not in NAME_ENTITY_EXCEPTIONS:
81
- names.append(name)
82
- return names
83
 
 
 
 
84
  def to_chosung(text):
85
  CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
86
  result = ""
@@ -93,73 +56,53 @@ def to_chosung(text):
93
  result += ch
94
  return result
95
 
96
- def mask_department(text):
97
- text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
98
- return text
99
-
100
-
101
  def postprocess_sensitive_patterns(text):
102
- # IP 주소: 192.168.35.201 → 192.168.*.*
103
  text = re.sub(r"\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\b", r"\1.\2.*.*", text)
104
-
105
- # 전화번호: 031-987-6543 → 031-***-6543
106
  text = re.sub(r"\b(\d{2,4})-(\d{3,4})-(\d{4})\b", r"\1-***-\3", text)
107
-
108
- # 카드번호: 1234-5678-9012-3456 → 1234-****-****-3456
109
  text = re.sub(r"\b(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})\b", r"\1-****-****-\4", text)
110
-
111
  return text
112
 
 
 
 
 
 
 
 
 
 
113
 
 
 
 
 
 
 
114
 
115
- def sanitize_sensitive_info(text, keyword_string, replace_word):
116
- text = mask_school_names(text)
117
- text = mask_department(text)
118
-
119
- text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
120
- text = re.sub(r"(\d)학년\s?(\d)반", r"*학년 *반", text)
121
-
122
- keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
123
- keywords += list(REGEX_KEYWORDS_TO_MASK)
124
- for kw in keywords:
125
- pattern = rf"\b{re.escape(kw)}\b"
126
- text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
127
-
128
- text = re.sub(r"(\d{3})-(\d{4})-(\d{4})", r"\1-****-\3", text)
129
- text = re.sub(r"(\d{4})년 (\d{1,2})월 (\d{1,2})일", r"19**년 \2월 *일", text)
130
- text = re.sub(r"(\d{1,3})번지", r"***번지", text)
131
- text = re.sub(r"(\d{1,3})동", r"***동", text)
132
- text = re.sub(r"(\d{1,4})호", r"****호", text)
133
- text = re.sub(r"[\w\.-]+@[\w\.-]+", r"******@****", text)
134
- text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
135
- text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
136
- text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
137
- lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
138
- # 📌 후처리 추가
139
- text = postprocess_sensitive_patterns(text)
140
-
141
- return text
142
 
143
- def final_name_remask_exact_only(text, mapping_dict):
144
- for tag, name in mapping_dict.items():
145
- pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
146
- text = re.sub(pattern, tag, text)
147
- return text
 
148
 
149
  def refactored_mask_names(original_text, names, start_counter=100):
150
- korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로',
151
- '까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도',
152
- '이','가','의']
153
  masked = original_text
154
  mapping = {}
155
  counter = start_counter
156
  used_names = set()
 
157
  for name in names:
158
  for josa in korean_josa:
159
  full = name + josa
160
  pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])'
161
  if re.search(pattern, masked):
162
- tag = f"N{counter:03d}"
163
  mapping[tag] = name
164
  masked = re.sub(pattern, tag + josa, masked)
165
  counter += 1
@@ -170,12 +113,53 @@ def refactored_mask_names(original_text, names, start_counter=100):
170
  continue
171
  pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
172
  if re.search(pattern, masked):
173
- tag = f"N{counter:03d}"
174
  mapping[tag] = name
175
  masked = re.sub(pattern, tag, masked)
176
  counter += 1
177
  return masked, mapping
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  def apply_masking(text, keywords, replace_word):
180
  names = extract_names(text)
181
  masked, mapping = refactored_mask_names(text, names)
@@ -184,17 +168,17 @@ def apply_masking(text, keywords, replace_word):
184
  mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
185
  return sanitized, mapping_table
186
 
 
187
  with gr.Blocks() as demo:
188
  gr.Markdown("""
189
- 🛡️ **민감정보 마스킹 [땡땡이 마스킹]**
190
- 이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반)
191
- ⚠️ *완벽하지 않을 수 있습니다. 반드시 직접 최종 점검하세요.*
192
- """)
193
- input_text = gr.Textbox(lines=15, label="📥 원본 텍스트 입력")
194
- keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, good neighbors, gn, 사회복지법인 굿네이버스")
195
  replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
196
  run_button = gr.Button("🚀 마스킹 실행")
197
- masked_output = gr.Textbox(lines=15, label="🔐 마스킹된 텍스트")
198
  mapping_output = gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑", interactive=False)
199
 
200
  run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
 
1
+ # masking_ver2.py
2
  import re
3
  import gradio as gr
4
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
5
 
6
+ # =============================================
7
+ # Configurable Constants
8
+ # =============================================
9
+ TAG_PREFIX = "N"
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  NAME_ENTITY_EXCEPTIONS = set([
12
  '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
13
  '개인정보', '본인', '해당', '현재', '아래', '위치', '소속'
 
17
  '이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소'
18
  ])
19
 
20
+ # 분리된 suffix 그룹
21
+ FAMILY_TITLES = ['어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생', '아들', '딸',
22
+ '할머니', '할아버지', '외할머니', '외할아버지', '이모', '고모', '삼촌', '숙모', '외삼촌',
23
+ '고모부', '이모부', '조카', '사촌', '남편', '아내', '부인', '와이프', '신랑', '장모',
24
+ '장인', '사위', '며느리', '올케', '형수', '제수씨', '매형', '처제', '시누이']
 
25
 
26
+ ACADEMIC_TITLES = ['학생', '초등학생', '중학생', '고등학생', '수험생', '학부모']
 
 
 
 
 
27
 
28
+ OCCUPATIONAL_TITLES = ['대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원',
29
+ '실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장',
30
+ '사무장', '직원', '매니저', '지점장', '선생님', '선생', '교사', '교장',
31
+ '교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사',
32
+ '의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님']
 
 
 
 
 
 
 
 
 
33
 
34
+ COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES
 
 
35
 
36
+ # =============================================
37
+ # Preload Model
38
+ # =============================================
39
+ model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
40
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
41
+ model = AutoModelForTokenClassification.from_pretrained(model_name)
42
+ ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
43
 
44
+ # =============================================
45
+ # Utility Functions
46
+ # =============================================
47
  def to_chosung(text):
48
  CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
49
  result = ""
 
56
  result += ch
57
  return result
58
 
 
 
 
 
 
59
  def postprocess_sensitive_patterns(text):
 
60
  text = re.sub(r"\b(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})\b", r"\1.\2.*.*", text)
 
 
61
  text = re.sub(r"\b(\d{2,4})-(\d{3,4})-(\d{4})\b", r"\1-***-\3", text)
 
 
62
  text = re.sub(r"\b(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})\b", r"\1-****-****-\4", text)
 
63
  return text
64
 
65
+ # =============================================
66
+ # Masking Core Functions
67
+ # =============================================
68
+ def extract_names(text):
69
+ try:
70
+ results = ner_pipeline(text)
71
+ except Exception as e:
72
+ print("NER 오류 발생:", e)
73
+ return []
74
 
75
+ names = []
76
+ for entity in results:
77
+ if entity.get("entity_group") == "PS":
78
+ name = entity["word"].replace("##", "").strip()
79
+ if len(name) >= 2 and name not in names and name not in NAME_ENTITY_EXCEPTIONS:
80
+ names.append(name)
81
 
82
+ KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
83
+ attached = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
84
+ spaced = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ for pattern in [attached, spaced]:
87
+ for match in re.findall(pattern, text):
88
+ name = match[0]
89
+ if name not in names and name not in NAME_ENTITY_EXCEPTIONS:
90
+ names.append(name)
91
+ return names
92
 
93
  def refactored_mask_names(original_text, names, start_counter=100):
94
+ korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로','까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도','이','가','의']
 
 
95
  masked = original_text
96
  mapping = {}
97
  counter = start_counter
98
  used_names = set()
99
+
100
  for name in names:
101
  for josa in korean_josa:
102
  full = name + josa
103
  pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])'
104
  if re.search(pattern, masked):
105
+ tag = f"{TAG_PREFIX}{counter:03d}"
106
  mapping[tag] = name
107
  masked = re.sub(pattern, tag + josa, masked)
108
  counter += 1
 
113
  continue
114
  pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
115
  if re.search(pattern, masked):
116
+ tag = f"{TAG_PREFIX}{counter:03d}"
117
  mapping[tag] = name
118
  masked = re.sub(pattern, tag, masked)
119
  counter += 1
120
  return masked, mapping
121
 
122
+ def final_name_remask_exact_only(text, mapping_dict):
123
+ for tag, name in mapping_dict.items():
124
+ pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
125
+ text = re.sub(pattern, tag, text)
126
+ return text
127
+
128
+ def mask_department(text):
129
+ return re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
130
+
131
+ def mask_school_names(text):
132
+ global school_name_candidates
133
+ school_name_candidates = []
134
+
135
+ def replacer(match):
136
+ name = match.group(1)
137
+ if 2 <= len(name) <= 20:
138
+ school_name_candidates.append(name)
139
+ return to_chosung(name) + match.group(2)
140
+ return match.group(0)
141
+
142
+ text = re.sub(r"(\b[가-힣]{2,20})(초등학교|중학교|고등학교)", replacer, text)
143
+ for name in school_name_candidates:
144
+ pattern = rf"{re.escape(name)}\s?(초등학교|중학교|고등학교)"
145
+ text = re.sub(pattern, to_chosung(name) + " " + r"\1", text)
146
+ return text
147
+
148
+ def sanitize_sensitive_info(text, keyword_string, replace_word):
149
+ text = postprocess_sensitive_patterns(text) # 먼저 처리
150
+ text = mask_school_names(text)
151
+ text = mask_department(text)
152
+ text = re.sub(r"(\d)학년(\s?(\d)반)?", lambda m: "*학년" + (" *반" if m.group(3) else ""), text)
153
+
154
+ keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
155
+ for kw in keywords:
156
+ pattern = rf"\b{re.escape(kw)}\b"
157
+ text = re.sub(pattern, replace_word, text, flags=re.IGNORECASE)
158
+
159
+ text = re.sub(r"(\d{6})[-](\d)\d{6}", r"*******-\2*****", text)
160
+ text = re.sub(r"([가-힣]+(대로|로|길))\s?(\d+)(호|번길|가)?", r"\1 ***", text)
161
+ return text
162
+
163
  def apply_masking(text, keywords, replace_word):
164
  names = extract_names(text)
165
  masked, mapping = refactored_mask_names(text, names)
 
168
  mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
169
  return sanitized, mapping_table
170
 
171
+ # UI
172
  with gr.Blocks() as demo:
173
  gr.Markdown("""
174
+ 🛡️ **민감정보 마스킹 [ver2]**
175
+ 이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반 + 예외 필터 + 후처리 강화)
176
+ """)
177
+ input_text = gr.Textbox(lines=15, label="📅 원본 텍스트 입력")
178
+ keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
 
179
  replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
180
  run_button = gr.Button("🚀 마스킹 실행")
181
+ masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
182
  mapping_output = gr.Textbox(lines=10, label="🏷️ 이름 태그 매핑", interactive=False)
183
 
184
  run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])