blueradiance commited on
Commit
7131d84
·
verified ·
1 Parent(s): 4f1ff3d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -57
app.py CHANGED
@@ -37,84 +37,52 @@ def extract_names(text):
37
  return []
38
 
39
  names = []
 
 
 
 
 
 
 
40
  for entity in results:
41
  if entity.get("entity_group") == "PS":
42
  name = entity["word"].replace("##", "").strip()
43
- if len(name) >= 2 and name not in names:
44
  names.append(name)
45
 
46
  COMMON_SUFFIXES = [
47
- # 직함/직책
48
  '대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원',
49
  '실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장', '사무장',
50
  '직원', '매니저', '지점장',
51
- # 교육/전문가
52
  '선생님', '선생', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '강사',
53
  '박사', '석사', '학사', '의사', '간호사', '간병인',
54
- # 학생 관련
55
  '학생', '수험생', '초등학생', '중학생', '고등학생', '학부모',
56
- # 가족/친척
57
  '어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생',
58
  '아들', '딸', '할머니', '할아버지', '외할머니', '외할아버지',
59
  '이모', '고모', '삼촌', '숙모', '외삼촌', '고모부', '이모부', '조카', '사촌',
60
  '남편', '아내', '부인', '와이프', '신랑', '장모', '장인', '사위', '며느리',
61
  '올케', '형수', '제수씨', '매형', '처제', '시누이',
62
- # 그 외 지칭
63
  '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님'
64
  ]
65
 
66
- # 조사 포함 패턴
67
  KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
68
 
69
- # 붙여쓰기: 이민지선생님, 김대리와
70
  attached_pattern = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
71
  attached_matches = re.findall(attached_pattern, text)
72
  for match in attached_matches:
73
  name = match[0]
74
- if name not in names:
75
  names.append(name)
76
 
77
- # 띄어쓰기: 이민지 선생님, 김대리 와
78
  spaced_pattern = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
79
  spaced_matches = re.findall(spaced_pattern, text)
80
  for match in spaced_matches:
81
  name = match[0]
82
- if name not in names:
83
  names.append(name)
84
 
85
  return names
86
 
87
-
88
- def refactored_mask_names(original_text, names, start_counter=100):
89
- korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로',
90
- '까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도',
91
- '이','가','의']
92
- masked = original_text
93
- mapping = {}
94
- counter = start_counter
95
- used_names = set()
96
- for name in names:
97
- for josa in korean_josa:
98
- full = name + josa
99
- pattern = rf'(?<![\w가-힣]){re.escape(full)}(?![\w가-힣])'
100
- if re.search(pattern, masked):
101
- tag = f"N{counter:03d}"
102
- mapping[tag] = name
103
- masked = re.sub(pattern, tag + josa, masked)
104
- counter += 1
105
- used_names.add(name)
106
- break
107
- for name in names:
108
- if name in used_names:
109
- continue
110
- pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
111
- if re.search(pattern, masked):
112
- tag = f"N{counter:03d}"
113
- mapping[tag] = name
114
- masked = re.sub(pattern, tag, masked)
115
- counter += 1
116
- return masked, mapping
117
-
118
  def to_chosung(text):
119
  CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
120
  result = ""
@@ -127,10 +95,13 @@ def to_chosung(text):
127
  result += ch
128
  return result
129
 
 
 
130
  def mask_department(text):
131
  text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
132
  return text
133
 
 
134
  def sanitize_sensitive_info(text, keyword_string, replace_word):
135
  text = mask_school_names(text)
136
  text = mask_department(text)
@@ -154,18 +125,16 @@ def sanitize_sensitive_info(text, keyword_string, replace_word):
154
  lambda m: f"{m.group(1)[:2]}{'*'*(len(m.group(1))-2)}{'*'*len(m.group(2))}{m.group(3)[-4:]}", text)
155
  text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
156
  lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
157
- text = re.sub(r"(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})",
158
- lambda m: f"{m.group(1)}.{m.group(2)}.*.*", text)
159
- text = re.sub(r"([가-힣]{1,10})(은행|동|로|길)\s?([\d\-]{4,})",
160
- lambda m: m.group(1) + m.group(2) + " " + re.sub(r"\d", "*", m.group(3)), text)
161
  return text
162
 
 
163
  def final_name_remask_exact_only(text, mapping_dict):
164
  for tag, name in mapping_dict.items():
165
  pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
166
  text = re.sub(pattern, tag, text)
167
  return text
168
 
 
169
  def apply_masking(text, keywords, replace_word):
170
  names = extract_names(text)
171
  masked, mapping = refactored_mask_names(text, names)
@@ -174,21 +143,11 @@ def apply_masking(text, keywords, replace_word):
174
  mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
175
  return sanitized, mapping_table
176
 
177
- def remask_with_mapping(text, mapping_string):
178
- mapping = {}
179
- for line in mapping_string.strip().split("\n"):
180
- if "→" in line:
181
- tag, name = line.split("→")
182
- mapping[tag.strip()] = name.strip()
183
- for tag, name in mapping.items():
184
- pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
185
- text = re.sub(pattern, tag, text)
186
- return text
187
 
188
  with gr.Blocks() as demo:
189
  gr.Markdown("""
190
  🛡️ **민감정보 마스킹 [땡땡이 마스킹]**
191
- 이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반)
192
  ⚠️ *완벽하지 않을 수 있습니다. 반드시 직접 최종 점검하세요.*
193
  """)
194
  input_text = gr.Textbox(lines=15, label="📥 원본 텍스트 입력")
 
37
  return []
38
 
39
  names = []
40
+
41
+ NAME_FILTER_EXCEPTIONS = set([
42
+ '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
43
+ '개인정보', '본인', '직통번호', '이메일', '전화번호', '연락처', '주소',
44
+ '해당', '현재', '상담', '예약', '센터', '아래', '위치', '소속', '보호자',
45
+ ])
46
+
47
  for entity in results:
48
  if entity.get("entity_group") == "PS":
49
  name = entity["word"].replace("##", "").strip()
50
+ if len(name) >= 2 and name not in names and name not in NAME_FILTER_EXCEPTIONS:
51
  names.append(name)
52
 
53
  COMMON_SUFFIXES = [
 
54
  '대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원',
55
  '실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장', '사무장',
56
  '직원', '매니저', '지점장',
 
57
  '선생님', '선생', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '강사',
58
  '박사', '석사', '학사', '의사', '간호사', '간병인',
 
59
  '학생', '수험생', '초등학생', '중학생', '고등학생', '학부모',
 
60
  '어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생',
61
  '아들', '딸', '할머니', '할아버지', '외할머니', '외할아버지',
62
  '이모', '고모', '삼촌', '숙모', '외삼촌', '고모부', '이모부', '조카', '사촌',
63
  '남편', '아내', '부인', '와이프', '신랑', '장모', '장인', '사위', '며느리',
64
  '올케', '형수', '제수씨', '매형', '처제', '시누이',
 
65
  '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님'
66
  ]
67
 
 
68
  KOREAN_JOSA = r'(이[가]|은|는|을|를|과|와|의|도|만|께서|에서|으로|에게|한테|보다|까지|부터)?'
69
 
 
70
  attached_pattern = r'([가-힣]{2,4})(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
71
  attached_matches = re.findall(attached_pattern, text)
72
  for match in attached_matches:
73
  name = match[0]
74
+ if name not in names and name not in NAME_FILTER_EXCEPTIONS:
75
  names.append(name)
76
 
 
77
  spaced_pattern = r'([가-힣]{2,4})\s+(' + '|'.join(COMMON_SUFFIXES) + r')' + KOREAN_JOSA
78
  spaced_matches = re.findall(spaced_pattern, text)
79
  for match in spaced_matches:
80
  name = match[0]
81
+ if name not in names and name not in NAME_FILTER_EXCEPTIONS:
82
  names.append(name)
83
 
84
  return names
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def to_chosung(text):
87
  CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
88
  result = ""
 
95
  result += ch
96
  return result
97
 
98
+
99
+
100
  def mask_department(text):
101
  text = re.sub(r"([가-힣]{2,20}학과)", lambda m: to_chosung(m.group(1)[:-2]) + "학과", text)
102
  return text
103
 
104
+
105
  def sanitize_sensitive_info(text, keyword_string, replace_word):
106
  text = mask_school_names(text)
107
  text = mask_department(text)
 
125
  lambda m: f"{m.group(1)[:2]}{'*'*(len(m.group(1))-2)}{'*'*len(m.group(2))}{m.group(3)[-4:]}", text)
126
  text = re.sub(r"(\d{4})[- ]?(\d{4})[- ]?(\d{4})[- ]?(\d{4})",
127
  lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)
 
 
 
 
128
  return text
129
 
130
+
131
  def final_name_remask_exact_only(text, mapping_dict):
132
  for tag, name in mapping_dict.items():
133
  pattern = rf'(?<![\w가-힣]){re.escape(name)}(?![\w가-힣])'
134
  text = re.sub(pattern, tag, text)
135
  return text
136
 
137
+
138
  def apply_masking(text, keywords, replace_word):
139
  names = extract_names(text)
140
  masked, mapping = refactored_mask_names(text, names)
 
143
  mapping_table = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
144
  return sanitized, mapping_table
145
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  with gr.Blocks() as demo:
148
  gr.Markdown("""
149
  🛡️ **민감정보 마스킹 [땡땡이 마스킹]**
150
+ 이름 + 민감정보 + 초/중/고 마스킹기 (초성 기반)
151
  ⚠️ *완벽하지 않을 수 있습니다. 반드시 직접 최종 점검하세요.*
152
  """)
153
  input_text = gr.Textbox(lines=15, label="📥 원본 텍스트 입력")