blueradiance commited on
Commit
04a745e
·
verified ·
1 Parent(s): 5f9191a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -81
app.py CHANGED
@@ -1,32 +1,13 @@
1
 
 
 
2
  import re
3
  import gradio as gr
4
  import threading
5
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
6
 
7
  TAG_PREFIX = "N"
8
-
9
- NAME_ENTITY_EXCEPTIONS = set([
10
- '법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
11
- '개인정보', '본인', '해당', '현재', '아래', '위치', '소속',
12
- '상담', '그래도'
13
- ])
14
-
15
- REGEX_KEYWORDS_TO_MASK = set([
16
- '이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소', '계좌번호'
17
- ])
18
-
19
- FAMILY_TITLES = ['어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생', '아들', '딸',
20
- '할머니', '할아버지', '외할머니', '외할아버지', '이모', '고모', '삼촌', '숙모', '외삼촌',
21
- '고모부', '이모부', '조카', '사촌', '남편', '아내', '부인', '와이프', '신랑', '장모',
22
- '장인', '사위', '며느리', '올케', '형수', '제수씨', '매형', '처제', '시누이']
23
- ACADEMIC_TITLES = ['학생', '초등학생', '중학생', '고등학생', '수험생', '학부모']
24
- OCCUPATIONAL_TITLES = ['대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원',
25
- '실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장',
26
- '사무장', '직원', '매니저', '지점장', '선생님', '선생', '교사', '교장',
27
- '교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사',
28
- '의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님']
29
- COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES
30
  COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
31
 
32
  model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
@@ -34,25 +15,13 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
34
  model = AutoModelForTokenClassification.from_pretrained(model_name)
35
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
36
 
37
- def to_chosung(text):
38
- CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
39
- result = ""
40
- for ch in text:
41
- if '가' <= ch <= '힣':
42
- code = ord(ch) - ord('가')
43
- cho = code // 588
44
- result += CHOSUNG_LIST[cho]
45
- else:
46
- result += ch
47
- return result
48
-
49
  def extract_names(text):
50
  results = ner_pipeline(text)
51
  names = []
52
  for entity in results:
53
  if entity.get("entity_group") == "PS":
54
  name = entity["word"].replace("##", "").strip()
55
- if len(name) >= 2 and name not in NAME_ENTITY_EXCEPTIONS:
56
  names.append(name)
57
  return list(set(names))
58
 
@@ -69,72 +38,107 @@ def apply_name_tags(text, names, start=100):
69
  counter += 1
70
  return tagged, mapping
71
 
72
- def expand_from_tag_context(tagged_text, mapping):
73
- updated = {}
74
- for tag, name in mapping.items():
75
- idx = tagged_text.find(tag)
76
- if idx == -1:
77
- updated[tag] = name
78
- continue
79
- context = tagged_text[max(0, idx - 50): idx + 50]
80
- pattern = re.compile(rf'([가-힣])?{re.escape(name)}({"|".join(COMMON_SUFFIXES)})?({"|".join(COMMON_JOSA)})?')
81
- matches = pattern.findall(context)
82
- if matches:
83
- longest = max(matches, key=lambda x: len(''.join(x)))
84
- updated[tag] = ''.join(longest)
85
- else:
86
- updated[tag] = name
87
- return updated
88
-
89
- def mask_school_names(text):
90
- def replace_school(m):
91
- return to_chosung(m.group(1)) + m.group(2)
92
- return re.sub(r"([가-힣]{2,20})(초등학교|중학교|고등학교)", replace_school, text)
93
 
94
- def mask_department(text):
95
- return re.sub(r"([가-힣]{2,20})학과", lambda m: to_chosung(m.group(1)) + "학과", text)
 
 
 
96
 
97
  def postprocess_sensitive_patterns(text):
98
- text = re.sub(r"[\w\.-]+@", "******@", text)
99
  text = re.sub(r"(\d{6})[- ]?(\d{7})", "******-*******", text)
100
  text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "***-****-****", text)
101
  text = re.sub(r"(\d{1,3})동", "***동", text)
102
  text = re.sub(r"(\d{1,4})호", "****호", text)
103
  return text
104
 
105
- def sanitize_sensitive_info(text, keyword_string, replace_word):
106
- text = postprocess_sensitive_patterns(text)
107
- text = mask_school_names(text)
108
- text = mask_department(text)
109
- text = re.sub(r"(\d)학년(\s?(\d)반)?", "*학년 *반", text)
110
- keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
111
- for kw in keywords:
112
- text = re.sub(rf"{re.escape(kw)}", replace_word, text, flags=re.IGNORECASE)
113
- return text
114
-
115
  def apply_masking(text, keyword_string, replace_word):
116
- original = text
117
- text = sanitize_sensitive_info(text, keyword_string, replace_word)
 
118
  names = extract_names(text)
119
  tagged, mapping = apply_name_tags(text, names)
120
 
121
  def finalize():
122
- updated_mapping = expand_from_tag_context(tagged, mapping)
123
- final_map = "\n".join([f"{k} → {v}" for k, v in updated_mapping.items()])
124
- masked_output.update(value=tagged)
125
- mapping_output.update(value=final_map)
126
 
127
  threading.Timer(0.2, finalize).start()
128
- initial_map = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
129
- return tagged, initial_map
130
 
 
131
  with gr.Blocks() as demo:
132
- gr.Markdown("🧠 **v4.2 ULTIMATE FULL: 태그 기반 확장 + 민감정보 마스킹 완전체**")
133
- input_text = gr.Textbox(lines=15, label="📄 입력 텍스트")
134
- keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
135
  replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
136
- run_button = gr.Button("🚀 마스킹 실행")
137
  masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
138
  mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False)
139
  run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  demo.launch()
 
1
 
2
+ # ▶️ Part 1: 이름 태깅 + 파생 표현 후처리 + 우리기관 강화치환
3
+
4
  import re
5
  import gradio as gr
6
  import threading
7
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
8
 
9
  TAG_PREFIX = "N"
10
+ COMMON_SUFFIXES = ['학생', '선생님', '씨', '님']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
12
 
13
  model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
 
15
  model = AutoModelForTokenClassification.from_pretrained(model_name)
16
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
17
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def extract_names(text):
19
  results = ner_pipeline(text)
20
  names = []
21
  for entity in results:
22
  if entity.get("entity_group") == "PS":
23
  name = entity["word"].replace("##", "").strip()
24
+ if len(name) >= 2:
25
  names.append(name)
26
  return list(set(names))
27
 
 
38
  counter += 1
39
  return tagged, mapping
40
 
41
+ def expand_variation_patterns(tagged_text, mapping):
42
+ for tag, base in mapping.items():
43
+ suffix_pattern = re.compile(rf'([\(\["'‘“\s]*){re.escape(base)}(?:{"|".join(COMMON_SUFFIXES)})?(?:{"|".join(COMMON_JOSA)})?', re.IGNORECASE)
44
+ tagged_text = suffix_pattern.sub(lambda m: m.group(0).replace(base, tag), tagged_text)
45
+ return tagged_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ def replace_institution_keywords(text, keywords, replace_word):
48
+ for kw in keywords:
49
+ pattern = re.compile(rf'([\s\(\["'‘“]*){re.escape(kw)}([가-힣\s.,;:!?()"'”’]*)', re.IGNORECASE)
50
+ text = pattern.sub(lambda m: m.group(1) + replace_word + m.group(2), text)
51
+ return text
52
 
53
  def postprocess_sensitive_patterns(text):
54
+ text = re.sub(r"[\w\.-]+@[\w\.-]+", "******@***.***", text)
55
  text = re.sub(r"(\d{6})[- ]?(\d{7})", "******-*******", text)
56
  text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "***-****-****", text)
57
  text = re.sub(r"(\d{1,3})동", "***동", text)
58
  text = re.sub(r"(\d{1,4})호", "****호", text)
59
  return text
60
 
 
 
 
 
 
 
 
 
 
 
61
  def apply_masking(text, keyword_string, replace_word):
62
+ keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
63
+ text = replace_institution_keywords(text, keywords, replace_word)
64
+ text = postprocess_sensitive_patterns(text)
65
  names = extract_names(text)
66
  tagged, mapping = apply_name_tags(text, names)
67
 
68
  def finalize():
69
+ final_tagged = expand_variation_patterns(tagged, mapping)
70
+ mapping_str = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
71
+ masked_output.update(value=final_tagged)
72
+ mapping_output.update(value=mapping_str)
73
 
74
  threading.Timer(0.2, finalize).start()
75
+ return tagged, "\n".join([f"{k} → {v}" for k, v in mapping.items()])
 
76
 
77
+ # Part 1 UI
78
  with gr.Blocks() as demo:
79
+ gr.Markdown("🧠 **v4.3A: 이름 + 파생 표현 + 기관 치환 (1단계)**")
80
+ input_text = gr.Textbox(lines=15, label="📄 원문 텍스트")
81
+ keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
82
  replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
83
+ run_button = gr.Button("🚀 실행")
84
  masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
85
  mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False)
86
  run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
87
+
88
+
89
+ # ▶️ Part 2: 학과/학교 초성 변환 + 학년/반 마스킹 + mapping 보정
90
+ # - 이 부분은 이어서 별도 실행 환경 or 연동 코드로 붙일 수 있습니다.
91
+ # - 필요 시 한 파일로 통합 가능
92
+
93
+
94
+ # ▶️ Part 2: 학과/학교 초성 변환 + 학년/반 마스킹 + mapping 보정
95
+
96
+ def to_chosung(text):
97
+ CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
98
+ result = ""
99
+ for ch in text:
100
+ if '가' <= ch <= '힣':
101
+ code = ord(ch) - ord('가')
102
+ cho = code // 588
103
+ result += CHOSUNG_LIST[cho]
104
+ else:
105
+ result += ch
106
+ return result
107
+
108
+ def mask_school_names(text):
109
+ def replace_school(m):
110
+ return to_chosung(m.group(1)) + m.group(2)
111
+ return re.sub(r"([가-힣]{2,20})(초등학교|중학교|고등학교)", replace_school, text)
112
+
113
+ def mask_department_names(text):
114
+ return re.sub(r"([가-힣]{2,20})학과", lambda m: to_chosung(m.group(1)) + "학과", text)
115
+
116
+ def mask_grade_class(text):
117
+ return re.sub(r"(\d)학년(\s?(\d)반)?", "*학년 *반", text)
118
+
119
+ def mapping_boost_context(text, mapping_dict):
120
+ COMMON_SUFFIXES = ['학생', '선생님', '씨', '님']
121
+ COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
122
+ updated = {}
123
+ for tag, base in mapping_dict.items():
124
+ idx = text.find(tag)
125
+ if idx == -1:
126
+ updated[tag] = base
127
+ continue
128
+ window = text[max(0, idx - 100): idx + 100]
129
+ pattern = re.compile(rf"([가-힣])?{re.escape(base)}(?:{'|'.join(COMMON_SUFFIXES)})?(?:{'|'.join(COMMON_JOSA)})?")
130
+ match = pattern.search(window)
131
+ if match:
132
+ updated[tag] = match.group(0)
133
+ else:
134
+ updated[tag] = base
135
+ return updated
136
+
137
+ # 👇 예시: 후처리 최종 적용
138
+ # final_output = mask_school_names(tagged_text)
139
+ # final_output = mask_department_names(final_output)
140
+ # final_output = mask_grade_class(final_output)
141
+ # mapping = mapping_boost_context(final_output, mapping)
142
+
143
+
144
  demo.launch()