Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -1,32 +1,13 @@
|
|
1 |
|
|
|
|
|
2 |
import re
|
3 |
import gradio as gr
|
4 |
import threading
|
5 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
6 |
|
7 |
TAG_PREFIX = "N"
|
8 |
-
|
9 |
-
NAME_ENTITY_EXCEPTIONS = set([
|
10 |
-
'법적', '군의', '사회적', '심리적', '행정적', '의료적', '법률적',
|
11 |
-
'개인정보', '본인', '해당', '현재', '아래', '위치', '소속',
|
12 |
-
'상담', '그래도'
|
13 |
-
])
|
14 |
-
|
15 |
-
REGEX_KEYWORDS_TO_MASK = set([
|
16 |
-
'이메일', '전화번호', '연락처', '주소', '센터', '카드번호', '주민등록번호', 'IP', 'IP주소', '계좌번호'
|
17 |
-
])
|
18 |
-
|
19 |
-
FAMILY_TITLES = ['어머니', '아버지', '엄마', '아빠', '형', '누나', '언니', '오빠', '동생', '아들', '딸',
|
20 |
-
'할머니', '할아버지', '외할머니', '외할아버지', '이모', '고모', '삼촌', '숙모', '외삼촌',
|
21 |
-
'고모부', '이모부', '조카', '사촌', '남편', '아내', '부인', '와이프', '신랑', '장모',
|
22 |
-
'장인', '사위', '며느리', '올케', '형수', '제수씨', '매형', '처제', '시누이']
|
23 |
-
ACADEMIC_TITLES = ['학생', '초등학생', '중학생', '고등학생', '수험생', '학부모']
|
24 |
-
OCCUPATIONAL_TITLES = ['대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원',
|
25 |
-
'실장', '팀장', '소장', '국장', '본부장', '주임', '총무', '회장', '부회장',
|
26 |
-
'사무장', '직원', '매니저', '지점장', '선생님', '선생', '교사', '교장',
|
27 |
-
'교감', '부교장', '조교수', '교수', '연구원', '강사', '박사', '석사', '학사',
|
28 |
-
'의사', '간호사', '간병인', '보호자', '피해자', '당사자', '대상자', '주민', '어르신', '기사님']
|
29 |
-
COMMON_SUFFIXES = FAMILY_TITLES + ACADEMIC_TITLES + OCCUPATIONAL_TITLES
|
30 |
COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
|
31 |
|
32 |
model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
|
@@ -34,25 +15,13 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
34 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
35 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
36 |
|
37 |
-
def to_chosung(text):
|
38 |
-
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
|
39 |
-
result = ""
|
40 |
-
for ch in text:
|
41 |
-
if '가' <= ch <= '힣':
|
42 |
-
code = ord(ch) - ord('가')
|
43 |
-
cho = code // 588
|
44 |
-
result += CHOSUNG_LIST[cho]
|
45 |
-
else:
|
46 |
-
result += ch
|
47 |
-
return result
|
48 |
-
|
49 |
def extract_names(text):
|
50 |
results = ner_pipeline(text)
|
51 |
names = []
|
52 |
for entity in results:
|
53 |
if entity.get("entity_group") == "PS":
|
54 |
name = entity["word"].replace("##", "").strip()
|
55 |
-
if len(name) >= 2
|
56 |
names.append(name)
|
57 |
return list(set(names))
|
58 |
|
@@ -69,72 +38,107 @@ def apply_name_tags(text, names, start=100):
|
|
69 |
counter += 1
|
70 |
return tagged, mapping
|
71 |
|
72 |
-
def
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
updated[tag] = name
|
78 |
-
continue
|
79 |
-
context = tagged_text[max(0, idx - 50): idx + 50]
|
80 |
-
pattern = re.compile(rf'([가-힣])?{re.escape(name)}({"|".join(COMMON_SUFFIXES)})?({"|".join(COMMON_JOSA)})?')
|
81 |
-
matches = pattern.findall(context)
|
82 |
-
if matches:
|
83 |
-
longest = max(matches, key=lambda x: len(''.join(x)))
|
84 |
-
updated[tag] = ''.join(longest)
|
85 |
-
else:
|
86 |
-
updated[tag] = name
|
87 |
-
return updated
|
88 |
-
|
89 |
-
def mask_school_names(text):
|
90 |
-
def replace_school(m):
|
91 |
-
return to_chosung(m.group(1)) + m.group(2)
|
92 |
-
return re.sub(r"([가-힣]{2,20})(초등학교|중학교|고등학교)", replace_school, text)
|
93 |
|
94 |
-
def
|
95 |
-
|
|
|
|
|
|
|
96 |
|
97 |
def postprocess_sensitive_patterns(text):
|
98 |
-
text = re.sub(r"
|
99 |
text = re.sub(r"(\d{6})[- ]?(\d{7})", "******-*******", text)
|
100 |
text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "***-****-****", text)
|
101 |
text = re.sub(r"(\d{1,3})동", "***동", text)
|
102 |
text = re.sub(r"(\d{1,4})호", "****호", text)
|
103 |
return text
|
104 |
|
105 |
-
def sanitize_sensitive_info(text, keyword_string, replace_word):
|
106 |
-
text = postprocess_sensitive_patterns(text)
|
107 |
-
text = mask_school_names(text)
|
108 |
-
text = mask_department(text)
|
109 |
-
text = re.sub(r"(\d)학년(\s?(\d)반)?", "*학년 *반", text)
|
110 |
-
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()] + list(REGEX_KEYWORDS_TO_MASK)
|
111 |
-
for kw in keywords:
|
112 |
-
text = re.sub(rf"{re.escape(kw)}", replace_word, text, flags=re.IGNORECASE)
|
113 |
-
return text
|
114 |
-
|
115 |
def apply_masking(text, keyword_string, replace_word):
|
116 |
-
|
117 |
-
text =
|
|
|
118 |
names = extract_names(text)
|
119 |
tagged, mapping = apply_name_tags(text, names)
|
120 |
|
121 |
def finalize():
|
122 |
-
|
123 |
-
|
124 |
-
masked_output.update(value=
|
125 |
-
mapping_output.update(value=
|
126 |
|
127 |
threading.Timer(0.2, finalize).start()
|
128 |
-
|
129 |
-
return tagged, initial_map
|
130 |
|
|
|
131 |
with gr.Blocks() as demo:
|
132 |
-
gr.Markdown("🧠 **v4.
|
133 |
-
input_text = gr.Textbox(lines=15, label="📄
|
134 |
-
keyword_input = gr.Textbox(lines=1, label="기관 키워드 (
|
135 |
replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
|
136 |
-
run_button = gr.Button("🚀
|
137 |
masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
|
138 |
mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False)
|
139 |
run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
demo.launch()
|
|
|
1 |
|
2 |
+
# ▶️ Part 1: 이름 태깅 + 파생 표현 후처리 + 우리기관 강화치환
|
3 |
+
|
4 |
import re
|
5 |
import gradio as gr
|
6 |
import threading
|
7 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
8 |
|
9 |
TAG_PREFIX = "N"
|
10 |
+
COMMON_SUFFIXES = ['학생', '선생님', '씨', '님']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
|
12 |
|
13 |
model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
|
|
|
15 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
16 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def extract_names(text):
|
19 |
results = ner_pipeline(text)
|
20 |
names = []
|
21 |
for entity in results:
|
22 |
if entity.get("entity_group") == "PS":
|
23 |
name = entity["word"].replace("##", "").strip()
|
24 |
+
if len(name) >= 2:
|
25 |
names.append(name)
|
26 |
return list(set(names))
|
27 |
|
|
|
38 |
counter += 1
|
39 |
return tagged, mapping
|
40 |
|
41 |
+
def expand_variation_patterns(tagged_text, mapping):
|
42 |
+
for tag, base in mapping.items():
|
43 |
+
suffix_pattern = re.compile(rf'([\(\["'‘“\s]*){re.escape(base)}(?:{"|".join(COMMON_SUFFIXES)})?(?:{"|".join(COMMON_JOSA)})?', re.IGNORECASE)
|
44 |
+
tagged_text = suffix_pattern.sub(lambda m: m.group(0).replace(base, tag), tagged_text)
|
45 |
+
return tagged_text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
def replace_institution_keywords(text, keywords, replace_word):
|
48 |
+
for kw in keywords:
|
49 |
+
pattern = re.compile(rf'([\s\(\["'‘“]*){re.escape(kw)}([가-힣\s.,;:!?()"'”’]*)', re.IGNORECASE)
|
50 |
+
text = pattern.sub(lambda m: m.group(1) + replace_word + m.group(2), text)
|
51 |
+
return text
|
52 |
|
53 |
def postprocess_sensitive_patterns(text):
|
54 |
+
text = re.sub(r"[\w\.-]+@[\w\.-]+", "******@***.***", text)
|
55 |
text = re.sub(r"(\d{6})[- ]?(\d{7})", "******-*******", text)
|
56 |
text = re.sub(r"(\d{3})[- ]?(\d{4})[- ]?(\d{4})", "***-****-****", text)
|
57 |
text = re.sub(r"(\d{1,3})동", "***동", text)
|
58 |
text = re.sub(r"(\d{1,4})호", "****호", text)
|
59 |
return text
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
def apply_masking(text, keyword_string, replace_word):
|
62 |
+
keywords = [k.strip() for k in keyword_string.split(",") if k.strip()]
|
63 |
+
text = replace_institution_keywords(text, keywords, replace_word)
|
64 |
+
text = postprocess_sensitive_patterns(text)
|
65 |
names = extract_names(text)
|
66 |
tagged, mapping = apply_name_tags(text, names)
|
67 |
|
68 |
def finalize():
|
69 |
+
final_tagged = expand_variation_patterns(tagged, mapping)
|
70 |
+
mapping_str = "\n".join([f"{k} → {v}" for k, v in mapping.items()])
|
71 |
+
masked_output.update(value=final_tagged)
|
72 |
+
mapping_output.update(value=mapping_str)
|
73 |
|
74 |
threading.Timer(0.2, finalize).start()
|
75 |
+
return tagged, "\n".join([f"{k} → {v}" for k, v in mapping.items()])
|
|
|
76 |
|
77 |
+
# Part 1 UI
|
78 |
with gr.Blocks() as demo:
|
79 |
+
gr.Markdown("🧠 **v4.3A: 이름 + 파생 표현 + 기관 치환 (1단계)**")
|
80 |
+
input_text = gr.Textbox(lines=15, label="📄 원문 텍스트")
|
81 |
+
keyword_input = gr.Textbox(lines=1, label="기관 키워드 (쉼표로 구분)", value="굿네이버스, 사회복지법인 굿네이버스")
|
82 |
replace_input = gr.Textbox(lines=1, label="치환할 텍스트", value="우리기관")
|
83 |
+
run_button = gr.Button("🚀 실행")
|
84 |
masked_output = gr.Textbox(lines=15, label="🔐 마스킹 결과")
|
85 |
mapping_output = gr.Textbox(lines=10, label="🏷️ 태그 매핑", interactive=False)
|
86 |
run_button.click(fn=apply_masking, inputs=[input_text, keyword_input, replace_input], outputs=[masked_output, mapping_output])
|
87 |
+
|
88 |
+
|
89 |
+
# ▶️ Part 2: 학과/학교 초성 변환 + 학년/반 마스킹 + mapping 보정
|
90 |
+
# - 이 부분은 이어서 별도 실행 환경 or 연동 코드로 붙일 수 있습니다.
|
91 |
+
# - 필요 시 한 파일로 통합 가능
|
92 |
+
|
93 |
+
|
94 |
+
# ▶️ Part 2: 학과/학교 초성 변환 + 학년/반 마스킹 + mapping 보정
|
95 |
+
|
96 |
+
def to_chosung(text):
|
97 |
+
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
|
98 |
+
result = ""
|
99 |
+
for ch in text:
|
100 |
+
if '가' <= ch <= '힣':
|
101 |
+
code = ord(ch) - ord('가')
|
102 |
+
cho = code // 588
|
103 |
+
result += CHOSUNG_LIST[cho]
|
104 |
+
else:
|
105 |
+
result += ch
|
106 |
+
return result
|
107 |
+
|
108 |
+
def mask_school_names(text):
|
109 |
+
def replace_school(m):
|
110 |
+
return to_chosung(m.group(1)) + m.group(2)
|
111 |
+
return re.sub(r"([가-힣]{2,20})(초등학교|중학교|고등학교)", replace_school, text)
|
112 |
+
|
113 |
+
def mask_department_names(text):
|
114 |
+
return re.sub(r"([가-힣]{2,20})학과", lambda m: to_chosung(m.group(1)) + "학과", text)
|
115 |
+
|
116 |
+
def mask_grade_class(text):
|
117 |
+
return re.sub(r"(\d)학년(\s?(\d)반)?", "*학년 *반", text)
|
118 |
+
|
119 |
+
def mapping_boost_context(text, mapping_dict):
|
120 |
+
COMMON_SUFFIXES = ['학생', '선생님', '씨', '님']
|
121 |
+
COMMON_JOSA = ['이', '가', '은', '는', '을', '를', '께서', '에게', '에서']
|
122 |
+
updated = {}
|
123 |
+
for tag, base in mapping_dict.items():
|
124 |
+
idx = text.find(tag)
|
125 |
+
if idx == -1:
|
126 |
+
updated[tag] = base
|
127 |
+
continue
|
128 |
+
window = text[max(0, idx - 100): idx + 100]
|
129 |
+
pattern = re.compile(rf"([가-힣])?{re.escape(base)}(?:{'|'.join(COMMON_SUFFIXES)})?(?:{'|'.join(COMMON_JOSA)})?")
|
130 |
+
match = pattern.search(window)
|
131 |
+
if match:
|
132 |
+
updated[tag] = match.group(0)
|
133 |
+
else:
|
134 |
+
updated[tag] = base
|
135 |
+
return updated
|
136 |
+
|
137 |
+
# 👇 예시: 후처리 최종 적용
|
138 |
+
# final_output = mask_school_names(tagged_text)
|
139 |
+
# final_output = mask_department_names(final_output)
|
140 |
+
# final_output = mask_grade_class(final_output)
|
141 |
+
# mapping = mapping_boost_context(final_output, mapping)
|
142 |
+
|
143 |
+
|
144 |
demo.launch()
|