Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -27,18 +27,22 @@ def mask_school_names(text):
|
|
27 |
model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
|
28 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
29 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
30 |
-
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer,
|
31 |
|
32 |
def extract_names(text):
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
34 |
names = []
|
35 |
for entity in results:
|
36 |
-
if entity
|
37 |
name = entity["word"].replace("##", "").strip()
|
38 |
if len(name) >= 2 and name not in names:
|
39 |
names.append(name)
|
40 |
|
41 |
-
# 붙임형 직함 기반
|
42 |
title_suffixes = [
|
43 |
'대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원', '실장', '팀장', '소장', '국장', '본부장',
|
44 |
'선생님', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '박사', '석사', '학사',
|
@@ -52,7 +56,6 @@ def extract_names(text):
|
|
52 |
if name not in names:
|
53 |
names.append(name)
|
54 |
|
55 |
-
# 띄어쓰기 있는 지칭어 형태에서도 이름 추출
|
56 |
honorific_suffixes = [
|
57 |
'어머니', '아버지', '엄마', '아빠', '할머니', '할아버지', '외할머니', '외할아버지',
|
58 |
'형', '누나', '언니', '오빠', '동생', '아들', '딸',
|
@@ -105,12 +108,3 @@ def refactored_mask_names(original_text, names, start_counter=100):
|
|
105 |
|
106 |
def to_chosung(text):
|
107 |
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
|
108 |
-
result = ""
|
109 |
-
for ch in text:
|
110 |
-
if '가' <= ch <= '힣':
|
111 |
-
code = ord(ch) - ord('가')
|
112 |
-
cho = code // 588
|
113 |
-
result += CHOSUNG_LIST[cho]
|
114 |
-
else:
|
115 |
-
result += ch
|
116 |
-
return result
|
|
|
27 |
model_name = "Leo97/KoELECTRA-small-v3-modu-ner"
|
28 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
29 |
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
30 |
+
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
31 |
|
32 |
def extract_names(text):
|
33 |
+
try:
|
34 |
+
results = ner_pipeline(text)
|
35 |
+
except Exception as e:
|
36 |
+
print("NER 오류 발생:", e)
|
37 |
+
return []
|
38 |
+
|
39 |
names = []
|
40 |
for entity in results:
|
41 |
+
if entity.get("entity_group") == "PS":
|
42 |
name = entity["word"].replace("##", "").strip()
|
43 |
if len(name) >= 2 and name not in names:
|
44 |
names.append(name)
|
45 |
|
|
|
46 |
title_suffixes = [
|
47 |
'대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원', '실장', '팀장', '소장', '국장', '본부장',
|
48 |
'선생님', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '박사', '석사', '학사',
|
|
|
56 |
if name not in names:
|
57 |
names.append(name)
|
58 |
|
|
|
59 |
honorific_suffixes = [
|
60 |
'어머니', '아버지', '엄마', '아빠', '할머니', '할아버지', '외할머니', '외할아버지',
|
61 |
'형', '누나', '언니', '오빠', '동생', '아들', '딸',
|
|
|
108 |
|
109 |
def to_chosung(text):
|
110 |
CHOSUNG_LIST = [chr(i) for i in range(0x1100, 0x1113)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|