blueradiance commited on
Commit
9510376
·
verified ·
1 Parent(s): 7c61b5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -0
app.py CHANGED
@@ -35,6 +35,7 @@ model = AutoModelForTokenClassification.from_pretrained(model_name)
35
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
36
 
37
  def extract_names(text):
 
38
  results = ner_pipeline(text)
39
  names = []
40
  for entity in results:
@@ -42,8 +43,30 @@ def extract_names(text):
42
  name = entity["word"].replace("##", "").strip()
43
  if len(name) >= 2 and name not in names:
44
  names.append(name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  return names
46
 
 
47
  def refactored_mask_names(original_text, names, start_counter=100):
48
  korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로',
49
  '까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도',
 
35
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
36
 
37
  def extract_names(text):
38
+ # 1. 기존 NER 기반 추출
39
  results = ner_pipeline(text)
40
  names = []
41
  for entity in results:
 
43
  name = entity["word"].replace("##", "").strip()
44
  if len(name) >= 2 and name not in names:
45
  names.append(name)
46
+
47
+ # 2. 직함 기반 이름 추출 보강
48
+ title_suffixes = [
49
+ # 회사 직함
50
+ '대표', '이사', '전무', '상무', '부장', '차장', '과장', '대리', '사원', '실장', '팀장', '소장', '국장', '본부장',
51
+ # 교육 관련
52
+ '선생님', '교사', '교장', '교감', '부교장', '조교수', '교수', '연구원', '박사', '석사', '학사',
53
+ # 학생 관련
54
+ '학생', '고등학생', '중학생', '초등학생', '학부모', '수험생',
55
+ # 기타 사회 호칭
56
+ '주임', '총무', '회장', '부회장', '사무장', '간호사', '의사', '원장', '기사님', '매니저', '지점장'
57
+ ]
58
+
59
+ # ex: 김과장, 이선생님, 박학생 등 추출
60
+ pattern = r'\b([가-힣]{2,4})(' + '|'.join(title_suffixes) + r')\b'
61
+ matches = re.findall(pattern, text)
62
+ for match in matches:
63
+ name = match[0]
64
+ if name not in names:
65
+ names.append(name)
66
+
67
  return names
68
 
69
+
70
  def refactored_mask_names(original_text, names, start_counter=100):
71
  korean_josa = ['이가','를','은','는','을','도','만','과','와','에게','에서','으로',
72
  '까지','조차','마저','이며','이다','이나','이나마','밖에','이든','이라도',