|
from transformers import RobertaTokenizerFast, AutoModelForTokenClassification
|
|
import re
|
|
import torch
|
|
|
|
tokenizer = RobertaTokenizerFast.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
|
|
model = AutoModelForTokenClassification.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
|
|
|
|
id2label = {0: 'O',
|
|
1: 'B-NAME',
|
|
3: 'B-NATION',
|
|
5: 'B-EMAIL',
|
|
7: 'B-URL',
|
|
9: 'B-CAMPUS',
|
|
11: 'B-MAJOR',
|
|
13: 'B-COMPANY',
|
|
15: 'B-DESIGNATION',
|
|
17: 'B-GPA',
|
|
19: 'B-PHONE NUMBER',
|
|
21: 'B-ACHIEVEMENT',
|
|
23: 'B-EXPERIENCES DESC',
|
|
25: 'B-SKILLS',
|
|
27: 'B-PROJECTS',
|
|
2: 'I-NAME',
|
|
4: 'I-NATION',
|
|
6: 'I-EMAIL',
|
|
8: 'I-URL',
|
|
10: 'I-CAMPUS',
|
|
12: 'I-MAJOR',
|
|
14: 'I-COMPANY',
|
|
16: 'I-DESIGNATION',
|
|
18: 'I-GPA',
|
|
20: 'I-PHONE NUMBER',
|
|
22: 'I-ACHIEVEMENT',
|
|
24: 'I-EXPERIENCES DESC',
|
|
26: 'I-SKILLS',
|
|
28: 'I-PROJECTS'}
|
|
|
|
def merge_subwords(tokens, labels):
|
|
merged_tokens = []
|
|
merged_labels = []
|
|
|
|
current_token = ""
|
|
current_label = ""
|
|
|
|
for token, label in zip(tokens, labels):
|
|
if token.startswith("Ġ"):
|
|
if current_token:
|
|
|
|
merged_tokens.append(current_token)
|
|
merged_labels.append(current_label)
|
|
|
|
current_token = token[1:]
|
|
current_label = label
|
|
else:
|
|
|
|
current_token += token
|
|
|
|
|
|
if current_token:
|
|
merged_tokens.append(current_token)
|
|
merged_labels.append(current_label)
|
|
|
|
return merged_tokens, merged_labels
|
|
|
|
def chunked_inference(text, tokenizer, model, max_length=512):
|
|
|
|
tok = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
|
|
tokens = tokenizer.tokenize(tok, is_split_into_words=True)
|
|
|
|
input_ids_chunks = []
|
|
|
|
print(tokens)
|
|
|
|
for i in range(0, len(tokens), max_length - 2):
|
|
chunk = tokens[i:i + max_length - 2]
|
|
|
|
chunk_ids = tokenizer.convert_tokens_to_ids(chunk)
|
|
chunk_ids = tokenizer.build_inputs_with_special_tokens(chunk_ids)
|
|
input_ids_chunks.append(chunk_ids)
|
|
|
|
|
|
input_ids_chunks = [torch.tensor(chunk_ids).unsqueeze(0) for chunk_ids in input_ids_chunks]
|
|
|
|
|
|
predictions = []
|
|
|
|
|
|
for input_ids in input_ids_chunks:
|
|
attention_mask = torch.ones_like(input_ids)
|
|
output = model(input_ids, attention_mask=attention_mask)
|
|
logits = output[0] if isinstance(output, tuple) else output.logits
|
|
predictions_chunk = torch.argmax(logits, dim=-1).squeeze(0)
|
|
predictions.append(predictions_chunk[1:-1])
|
|
|
|
|
|
|
|
predictions = torch.cat(predictions, dim=0)
|
|
predicted_labels = [id2label[pred.item()] for pred in predictions]
|
|
return merge_subwords(tokens,predicted_labels)
|
|
|
|
def process_tokens(tokens, tag_prefix):
|
|
|
|
entities = []
|
|
current_entity = {}
|
|
for token, tag in tokens:
|
|
if tag.startswith('B-') and tag.endswith(tag_prefix):
|
|
|
|
if current_entity:
|
|
|
|
entities.append(current_entity)
|
|
current_entity = {}
|
|
current_entity['text'] = token
|
|
current_entity['type'] = tag
|
|
elif tag.startswith('I-') and (tag.endswith('GPA') or tag.endswith('URL')) and current_entity:
|
|
current_entity['text'] += '' + token
|
|
elif tag.startswith('I-') and tag.endswith(tag_prefix) and current_entity:
|
|
|
|
current_entity['text'] += ' ' + token
|
|
|
|
if current_entity:
|
|
entities.append(current_entity)
|
|
return entities
|
|
|
|
def predict(text):
|
|
tokens, predictions = chunked_inference(text, tokenizer, model)
|
|
data = list(zip(tokens, predictions))
|
|
profile = {
|
|
"name": "",
|
|
"links": [],
|
|
"skills": [],
|
|
"experiences": [],
|
|
"educations": []
|
|
}
|
|
profile['name'] = ' '.join([t for t, p in data if p.endswith('NAME')])
|
|
|
|
for skills in process_tokens(data, 'SKILLS'):
|
|
profile['skills'].append(skills['text'])
|
|
|
|
for links in process_tokens(data, 'URL'):
|
|
profile['links'].append(links['text'])
|
|
|
|
for designation, company, experience_desc in zip(process_tokens(data, 'DESIGNATION'),process_tokens(data, 'CAMPUS'),process_tokens(data, 'EXPERIENCES DESC') ):
|
|
profile['experiences'].append({
|
|
"start": None,
|
|
"end": None,
|
|
"designation": designation['text'],
|
|
"company": company['text'],
|
|
"experience_description": experience_desc['text']
|
|
})
|
|
for major, gpa, campus in zip(process_tokens(data, 'MAJOR'), process_tokens(data, 'GPA'), process_tokens(data, 'CAMPUS')):
|
|
profile['educations'].append({
|
|
"start": None,
|
|
"end": None,
|
|
"major": major['text'],
|
|
"campus": campus['text'],
|
|
"GPA": gpa['text']
|
|
})
|
|
|
|
return profile |