mrfirdauss commited on
Commit
c795ebb
·
verified ·
1 Parent(s): e090d55

Update extractor.py

Browse files
Files changed (1) hide show
  1. extractor.py +158 -158
extractor.py CHANGED
@@ -1,159 +1,159 @@
1
- from transformers import RobertaTokenizerFast, AutoModelForTokenClassification
2
- import re
3
- import torch
4
-
5
- tokenizer = RobertaTokenizerFast.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
6
- model = AutoModelForTokenClassification.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
7
-
8
- id2label = {0: 'O',
9
- 1: 'B-NAME',
10
- 3: 'B-NATION',
11
- 5: 'B-EMAIL',
12
- 7: 'B-URL',
13
- 9: 'B-CAMPUS',
14
- 11: 'B-MAJOR',
15
- 13: 'B-COMPANY',
16
- 15: 'B-DESIGNATION',
17
- 17: 'B-GPA',
18
- 19: 'B-PHONE NUMBER',
19
- 21: 'B-ACHIEVEMENT',
20
- 23: 'B-EXPERIENCES DESC',
21
- 25: 'B-SKILLS',
22
- 27: 'B-PROJECTS',
23
- 2: 'I-NAME',
24
- 4: 'I-NATION',
25
- 6: 'I-EMAIL',
26
- 8: 'I-URL',
27
- 10: 'I-CAMPUS',
28
- 12: 'I-MAJOR',
29
- 14: 'I-COMPANY',
30
- 16: 'I-DESIGNATION',
31
- 18: 'I-GPA',
32
- 20: 'I-PHONE NUMBER',
33
- 22: 'I-ACHIEVEMENT',
34
- 24: 'I-EXPERIENCES DESC',
35
- 26: 'I-SKILLS',
36
- 28: 'I-PROJECTS'}
37
-
38
- def merge_subwords(tokens, labels):
39
- merged_tokens = []
40
- merged_labels = []
41
-
42
- current_token = ""
43
- current_label = ""
44
-
45
- for token, label in zip(tokens, labels):
46
- if token.startswith("Ġ"):
47
- if current_token:
48
- # Append the accumulated subwords as a new token and label
49
- merged_tokens.append(current_token)
50
- merged_labels.append(current_label)
51
- # Start a new token and label
52
- current_token = token[1:] # Remove the 'Ġ'
53
- current_label = label
54
- else:
55
- # Continue accumulating subwords into the current token
56
- current_token += token
57
-
58
- # Append the last token and label
59
- if current_token:
60
- merged_tokens.append(current_token)
61
- merged_labels.append(current_label)
62
-
63
- return merged_tokens, merged_labels
64
-
65
- def chunked_inference(text, tokenizer, model, max_length=512):
66
- # Tokenize the text with truncation=False to get the full list of tokens
67
- tok = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
68
- tokens = tokenizer.tokenize(tok, is_split_into_words=True)
69
- # Initialize containers for tokenized inputs
70
- input_ids_chunks = []
71
- # Decode and print each token
72
- print(tokens)
73
- # Create chunks of tokens that fit within the model's maximum input size
74
- for i in range(0, len(tokens), max_length - 2): # -2 accounts for special tokens [CLS] and [SEP]
75
- chunk = tokens[i:i + max_length - 2]
76
- # Encode the chunks. Add special tokens via the tokenizer
77
- chunk_ids = tokenizer.convert_tokens_to_ids(chunk)
78
- chunk_ids = tokenizer.build_inputs_with_special_tokens(chunk_ids)
79
- input_ids_chunks.append(chunk_ids)
80
-
81
- # Convert list of token ids into a tensor
82
- input_ids_chunks = [torch.tensor(chunk_ids).unsqueeze(0) for chunk_ids in input_ids_chunks]
83
-
84
- # Predictions container
85
- predictions = []
86
-
87
- # Process each chunk
88
- for input_ids in input_ids_chunks:
89
- attention_mask = torch.ones_like(input_ids) # Create an attention mask for the inputs
90
- output = model(input_ids, attention_mask=attention_mask)
91
- logits = output[0] if isinstance(output, tuple) else output.logits
92
- predictions_chunk = torch.argmax(logits, dim=-1).squeeze(0)
93
- predictions.append(predictions_chunk[1:-1])
94
-
95
- # Optionally, you can convert predictions to labels here
96
- # Flatten the list of tensors into one long tensor for label mapping
97
- predictions = torch.cat(predictions, dim=0)
98
- predicted_labels = [id2label[pred.item()] for pred in predictions]
99
- return merge_subwords(tokens,predicted_labels)
100
-
101
- def process_tokens(tokens, tag_prefix):
102
- # Process tokens to extract entities based on the tag prefix
103
- entities = []
104
- current_entity = {}
105
- for token, tag in tokens:
106
- if tag.startswith('B-') and tag.endswith(tag_prefix):
107
- # Start a new entity
108
- if current_entity:
109
- # Append the current entity before starting a new one
110
- entities.append(current_entity)
111
- current_entity = {}
112
- current_entity['text'] = token
113
- current_entity['type'] = tag
114
- elif tag.startswith('I-') and (tag.endswith('GPA') or tag.endswith('URL')) and current_entity:
115
- current_entity['text'] += '' + token
116
- elif tag.startswith('I-') and tag.endswith(tag_prefix) and current_entity:
117
- # Continue the current entity
118
- current_entity['text'] += ' ' + token
119
- # Append the last entity if there is one
120
- if current_entity:
121
- entities.append(current_entity)
122
- return entities
123
-
124
- def predict(text):
125
- tokens, predictions = chunked_inference(text, tokenizer, model)
126
- data = list(zip(tokens, predictions))
127
- profile = {
128
- "name": "",
129
- "links": [],
130
- "skills": [],
131
- "experiences": [],
132
- "educations": []
133
- }
134
- profile['name'] = ' '.join([t for t, p in data if p.endswith('NAME')])
135
-
136
- for skills in process_tokens(data, 'SKILLS'):
137
- profile['skills'].append(skills['text'])
138
- #Links
139
- for links in process_tokens(data, 'URL'):
140
- profile['links'].append(links['text'])
141
- # Process experiences and education
142
- for designation, company, experience_desc in zip(process_tokens(data, 'DESIGNATION'),process_tokens(data, 'CAMPUS'),process_tokens(data, 'EXPERIENCES DESC') ):
143
- profile['experiences'].append({
144
- "start": None,
145
- "end": None,
146
- "designation": designation['text'],
147
- "company": company['text'], # To be filled in similarly
148
- "experience_description": experience_desc['text'] # To be filled in similarly
149
- })
150
- for major, gpa, campus in zip(process_tokens(data, 'MAJOR'), process_tokens(data, 'GPA'), process_tokens(data, 'CAMPUS')):
151
- profile['educations'].append({
152
- "start": None,
153
- "end": None,
154
- "major": major['text'],
155
- "campus": campus['text'], # To be filled in similarly
156
- "GPA": gpa['text'] # To be filled in similarly
157
- })
158
-
159
  return profile
 
1
+ from transformers import RobertaTokenizerFast, AutoModelForTokenClassification
2
+ import re
3
+ import torch
4
+
5
+ tokenizer = RobertaTokenizerFast.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
6
+ model = AutoModelForTokenClassification.from_pretrained("mrfirdauss/robert-base-finetuned-cv")
7
+
8
+ id2label = {0: 'O',
9
+ 1: 'B-NAME',
10
+ 3: 'B-NATION',
11
+ 5: 'B-EMAIL',
12
+ 7: 'B-URL',
13
+ 9: 'B-CAMPUS',
14
+ 11: 'B-MAJOR',
15
+ 13: 'B-COMPANY',
16
+ 15: 'B-DESIGNATION',
17
+ 17: 'B-GPA',
18
+ 19: 'B-PHONE NUMBER',
19
+ 21: 'B-ACHIEVEMENT',
20
+ 23: 'B-EXPERIENCES DESC',
21
+ 25: 'B-SKILLS',
22
+ 27: 'B-PROJECTS',
23
+ 2: 'I-NAME',
24
+ 4: 'I-NATION',
25
+ 6: 'I-EMAIL',
26
+ 8: 'I-URL',
27
+ 10: 'I-CAMPUS',
28
+ 12: 'I-MAJOR',
29
+ 14: 'I-COMPANY',
30
+ 16: 'I-DESIGNATION',
31
+ 18: 'I-GPA',
32
+ 20: 'I-PHONE NUMBER',
33
+ 22: 'I-ACHIEVEMENT',
34
+ 24: 'I-EXPERIENCES DESC',
35
+ 26: 'I-SKILLS',
36
+ 28: 'I-PROJECTS'}
37
+
38
+ def merge_subwords(tokens, labels):
39
+ merged_tokens = []
40
+ merged_labels = []
41
+
42
+ current_token = ""
43
+ current_label = ""
44
+
45
+ for token, label in zip(tokens, labels):
46
+ if token.startswith("Ġ"):
47
+ if current_token:
48
+ # Append the accumulated subwords as a new token and label
49
+ merged_tokens.append(current_token)
50
+ merged_labels.append(current_label)
51
+ # Start a new token and label
52
+ current_token = token[1:] # Remove the 'Ġ'
53
+ current_label = label
54
+ else:
55
+ # Continue accumulating subwords into the current token
56
+ current_token += token
57
+
58
+ # Append the last token and label
59
+ if current_token:
60
+ merged_tokens.append(current_token)
61
+ merged_labels.append(current_label)
62
+
63
+ return merged_tokens, merged_labels
64
+
65
+ def chunked_inference(text, tokenizer, model, max_length=512):
66
+ # Tokenize the text with truncation=False to get the full list of tokens
67
+ tok = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
68
+ tokens = tokenizer.tokenize(tok, is_split_into_words=True)
69
+ # Initialize containers for tokenized inputs
70
+ input_ids_chunks = []
71
+ # Decode and print each token
72
+ print(tokens)
73
+ # Create chunks of tokens that fit within the model's maximum input size
74
+ for i in range(0, len(tokens), max_length - 2): # -2 accounts for special tokens [CLS] and [SEP]
75
+ chunk = tokens[i:i + max_length - 2]
76
+ # Encode the chunks. Add special tokens via the tokenizer
77
+ chunk_ids = tokenizer.convert_tokens_to_ids(chunk)
78
+ chunk_ids = tokenizer.build_inputs_with_special_tokens(chunk_ids)
79
+ input_ids_chunks.append(chunk_ids)
80
+
81
+ # Convert list of token ids into a tensor
82
+ input_ids_chunks = [torch.tensor(chunk_ids).unsqueeze(0) for chunk_ids in input_ids_chunks]
83
+
84
+ # Predictions container
85
+ predictions = []
86
+
87
+ # Process each chunk
88
+ for input_ids in input_ids_chunks:
89
+ attention_mask = torch.ones_like(input_ids) # Create an attention mask for the inputs
90
+ output = model(input_ids, attention_mask=attention_mask)
91
+ logits = output[0] if isinstance(output, tuple) else output.logits
92
+ predictions_chunk = torch.argmax(logits, dim=-1).squeeze(0)
93
+ predictions.append(predictions_chunk[1:-1])
94
+
95
+ # Optionally, you can convert predictions to labels here
96
+ # Flatten the list of tensors into one long tensor for label mapping
97
+ predictions = torch.cat(predictions, dim=0)
98
+ predicted_labels = [id2label[pred.item()] for pred in predictions]
99
+ return merge_subwords(tokens,predicted_labels)
100
+
101
+ def process_tokens(tokens, tag_prefix):
102
+ # Process tokens to extract entities based on the tag prefix
103
+ entities = []
104
+ current_entity = {}
105
+ for token, tag in tokens:
106
+ if tag.startswith('B-') and tag.endswith(tag_prefix):
107
+ # Start a new entity
108
+ if current_entity:
109
+ # Append the current entity before starting a new one
110
+ entities.append(current_entity)
111
+ current_entity = {}
112
+ current_entity['text'] = token
113
+ current_entity['type'] = tag
114
+ elif tag.startswith('I-') and (('GPA') == tag_prefix or tag_prefix == ('URL')) and tag.endswith(tag_prefix) and current_entity:
115
+ current_entity['text'] += '' + token
116
+ elif tag.startswith('I-') and tag.endswith(tag_prefix) and current_entity:
117
+ # Continue the current entity
118
+ current_entity['text'] += ' ' + token
119
+ # Append the last entity if there is one
120
+ if current_entity:
121
+ entities.append(current_entity)
122
+ return entities
123
+
124
+ def predict(text):
125
+ tokens, predictions = chunked_inference(text, tokenizer, model)
126
+ data = list(zip(tokens, predictions))
127
+ profile = {
128
+ "name": "",
129
+ "links": [],
130
+ "skills": [],
131
+ "experiences": [],
132
+ "educations": []
133
+ }
134
+ profile['name'] = ' '.join([t for t, p in data if p.endswith('NAME')])
135
+
136
+ for skills in process_tokens(data, 'SKILLS'):
137
+ profile['skills'].append(skills['text'])
138
+ #Links
139
+ for links in process_tokens(data, 'URL'):
140
+ profile['links'].append(links['text'])
141
+ # Process experiences and education
142
+ for designation, company, experience_desc in zip(process_tokens(data, 'DESIGNATION'),process_tokens(data, 'COMPANY'),process_tokens(data, 'EXPERIENCES DESC') ):
143
+ profile['experiences'].append({
144
+ "start": None,
145
+ "end": None,
146
+ "designation": designation['text'],
147
+ "company": company['text'], # To be filled in similarly
148
+ "experience_description": experience_desc['text'] # To be filled in similarly
149
+ })
150
+ for major, gpa, campus in zip(process_tokens(data, 'MAJOR'), process_tokens(data, 'GPA'), process_tokens(data, 'CAMPUS')):
151
+ profile['educations'].append({
152
+ "start": None,
153
+ "end": None,
154
+ "major": major['text'],
155
+ "campus": campus['text'], # To be filled in similarly
156
+ "GPA": gpa['text'] # To be filled in similarly
157
+ })
158
+
159
  return profile