llm tagging & training functions done
Browse files- config.yaml +4 -0
- data/data.jsonl +35 -0
- few-shot-extract.py +10 -11
- few_shot.txt +299 -0
- llm-tagging.py +21 -92
- train.py +113 -129
config.yaml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
training:
|
| 2 |
+
epochs: 3
|
| 3 |
+
batch_size: 16
|
| 4 |
+
learning_rate: 0.00005
|
data/data.jsonl
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"tokens": ["About", "the", "job"], "tags_knowledge": ["O", "O", "O"]}
|
| 2 |
+
{"tokens": ["G", "##row", "with", "us"], "tags_knowledge": ["O", "O", "O", "O"]}
|
| 3 |
+
{"tokens": ["About", "This", "Op", "##port", "##unity"], "tags_knowledge": ["O", "O", "O", "O", "O"]}
|
| 4 |
+
{"tokens": ["Eric", "##sson", "is", "a", "world", "-", "leading", "provider", "of", "telecommunications", "equipment", "and", "services", "to", "mobile", "and", "fixed", "network", "operators", "."], "tags_knowledge": ["B", "I", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
|
| 5 |
+
{"tokens": ["Over", "1", ",", "000", "networks", "in", "more", "than", "180", "countries", "use", "Eric", "##sson", "equipment", ",", "and", "more", "than", "40", "percent", "of", "the", "world", "'", "s", "mobile", "traffic", "passes", "through", "Eric", "##sson", "networks", "."], "tags_knowledge": ["O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "B", "I", "O"]}
|
| 6 |
+
{"tokens": ["Using", "innovation", "to", "em", "##power", "people", ",", "business", "and", "society", ",", "Eric", "##sson", "is", "working", "towards", "the", "Network", "##ed", "Society", ":", "a", "world", "connected", "in", "real", "time", "that", "will", "open", "opportunities", "to", "create", "freedom", ",", "transform", "society", "and", "drive", "solutions", "to", "some", "of", "our", "planet", "\u2019", "s", "greatest", "challenges", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "B", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 7 |
+
{"tokens": ["Eric", "##sson", "'", "s", "6", "##G", "vision", ",", "first", "introduced", "in", "2020", ",", "remains", "pivotal", "for", "transforming", "business", "and", "society", "in", "the", "203", "##0s", "through", "secure", ",", "efficient", ",", "and", "sustainable", "communication", "services", "."], "tags_knowledge": ["B", "I", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O"]}
|
| 8 |
+
{"tokens": ["As", "6", "##G", "development", "progresses", "into", "a", "more", "concrete", "phase", "of", "regulation", "and", "standard", "##ization", "we", "are", "looking", "for", "researchers", "that", "would", "like", "to", "join", "us", ",", "co", "-", "creating", "a", "c", "##y", "##ber", "-", "physical", "world"], "tags_knowledge": ["O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 9 |
+
{"tokens": ["Within", "Eric", "##sson", ",", "Eric", "##sson", "Research", "develops", "new", "communication", "solutions", "and", "standards", "which", "have", "made", "Eric", "##sson", "the", "industry", "leader", "in", "defining", "five", "generations", "of", "mobile", "communication", "."], "tags_knowledge": ["O", "B", "I", "O", "B", "I", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O"]}
|
| 10 |
+
{"tokens": ["As", "we", "gear", "up", "for", "the", "6th", "generation", ",", "we", "would", "like", "to", "fully", "embrace", "and", "utilize", "cloud", "native", "principles", ",", "h", "##yper", "##sca", "##lers", "and", "internal", "cloud", "infrastructure", "in", "our", "research", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
|
| 11 |
+
{"tokens": ["We", "are", "now", "looking", "for", "a", "M", "##L", "##O", "##ps", "research", "engineer", "to", "develop", "and", "support", "our", "work", "##flow", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 12 |
+
{"tokens": ["In", "this", "role", ",", "you", "will"], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
|
| 13 |
+
{"tokens": ["Con", "##tri", "##but", "##e", "to", "the", "direction", "and", "implementation", "of", "M", "##L", "-", "based", "ways", "of", "working"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O", "O"]}
|
| 14 |
+
{"tokens": ["Study", ",", "design", "and", "develop", "work", "##flow", "##s", "and", "solutions", "for", "AI", "based", "R", "&", "D"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "O", "O", "O", "O"]}
|
| 15 |
+
{"tokens": ["Work", "across", "internal", "com", "##pute", "and", "external", "cloud", "platforms"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "B", "O"]}
|
| 16 |
+
{"tokens": ["Working", "closely", "with", "researchers", "driving", "6", "##G", "standard", "##ization"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "I", "B", "I"]}
|
| 17 |
+
{"tokens": ["Jo", "##in", "our", "Team"], "tags_knowledge": ["O", "O", "O", "O"]}
|
| 18 |
+
{"tokens": ["Qualification", "##s"], "tags_knowledge": ["O", "O"]}
|
| 19 |
+
{"tokens": ["MS", "##c", "in", "Data", "Science", "or", "related", "field", ",", "or", "have", "equivalent", "practical", "experience"], "tags_knowledge": ["B", "I", "O", "B", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 20 |
+
{"tokens": ["Technical", "skills", "and", "/", "or", "professional", "experience", ",", "particularly", "in", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 21 |
+
{"tokens": ["Programming", "in", "various", "languages", "(", "Python", ",", "Go", ",", "etc", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "B", "O", "B", "O", "O", "O"]}
|
| 22 |
+
{"tokens": ["M", "##L", "##O", "##ps", "technologies", "and", "tool", "##ing", "(", "e", ".", "g", ".", "M", "##LF", "##low", ",", "Ku", "##be", "##flow", ")"], "tags_knowledge": ["B", "I", "I", "I", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O"]}
|
| 23 |
+
{"tokens": ["Di", "##sp", "##atch", "##ing", "and", "computational", "Python", "packages", "(", "H", "##yd", "##ra", ",", "n", "##ump", "##y", ",", "Ten", "##sor", "##F", "##low", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "B", "O", "O", "B", "I", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "O", "O", "O"]}
|
| 24 |
+
{"tokens": ["Dev", "##O", "##ps", "and", "C", "##I", "/", "CD", "experience", ",", "runner", "deployment", "&", "management", ",", "pipeline", "creation", ",", "testing", "etc", ".", "for", "valid", "##ating", "M", "##L", "-", "driven", "code"], "tags_knowledge": ["B", "I", "I", "O", "B", "I", "O", "B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I", "O", "O", "O", "O"]}
|
| 25 |
+
{"tokens": ["F", "##ami", "##lia", "##rity", "in", "the", "following", "is", "a", "plus", ":"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 26 |
+
{"tokens": ["M", "##L", "framework", "##s", "(", "P", "##y", "##T", "##or", "##ch", ",", "Ten", "##sor", "##F", "##low", ",", "or", "Jax", ")"], "tags_knowledge": ["B", "I", "O", "O", "O", "B", "I", "I", "I", "I", "O", "B", "I", "I", "I", "O", "O", "B", "O"]}
|
| 27 |
+
{"tokens": ["Con", "##tain", "##ers", "technologies", "(", "engines", ",", "orchestra", "##tion", "tools", "and", "framework", "##s", "such", "as", "Dock", "##er", ",", "Ka", "##nik", "##o", ",", "Ku", "##ber", "##net", "##es", ",", "He", "##lm", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "I", "O", "O", "O", "O", "O", "O", "B", "I", "O", "B", "I", "I", "O", "B", "I", "I", "I", "O", "B", "I", "O", "O", "O", "O"]}
|
| 28 |
+
{"tokens": ["Cloud", "ecosystems", "along", "with", "the", "respective", "infrastructure", ",", "in", "particular", "A", "##WS"], "tags_knowledge": ["B", "O", "O", "O", "O", "O", "O", "O", "O", "O", "B", "I"]}
|
| 29 |
+
{"tokens": ["Infrastructure", "management", "(", "An", "##sible", ",", "Terra", "##form", ",", "etc", ".", ")"], "tags_knowledge": ["O", "O", "O", "B", "I", "O", "B", "I", "O", "O", "O", "O"]}
|
| 30 |
+
{"tokens": ["Team", "skills", "is", "a", "necessity", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O"]}
|
| 31 |
+
{"tokens": ["Daily", "cross", "-", "functional", "collaboration", "and", "interaction", "with", "other", "skilled", "researchers", "are", "the", "basis", "for", "our", "ways", "of", "working", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 32 |
+
{"tokens": ["You", "should", "enjoy", "working", "with", "people", "having", "diverse", "backgrounds", "and", "competence", "##s", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 33 |
+
{"tokens": ["It", "is", "important", "that", "you", "have", "strong", "personal", "drive", "and", "a", "strong", "focus", "on", "the", "tasks", "at", "hand", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 34 |
+
{"tokens": ["A", "##bility", "to", "translate", "high", "-", "level", "objectives", "into", "detailed", "tasks", "and", "action", "##able", "steps", "."], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O", "O"]}
|
| 35 |
+
{"tokens": ["Location", ":", "Lu", "##le", "##\u00e5", ",", "Sweden"], "tags_knowledge": ["O", "O", "O", "O", "O", "O", "O"]}
|
few-shot-extract.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import requests
|
| 2 |
import os
|
| 3 |
-
repo_dir = os.
|
|
|
|
| 4 |
|
| 5 |
def show_examples(n = 10):
|
| 6 |
|
|
@@ -10,16 +11,14 @@ def show_examples(n = 10):
|
|
| 10 |
if response.status_code == 200:
|
| 11 |
|
| 12 |
data = response.json()
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
file.write(f'
|
| 20 |
-
file.write(f'
|
| 21 |
-
file.write(f'Skill Labels: {str(skill_labels)}\n')
|
| 22 |
-
file.write(f'Knowledge Labels: {str(knowledge_labels)}\n')
|
| 23 |
file.write('\n')
|
| 24 |
|
| 25 |
|
|
|
|
| 1 |
import requests
|
| 2 |
import os
|
| 3 |
+
repo_dir = os.getcwd()
|
| 4 |
+
print(repo_dir)
|
| 5 |
|
| 6 |
def show_examples(n = 10):
|
| 7 |
|
|
|
|
| 11 |
if response.status_code == 200:
|
| 12 |
|
| 13 |
data = response.json()
|
| 14 |
+
|
| 15 |
+
tags_knowledge = [str(a['row']['tags_knowledge']) for a in data['rows']]
|
| 16 |
+
tokens = [str(a['row']['tokens']) for a in data['rows']]
|
| 17 |
+
|
| 18 |
+
with open(f"{repo_dir}/few_shot.txt", 'w') as file:
|
| 19 |
+
for i in range(n):
|
| 20 |
+
file.write(f'tags_knowledge: {tags_knowledge[i]}\n')
|
| 21 |
+
file.write(f'tokens: {tokens[i]}\n')
|
|
|
|
|
|
|
| 22 |
file.write('\n')
|
| 23 |
|
| 24 |
|
few_shot.txt
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 2 |
+
Tokens: ['Senior', 'QA', 'Engineer', '(', 'm/f/d', ')', '<ORGANIZATION>']
|
| 3 |
+
|
| 4 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
| 5 |
+
Tokens: ['<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<ADDRESS>', '<LOCATION>']
|
| 6 |
+
|
| 7 |
+
Tags Knowledge: ['O', 'O', 'O']
|
| 8 |
+
Tokens: ['Date', 'posted:', '2021-07-14']
|
| 9 |
+
|
| 10 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
|
| 11 |
+
Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
|
| 12 |
+
|
| 13 |
+
Tags Knowledge: ['O', 'O']
|
| 14 |
+
Tokens: ['Job', 'description:']
|
| 15 |
+
|
| 16 |
+
Tags Knowledge: ['O', 'O']
|
| 17 |
+
Tokens: ['Location', 'options:']
|
| 18 |
+
|
| 19 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
| 20 |
+
Tokens: ['Remote', 'Visa', 'sponsor', 'Paid', 'relocation']
|
| 21 |
+
|
| 22 |
+
Tags Knowledge: ['O', 'O']
|
| 23 |
+
Tokens: ['Job', 'type:']
|
| 24 |
+
|
| 25 |
+
Tags Knowledge: ['O']
|
| 26 |
+
Tokens: ['Full-time']
|
| 27 |
+
|
| 28 |
+
Tags Knowledge: ['O', 'O']
|
| 29 |
+
Tokens: ['Experience', 'level:']
|
| 30 |
+
|
| 31 |
+
Tags Knowledge: ['O']
|
| 32 |
+
Tokens: ['Senior']
|
| 33 |
+
|
| 34 |
+
Tags Knowledge: ['O']
|
| 35 |
+
Tokens: ['Role:']
|
| 36 |
+
|
| 37 |
+
Tags Knowledge: ['O', 'O']
|
| 38 |
+
Tokens: ['QA/Test', 'Developer']
|
| 39 |
+
|
| 40 |
+
Tags Knowledge: ['O']
|
| 41 |
+
Tokens: ['Industry:']
|
| 42 |
+
|
| 43 |
+
Tags Knowledge: ['B', 'I', 'I', 'B', 'I', 'B', 'I']
|
| 44 |
+
Tokens: ['Business', 'to', 'Business', 'Information', 'Technology', 'Web', 'Technology']
|
| 45 |
+
|
| 46 |
+
Tags Knowledge: ['O', 'O']
|
| 47 |
+
Tokens: ['Company', 'size:']
|
| 48 |
+
|
| 49 |
+
Tags Knowledge: ['O', 'O']
|
| 50 |
+
Tokens: ['501-1k', 'people']
|
| 51 |
+
|
| 52 |
+
Tags Knowledge: ['O', 'O']
|
| 53 |
+
Tokens: ['Company', 'type:']
|
| 54 |
+
|
| 55 |
+
Tags Knowledge: ['O']
|
| 56 |
+
Tokens: ['Private']
|
| 57 |
+
|
| 58 |
+
Tags Knowledge: ['O']
|
| 59 |
+
Tokens: ['Technologies']
|
| 60 |
+
|
| 61 |
+
Tags Knowledge: ['B', 'B', 'B', 'B', 'B']
|
| 62 |
+
Tokens: ['docker', 'agile', 'selenium', 'circleci', 'jenkins']
|
| 63 |
+
|
| 64 |
+
Tags Knowledge: ['O', 'O']
|
| 65 |
+
Tokens: ['Job', 'description']
|
| 66 |
+
|
| 67 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 68 |
+
Tokens: ['In', 'order', 'to', 'support', 'our', 'ongoing', 'international', 'growth', 'we', 'are', 'looking', 'for', 'a', 'Senior', 'QA', 'Engineer', 'to', 'join', 'our', 'Engineering', 'department', '.']
|
| 69 |
+
|
| 70 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 71 |
+
Tokens: ['You', 'will', 'be', 'working', 'in', 'an', 'end-to-end', 'cross-functional', 'team', 'being', 'responsible', 'for', 'implementing', 'and', 'promoting', 'all', 'QA', 'relevant', 'topics', 'on', 'team', 'level', '.']
|
| 72 |
+
|
| 73 |
+
Tags Knowledge: ['O']
|
| 74 |
+
Tokens: ['Responsibilities']
|
| 75 |
+
|
| 76 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 77 |
+
Tokens: ['Design', 'and', 'implement', 'complex', 'end-to-end', 'tests', '.']
|
| 78 |
+
|
| 79 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 80 |
+
Tokens: ['Work', 'hands-on', 'together', 'with', 'the', 'other', 'engineers', 'within', 'the', 'Agile', 'team', '-', 'to', 'ensure', 'continuous', 'quality', 'delivery', 'of', 'automated', 'acceptance', 'API', 'and', 'performance', 'tests', '-', 'while', 'constantly', 'collaborating', 'with', 'the', 'QA', 'Engineers', 'of', 'the', 'other', 'teams', '.']
|
| 81 |
+
|
| 82 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 83 |
+
Tokens: ['Own', 'a', 'thought-leadership', 'influence', 'regarding', 'QA', 'relevant', 'topics', 'within', 'the', 'Agile', 'team', '.']
|
| 84 |
+
|
| 85 |
+
Tags Knowledge: ['O']
|
| 86 |
+
Tokens: ['Requirements']
|
| 87 |
+
|
| 88 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'B', 'O', 'O', 'O', 'B', 'I', 'O', 'B', 'B', 'O', 'O']
|
| 89 |
+
Tokens: ['At', 'least', '5', 'years', 'of', 'combined', 'experience', 'in', 'Java', 'or', 'Kotlin', 'and', 'JavaScript', 'or', 'TypeScript', 'programming', 'and', 'related', 'test', 'frameworks', '(', 'Selenium', 'TestCafe', 'etc.)', '.']
|
| 90 |
+
|
| 91 |
+
Tags Knowledge: ['O', 'O', 'O', 'B', 'I', 'O', 'B', 'I', 'O']
|
| 92 |
+
Tokens: ['Good', 'understanding', 'of', 'Agile', 'methodologies', 'and', 'Continuous', 'Delivery', '.']
|
| 93 |
+
|
| 94 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 95 |
+
Tokens: ['Experience', 'in', 'testing', 'applications', 'on', 'every', 'level', 'of', 'the', 'testing', 'pyramid', '.']
|
| 96 |
+
|
| 97 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 98 |
+
Tokens: ['Great', 'communicator', 'being', 'able', 'to', 'relate', 'to', 'the', 'different', 'challenges', 'that', 'developers', 'product', 'managers', 'and', 'other', 'stakeholders', 'within', 'the', 'engineering', 'department', 'face', '.']
|
| 99 |
+
|
| 100 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O']
|
| 101 |
+
Tokens: ['Experience', 'in', 'working', 'on', 'a', 'cloud-based', 'application', 'running', 'on', 'Docker', '.']
|
| 102 |
+
|
| 103 |
+
Tags Knowledge: ['O', 'B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 104 |
+
Tokens: ['A', 'degree', 'in', 'Computer', 'Science', 'or', 'related', 'fields', 'or', 'equivalent', 'practical', 'experience', '.']
|
| 105 |
+
|
| 106 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'O', 'O', 'O', 'O']
|
| 107 |
+
Tokens: ['Experience', 'in', 'working', 'with', 'CircleCI', 'pipelines', 'on', 'running', 'tests', 'automatically', 'prior', 'to', 'the', 'deployment;', 'Jenkins', 'is', 'a', 'plus', '.']
|
| 108 |
+
|
| 109 |
+
Tags Knowledge: ['B', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O']
|
| 110 |
+
Tokens: ['Performance', 'and', 'security', 'testing', 'experience', 'is', 'a', 'plus', '.']
|
| 111 |
+
|
| 112 |
+
Tags Knowledge: ['O', 'O', 'O']
|
| 113 |
+
Tokens: ['What', 'we', 'offer']
|
| 114 |
+
|
| 115 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 116 |
+
Tokens: ['We', 'keep', 'things', 'open', 'agile', 'and', 'communicative', '.']
|
| 117 |
+
|
| 118 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 119 |
+
Tokens: ['It', 'is', 'all', 'based', 'on', 'trust', 'not', 'micromanaging', '.']
|
| 120 |
+
|
| 121 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 122 |
+
Tokens: ['The', 'whole', 'department', 'is', 'located', 'together', 'in', 'one', 'office', 'in', 'beautiful', '<LOCATION>', 'however', 'due', 'to', 'the', 'current', 'situation', 'we', 'work', 'and', 'onboard', '100%', 'remotely', 'to', 'keep', 'our', 'employees', 'safe', '.']
|
| 123 |
+
|
| 124 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 125 |
+
Tokens: ['Our', 'team', 'members', 'are', 'self-organized', 'within', 'their', 'teams', 'working', 'on', 'independent', 'projects', 'or', 'closely', 'with', 'Product', 'Leads', 'developers', 'and', 'UX', 'designers', '.']
|
| 126 |
+
|
| 127 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 128 |
+
Tokens: ['We', 'value', 'your', 'thoughts', 'and', 'ideas', 'and', 'will', 'give', 'you', 'the', 'freedom', 'to', 'push', 'and', 'implement', 'them!']
|
| 129 |
+
|
| 130 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 131 |
+
Tokens: ['We', 'offer', 'competitive', 'salaries', 'and', 'support', 'personal', 'growth', 'with', 'functional', 'in-house', 'coaching', 'and', 'a', 'personal', 'development', 'budget', 'that', 'includes', 'three', 'days', 'off', 'per', 'year', '.']
|
| 132 |
+
|
| 133 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 134 |
+
Tokens: ['You', 'will', 'gain', '–', 'and', 'share', '–', 'knowledge', 'during', 'recurring', 'learning', 'groups', 'jours', 'fixes', 'and', 'our', 'annual', 'Code', 'Camp', '.']
|
| 135 |
+
|
| 136 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 137 |
+
Tokens: ['You', 'are', 'free', 'to', 'use', 'the', 'OS', 'of', 'your', 'choice', 'the', 'tooling', 'you', 'are', 'comfortable', 'with', 'and', 'set', 'up', 'your', 'workspace', 'the', 'way', 'you', 'like', 'it', '.']
|
| 138 |
+
|
| 139 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 140 |
+
Tokens: ['<ORGANIZATION>', 'will', 'support', 'you', 'with', 'all', 'the', 'necessary', 'office', 'equipment', 'even', 'when', 'working', 'from', 'home!']
|
| 141 |
+
|
| 142 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 143 |
+
Tokens: ['We', 'get', 'that', 'balancing', 'a', 'family', 'and', 'work', 'can', 'be', 'a', 'challenge', 'so', 'everyone', 'gets', 'flexible', 'working', 'hours', 'and', '30', 'days', 'of', 'holidays', 'per', 'year', '.']
|
| 144 |
+
|
| 145 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 146 |
+
Tokens: ['Moreover', '<ORGANIZATION>', 'will', 'support', 'you', 'in', 'case', 'of', 'relocation', 'and', 'visa', 'application', '.']
|
| 147 |
+
|
| 148 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 149 |
+
Tokens: ['Note:', 'We', 'support', 'your', 'relocation', 'but', 'due', 'to', 'tax', 'reason', 'you’d', 'be', 'required', 'to', 'be', 'resident', 'in', 'one', 'of', 'the', 'following', 'countries:', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '.']
|
| 150 |
+
|
| 151 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 152 |
+
Tokens: ['Visa', 'support', 'can', 'currently', 'be', 'offered', 'only', 'for', '<LOCATION>', '.']
|
| 153 |
+
|
| 154 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 155 |
+
Tokens: ['*Do', 'I', 'need', 'to', 'meet', 'all', 'the', 'requirements', 'to', 'apply?']
|
| 156 |
+
|
| 157 |
+
Tags Knowledge: ['O']
|
| 158 |
+
Tokens: ['*']
|
| 159 |
+
|
| 160 |
+
Tags Knowledge: ['O']
|
| 161 |
+
Tokens: ['Studies']
|
| 162 |
+
|
| 163 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 164 |
+
Tokens: ['by', 'several', 'different', 'sources', 'have', 'shown', 'that', 'on', 'average', 'men', 'will', 'apply', 'for', 'a', 'job', 'if', 'they', 'meet', '60%', 'of', 'the', 'application', 'requirements', '.']
|
| 165 |
+
|
| 166 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 167 |
+
Tokens: ['In', 'contrast', 'women/non-binary', 'people', 'will', 'seek', 'to', 'match', 'a', 'much', 'higher', 'percentage', 'of', 'the', 'requirements', 'before', 'applying', '.']
|
| 168 |
+
|
| 169 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 170 |
+
Tokens: ['We', 'encourage', 'everyone', 'to', 'apply', 'and', 'give', 'us', 'a', 'chance', 'to', 'evaluate', 'your', 'skills', 'and', 'experience', '.']
|
| 171 |
+
|
| 172 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 173 |
+
Tokens: ['We', 'are', 'all', 'learning', 'on', 'the', 'job', 'and', 'although', 'the', 'listing', 'above', 'has', 'been', 'carefully', 'compiled', 'we', 'are', 'also', 'open-minded', 'and', 'interested', 'to', 'hear', 'about', 'the', 'value', 'you', 'can', 'bring', 'to', 'the', 'role', 'and', '<ORGANIZATION>', '.']
|
| 174 |
+
|
| 175 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 176 |
+
Tokens: ['*How', 'can', 'I', 'demonstrate', 'that', 'I', 'have', 'particular', 'needs', 'in', 'the', 'application', 'process?']
|
| 177 |
+
|
| 178 |
+
Tags Knowledge: ['O']
|
| 179 |
+
Tokens: ['*']
|
| 180 |
+
|
| 181 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 182 |
+
Tokens: ['For', 'people', 'living', 'with', 'disabilities', 'chronic', 'illnesses', 'or', 'neurodiversity', 'adjustments', 'and', 'support', 'can', 'make', 'a', 'decisive', 'difference', 'in', 'the', 'application', 'process', '.']
|
| 183 |
+
|
| 184 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 185 |
+
Tokens: ['If', 'you', 'need', 'any', 'specific', 'accommodations', '(', 'tools', 'time', 'etc.', ')', 'and', 'feel', 'comfortable', 'disclosing', 'this', 'please', 'let', 'us', 'know', '.']
|
| 186 |
+
|
| 187 |
+
Tags Knowledge: ['O', 'O']
|
| 188 |
+
Tokens: ['Job', 'benefits:']
|
| 189 |
+
|
| 190 |
+
Tags Knowledge: ['O', 'O', 'O']
|
| 191 |
+
Tokens: ['Flexible', 'working', 'hours']
|
| 192 |
+
|
| 193 |
+
Tags Knowledge: ['O', 'O']
|
| 194 |
+
Tokens: ['Flat', 'hierarchies']
|
| 195 |
+
|
| 196 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
| 197 |
+
Tokens: ['Mentoring', '&', 'personal', 'development', 'program']
|
| 198 |
+
|
| 199 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
| 200 |
+
Tokens: ['Fruits', '&', 'drinks', 'for', 'free']
|
| 201 |
+
|
| 202 |
+
Tags Knowledge: ['O', 'O', 'O']
|
| 203 |
+
Tokens: ['Excellent', 'transport', 'connections']
|
| 204 |
+
|
| 205 |
+
Tags Knowledge: ['O', 'O']
|
| 206 |
+
Tokens: ['Sports', 'offers']
|
| 207 |
+
|
| 208 |
+
Tags Knowledge: ['O', 'O']
|
| 209 |
+
Tokens: ['Subsidised', 'lunches']
|
| 210 |
+
|
| 211 |
+
Tags Knowledge: ['O', 'O', 'O', 'O']
|
| 212 |
+
Tokens: ['30', 'days', 'of', 'holidays']
|
| 213 |
+
|
| 214 |
+
Tags Knowledge: ['O', 'O']
|
| 215 |
+
Tokens: ['Child-care', 'support']
|
| 216 |
+
|
| 217 |
+
Tags Knowledge: ['O', 'O', 'O', 'O']
|
| 218 |
+
Tokens: ['30', 'days', 'of', 'holiday']
|
| 219 |
+
|
| 220 |
+
Tags Knowledge: ['O', 'O']
|
| 221 |
+
Tokens: ['Company', 'description:']
|
| 222 |
+
|
| 223 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 224 |
+
Tokens: ['<ORGANIZATION>', 'is', 'the', 'leading', 'SaaS-based', 'business', 'process', 'management', 'application', 'suite', 'in', 'the', 'world', '.']
|
| 225 |
+
|
| 226 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 227 |
+
Tokens: ['<ORGANIZATION>', 'enables', 'organisations', 'to', 'keep', 'up', 'with', 'the', 'pace', 'volume', 'and', 'complexity', 'of', 'change', '.']
|
| 228 |
+
|
| 229 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 230 |
+
Tokens: ['Our', 'Business', 'Transformation', 'Suite', 'is', 'the', 'smarter', 'way', 'to', 'continuously', 'translate', 'between', 'strategy', 'and', 'execution', '.']
|
| 231 |
+
|
| 232 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 233 |
+
Tokens: ['With', '<ORGANIZATION>', 'companies', 'of', 'all', 'sizes', 'can', 'document', 'automate', 'and', 'analyse', 'processes', 'which', 'allows', 'them', 'to', 'make', 'smarter', 'business', 'decisions', '.']
|
| 234 |
+
|
| 235 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 236 |
+
Tokens: ['Headquartered', 'in', '<LOCATION>', 'with', 'offices', 'in', 'the', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', '<LOCATION>', 'and', '<LOCATION>', '<ORGANIZATION>', 'serves', 'more', 'than', '1,300', 'customers', 'around', 'the', 'globe', 'across', 'all', 'industries', 'and', 'employs', '300', 'employees', 'globally', '.']
|
| 237 |
+
|
| 238 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 239 |
+
Tokens: ['Are', 'you', 'interested', 'in', 'joining', 'one', 'of', 'the', 'world’s', 'leading', 'Business', 'Process', 'Management', 'companies?', 'As', 'we', 'expand', 'our', 'presence', 'into', 'new', 'markets', 'across', 'the', 'globe', 'we', 'are', 'looking', 'to', 'add', 'to', 'our', 'team!', 'across', 'all', 'departments.']
|
| 240 |
+
|
| 241 |
+
Tags Knowledge: ['O', 'O', 'O']
|
| 242 |
+
Tokens: ['Cloud', 'DevOps', 'Engineer']
|
| 243 |
+
|
| 244 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
| 245 |
+
Tokens: ['<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>', '<ORGANIZATION>']
|
| 246 |
+
|
| 247 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O']
|
| 248 |
+
Tokens: ['<ADDRESS>', '<ADDRESS>', '<LOCATION>', '-', '<LOCATION>']
|
| 249 |
+
|
| 250 |
+
Tags Knowledge: ['O', 'O', 'O']
|
| 251 |
+
Tokens: ['Date', 'posted:', '2021-01-21']
|
| 252 |
+
|
| 253 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O']
|
| 254 |
+
Tokens: ['Likes:', '0', 'Dislikes:', '0', 'Love:', '0']
|
| 255 |
+
|
| 256 |
+
Tags Knowledge: ['O', 'O']
|
| 257 |
+
Tokens: ['Job', 'description:']
|
| 258 |
+
|
| 259 |
+
Tags Knowledge: ['O', 'O']
|
| 260 |
+
Tokens: ['Job', 'type:']
|
| 261 |
+
|
| 262 |
+
Tags Knowledge: ['O']
|
| 263 |
+
Tokens: ['Full-time']
|
| 264 |
+
|
| 265 |
+
Tags Knowledge: ['O']
|
| 266 |
+
Tokens: ['Role:']
|
| 267 |
+
|
| 268 |
+
Tags Knowledge: ['O']
|
| 269 |
+
Tokens: ['DevOps']
|
| 270 |
+
|
| 271 |
+
Tags Knowledge: ['O']
|
| 272 |
+
Tokens: ['Industry:']
|
| 273 |
+
|
| 274 |
+
Tags Knowledge: ['B', 'I']
|
| 275 |
+
Tokens: ['Financial', 'Services']
|
| 276 |
+
|
| 277 |
+
Tags Knowledge: ['O', 'O']
|
| 278 |
+
Tokens: ['Company', 'size:']
|
| 279 |
+
|
| 280 |
+
Tags Knowledge: ['O', 'O']
|
| 281 |
+
Tokens: ['10k+', 'people']
|
| 282 |
+
|
| 283 |
+
Tags Knowledge: ['O', 'O']
|
| 284 |
+
Tokens: ['Company', 'type:']
|
| 285 |
+
|
| 286 |
+
Tags Knowledge: ['O']
|
| 287 |
+
Tokens: ['Public']
|
| 288 |
+
|
| 289 |
+
Tags Knowledge: ['O']
|
| 290 |
+
Tokens: ['Technologies']
|
| 291 |
+
|
| 292 |
+
Tags Knowledge: ['B', 'B', 'B']
|
| 293 |
+
Tokens: ['cloud', 'java', 'amazon-web-services']
|
| 294 |
+
|
| 295 |
+
Tags Knowledge: ['O', 'O']
|
| 296 |
+
Tokens: ['Job', 'description']
|
| 297 |
+
|
| 298 |
+
Tags Knowledge: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
|
| 299 |
+
Tokens: ['As', 'a', 'member', 'of', 'our', 'Software', 'Engineering', 'Group', 'we', 'look', 'first', 'and', 'foremost', 'for', 'people', 'who', 'are', 'passionate', 'about', 'solving', 'business', 'problems', 'through', 'innovation', 'and', 'engineering', 'practices', '.']
|
llm-tagging.py
CHANGED
|
@@ -15,9 +15,9 @@ import sys
|
|
| 15 |
from tabulate import tabulate
|
| 16 |
import spacy
|
| 17 |
import re
|
|
|
|
| 18 |
|
| 19 |
load_dotenv(".env")
|
| 20 |
-
|
| 21 |
nlp = spacy.load("en_core_web_sm")
|
| 22 |
|
| 23 |
def split_text_recursively(text):
|
|
@@ -46,7 +46,6 @@ def tokenize_to_sent(path):
|
|
| 46 |
for line in str_list:
|
| 47 |
doc = nlp(line)
|
| 48 |
for sent in doc.sents:
|
| 49 |
-
# print(f"{sent.text}")
|
| 50 |
sents.append(sent.text)
|
| 51 |
|
| 52 |
return sents
|
|
@@ -58,13 +57,15 @@ model = ChatOpenAI(temperature=0)
|
|
| 58 |
|
| 59 |
class TokenTaggingResult(BaseModel):
|
| 60 |
tokens: List[str]
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
|
| 66 |
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
|
| 67 |
-
parser = JsonOutputParser(pydantic_object=
|
| 68 |
|
| 69 |
# Definitions
|
| 70 |
|
|
@@ -81,23 +82,20 @@ with open('few-shot.txt', 'r') as file:
|
|
| 81 |
few_shot_examples = file.read()
|
| 82 |
|
| 83 |
prompt = PromptTemplate(
|
| 84 |
-
template="""You are an expert in tagging tokens with
|
| 85 |
-
Skill definition:{skill_definition}
|
| 86 |
Knowledge definition:{knowledge_definition}
|
| 87 |
Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
|
| 88 |
input_variables=["input"],
|
| 89 |
partial_variables={"format_instructions": parser.get_format_instructions(),
|
| 90 |
"few_shot_examples": few_shot_examples,
|
| 91 |
-
|
| 92 |
"knowledge_definition": knowledge_definition},
|
| 93 |
)
|
| 94 |
|
| 95 |
-
def extract_tags(text: str, tokenize = True) ->
|
| 96 |
|
| 97 |
if tokenize:
|
| 98 |
-
|
| 99 |
-
inputs = tokenizer(text, return_tensors="pt")
|
| 100 |
-
tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
|
| 101 |
|
| 102 |
prompt_and_model = prompt | model
|
| 103 |
output = prompt_and_model.invoke({"input": tokens})
|
|
@@ -105,90 +103,21 @@ def extract_tags(text: str, tokenize = True) -> TokenTaggingResult:
|
|
| 105 |
return tokens, output
|
| 106 |
|
| 107 |
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
mapping = {0: 'B', 1: 'I', 2: 'O'}
|
| 111 |
-
token_skill_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_skill_extraction")
|
| 112 |
-
token_knowledge_classifier = AutoModelForTokenClassification.from_pretrained("jjzha/jobbert_knowledge_extraction")
|
| 113 |
-
|
| 114 |
-
def convert(text):
|
| 115 |
-
inputs = tokenizer(text, return_tensors="pt")
|
| 116 |
-
|
| 117 |
-
with torch.no_grad():
|
| 118 |
-
skill_outputs = token_skill_classifier(**inputs)
|
| 119 |
-
knowledge_outputs = token_knowledge_classifier(**inputs)
|
| 120 |
-
|
| 121 |
-
decoded_tokens = tokenizer.decode(inputs['input_ids'].squeeze()).split()[1:-1]
|
| 122 |
-
skill_cls = skill_outputs.logits.argmax(dim=2).squeeze()[1:-1]
|
| 123 |
-
knowledge_cls = knowledge_outputs.logits.argmax(dim=2).squeeze()[1:-1]
|
| 124 |
-
|
| 125 |
-
skill_cls = [mapping[i.item()] for i in skill_cls]
|
| 126 |
-
knowledge_cls = [mapping[i.item()] for i in knowledge_cls]
|
| 127 |
-
|
| 128 |
-
if len(decoded_tokens) != len(skill_cls) or len(decoded_tokens) != len(knowledge_cls):
|
| 129 |
-
raise ValueError("Error: Length mismatch")
|
| 130 |
-
|
| 131 |
-
return skill_cls, knowledge_cls, decoded_tokens
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
from transformers import pipeline
|
| 135 |
-
pipe = pipeline("token-classification", model="jjzha/jobbert_knowledge_extraction")
|
| 136 |
-
|
| 137 |
-
def convert2(text):
|
| 138 |
-
output = pipe(text)
|
| 139 |
-
tokens = [i['word'] for i in output]
|
| 140 |
-
skill_cls = [i['entity'] for i in output]
|
| 141 |
-
knowledge_cls = [i['entity'] for i in output]
|
| 142 |
-
|
| 143 |
-
return skill_cls, knowledge_cls, tokens
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
def tag_posting(path, llm_extract = True):
|
| 149 |
|
| 150 |
# Reading & sentence tokenization
|
| 151 |
-
sents = tokenize_to_sent(
|
| 152 |
-
|
| 153 |
-
for sent in sents:
|
| 154 |
-
# print(f"Sent: {sent}")
|
| 155 |
-
skill_cls, knowledge_cls, tokens = convert(sent)
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
# Pre-trained
|
| 159 |
-
# skill_cls, knowledge_cls, _ = convert(text)
|
| 160 |
-
|
| 161 |
-
if llm_extract:
|
| 162 |
-
|
| 163 |
-
# LLM-based tag extraction
|
| 164 |
-
tokens, output = extract_tags(text, tokenize=True)
|
| 165 |
-
table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
|
| 166 |
-
headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
|
| 167 |
-
print(tabulate(table, headers=headers, tablefmt="pretty"))
|
| 168 |
-
|
| 169 |
-
else:
|
| 170 |
-
|
| 171 |
-
# Only pre-trained
|
| 172 |
-
table = zip(tokens, output['skill_labels'], output['knowledge_labels'])
|
| 173 |
-
headers = ["Token", "Skill Label", "Knowledge Label"]
|
| 174 |
-
print(tabulate(table, headers=headers, tablefmt="pretty"))
|
| 175 |
|
|
|
|
|
|
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
if __name__ == "__main__":
|
| 179 |
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
quit()
|
| 184 |
-
text = input('Enter text: ')
|
| 185 |
-
|
| 186 |
-
# LLM-based tag extraction
|
| 187 |
-
tokens, output = extract_tags(text, tokenize=True)
|
| 188 |
-
|
| 189 |
-
# Pre-trained
|
| 190 |
-
skill_cls, knowledge_cls = convert(text)
|
| 191 |
-
|
| 192 |
-
table = zip(tokens, output['skill_labels'], output['knowledge_labels'], skill_cls, knowledge_cls)
|
| 193 |
-
headers = ["Token", "Skill Label", "Knowledge Label", "Pred Skill Label", "Pred Knowledge Label"]
|
| 194 |
-
print(tabulate(table, headers=headers, tablefmt="pretty"))
|
|
|
|
| 15 |
from tabulate import tabulate
|
| 16 |
import spacy
|
| 17 |
import re
|
| 18 |
+
import json
|
| 19 |
|
| 20 |
load_dotenv(".env")
|
|
|
|
| 21 |
nlp = spacy.load("en_core_web_sm")
|
| 22 |
|
| 23 |
def split_text_recursively(text):
|
|
|
|
| 46 |
for line in str_list:
|
| 47 |
doc = nlp(line)
|
| 48 |
for sent in doc.sents:
|
|
|
|
| 49 |
sents.append(sent.text)
|
| 50 |
|
| 51 |
return sents
|
|
|
|
| 57 |
|
| 58 |
class TokenTaggingResult(BaseModel):
|
| 59 |
tokens: List[str]
|
| 60 |
+
tags_knowledge: List[str]
|
| 61 |
+
|
| 62 |
+
class Results(BaseModel):
|
| 63 |
+
results: List[TokenTaggingResult]
|
| 64 |
|
| 65 |
|
| 66 |
model = ChatOpenAI(model_name="gpt-4o", temperature=0.0, api_key=os.getenv('OPENAI_API_KEY'))
|
| 67 |
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_skill_extraction")
|
| 68 |
+
parser = JsonOutputParser(pydantic_object=Results)
|
| 69 |
|
| 70 |
# Definitions
|
| 71 |
|
|
|
|
| 82 |
few_shot_examples = file.read()
|
| 83 |
|
| 84 |
prompt = PromptTemplate(
|
| 85 |
+
template="""You are an expert in tagging tokens with knowledge labels. Use the following definitions to tag the input tokens:
|
|
|
|
| 86 |
Knowledge definition:{knowledge_definition}
|
| 87 |
Use the examples below to tag the input text into relevant knowledge or skills categories.\n{few_shot_examples}\n{format_instructions}\n{input}\n""",
|
| 88 |
input_variables=["input"],
|
| 89 |
partial_variables={"format_instructions": parser.get_format_instructions(),
|
| 90 |
"few_shot_examples": few_shot_examples,
|
| 91 |
+
# "skill_definition": skill_definition,
|
| 92 |
"knowledge_definition": knowledge_definition},
|
| 93 |
)
|
| 94 |
|
| 95 |
+
def extract_tags(text: str, tokenize = True) -> Results:
|
| 96 |
|
| 97 |
if tokenize:
|
| 98 |
+
tokens = [tokenizer.tokenize(t) for t in text]
|
|
|
|
|
|
|
| 99 |
|
| 100 |
prompt_and_model = prompt | model
|
| 101 |
output = prompt_and_model.invoke({"input": tokens})
|
|
|
|
| 103 |
return tokens, output
|
| 104 |
|
| 105 |
|
| 106 |
+
def tag_posting(job_path, output_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
# Reading & sentence tokenization
|
| 109 |
+
sents = tokenize_to_sent(job_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
# LLM-based tag extraction
|
| 112 |
+
tokens, output = extract_tags(sents, tokenize=True)
|
| 113 |
|
| 114 |
+
with open("./data/data.jsonl", "w") as file:
|
| 115 |
+
for entry in output['results']:
|
| 116 |
+
json.dump(entry, file)
|
| 117 |
+
file.write("\n")
|
| 118 |
|
| 119 |
if __name__ == "__main__":
|
| 120 |
|
| 121 |
+
job_path = './job-postings/03-01-2024/1.txt'
|
| 122 |
+
output_path = './data/data.json'
|
| 123 |
+
tag_posting(job_path, output_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train.py
CHANGED
|
@@ -2,177 +2,161 @@ from transformers import AutoTokenizer, BertForTokenClassification, TrainingArgu
|
|
| 2 |
import torch
|
| 3 |
from tabulate import tabulate
|
| 4 |
import wandb
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
-
|
| 8 |
-
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
|
| 9 |
-
|
| 10 |
-
artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
|
| 11 |
-
|
| 12 |
-
text = 'Experience with Unreal and/or Unity and/or native IOS/Android 3D development and/or Web based 3D engines '
|
| 13 |
-
|
| 14 |
-
# Tokenize
|
| 15 |
-
inputs = tokenizer(
|
| 16 |
-
text, add_special_tokens=False, return_tensors="pt"
|
| 17 |
-
)
|
| 18 |
-
|
| 19 |
-
# Inference
|
| 20 |
-
|
| 21 |
-
# with torch.no_grad():
|
| 22 |
-
# output = model(**inputs)
|
| 23 |
-
|
| 24 |
-
# # Post-process
|
| 25 |
-
# predicted_token_class_ids = output.logits.argmax(-1)
|
| 26 |
-
# predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
|
| 27 |
-
# tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
|
| 28 |
-
|
| 29 |
-
# # Display
|
| 30 |
-
# table = zip(tokens, predicted_tokens_classes)
|
| 31 |
-
# print(tabulate(table, headers=["Token", "Predicted Class"], tablefmt="pretty"))
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
-
|
| 48 |
|
| 49 |
-
from torch.utils.data import DataLoader
|
| 50 |
-
import torch.nn as nn
|
| 51 |
-
from transformers import DataCollatorForTokenClassification
|
| 52 |
-
from typing import List, Tuple
|
|
|
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
|
| 59 |
-
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
|
| 60 |
-
|
| 61 |
-
return torch.tensor(padded_lists), torch.tensor(attention_masks)
|
| 62 |
|
|
|
|
| 63 |
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
|
| 67 |
-
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
|
| 68 |
-
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
|
| 69 |
|
| 70 |
-
|
| 71 |
-
batch_size = 32
|
| 72 |
-
train_dataloader = DataLoader(dataset['train'], shuffle=True, batch_size=batch_size, collate_fn=collate_fn)
|
| 73 |
-
eval_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
|
| 79 |
-
model.train()
|
| 80 |
-
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
id2label = model.config.id2label
|
| 85 |
-
label2id = model.config.label2id
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
lr_scheduler = get_scheduler(
|
| 93 |
-
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
|
| 94 |
-
)
|
| 95 |
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
| 103 |
|
| 104 |
-
|
| 105 |
-
current_time = datetime.now()
|
| 106 |
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
| 112 |
|
| 113 |
-
|
| 114 |
-
config={
|
| 115 |
-
"learning_rate": lr,
|
| 116 |
-
"architecture": "BERT",
|
| 117 |
-
"epochs": num_epochs,
|
| 118 |
-
"batch_size": batch_size,
|
| 119 |
-
"notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
|
| 120 |
-
}
|
| 121 |
-
)
|
| 122 |
|
| 123 |
-
|
| 124 |
-
from datetime import datetime
|
| 125 |
-
logging.info("Initiating training")
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
logging.info(f"Epoch #{epoch}")
|
| 130 |
-
print(f"Epoch #{epoch}")
|
| 131 |
|
| 132 |
-
|
|
|
|
|
|
|
| 133 |
|
| 134 |
-
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
|
|
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
|
| 146 |
-
pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
|
| 147 |
-
label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
|
| 148 |
|
| 149 |
-
|
| 150 |
-
_, predicted_labels = torch.max(pred, dim=1)
|
| 151 |
-
non_pad_elements = label != IGNORE_INDEX
|
| 152 |
-
correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
|
| 153 |
-
total_predictions = non_pad_elements.sum().item()
|
| 154 |
-
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
|
| 155 |
|
| 156 |
-
|
| 157 |
-
loss.backward()
|
| 158 |
-
optimizer.step()
|
| 159 |
-
lr_scheduler.step()
|
| 160 |
-
optimizer.zero_grad()
|
| 161 |
-
|
| 162 |
-
wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
|
| 163 |
|
| 164 |
-
batch_count += 1
|
| 165 |
|
| 166 |
-
|
| 167 |
|
| 168 |
|
| 169 |
-
|
|
|
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
-
#
|
| 173 |
-
|
| 174 |
-
with artifact.new_file('model.pth', mode='wb') as f:
|
| 175 |
-
torch.save(state_dict, f)
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
| 2 |
import torch
|
| 3 |
from tabulate import tabulate
|
| 4 |
import wandb
|
| 5 |
+
import os
|
| 6 |
+
import yaml
|
| 7 |
+
from datetime import datetime
|
| 8 |
|
| 9 |
|
| 10 |
+
def train(json_path: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
### Model & tokenizer loading
|
| 13 |
|
| 14 |
+
tokenizer = AutoTokenizer.from_pretrained("jjzha/jobbert_knowledge_extraction")
|
| 15 |
+
model = BertForTokenClassification.from_pretrained("Robzy/jobbert_knowledge_extraction")
|
| 16 |
|
| 17 |
+
with open("./config.yaml", "r") as file:
|
| 18 |
+
config = yaml.safe_load(file)
|
| 19 |
|
| 20 |
+
num_epochs = config['training']['epochs']
|
| 21 |
+
batch_size = config['training']['batch_size']
|
| 22 |
+
lr = config['training']['learning_rate']
|
| 23 |
+
current_time = datetime.now()
|
| 24 |
|
| 25 |
+
run = wandb.init(
|
| 26 |
+
# set the wandb project where this run will be logged
|
| 27 |
+
project="in-demand",
|
| 28 |
|
| 29 |
+
# track hyperparameters and run metadata
|
| 30 |
+
config={
|
| 31 |
+
"learning_rate": lr,
|
| 32 |
+
"architecture": "BERT",
|
| 33 |
+
"epochs": num_epochs,
|
| 34 |
+
"batch_size": batch_size,
|
| 35 |
+
"notes": "Datetime: " + current_time.strftime("%m/%d/%Y, %H:%M:%S")
|
| 36 |
+
}
|
| 37 |
+
)
|
| 38 |
|
| 39 |
+
### Data loading and preprocessing
|
| 40 |
|
| 41 |
+
from torch.utils.data import DataLoader
|
| 42 |
+
import torch.nn as nn
|
| 43 |
+
from transformers import DataCollatorForTokenClassification
|
| 44 |
+
from typing import List, Tuple
|
| 45 |
+
from datasets import load_dataset
|
| 46 |
|
| 47 |
+
# dataset = load_dataset("json", data_files="data/test-short.json")
|
| 48 |
+
dataset = load_dataset("json", data_files=json_path)
|
| 49 |
+
dataset = dataset.map(
|
| 50 |
+
lambda x: {"input_ids": torch.tensor(tokenizer.convert_tokens_to_ids(x["tokens"]))}
|
| 51 |
+
)
|
| 52 |
|
| 53 |
+
def pad(list_of_lists, pad_value=0):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
+
max_len = max(len(lst) for lst in list_of_lists)
|
| 56 |
|
| 57 |
+
# Pad shorter lists with the specified value
|
| 58 |
+
padded_lists = [lst + [pad_value] * (max_len - len(lst)) for lst in list_of_lists]
|
| 59 |
+
attention_masks = [[1] * len(lst) + [0] * (max_len - len(lst)) for lst in list_of_lists]
|
| 60 |
+
|
| 61 |
+
return torch.tensor(padded_lists), torch.tensor(attention_masks)
|
| 62 |
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
def collate_fn(batch: List[List[torch.Tensor]]):
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
+
input_ids, attention_mask = pad(list(map(lambda x: tokenizer.convert_tokens_to_ids(x['tokens']),batch)))
|
| 67 |
+
tags_knowledge, _ = pad([list(map(lambda x: label2id[x],o)) for o in [b['tags_knowledge'] for b in batch]])
|
| 68 |
+
return {"input_ids": input_ids, "tags_knowledge": tags_knowledge, "attention_mask": attention_mask}
|
| 69 |
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
### Training settings
|
| 72 |
+
train_dataloader = DataLoader(dataset['train'], batch_size=batch_size, collate_fn=collate_fn)
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
from tqdm.auto import tqdm
|
| 75 |
+
from torch.optim import AdamW
|
| 76 |
+
from transformers import get_scheduler
|
| 77 |
|
| 78 |
+
model.train()
|
| 79 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
IGNORE_INDEX = -100
|
| 82 |
+
criterion = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
|
| 83 |
+
id2label = model.config.id2label
|
| 84 |
+
label2id = model.config.label2id
|
| 85 |
|
| 86 |
+
optimizer = AdamW(model.parameters(), lr=lr)
|
| 87 |
|
| 88 |
+
num_training_steps = num_epochs * len(train_dataloader)
|
| 89 |
+
lr_scheduler = get_scheduler(
|
| 90 |
+
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
|
| 91 |
+
)
|
| 92 |
|
| 93 |
+
### Training
|
|
|
|
| 94 |
|
| 95 |
+
from dotenv import load_dotenv
|
| 96 |
+
import os
|
| 97 |
+
load_dotenv(".env")
|
| 98 |
+
import logging
|
| 99 |
+
logging.info("Initiating training")
|
| 100 |
|
| 101 |
+
progress_bar = tqdm(range(num_epochs), desc="Epochs")
|
| 102 |
+
for epoch in range(num_epochs):
|
| 103 |
+
logging.info(f"Epoch #{epoch}")
|
| 104 |
+
# print(f"Epoch #{epoch}")
|
| 105 |
|
| 106 |
+
batch_count = 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
+
for batch in train_dataloader:
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
logging.info(f"Batch #{batch_count} / {len(train_dataloader)}")
|
| 111 |
+
# print(f"Batch #{batch_count} / {len(train_dataloader)}")
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
tokens = batch['input_ids'].to(device)
|
| 114 |
+
attention_mask = batch['attention_mask'].to(device)
|
| 115 |
+
tags_knowledge = batch['tags_knowledge'].to(device)
|
| 116 |
|
| 117 |
+
outputs = model(tokens, attention_mask=attention_mask)
|
| 118 |
|
| 119 |
+
# Batch
|
| 120 |
+
pred = outputs.logits.reshape(-1, model.config.num_labels) # Logits
|
| 121 |
+
label = torch.where(attention_mask==0, torch.tensor(IGNORE_INDEX).to(device), tags_knowledge).reshape(-1) # Labels, padding set to class idx -100
|
| 122 |
|
| 123 |
+
# Compute accuracy ignoring padding idx
|
| 124 |
+
_, predicted_labels = torch.max(pred, dim=1)
|
| 125 |
+
non_pad_elements = label != IGNORE_INDEX
|
| 126 |
+
correct_predictions = (predicted_labels[non_pad_elements] == label[non_pad_elements]).sum().item()
|
| 127 |
+
total_predictions = non_pad_elements.sum().item()
|
| 128 |
+
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
|
| 129 |
|
| 130 |
+
loss = criterion(pred, label)
|
| 131 |
+
loss.backward()
|
| 132 |
+
optimizer.step()
|
| 133 |
+
lr_scheduler.step()
|
| 134 |
+
optimizer.zero_grad()
|
| 135 |
+
|
| 136 |
+
wandb.log({"epoch": epoch, "accuracy": accuracy, "loss": loss})
|
| 137 |
|
| 138 |
+
batch_count += 1
|
|
|
|
|
|
|
| 139 |
|
| 140 |
+
progress_bar.update(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
+
print("Training complete")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
|
|
|
| 144 |
|
| 145 |
+
### Pushing model
|
| 146 |
|
| 147 |
|
| 148 |
+
# Hugging Face
|
| 149 |
+
model.push_to_hub("Robzy/jobbert_knowledge_extraction")
|
| 150 |
|
| 151 |
+
# W&B
|
| 152 |
+
artifact = wandb.Artifact(name="jobbert-knowledge-extraction", type="BERT")
|
| 153 |
+
state_dict = model.state_dict()
|
| 154 |
+
with artifact.new_file('model.pth', mode='wb') as f:
|
| 155 |
+
torch.save(state_dict, f)
|
| 156 |
|
| 157 |
+
# Log the artifact to W&B
|
| 158 |
+
wandb.log_artifact(artifact)
|
|
|
|
|
|
|
| 159 |
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
|
| 162 |
+
train(json_path="./data/data.jsonl")
|