Spaces:
Sleeping
Sleeping
import re | |
import string | |
import numpy as np | |
from fastapi import FastAPI | |
from pydantic import BaseModel | |
from transformers import pipeline, AutoTokenizer | |
from semantic_similarity.semantic_similarity import model as embedding_model | |
from sentence_transformers import util | |
# Initialize FastAPI | |
app = FastAPI() | |
# Load models and tokenizers | |
knowledge_model_name = "jjzha/jobbert_knowledge_extraction" | |
knowledge_tokenizer = AutoTokenizer.from_pretrained(knowledge_model_name) | |
knowledge_nlp = pipeline( | |
model=knowledge_model_name, | |
tokenizer=knowledge_tokenizer, | |
aggregation_strategy="first", | |
) | |
skill_model_name = "jjzha/jobbert_skill_extraction" | |
skill_tokenizer = AutoTokenizer.from_pretrained(skill_model_name) | |
skill_nlp = pipeline( | |
model=skill_model_name, | |
tokenizer=skill_tokenizer, | |
aggregation_strategy="first", | |
) | |
class TextInput(BaseModel): | |
jobDescription: str | |
def convert_from_numpy(predictions): | |
for pred in predictions: | |
for key, value in pred.items(): | |
if isinstance(value, (np.float32, np.int32, np.int64)): | |
pred[key] = float(value) | |
return predictions | |
def merge_BI_and_get_results(predictions): | |
results, curSkill, curScore, curNoWords = [], "", 0, 0 | |
for pred in predictions: | |
if pred["entity_group"] == "B": | |
if curSkill: | |
results.append( | |
{"name": curSkill.strip(), "confidence": curScore / curNoWords} | |
) | |
curSkill, curScore, curNoWords = pred["word"], pred["score"], 1 | |
else: | |
curSkill += " " + pred["word"] | |
curScore += pred["score"] | |
curNoWords += 1 | |
if curSkill: | |
results.append({"name": curSkill.strip(), "confidence": curScore / curNoWords}) | |
return results | |
def chunk_text(text, tokenizer, max_length=500, overlap=100): | |
""" | |
Uses the tokenizer's built-in overflow mechanism to split `text` into | |
chunks of at most `max_length` tokens, each overlapping the previous | |
by `overlap` tokens. | |
""" | |
enc = tokenizer( | |
text, | |
truncation=True, | |
max_length=max_length, | |
stride=overlap, | |
return_overflowing_tokens=True, | |
return_special_tokens_mask=False, | |
) | |
chunks = [] | |
for ids in enc["input_ids"]: | |
# decode each chunk back to string | |
chunks.append(tokenizer.decode(ids, skip_special_tokens=True)) | |
return chunks | |
def deduplicate_by_similarity(items, embeddings, threshold=0.7): | |
keep = [] | |
used = set() | |
sim_matrix = util.cos_sim(embeddings, embeddings) | |
for i in range(len(items)): | |
if i in used: | |
continue | |
keep.append(items[i]) | |
for j in range(i + 1, len(items)): | |
if sim_matrix[i][j] > threshold: | |
used.add(j) | |
return keep | |
def filter_knowledge(results): | |
# to_remove = ['-', '/', '(', ')', 'and', 'or', 'the', 'a', 'an'] | |
filtered_results = [] | |
for result in results: | |
result["name"] = result["name"].strip() | |
result["name"] = re.sub(r'[^\w\s]', '', result["name"]) | |
result["name"] = re.sub(r'\s+', ' ', result["name"]) | |
if len(result["name"].split()) > 3 or len(result["name"]) <= 2 or result['confidence'] < 0.95: | |
continue | |
filtered_results.append(result) | |
return filtered_results | |
def predict_knowledge(input_data: TextInput): | |
# Clean non-printable chars | |
text = "".join(filter(lambda x: x in string.printable, input_data.jobDescription)) | |
chunks = chunk_text(text, knowledge_tokenizer) | |
all_preds = [] | |
for chunk in chunks: | |
preds = knowledge_nlp(chunk) | |
all_preds.extend(convert_from_numpy(preds)) | |
result = merge_BI_and_get_results(all_preds) | |
if not result: | |
return {"knowledge_predictions": []} | |
result = filter_knowledge(result) | |
knowledge_names = [r["name"] for r in result] | |
embeddings_tensor = embedding_model.encode(knowledge_names, convert_to_tensor=True) | |
embeddings = embeddings_tensor.cpu().numpy() | |
deduped_results = deduplicate_by_similarity(result, embeddings) | |
return {"knowledge_predictions": deduped_results} | |
def predict_skills(input_data: TextInput): | |
text = "".join(filter(lambda x: x in string.printable, input_data.jobDescription)) | |
chunks = chunk_text(text, skill_tokenizer) | |
all_preds = [] | |
for chunk in chunks: | |
preds = skill_nlp(chunk) | |
all_preds.extend(convert_from_numpy(preds)) | |
return {"skills_predictions": merge_BI_and_get_results(all_preds)} | |
# Run with: | |
# uvicorn main:app --host 0.0.0.0 --port 8000 | |