File size: 5,370 Bytes
6e31617 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import streamlit as st
from pyngrok import ngrok
import google.generativeai as genai
import fitz # PyMuPDF for PDF text extraction
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from docx import Document
import re
from nltk.corpus import words
import dateparser
from datetime import datetime
import os
# Replace with your ngrok auth token
ngrok.set_auth_token("2keP9BS91BCtRFtnf5Ss4tOpzq4_2c6463MYzXPqFM3a95gUM")
url = ngrok.connect(8501)
print(f"Public URL: {url}")
# Load SpaCy model
nlp_spacy = spacy.load('en_core_web_sm')
# Load Babelscape NER model
tokenizer_ner = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model_ner = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp_ner = pipeline('ner', model=model_ner, tokenizer=tokenizer_ner, aggregation_strategy="simple")
# Load GLinER model
gliner_tokenizer = AutoTokenizer.from_pretrained("DAMO-NLP-SG/gliner-large")
gliner_model = AutoModelForSeq2SeqLM.from_pretrained("DAMO-NLP-SG/gliner-large")
class EnhancedNERPipeline:
def __init__(self, nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer):
self.nlp_spacy = nlp_spacy
self.nlp_ner = nlp_ner
self.gliner_model = gliner_model
self.gliner_tokenizer = gliner_tokenizer
def __call__(self, text):
doc = self.nlp_spacy(text)
ner_results = self.nlp_ner(text)
gliner_companies = extract_info_with_gliner(text, "company names")
gliner_experience = extract_info_with_gliner(text, "years of experience")
gliner_education = extract_info_with_gliner(text, "educational institutions")
combined_entities = doc.ents + tuple(ner_results)
doc._.gliner_companies = gliner_companies.split(', ')
doc._.gliner_experience = gliner_experience
doc._.gliner_education = gliner_education.split(', ')
doc.ents = [ent for ent in combined_entities if ent.label_ not in ["ORG"]]
return doc
def extract_info_with_gliner(text, info_type):
input_text = f"Extract {info_type} from: {text}"
input_ids = gliner_tokenizer(input_text, return_tensors="pt").input_ids
outputs = gliner_model.generate(input_ids, max_length=100)
return gliner_tokenizer.decode(outputs[0], skip_special_tokens=True)
# Create the enhanced pipeline
enhanced_nlp = EnhancedNERPipeline(nlp_spacy, nlp_ner, gliner_model, gliner_tokenizer)
def extract_companies(doc):
gliner_companies = set(doc._.gliner_companies)
spacy_babelscape_companies = set([ent.text for ent in doc.ents if ent.label_ == "ORG"])
return list(gliner_companies.union(spacy_babelscape_companies))
def extract_experience(doc):
gliner_experience = int(re.search(r'\d+', doc._.gliner_experience).group()) if doc._.gliner_experience else 0
spacy_experience = max([datetime.now().year - date.year for ent in doc.ents if ent.label_ == "DATE" and (date := dateparser.parse(ent.text)) and date.year <= datetime.now().year] or [0])
return max(gliner_experience, spacy_experience)
def extract_education(doc):
gliner_education = set(doc._.gliner_education)
spacy_babelscape_education = set([ent.text for ent in doc.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in ["university", "college", "institute", "school"])])
return list(gliner_education.union(spacy_babelscape_education))
def main():
st.title("Enhanced Resume Analyzer with GLinER Focus")
api_key = st.text_input("Enter your Google Gemini API key", type="password")
uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx"])
if uploaded_file is not None and api_key:
try:
model = authenticate_gemini(api_key)
if model is None:
return
if uploaded_file.type == "application/pdf":
resume_text = extract_text_from_pdf(uploaded_file)
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
resume_text = extract_text_from_doc(uploaded_file)
else:
st.error("Unsupported file format.")
return
doc = enhanced_nlp(resume_text)
companies = extract_companies(doc)
experience = extract_experience(doc)
education = extract_education(doc)
phone = extract_info_with_gliner(resume_text, "phone number")
email = extract_info_with_gliner(resume_text, "email address")
linkedin = extract_info_with_gliner(resume_text, "LinkedIn profile")
st.subheader("Extracted Information")
st.write(f"**Years of Experience:** {experience}")
st.write("**Companies:**", ", ".join(companies))
st.write("**Education:**", ", ".join(education))
st.write(f"**Phone Number:** {phone}")
st.write(f"**Email:** {email}")
st.write(f"**LinkedIn:** {linkedin}")
summary = generate_summary(resume_text, model)
st.subheader("Resume Summary")
st.write(summary)
except Exception as e:
st.error(f"Error during processing: {e}")
if __name__ == "__main__":
main()
|