Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import google.generativeai as genai
|
2 |
import fitz # PyMuPDF for PDF text extraction
|
3 |
import streamlit as st
|
@@ -8,8 +9,14 @@ import re
|
|
8 |
import dateparser
|
9 |
from datetime import datetime
|
10 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
-
# Load SpaCy model for dependency parsing
|
13 |
nlp_spacy = spacy.load('en_core_web_sm')
|
14 |
|
15 |
# Load the NER model
|
@@ -17,9 +24,8 @@ tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ne
|
|
17 |
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
|
18 |
nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
api_key = os.environ.get("GOOGLE_GEMINI_API_KEY")
|
23 |
if not api_key:
|
24 |
st.error("Google Gemini API key not found. Please set it in the Hugging Face Spaces secrets.")
|
25 |
return None
|
@@ -29,66 +35,93 @@ def authenticate_gemini():
|
|
29 |
st.success("Gemini API successfully configured.")
|
30 |
return model
|
31 |
except Exception as e:
|
32 |
-
|
|
|
33 |
return None
|
34 |
|
35 |
-
|
36 |
-
def refine_org_entities(entities):
|
37 |
refined_entities = set()
|
38 |
-
company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
|
39 |
-
|
40 |
for entity in entities:
|
|
|
|
|
|
|
41 |
if any(entity.endswith(suffix) for suffix in company_suffixes):
|
42 |
refined_entities.add(entity)
|
43 |
-
elif re.match(r'([A-Z][a-z]
|
44 |
refined_entities.add(entity)
|
|
|
45 |
return list(refined_entities)
|
46 |
|
47 |
-
|
48 |
-
def extract_orgs(text):
|
49 |
ner_results = nlp_ner(text)
|
50 |
orgs = set()
|
51 |
for entity in ner_results:
|
52 |
if entity['entity_group'] == 'ORG':
|
53 |
orgs.add(entity['word'])
|
54 |
-
|
55 |
return refine_org_entities(orgs)
|
56 |
|
57 |
-
|
58 |
-
def extract_text_from_pdf(pdf_file):
|
59 |
-
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
60 |
-
text = ""
|
61 |
-
for page_num in range(doc.page_count):
|
62 |
-
page = doc.load_page(page_num)
|
63 |
-
text += page.get_text()
|
64 |
-
return text
|
65 |
-
|
66 |
-
# Extract text from DOCX
|
67 |
-
def extract_text_from_doc(doc_file):
|
68 |
-
doc = Document(doc_file)
|
69 |
-
text = '\n'.join([para.text for para in doc.paragraphs])
|
70 |
-
return text
|
71 |
-
|
72 |
-
# Summary generation function
|
73 |
-
def generate_summary(text, model):
|
74 |
-
prompt = f"Can you summarize the following document in 100 words?\n\n{text}"
|
75 |
try:
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
|
|
78 |
except Exception as e:
|
79 |
-
|
|
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
return experience
|
90 |
|
91 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
phone_patterns = [
|
93 |
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
|
94 |
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
|
@@ -99,12 +132,12 @@ def extract_phone(text):
|
|
99 |
return match.group()
|
100 |
return "Not found"
|
101 |
|
102 |
-
def extract_email(text):
|
103 |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
104 |
match = re.search(email_pattern, text)
|
105 |
return match.group() if match else "Not found"
|
106 |
|
107 |
-
def extract_colleges(doc):
|
108 |
colleges = set()
|
109 |
edu_keywords = ["university", "college", "institute", "school"]
|
110 |
for ent in doc.ents:
|
@@ -112,27 +145,42 @@ def extract_colleges(doc):
|
|
112 |
colleges.add(ent.text)
|
113 |
return list(colleges)
|
114 |
|
115 |
-
def extract_linkedin(text):
|
116 |
-
|
117 |
-
|
118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
-
# Main function to process the resume and return the analysis
|
121 |
def main():
|
122 |
-
st.title("
|
123 |
st.write("Upload a resume to extract information, generate a summary, and analyze details.")
|
124 |
|
125 |
-
# Authenticate with Gemini API
|
126 |
model = authenticate_gemini()
|
127 |
if model is None:
|
128 |
return
|
129 |
|
130 |
-
# File uploader for resume input
|
131 |
uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
|
132 |
|
133 |
if uploaded_file is not None:
|
134 |
try:
|
135 |
-
# Extract text from the uploaded resume
|
136 |
file_ext = uploaded_file.name.split('.')[-1].lower()
|
137 |
if file_ext == 'pdf':
|
138 |
resume_text = extract_text_from_pdf(uploaded_file)
|
@@ -143,37 +191,28 @@ def main():
|
|
143 |
return
|
144 |
|
145 |
if not resume_text.strip():
|
146 |
-
st.error("The resume appears to be empty.")
|
147 |
return
|
148 |
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
# Extract information
|
153 |
-
companies = extract_orgs(resume_text)
|
154 |
-
summary = generate_summary(resume_text, model)
|
155 |
-
experience = extract_experience(doc)
|
156 |
-
phone = extract_phone(resume_text)
|
157 |
-
email = extract_email(resume_text)
|
158 |
-
colleges = extract_colleges(doc)
|
159 |
-
linkedin = extract_linkedin(resume_text)
|
160 |
|
161 |
-
# Display results
|
162 |
st.subheader("Extracted Information")
|
163 |
-
st.write(f"
|
164 |
-
st.write("
|
165 |
-
st.write(", ".join(companies))
|
166 |
-
st.write(f"
|
167 |
-
st.write(f"
|
168 |
-
st.write("
|
169 |
-
st.write(", ".join(colleges))
|
170 |
-
st.write(f"
|
171 |
|
172 |
st.subheader("Generated Summary")
|
173 |
-
st.write(summary)
|
174 |
|
175 |
except Exception as e:
|
176 |
-
|
|
|
177 |
|
178 |
-
if
|
179 |
main()
|
|
|
1 |
+
|
2 |
import google.generativeai as genai
|
3 |
import fitz # PyMuPDF for PDF text extraction
|
4 |
import streamlit as st
|
|
|
9 |
import dateparser
|
10 |
from datetime import datetime
|
11 |
import os
|
12 |
+
from typing import List, Dict
|
13 |
+
import logging
|
14 |
+
|
15 |
+
# Configure logging
|
16 |
+
logging.basicConfig(level=logging.INFO)
|
17 |
+
logger = logging.getLogger(_name_)
|
18 |
|
19 |
+
# Load SpaCy model for dependency parsing and NER
|
20 |
nlp_spacy = spacy.load('en_core_web_sm')
|
21 |
|
22 |
# Load the NER model
|
|
|
24 |
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
|
25 |
nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
26 |
|
27 |
+
def authenticate_gemini() -> genai.GenerativeModel:
|
28 |
+
api_key = "AIzaSyCG-qpFRqJc0QOJT-AcAaO5XIEdE-nk3Tc"
|
|
|
29 |
if not api_key:
|
30 |
st.error("Google Gemini API key not found. Please set it in the Hugging Face Spaces secrets.")
|
31 |
return None
|
|
|
35 |
st.success("Gemini API successfully configured.")
|
36 |
return model
|
37 |
except Exception as e:
|
38 |
+
logger.error(f"Error configuring Gemini API: {e}")
|
39 |
+
st.error(f"Error configuring Gemini API. Please check your API key and try again.")
|
40 |
return None
|
41 |
|
42 |
+
def refine_org_entities(entities: List[str]) -> List[str]:
|
|
|
43 |
refined_entities = set()
|
44 |
+
company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.', 'Company', 'Group']
|
45 |
+
|
46 |
for entity in entities:
|
47 |
+
# Remove common prefixes that might interfere with company names
|
48 |
+
entity = re.sub(r'^(The|A|An)\s+', '', entity).strip()
|
49 |
+
|
50 |
if any(entity.endswith(suffix) for suffix in company_suffixes):
|
51 |
refined_entities.add(entity)
|
52 |
+
elif re.match(r'([A-Z][a-z]+\s?)+', entity): # Match sequences of capitalized words
|
53 |
refined_entities.add(entity)
|
54 |
+
|
55 |
return list(refined_entities)
|
56 |
|
57 |
+
def extract_orgs(text: str) -> List[str]:
|
|
|
58 |
ner_results = nlp_ner(text)
|
59 |
orgs = set()
|
60 |
for entity in ner_results:
|
61 |
if entity['entity_group'] == 'ORG':
|
62 |
orgs.add(entity['word'])
|
|
|
63 |
return refine_org_entities(orgs)
|
64 |
|
65 |
+
def extract_text_from_pdf(pdf_file) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
try:
|
67 |
+
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
68 |
+
text = ""
|
69 |
+
for page_num in range(doc.page_count):
|
70 |
+
page = doc.load_page(page_num)
|
71 |
+
text += page.get_text()
|
72 |
+
return text
|
73 |
except Exception as e:
|
74 |
+
logger.error(f"Error extracting text from PDF: {e}")
|
75 |
+
return ""
|
76 |
|
77 |
+
def extract_text_from_doc(doc_file) -> str:
|
78 |
+
try:
|
79 |
+
doc = Document(doc_file)
|
80 |
+
text = '\n'.join([para.text for para in doc.paragraphs])
|
81 |
+
return text
|
82 |
+
except Exception as e:
|
83 |
+
logger.error(f"Error extracting text from DOCX: {e}")
|
84 |
+
return ""
|
|
|
85 |
|
86 |
+
def generate_summary(text: str, model: genai.GenerativeModel) -> str:
|
87 |
+
prompt = f"Summarize the following resume in 100 words, highlighting key skills and experiences:\n\n{text}"
|
88 |
+
try:
|
89 |
+
response = model.generate_content(prompt)
|
90 |
+
return response.text
|
91 |
+
except Exception as e:
|
92 |
+
logger.error(f"Error generating summary: {e}")
|
93 |
+
return "Error generating summary. Please try again."
|
94 |
+
|
95 |
+
def extract_experience(text: str) -> str:
|
96 |
+
# Patterns to match experience in years and months
|
97 |
+
experience_patterns = [
|
98 |
+
r'(\d+)\s*(?:years?|yrs?)', # e.g., 5 years, 2 yrs
|
99 |
+
r'(\d+)\s*(?:months?|mos?)', # e.g., 6 months
|
100 |
+
r'(\d+)\s*(?:years?|yrs?)\s*(?:and)?\s*(\d+)\s*(?:months?|mos?)' # e.g., 2 years and 6 months
|
101 |
+
]
|
102 |
+
|
103 |
+
# Extract and prioritize years of experience
|
104 |
+
total_years = 0
|
105 |
+
for pattern in experience_patterns:
|
106 |
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
107 |
+
for match in matches:
|
108 |
+
if len(match) == 1: # Only years or months
|
109 |
+
value = int(match[0])
|
110 |
+
if 'year' in pattern:
|
111 |
+
total_years += value
|
112 |
+
# We ignore months in this case
|
113 |
+
elif len(match) == 2: # Years and months
|
114 |
+
years, _ = int(match[0]), int(match[1])
|
115 |
+
total_years += years
|
116 |
+
|
117 |
+
# Return only the number of years (ignore months)
|
118 |
+
if total_years > 0:
|
119 |
+
return f"{total_years} years"
|
120 |
+
else:
|
121 |
+
return "Experience not found"
|
122 |
+
|
123 |
+
|
124 |
+
def extract_phone(text: str) -> str:
|
125 |
phone_patterns = [
|
126 |
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
|
127 |
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
|
|
|
132 |
return match.group()
|
133 |
return "Not found"
|
134 |
|
135 |
+
def extract_email(text: str) -> str:
|
136 |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
|
137 |
match = re.search(email_pattern, text)
|
138 |
return match.group() if match else "Not found"
|
139 |
|
140 |
+
def extract_colleges(doc) -> List[str]:
|
141 |
colleges = set()
|
142 |
edu_keywords = ["university", "college", "institute", "school"]
|
143 |
for ent in doc.ents:
|
|
|
145 |
colleges.add(ent.text)
|
146 |
return list(colleges)
|
147 |
|
148 |
+
def extract_linkedin(text: str) -> str:
|
149 |
+
linkedin_patterns = [
|
150 |
+
r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?',
|
151 |
+
r'linkedin\.com\/in\/[A-z0-9_-]+',
|
152 |
+
r'@[A-z0-9_-]+\s+\(LinkedIn\)'
|
153 |
+
]
|
154 |
+
for pattern in linkedin_patterns:
|
155 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
156 |
+
if match:
|
157 |
+
return match.group()
|
158 |
+
return "Not found"
|
159 |
+
|
160 |
+
def analyze_resume(text: str, model: genai.GenerativeModel) -> Dict:
|
161 |
+
doc = nlp_spacy(text)
|
162 |
+
return {
|
163 |
+
"companies": extract_orgs(text),
|
164 |
+
"summary": generate_summary(text, model),
|
165 |
+
"experience": extract_experience(text),
|
166 |
+
"phone": extract_phone(text),
|
167 |
+
"email": extract_email(text),
|
168 |
+
"colleges": extract_colleges(doc),
|
169 |
+
"linkedin": extract_linkedin(text)
|
170 |
+
}
|
171 |
|
|
|
172 |
def main():
|
173 |
+
st.title("Enhanced Resume Analyzer")
|
174 |
st.write("Upload a resume to extract information, generate a summary, and analyze details.")
|
175 |
|
|
|
176 |
model = authenticate_gemini()
|
177 |
if model is None:
|
178 |
return
|
179 |
|
|
|
180 |
uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
|
181 |
|
182 |
if uploaded_file is not None:
|
183 |
try:
|
|
|
184 |
file_ext = uploaded_file.name.split('.')[-1].lower()
|
185 |
if file_ext == 'pdf':
|
186 |
resume_text = extract_text_from_pdf(uploaded_file)
|
|
|
191 |
return
|
192 |
|
193 |
if not resume_text.strip():
|
194 |
+
st.error("The resume appears to be empty or couldn't be read.")
|
195 |
return
|
196 |
|
197 |
+
with st.spinner("Analyzing resume..."):
|
198 |
+
results = analyze_resume(resume_text, model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
199 |
|
|
|
200 |
st.subheader("Extracted Information")
|
201 |
+
st.write(f"Experience: {results['experience']}")
|
202 |
+
st.write("Companies Worked For:")
|
203 |
+
st.write(", ".join(results['companies']))
|
204 |
+
st.write(f"Phone Number: {results['phone']}")
|
205 |
+
st.write(f"Email ID: {results['email']}")
|
206 |
+
st.write("Colleges Attended:")
|
207 |
+
st.write(", ".join(results['colleges']))
|
208 |
+
st.write(f"LinkedIn: {results['linkedin']}")
|
209 |
|
210 |
st.subheader("Generated Summary")
|
211 |
+
st.write(results['summary'])
|
212 |
|
213 |
except Exception as e:
|
214 |
+
logger.error(f"Error during resume analysis: {e}")
|
215 |
+
st.error("An error occurred during resume analysis. Please try again or contact support if the issue persists.")
|
216 |
|
217 |
+
if _name_ == "_main_":
|
218 |
main()
|