bangaboy commited on
Commit
1ec17ce
·
verified ·
1 Parent(s): cd99cf5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -79
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import google.generativeai as genai
2
  import fitz # PyMuPDF for PDF text extraction
3
  import streamlit as st
@@ -8,8 +9,14 @@ import re
8
  import dateparser
9
  from datetime import datetime
10
  import os
 
 
 
 
 
 
11
 
12
- # Load SpaCy model for dependency parsing
13
  nlp_spacy = spacy.load('en_core_web_sm')
14
 
15
  # Load the NER model
@@ -17,9 +24,8 @@ tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ne
17
  model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
18
  nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
19
 
20
- # Function to authenticate with Gemini API
21
- def authenticate_gemini():
22
- api_key = os.environ.get("GOOGLE_GEMINI_API_KEY")
23
  if not api_key:
24
  st.error("Google Gemini API key not found. Please set it in the Hugging Face Spaces secrets.")
25
  return None
@@ -29,66 +35,93 @@ def authenticate_gemini():
29
  st.success("Gemini API successfully configured.")
30
  return model
31
  except Exception as e:
32
- st.error(f"Error configuring Gemini API: {e}")
 
33
  return None
34
 
35
- # Function to filter and refine extracted ORG entities
36
- def refine_org_entities(entities):
37
  refined_entities = set()
38
- company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
39
-
40
  for entity in entities:
 
 
 
41
  if any(entity.endswith(suffix) for suffix in company_suffixes):
42
  refined_entities.add(entity)
43
- elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
44
  refined_entities.add(entity)
 
45
  return list(refined_entities)
46
 
47
- # Function to extract ORG entities using NER
48
- def extract_orgs(text):
49
  ner_results = nlp_ner(text)
50
  orgs = set()
51
  for entity in ner_results:
52
  if entity['entity_group'] == 'ORG':
53
  orgs.add(entity['word'])
54
-
55
  return refine_org_entities(orgs)
56
 
57
- # Extract text from PDF
58
- def extract_text_from_pdf(pdf_file):
59
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
60
- text = ""
61
- for page_num in range(doc.page_count):
62
- page = doc.load_page(page_num)
63
- text += page.get_text()
64
- return text
65
-
66
- # Extract text from DOCX
67
- def extract_text_from_doc(doc_file):
68
- doc = Document(doc_file)
69
- text = '\n'.join([para.text for para in doc.paragraphs])
70
- return text
71
-
72
- # Summary generation function
73
- def generate_summary(text, model):
74
- prompt = f"Can you summarize the following document in 100 words?\n\n{text}"
75
  try:
76
- response = model.generate_content(prompt)
77
- return response.text
 
 
 
 
78
  except Exception as e:
79
- return f"Error generating summary: {str(e)}"
 
80
 
81
- # Additional resume parsing functions
82
- def extract_experience(doc):
83
- experience = 0
84
- for ent in doc.ents:
85
- if ent.label_ == "DATE":
86
- date = dateparser.parse(ent.text)
87
- if date:
88
- experience = max(experience, datetime.now().year - date.year)
89
- return experience
90
 
91
- def extract_phone(text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  phone_patterns = [
93
  r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
94
  r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
@@ -99,12 +132,12 @@ def extract_phone(text):
99
  return match.group()
100
  return "Not found"
101
 
102
- def extract_email(text):
103
  email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
104
  match = re.search(email_pattern, text)
105
  return match.group() if match else "Not found"
106
 
107
- def extract_colleges(doc):
108
  colleges = set()
109
  edu_keywords = ["university", "college", "institute", "school"]
110
  for ent in doc.ents:
@@ -112,27 +145,42 @@ def extract_colleges(doc):
112
  colleges.add(ent.text)
113
  return list(colleges)
114
 
115
- def extract_linkedin(text):
116
- linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
117
- match = re.search(linkedin_pattern, text)
118
- return match.group() if match else "Not found"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- # Main function to process the resume and return the analysis
121
  def main():
122
- st.title("Comprehensive Resume Analyzer")
123
  st.write("Upload a resume to extract information, generate a summary, and analyze details.")
124
 
125
- # Authenticate with Gemini API
126
  model = authenticate_gemini()
127
  if model is None:
128
  return
129
 
130
- # File uploader for resume input
131
  uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
132
 
133
  if uploaded_file is not None:
134
  try:
135
- # Extract text from the uploaded resume
136
  file_ext = uploaded_file.name.split('.')[-1].lower()
137
  if file_ext == 'pdf':
138
  resume_text = extract_text_from_pdf(uploaded_file)
@@ -143,37 +191,28 @@ def main():
143
  return
144
 
145
  if not resume_text.strip():
146
- st.error("The resume appears to be empty.")
147
  return
148
 
149
- # Process the resume
150
- doc = nlp_spacy(resume_text)
151
-
152
- # Extract information
153
- companies = extract_orgs(resume_text)
154
- summary = generate_summary(resume_text, model)
155
- experience = extract_experience(doc)
156
- phone = extract_phone(resume_text)
157
- email = extract_email(resume_text)
158
- colleges = extract_colleges(doc)
159
- linkedin = extract_linkedin(resume_text)
160
 
161
- # Display results
162
  st.subheader("Extracted Information")
163
- st.write(f"*Years of Experience:* {experience}")
164
- st.write("*Companies Worked For:*")
165
- st.write(", ".join(companies))
166
- st.write(f"*Phone Number:* {phone}")
167
- st.write(f"*Email ID:* {email}")
168
- st.write("*Colleges Attended:*")
169
- st.write(", ".join(colleges))
170
- st.write(f"*LinkedIn ID:* {linkedin}")
171
 
172
  st.subheader("Generated Summary")
173
- st.write(summary)
174
 
175
  except Exception as e:
176
- st.error(f"Error during processing: {e}")
 
177
 
178
- if __name__ == "__main__":
179
  main()
 
1
+
2
  import google.generativeai as genai
3
  import fitz # PyMuPDF for PDF text extraction
4
  import streamlit as st
 
9
  import dateparser
10
  from datetime import datetime
11
  import os
12
+ from typing import List, Dict
13
+ import logging
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(_name_)
18
 
19
+ # Load SpaCy model for dependency parsing and NER
20
  nlp_spacy = spacy.load('en_core_web_sm')
21
 
22
  # Load the NER model
 
24
  model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
25
  nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
26
 
27
+ def authenticate_gemini() -> genai.GenerativeModel:
28
+ api_key = "AIzaSyCG-qpFRqJc0QOJT-AcAaO5XIEdE-nk3Tc"
 
29
  if not api_key:
30
  st.error("Google Gemini API key not found. Please set it in the Hugging Face Spaces secrets.")
31
  return None
 
35
  st.success("Gemini API successfully configured.")
36
  return model
37
  except Exception as e:
38
+ logger.error(f"Error configuring Gemini API: {e}")
39
+ st.error(f"Error configuring Gemini API. Please check your API key and try again.")
40
  return None
41
 
42
+ def refine_org_entities(entities: List[str]) -> List[str]:
 
43
  refined_entities = set()
44
+ company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.', 'Company', 'Group']
45
+
46
  for entity in entities:
47
+ # Remove common prefixes that might interfere with company names
48
+ entity = re.sub(r'^(The|A|An)\s+', '', entity).strip()
49
+
50
  if any(entity.endswith(suffix) for suffix in company_suffixes):
51
  refined_entities.add(entity)
52
+ elif re.match(r'([A-Z][a-z]+\s?)+', entity): # Match sequences of capitalized words
53
  refined_entities.add(entity)
54
+
55
  return list(refined_entities)
56
 
57
+ def extract_orgs(text: str) -> List[str]:
 
58
  ner_results = nlp_ner(text)
59
  orgs = set()
60
  for entity in ner_results:
61
  if entity['entity_group'] == 'ORG':
62
  orgs.add(entity['word'])
 
63
  return refine_org_entities(orgs)
64
 
65
+ def extract_text_from_pdf(pdf_file) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  try:
67
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
68
+ text = ""
69
+ for page_num in range(doc.page_count):
70
+ page = doc.load_page(page_num)
71
+ text += page.get_text()
72
+ return text
73
  except Exception as e:
74
+ logger.error(f"Error extracting text from PDF: {e}")
75
+ return ""
76
 
77
+ def extract_text_from_doc(doc_file) -> str:
78
+ try:
79
+ doc = Document(doc_file)
80
+ text = '\n'.join([para.text for para in doc.paragraphs])
81
+ return text
82
+ except Exception as e:
83
+ logger.error(f"Error extracting text from DOCX: {e}")
84
+ return ""
 
85
 
86
+ def generate_summary(text: str, model: genai.GenerativeModel) -> str:
87
+ prompt = f"Summarize the following resume in 100 words, highlighting key skills and experiences:\n\n{text}"
88
+ try:
89
+ response = model.generate_content(prompt)
90
+ return response.text
91
+ except Exception as e:
92
+ logger.error(f"Error generating summary: {e}")
93
+ return "Error generating summary. Please try again."
94
+
95
+ def extract_experience(text: str) -> str:
96
+ # Patterns to match experience in years and months
97
+ experience_patterns = [
98
+ r'(\d+)\s*(?:years?|yrs?)', # e.g., 5 years, 2 yrs
99
+ r'(\d+)\s*(?:months?|mos?)', # e.g., 6 months
100
+ r'(\d+)\s*(?:years?|yrs?)\s*(?:and)?\s*(\d+)\s*(?:months?|mos?)' # e.g., 2 years and 6 months
101
+ ]
102
+
103
+ # Extract and prioritize years of experience
104
+ total_years = 0
105
+ for pattern in experience_patterns:
106
+ matches = re.findall(pattern, text, re.IGNORECASE)
107
+ for match in matches:
108
+ if len(match) == 1: # Only years or months
109
+ value = int(match[0])
110
+ if 'year' in pattern:
111
+ total_years += value
112
+ # We ignore months in this case
113
+ elif len(match) == 2: # Years and months
114
+ years, _ = int(match[0]), int(match[1])
115
+ total_years += years
116
+
117
+ # Return only the number of years (ignore months)
118
+ if total_years > 0:
119
+ return f"{total_years} years"
120
+ else:
121
+ return "Experience not found"
122
+
123
+
124
+ def extract_phone(text: str) -> str:
125
  phone_patterns = [
126
  r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
127
  r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
 
132
  return match.group()
133
  return "Not found"
134
 
135
+ def extract_email(text: str) -> str:
136
  email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
137
  match = re.search(email_pattern, text)
138
  return match.group() if match else "Not found"
139
 
140
+ def extract_colleges(doc) -> List[str]:
141
  colleges = set()
142
  edu_keywords = ["university", "college", "institute", "school"]
143
  for ent in doc.ents:
 
145
  colleges.add(ent.text)
146
  return list(colleges)
147
 
148
+ def extract_linkedin(text: str) -> str:
149
+ linkedin_patterns = [
150
+ r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?',
151
+ r'linkedin\.com\/in\/[A-z0-9_-]+',
152
+ r'@[A-z0-9_-]+\s+\(LinkedIn\)'
153
+ ]
154
+ for pattern in linkedin_patterns:
155
+ match = re.search(pattern, text, re.IGNORECASE)
156
+ if match:
157
+ return match.group()
158
+ return "Not found"
159
+
160
+ def analyze_resume(text: str, model: genai.GenerativeModel) -> Dict:
161
+ doc = nlp_spacy(text)
162
+ return {
163
+ "companies": extract_orgs(text),
164
+ "summary": generate_summary(text, model),
165
+ "experience": extract_experience(text),
166
+ "phone": extract_phone(text),
167
+ "email": extract_email(text),
168
+ "colleges": extract_colleges(doc),
169
+ "linkedin": extract_linkedin(text)
170
+ }
171
 
 
172
  def main():
173
+ st.title("Enhanced Resume Analyzer")
174
  st.write("Upload a resume to extract information, generate a summary, and analyze details.")
175
 
 
176
  model = authenticate_gemini()
177
  if model is None:
178
  return
179
 
 
180
  uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
181
 
182
  if uploaded_file is not None:
183
  try:
 
184
  file_ext = uploaded_file.name.split('.')[-1].lower()
185
  if file_ext == 'pdf':
186
  resume_text = extract_text_from_pdf(uploaded_file)
 
191
  return
192
 
193
  if not resume_text.strip():
194
+ st.error("The resume appears to be empty or couldn't be read.")
195
  return
196
 
197
+ with st.spinner("Analyzing resume..."):
198
+ results = analyze_resume(resume_text, model)
 
 
 
 
 
 
 
 
 
199
 
 
200
  st.subheader("Extracted Information")
201
+ st.write(f"Experience: {results['experience']}")
202
+ st.write("Companies Worked For:")
203
+ st.write(", ".join(results['companies']))
204
+ st.write(f"Phone Number: {results['phone']}")
205
+ st.write(f"Email ID: {results['email']}")
206
+ st.write("Colleges Attended:")
207
+ st.write(", ".join(results['colleges']))
208
+ st.write(f"LinkedIn: {results['linkedin']}")
209
 
210
  st.subheader("Generated Summary")
211
+ st.write(results['summary'])
212
 
213
  except Exception as e:
214
+ logger.error(f"Error during resume analysis: {e}")
215
+ st.error("An error occurred during resume analysis. Please try again or contact support if the issue persists.")
216
 
217
+ if _name_ == "_main_":
218
  main()