Spaces:

bangaboy
/

resume_parser

Running

App Files Files Community

bangaboy commited on Sep 17, 2024

Commit

1ec17ce

verified ·

1 Parent(s): cd99cf5

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -79

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import google.generativeai as genai
 import fitz  # PyMuPDF for PDF text extraction
 import streamlit as st
@@ -8,8 +9,14 @@ import re
 import dateparser
 from datetime import datetime
 import os
-# Load SpaCy model for dependency parsing
 nlp_spacy = spacy.load('en_core_web_sm')
 # Load the NER model
@@ -17,9 +24,8 @@ tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ne
 model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
 nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
-# Function to authenticate with Gemini API
-def authenticate_gemini():
-    api_key = os.environ.get("GOOGLE_GEMINI_API_KEY")
     if not api_key:
         st.error("Google Gemini API key not found. Please set it in the Hugging Face Spaces secrets.")
         return None
@@ -29,66 +35,93 @@ def authenticate_gemini():
         st.success("Gemini API successfully configured.")
         return model
     except Exception as e:
-        st.error(f"Error configuring Gemini API: {e}")
         return None
-# Function to filter and refine extracted ORG entities
-def refine_org_entities(entities):
     refined_entities = set()
-    company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.']
     for entity in entities:
         if any(entity.endswith(suffix) for suffix in company_suffixes):
             refined_entities.add(entity)
-        elif re.match(r'([A-Z][a-z]+)\s([A-Z][a-z]+)', entity):
             refined_entities.add(entity)
     return list(refined_entities)
-# Function to extract ORG entities using NER
-def extract_orgs(text):
     ner_results = nlp_ner(text)
     orgs = set()
     for entity in ner_results:
         if entity['entity_group'] == 'ORG':
             orgs.add(entity['word'])
     return refine_org_entities(orgs)
-# Extract text from PDF
-def extract_text_from_pdf(pdf_file):
-    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
-    text = ""
-    for page_num in range(doc.page_count):
-        page = doc.load_page(page_num)
-        text += page.get_text()
-    return text
-# Extract text from DOCX
-def extract_text_from_doc(doc_file):
-    doc = Document(doc_file)
-    text = '\n'.join([para.text for para in doc.paragraphs])
-    return text
-# Summary generation function
-def generate_summary(text, model):
-    prompt = f"Can you summarize the following document in 100 words?\n\n{text}"
     try:
-        response = model.generate_content(prompt)
-        return response.text
     except Exception as e:
-        return f"Error generating summary: {str(e)}"
-# Additional resume parsing functions
-def extract_experience(doc):
-    experience = 0
-    for ent in doc.ents:
-        if ent.label_ == "DATE":
-            date = dateparser.parse(ent.text)
-            if date:
-                experience = max(experience, datetime.now().year - date.year)
-    return experience
-def extract_phone(text):
     phone_patterns = [
         r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
         r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
@@ -99,12 +132,12 @@ def extract_phone(text):
             return match.group()
     return "Not found"
-def extract_email(text):
     email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
     match = re.search(email_pattern, text)
     return match.group() if match else "Not found"
-def extract_colleges(doc):
     colleges = set()
     edu_keywords = ["university", "college", "institute", "school"]
     for ent in doc.ents:
@@ -112,27 +145,42 @@ def extract_colleges(doc):
             colleges.add(ent.text)
     return list(colleges)
-def extract_linkedin(text):
-    linkedin_pattern = r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?'
-    match = re.search(linkedin_pattern, text)
-    return match.group() if match else "Not found"
-# Main function to process the resume and return the analysis
 def main():
-    st.title("Comprehensive Resume Analyzer")
     st.write("Upload a resume to extract information, generate a summary, and analyze details.")
-    # Authenticate with Gemini API
     model = authenticate_gemini()
     if model is None:
         return
-    # File uploader for resume input
     uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
     if uploaded_file is not None:
         try:
-            # Extract text from the uploaded resume
             file_ext = uploaded_file.name.split('.')[-1].lower()
             if file_ext == 'pdf':
                 resume_text = extract_text_from_pdf(uploaded_file)
@@ -143,37 +191,28 @@ def main():
                 return
             if not resume_text.strip():
-                st.error("The resume appears to be empty.")
                 return
-            # Process the resume
-            doc = nlp_spacy(resume_text)
-            # Extract information
-            companies = extract_orgs(resume_text)
-            summary = generate_summary(resume_text, model)
-            experience = extract_experience(doc)
-            phone = extract_phone(resume_text)
-            email = extract_email(resume_text)
-            colleges = extract_colleges(doc)
-            linkedin = extract_linkedin(resume_text)
-            # Display results
             st.subheader("Extracted Information")
-            st.write(f"*Years of Experience:* {experience}")
-            st.write("*Companies Worked For:*")
-            st.write(", ".join(companies))
-            st.write(f"*Phone Number:* {phone}")
-            st.write(f"*Email ID:* {email}")
-            st.write("*Colleges Attended:*")
-            st.write(", ".join(colleges))
-            st.write(f"*LinkedIn ID:* {linkedin}")
             st.subheader("Generated Summary")
-            st.write(summary)
         except Exception as e:
-            st.error(f"Error during processing: {e}")
-if __name__ == "__main__":
     main()

 import google.generativeai as genai
 import fitz  # PyMuPDF for PDF text extraction
 import streamlit as st
 import dateparser
 from datetime import datetime
 import os
+from typing import List, Dict
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(_name_)
+# Load SpaCy model for dependency parsing and NER
 nlp_spacy = spacy.load('en_core_web_sm')
 # Load the NER model
 model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
 nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+def authenticate_gemini() -> genai.GenerativeModel:
+    api_key = "AIzaSyCG-qpFRqJc0QOJT-AcAaO5XIEdE-nk3Tc"
     if not api_key:
         st.error("Google Gemini API key not found. Please set it in the Hugging Face Spaces secrets.")
         return None
         st.success("Gemini API successfully configured.")
         return model
     except Exception as e:
+        logger.error(f"Error configuring Gemini API: {e}")
+        st.error(f"Error configuring Gemini API. Please check your API key and try again.")
         return None
+def refine_org_entities(entities: List[str]) -> List[str]:
     refined_entities = set()
+    company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.', 'Company', 'Group']
     for entity in entities:
+        # Remove common prefixes that might interfere with company names
+        entity = re.sub(r'^(The|A|An)\s+', '', entity).strip()
         if any(entity.endswith(suffix) for suffix in company_suffixes):
             refined_entities.add(entity)
+        elif re.match(r'([A-Z][a-z]+\s?)+', entity):  # Match sequences of capitalized words
             refined_entities.add(entity)
     return list(refined_entities)
+def extract_orgs(text: str) -> List[str]:
     ner_results = nlp_ner(text)
     orgs = set()
     for entity in ner_results:
         if entity['entity_group'] == 'ORG':
             orgs.add(entity['word'])
     return refine_org_entities(orgs)
+def extract_text_from_pdf(pdf_file) -> str:
     try:
+        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+        text = ""
+        for page_num in range(doc.page_count):
+            page = doc.load_page(page_num)
+            text += page.get_text()
+        return text
     except Exception as e:
+        logger.error(f"Error extracting text from PDF: {e}")
+        return ""
+def extract_text_from_doc(doc_file) -> str:
+    try:
+        doc = Document(doc_file)
+        text = '\n'.join([para.text for para in doc.paragraphs])
+        return text
+    except Exception as e:
+        logger.error(f"Error extracting text from DOCX: {e}")
+        return ""
+def generate_summary(text: str, model: genai.GenerativeModel) -> str:
+    prompt = f"Summarize the following resume in 100 words, highlighting key skills and experiences:\n\n{text}"
+    try:
+        response = model.generate_content(prompt)
+        return response.text
+    except Exception as e:
+        logger.error(f"Error generating summary: {e}")
+        return "Error generating summary. Please try again."
+def extract_experience(text: str) -> str:
+    # Patterns to match experience in years and months
+    experience_patterns = [
+        r'(\d+)\s*(?:years?|yrs?)',  # e.g., 5 years, 2 yrs
+        r'(\d+)\s*(?:months?|mos?)',  # e.g., 6 months
+        r'(\d+)\s*(?:years?|yrs?)\s*(?:and)?\s*(\d+)\s*(?:months?|mos?)'  # e.g., 2 years and 6 months
+    ]
+    # Extract and prioritize years of experience
+    total_years = 0
+    for pattern in experience_patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        for match in matches:
+            if len(match) == 1:  # Only years or months
+                value = int(match[0])
+                if 'year' in pattern:
+                    total_years += value
+                # We ignore months in this case
+            elif len(match) == 2:  # Years and months
+                years, _ = int(match[0]), int(match[1])
+                total_years += years
+    # Return only the number of years (ignore months)
+    if total_years > 0:
+        return f"{total_years} years"
+    else:
+        return "Experience not found"
+def extract_phone(text: str) -> str:
     phone_patterns = [
         r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
         r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
             return match.group()
     return "Not found"
+def extract_email(text: str) -> str:
     email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
     match = re.search(email_pattern, text)
     return match.group() if match else "Not found"
+def extract_colleges(doc) -> List[str]:
     colleges = set()
     edu_keywords = ["university", "college", "institute", "school"]
     for ent in doc.ents:
             colleges.add(ent.text)
     return list(colleges)
+def extract_linkedin(text: str) -> str:
+    linkedin_patterns = [
+        r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?',
+        r'linkedin\.com\/in\/[A-z0-9_-]+',
+        r'@[A-z0-9_-]+\s+\(LinkedIn\)'
+    ]
+    for pattern in linkedin_patterns:
+        match = re.search(pattern, text, re.IGNORECASE)
+        if match:
+            return match.group()
+    return "Not found"
+def analyze_resume(text: str, model: genai.GenerativeModel) -> Dict:
+    doc = nlp_spacy(text)
+    return {
+        "companies": extract_orgs(text),
+        "summary": generate_summary(text, model),
+        "experience": extract_experience(text),
+        "phone": extract_phone(text),
+        "email": extract_email(text),
+        "colleges": extract_colleges(doc),
+        "linkedin": extract_linkedin(text)
+    }
 def main():
+    st.title("Enhanced Resume Analyzer")
     st.write("Upload a resume to extract information, generate a summary, and analyze details.")
     model = authenticate_gemini()
     if model is None:
         return
     uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
     if uploaded_file is not None:
         try:
             file_ext = uploaded_file.name.split('.')[-1].lower()
             if file_ext == 'pdf':
                 resume_text = extract_text_from_pdf(uploaded_file)
                 return
             if not resume_text.strip():
+                st.error("The resume appears to be empty or couldn't be read.")
                 return
+            with st.spinner("Analyzing resume..."):
+                results = analyze_resume(resume_text, model)
             st.subheader("Extracted Information")
+            st.write(f"Experience: {results['experience']}")
+            st.write("Companies Worked For:")
+            st.write(", ".join(results['companies']))
+            st.write(f"Phone Number: {results['phone']}")
+            st.write(f"Email ID: {results['email']}")
+            st.write("Colleges Attended:")
+            st.write(", ".join(results['colleges']))
+            st.write(f"LinkedIn: {results['linkedin']}")
             st.subheader("Generated Summary")
+            st.write(results['summary'])
         except Exception as e:
+            logger.error(f"Error during resume analysis: {e}")
+            st.error("An error occurred during resume analysis. Please try again or contact support if the issue persists.")
+if _name_ == "_main_":
     main()