Spaces:
Sleeping
Sleeping
File size: 6,751 Bytes
4813a0f 250e7d4 4813a0f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import os
import gradio as gr
import PyPDF2
import docx2txt
import logging
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt_tab')
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# ----------------------------------------------------------------------------
# 1) Utility Functions: Parsing & Preprocessing
# ----------------------------------------------------------------------------
def extract_text_from_pdf(file_obj):
"""Extract all text from a PDF file object."""
text_content = []
try:
logging.info("Loading PDF file.")
pdf_reader = PyPDF2.PdfReader(file_obj)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text_content.append(page_text)
extracted_text = "\n".join(text_content)
logging.info(f"Extracted PDF content: {extracted_text[:500]}...")
print(extracted_text) # Print the extracted text
return extracted_text
except Exception as e:
logging.error(f"Error reading PDF: {e}")
return f"Error reading PDF: {e}"
def extract_text_from_docx(file_path):
"""Extract all text from a DOCX file on disk."""
try:
logging.info("Loading DOCX file.")
extracted_text = docx2txt.process(file_path)
logging.info(f"Extracted DOCX content: {extracted_text[:500]}...")
print(extracted_text) # Print the extracted text
return extracted_text
except Exception as e:
logging.error(f"Error reading DOCX: {e}")
return f"Error reading DOCX: {e}"
def extract_text_from_txt(file_obj):
"""Extract all text from a TXT file object."""
try:
logging.info("Loading TXT file.")
extracted_text = file_obj.read().decode("utf-8", errors="ignore")
logging.info(f"Extracted TXT content: {extracted_text[:500]}...")
print(extracted_text) # Print the extracted text
return extracted_text
except Exception as e:
logging.error(f"Error reading TXT: {e}")
return f"Error reading TXT: {e}"
def preprocess_text(text):
"""
Lowercase, tokenize, remove stopwords and non-alphabetic tokens,
and then rejoin into a clean string.
"""
logging.info("Preprocessing text.")
text = str(text).lower()
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
processed_text = " ".join(filtered_tokens)
logging.info(f"Preprocessed text: {processed_text[:500]}...")
return processed_text
# ----------------------------------------------------------------------------
# 2) Core Ranking Logic with TF-IDF & Cosine Similarity
# ----------------------------------------------------------------------------
def rank_resumes_with_tfidf(job_description: str, resumes: dict):
logging.info("Ranking resumes using TF-IDF.")
preprocessed_jd = preprocess_text(job_description)
preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()}
corpus = [preprocessed_jd] + list(preprocessed_resumes.values())
filenames = list(preprocessed_resumes.keys())
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
jd_vector = tfidf_matrix[0:1]
resume_vectors = tfidf_matrix[1:]
similarities = cosine_similarity(jd_vector, resume_vectors).flatten()
results = list(zip(filenames, similarities))
results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
logging.info(f"Ranking completed: {results_sorted}")
return results_sorted
# ----------------------------------------------------------------------------
# 3) Gradio Callback Function
# ----------------------------------------------------------------------------
def analyze_cvs(job_description, cv_files):
logging.info("Starting CV analysis.")
resumes_data = {}
for uploaded_file in cv_files:
filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes
file_ext = os.path.splitext(filename)[1].lower()
temp_filepath = None
try:
logging.info(f"Processing file: {filename}")
if file_ext == ".pdf":
with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
file_content = extract_text_from_pdf(f)
elif file_ext == ".txt":
with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
file_content = extract_text_from_txt(f)
elif file_ext == ".docx":
file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath
else:
file_content = "Unsupported file type."
except Exception as e:
logging.error(f"Error processing file: {e}")
file_content = f"Error processing file: {e}"
logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...")
resumes_data[filename] = file_content
ranked_results = rank_resumes_with_tfidf(job_description, resumes_data)
display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results]
logging.info("Analysis completed successfully.")
return display_data
# ----------------------------------------------------------------------------
# 4) Gradio Interface
# ----------------------------------------------------------------------------
def create_gradio_interface():
job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4)
cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath")
results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates")
demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF")
return demo
# ----------------------------------------------------------------------------
# 5) Main Script
# ----------------------------------------------------------------------------
if __name__ == "__main__":
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
app = create_gradio_interface()
app.launch(server_name="0.0.0.0", server_port=7860, debug=True) |