Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
import PyPDF2 | |
import docx2txt | |
import logging | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
nltk.download('punkt_tab') | |
# Configure logging | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') | |
# ---------------------------------------------------------------------------- | |
# 1) Utility Functions: Parsing & Preprocessing | |
# ---------------------------------------------------------------------------- | |
def extract_text_from_pdf(file_obj): | |
"""Extract all text from a PDF file object.""" | |
text_content = [] | |
try: | |
logging.info("Loading PDF file.") | |
pdf_reader = PyPDF2.PdfReader(file_obj) | |
for page in pdf_reader.pages: | |
page_text = page.extract_text() | |
if page_text: | |
text_content.append(page_text) | |
extracted_text = "\n".join(text_content) | |
logging.info(f"Extracted PDF content: {extracted_text[:500]}...") | |
print(extracted_text) # Print the extracted text | |
return extracted_text | |
except Exception as e: | |
logging.error(f"Error reading PDF: {e}") | |
return f"Error reading PDF: {e}" | |
def extract_text_from_docx(file_path): | |
"""Extract all text from a DOCX file on disk.""" | |
try: | |
logging.info("Loading DOCX file.") | |
extracted_text = docx2txt.process(file_path) | |
logging.info(f"Extracted DOCX content: {extracted_text[:500]}...") | |
print(extracted_text) # Print the extracted text | |
return extracted_text | |
except Exception as e: | |
logging.error(f"Error reading DOCX: {e}") | |
return f"Error reading DOCX: {e}" | |
def extract_text_from_txt(file_obj): | |
"""Extract all text from a TXT file object.""" | |
try: | |
logging.info("Loading TXT file.") | |
extracted_text = file_obj.read().decode("utf-8", errors="ignore") | |
logging.info(f"Extracted TXT content: {extracted_text[:500]}...") | |
print(extracted_text) # Print the extracted text | |
return extracted_text | |
except Exception as e: | |
logging.error(f"Error reading TXT: {e}") | |
return f"Error reading TXT: {e}" | |
def preprocess_text(text): | |
""" | |
Lowercase, tokenize, remove stopwords and non-alphabetic tokens, | |
and then rejoin into a clean string. | |
""" | |
logging.info("Preprocessing text.") | |
text = str(text).lower() | |
tokens = word_tokenize(text) | |
stop_words = set(stopwords.words('english')) | |
filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words] | |
processed_text = " ".join(filtered_tokens) | |
logging.info(f"Preprocessed text: {processed_text[:500]}...") | |
return processed_text | |
# ---------------------------------------------------------------------------- | |
# 2) Core Ranking Logic with TF-IDF & Cosine Similarity | |
# ---------------------------------------------------------------------------- | |
def rank_resumes_with_tfidf(job_description: str, resumes: dict): | |
logging.info("Ranking resumes using TF-IDF.") | |
preprocessed_jd = preprocess_text(job_description) | |
preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()} | |
corpus = [preprocessed_jd] + list(preprocessed_resumes.values()) | |
filenames = list(preprocessed_resumes.keys()) | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform(corpus) | |
jd_vector = tfidf_matrix[0:1] | |
resume_vectors = tfidf_matrix[1:] | |
similarities = cosine_similarity(jd_vector, resume_vectors).flatten() | |
results = list(zip(filenames, similarities)) | |
results_sorted = sorted(results, key=lambda x: x[1], reverse=True) | |
logging.info(f"Ranking completed: {results_sorted}") | |
return results_sorted | |
# ---------------------------------------------------------------------------- | |
# 3) Gradio Callback Function | |
# ---------------------------------------------------------------------------- | |
def analyze_cvs(job_description, cv_files): | |
logging.info("Starting CV analysis.") | |
resumes_data = {} | |
for uploaded_file in cv_files: | |
filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes | |
file_ext = os.path.splitext(filename)[1].lower() | |
temp_filepath = None | |
try: | |
logging.info(f"Processing file: {filename}") | |
if file_ext == ".pdf": | |
with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio | |
file_content = extract_text_from_pdf(f) | |
elif file_ext == ".txt": | |
with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio | |
file_content = extract_text_from_txt(f) | |
elif file_ext == ".docx": | |
file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath | |
else: | |
file_content = "Unsupported file type." | |
except Exception as e: | |
logging.error(f"Error processing file: {e}") | |
file_content = f"Error processing file: {e}" | |
logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...") | |
resumes_data[filename] = file_content | |
ranked_results = rank_resumes_with_tfidf(job_description, resumes_data) | |
display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results] | |
logging.info("Analysis completed successfully.") | |
return display_data | |
# ---------------------------------------------------------------------------- | |
# 4) Gradio Interface | |
# ---------------------------------------------------------------------------- | |
def create_gradio_interface(): | |
job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4) | |
cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath") | |
results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates") | |
demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF") | |
return demo | |
# ---------------------------------------------------------------------------- | |
# 5) Main Script | |
# ---------------------------------------------------------------------------- | |
if __name__ == "__main__": | |
nltk.download('punkt', quiet=True) | |
nltk.download('stopwords', quiet=True) | |
app = create_gradio_interface() | |
app.launch(server_name="0.0.0.0", server_port=7860, debug=True) |