CV_Ranking / app.py
ruslanmv's picture
Update app.py
250e7d4
import os
import gradio as gr
import PyPDF2
import docx2txt
import logging
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt_tab')
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
# ----------------------------------------------------------------------------
# 1) Utility Functions: Parsing & Preprocessing
# ----------------------------------------------------------------------------
def extract_text_from_pdf(file_obj):
"""Extract all text from a PDF file object."""
text_content = []
try:
logging.info("Loading PDF file.")
pdf_reader = PyPDF2.PdfReader(file_obj)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text_content.append(page_text)
extracted_text = "\n".join(text_content)
logging.info(f"Extracted PDF content: {extracted_text[:500]}...")
print(extracted_text) # Print the extracted text
return extracted_text
except Exception as e:
logging.error(f"Error reading PDF: {e}")
return f"Error reading PDF: {e}"
def extract_text_from_docx(file_path):
"""Extract all text from a DOCX file on disk."""
try:
logging.info("Loading DOCX file.")
extracted_text = docx2txt.process(file_path)
logging.info(f"Extracted DOCX content: {extracted_text[:500]}...")
print(extracted_text) # Print the extracted text
return extracted_text
except Exception as e:
logging.error(f"Error reading DOCX: {e}")
return f"Error reading DOCX: {e}"
def extract_text_from_txt(file_obj):
"""Extract all text from a TXT file object."""
try:
logging.info("Loading TXT file.")
extracted_text = file_obj.read().decode("utf-8", errors="ignore")
logging.info(f"Extracted TXT content: {extracted_text[:500]}...")
print(extracted_text) # Print the extracted text
return extracted_text
except Exception as e:
logging.error(f"Error reading TXT: {e}")
return f"Error reading TXT: {e}"
def preprocess_text(text):
"""
Lowercase, tokenize, remove stopwords and non-alphabetic tokens,
and then rejoin into a clean string.
"""
logging.info("Preprocessing text.")
text = str(text).lower()
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
processed_text = " ".join(filtered_tokens)
logging.info(f"Preprocessed text: {processed_text[:500]}...")
return processed_text
# ----------------------------------------------------------------------------
# 2) Core Ranking Logic with TF-IDF & Cosine Similarity
# ----------------------------------------------------------------------------
def rank_resumes_with_tfidf(job_description: str, resumes: dict):
logging.info("Ranking resumes using TF-IDF.")
preprocessed_jd = preprocess_text(job_description)
preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()}
corpus = [preprocessed_jd] + list(preprocessed_resumes.values())
filenames = list(preprocessed_resumes.keys())
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)
jd_vector = tfidf_matrix[0:1]
resume_vectors = tfidf_matrix[1:]
similarities = cosine_similarity(jd_vector, resume_vectors).flatten()
results = list(zip(filenames, similarities))
results_sorted = sorted(results, key=lambda x: x[1], reverse=True)
logging.info(f"Ranking completed: {results_sorted}")
return results_sorted
# ----------------------------------------------------------------------------
# 3) Gradio Callback Function
# ----------------------------------------------------------------------------
def analyze_cvs(job_description, cv_files):
logging.info("Starting CV analysis.")
resumes_data = {}
for uploaded_file in cv_files:
filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes
file_ext = os.path.splitext(filename)[1].lower()
temp_filepath = None
try:
logging.info(f"Processing file: {filename}")
if file_ext == ".pdf":
with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
file_content = extract_text_from_pdf(f)
elif file_ext == ".txt":
with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio
file_content = extract_text_from_txt(f)
elif file_ext == ".docx":
file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath
else:
file_content = "Unsupported file type."
except Exception as e:
logging.error(f"Error processing file: {e}")
file_content = f"Error processing file: {e}"
logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...")
resumes_data[filename] = file_content
ranked_results = rank_resumes_with_tfidf(job_description, resumes_data)
display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results]
logging.info("Analysis completed successfully.")
return display_data
# ----------------------------------------------------------------------------
# 4) Gradio Interface
# ----------------------------------------------------------------------------
def create_gradio_interface():
job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4)
cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath")
results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates")
demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF")
return demo
# ----------------------------------------------------------------------------
# 5) Main Script
# ----------------------------------------------------------------------------
if __name__ == "__main__":
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
app = create_gradio_interface()
app.launch(server_name="0.0.0.0", server_port=7860, debug=True)