import os import gradio as gr import PyPDF2 import docx2txt import logging import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity nltk.download('punkt_tab') # Configure logging logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') # ---------------------------------------------------------------------------- # 1) Utility Functions: Parsing & Preprocessing # ---------------------------------------------------------------------------- def extract_text_from_pdf(file_obj): """Extract all text from a PDF file object.""" text_content = [] try: logging.info("Loading PDF file.") pdf_reader = PyPDF2.PdfReader(file_obj) for page in pdf_reader.pages: page_text = page.extract_text() if page_text: text_content.append(page_text) extracted_text = "\n".join(text_content) logging.info(f"Extracted PDF content: {extracted_text[:500]}...") print(extracted_text) # Print the extracted text return extracted_text except Exception as e: logging.error(f"Error reading PDF: {e}") return f"Error reading PDF: {e}" def extract_text_from_docx(file_path): """Extract all text from a DOCX file on disk.""" try: logging.info("Loading DOCX file.") extracted_text = docx2txt.process(file_path) logging.info(f"Extracted DOCX content: {extracted_text[:500]}...") print(extracted_text) # Print the extracted text return extracted_text except Exception as e: logging.error(f"Error reading DOCX: {e}") return f"Error reading DOCX: {e}" def extract_text_from_txt(file_obj): """Extract all text from a TXT file object.""" try: logging.info("Loading TXT file.") extracted_text = file_obj.read().decode("utf-8", errors="ignore") logging.info(f"Extracted TXT content: {extracted_text[:500]}...") print(extracted_text) # Print the extracted text return extracted_text except Exception as e: logging.error(f"Error reading TXT: {e}") return f"Error reading TXT: {e}" def preprocess_text(text): """ Lowercase, tokenize, remove stopwords and non-alphabetic tokens, and then rejoin into a clean string. """ logging.info("Preprocessing text.") text = str(text).lower() tokens = word_tokenize(text) stop_words = set(stopwords.words('english')) filtered_tokens = [t for t in tokens if t.isalpha() and t not in stop_words] processed_text = " ".join(filtered_tokens) logging.info(f"Preprocessed text: {processed_text[:500]}...") return processed_text # ---------------------------------------------------------------------------- # 2) Core Ranking Logic with TF-IDF & Cosine Similarity # ---------------------------------------------------------------------------- def rank_resumes_with_tfidf(job_description: str, resumes: dict): logging.info("Ranking resumes using TF-IDF.") preprocessed_jd = preprocess_text(job_description) preprocessed_resumes = {fname: preprocess_text(txt) for fname, txt in resumes.items()} corpus = [preprocessed_jd] + list(preprocessed_resumes.values()) filenames = list(preprocessed_resumes.keys()) vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(corpus) jd_vector = tfidf_matrix[0:1] resume_vectors = tfidf_matrix[1:] similarities = cosine_similarity(jd_vector, resume_vectors).flatten() results = list(zip(filenames, similarities)) results_sorted = sorted(results, key=lambda x: x[1], reverse=True) logging.info(f"Ranking completed: {results_sorted}") return results_sorted # ---------------------------------------------------------------------------- # 3) Gradio Callback Function # ---------------------------------------------------------------------------- def analyze_cvs(job_description, cv_files): logging.info("Starting CV analysis.") resumes_data = {} for uploaded_file in cv_files: filename = os.path.basename(uploaded_file.name) #Get the base name, handling potential Gradio changes file_ext = os.path.splitext(filename)[1].lower() temp_filepath = None try: logging.info(f"Processing file: {filename}") if file_ext == ".pdf": with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio file_content = extract_text_from_pdf(f) elif file_ext == ".txt": with open(uploaded_file.name, "rb") as f: # Open the temporary file created by gradio file_content = extract_text_from_txt(f) elif file_ext == ".docx": file_content = extract_text_from_docx(uploaded_file.name) #docx2txt can handle the temporary filepath else: file_content = "Unsupported file type." except Exception as e: logging.error(f"Error processing file: {e}") file_content = f"Error processing file: {e}" logging.info(f"Extracted CV Content ({filename}): {file_content[:500]}...") resumes_data[filename] = file_content ranked_results = rank_resumes_with_tfidf(job_description, resumes_data) display_data = [[filename, round(float(score), 3)] for filename, score in ranked_results] logging.info("Analysis completed successfully.") return display_data # ---------------------------------------------------------------------------- # 4) Gradio Interface # ---------------------------------------------------------------------------- def create_gradio_interface(): job_description_input = gr.Textbox(label="Job Description", placeholder="Describe the role here...", lines=4) cv_input = gr.File(label="Upload resumes (PDF/DOCX/TXT)", file_count="multiple", type="filepath") results_output = gr.Dataframe(headers=["Candidate CV", "Similarity Score"], label="Ranked Candidates") demo = gr.Interface(fn=analyze_cvs, inputs=[job_description_input, cv_input], outputs=[results_output], title="Resume Ranking with TF-IDF") return demo # ---------------------------------------------------------------------------- # 5) Main Script # ---------------------------------------------------------------------------- if __name__ == "__main__": nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) app = create_gradio_interface() app.launch(server_name="0.0.0.0", server_port=7860, debug=True)