|
import streamlit as st |
|
from PyPDF2 import PdfReader |
|
import pandas as pd |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
|
uploaded_files = st.file_uploader( |
|
"Choose a CSV file", accept_multiple_files=True |
|
) |
|
for uploaded_file in uploaded_files: |
|
pdf_reader = PdfReader(uploaded_file) |
|
|
|
text_data= "" |
|
for page in pdf_reader.pages: |
|
text_data+= page.extract_text() |
|
|
|
|
|
data = pd.Series(text_data, index = ["Resume"]) |
|
|
|
st.dataframe(data) |
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
vec = TfidfVectorizer() |
|
tf_idf = vec.fit_transform(data[["Resume"]]) |
|
st.dataframe(pd.DataFrame(tf_idf.toarray(), columns=vec.get_feature_names_out())) |
|
cosine_sim = cosine_similarity(tf_idf, tf_idf) |
|
st.write(cosine_sim) |
|
|
|
|
|
|