jonanfu commited on
Commit
1e2c986
·
1 Parent(s): a3d4cc2

primera version de app

Browse files
Files changed (3) hide show
  1. .gitignore +176 -0
  2. app.py +102 -0
  3. requirements.txt +0 -0
.gitignore ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python
3
+
4
+ ### Python ###
5
+ # Byte-compiled / optimized / DLL files
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+
10
+ # C extensions
11
+ *.so
12
+
13
+ # Distribution / packaging
14
+ .Python
15
+ build/
16
+ develop-eggs/
17
+ dist/
18
+ downloads/
19
+ eggs/
20
+ .eggs/
21
+ lib/
22
+ lib64/
23
+ parts/
24
+ sdist/
25
+ var/
26
+ wheels/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+ cover/
57
+
58
+ # Translations
59
+ *.mo
60
+ *.pot
61
+
62
+ # Django stuff:
63
+ *.log
64
+ local_settings.py
65
+ db.sqlite3
66
+ db.sqlite3-journal
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ .pybuilder/
80
+ target/
81
+
82
+ # Jupyter Notebook
83
+ .ipynb_checkpoints
84
+
85
+ # IPython
86
+ profile_default/
87
+ ipython_config.py
88
+
89
+ # pyenv
90
+ # For a library or package, you might want to ignore these files since the code is
91
+ # intended to run in multiple environments; otherwise, check them in:
92
+ # .python-version
93
+
94
+ # pipenv
95
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
97
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
98
+ # install all needed dependencies.
99
+ #Pipfile.lock
100
+
101
+ # poetry
102
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
104
+ # commonly ignored for libraries.
105
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106
+ #poetry.lock
107
+
108
+ # pdm
109
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110
+ #pdm.lock
111
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112
+ # in version control.
113
+ # https://pdm.fming.dev/#use-with-ide
114
+ .pdm.toml
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+
166
+ ### Python Patch ###
167
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168
+ poetry.toml
169
+
170
+ # ruff
171
+ .ruff_cache/
172
+
173
+ # LSP config files
174
+ pyrightconfig.json
175
+
176
+ # End of https://www.toptal.com/developers/gitignore/api/python
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import base64
3
+ import os
4
+ from io import BytesIO
5
+ from pypdf import PdfReader
6
+ from langchain.schema import Document
7
+ from langchain.embeddings import HuggingFaceEmbeddings
8
+ from langchain_pinecone import PineconeVectorStore
9
+ from pinecone import Pinecone as PineconeClient, ServerlessSpec
10
+
11
+ # Configuración de las variables de entorno
12
+ hf_token = os.getenv("HUGGINGFACE_TOKEN")
13
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
14
+
15
+ st.set_page_config(page_title="Clasificador de CVs", layout="wide")
16
+ st.title("🎯 Clasificador de CVs por Puesto de Trabajo")
17
+
18
+ # Inputs
19
+ titulo_puesto = st.text_input("🧑‍💼 Título del puesto", placeholder="Ej: Desarrollador Backend Senior")
20
+ descripcion_puesto = st.text_area("📝 Descripción del puesto", height=200)
21
+ uploaded_files = st.file_uploader("📎 Subir CVs (PDF)", type="pdf", accept_multiple_files=True)
22
+
23
+ if st.button("📊 Procesar CVs"):
24
+ if not uploaded_files:
25
+ st.warning("Primero sube al menos un CV.")
26
+ elif not descripcion_puesto.strip():
27
+ st.warning("Debes escribir una descripción del puesto.")
28
+ else:
29
+ # Inicializar Pinecone
30
+ pc = PineconeClient(api_key=pinecone_api_key)
31
+ index_name = "cv-index"
32
+
33
+ if index_name not in pc.list_indexes().names():
34
+ pc.create_index(
35
+ name=index_name,
36
+ dimension=384,
37
+ metric='cosine',
38
+ spec=ServerlessSpec(cloud='aws', region='us-east-1')
39
+ )
40
+ index = pc.Index(index_name)
41
+
42
+ # Inicializar modelo de embeddings
43
+ embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
44
+
45
+ # Vector store
46
+ vector_store = PineconeVectorStore(index=index, embedding=embedding)
47
+
48
+ # Crear diccionario de archivos en memoria
49
+ archivos_en_memoria = {file.name: BytesIO(file.read()) for file in uploaded_files}
50
+
51
+ # Procesar archivos y crear documentos
52
+ documents = []
53
+ for filename, buffer in archivos_en_memoria.items():
54
+ buffer.seek(0)
55
+ reader = PdfReader(buffer)
56
+ text = ""
57
+ for page in reader.pages:
58
+ page_text = page.extract_text()
59
+ if page_text:
60
+ text += page_text + "\n"
61
+
62
+ doc = Document(
63
+ page_content=text.strip(),
64
+ metadata={"filename": filename, "titulo_puesto": titulo_puesto}
65
+ )
66
+ documents.append(doc)
67
+
68
+ # Subir documentos a Pinecone
69
+ vector_store.add_documents(documents)
70
+
71
+ # Búsqueda por similitud
72
+ results = vector_store.similarity_search_with_score(descripcion_puesto, k=len(documents))
73
+ st.success(f"{len(results)} CV(s) procesado(s).")
74
+
75
+ # Mostrar resultados
76
+ for doc, score in results:
77
+ st.markdown("---")
78
+ col1, col2 = st.columns([2, 1])
79
+
80
+ filename = doc.metadata.get("filename")
81
+ file_buffer = archivos_en_memoria.get(filename)
82
+
83
+ with col1:
84
+ if file_buffer:
85
+ file_buffer.seek(0)
86
+ base64_pdf = base64.b64encode(file_buffer.read()).decode("utf-8")
87
+ pdf_display = f"""
88
+ <iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="500" type="application/pdf"></iframe>
89
+ """
90
+ st.markdown("#### 👀 Visualizador del PDF", unsafe_allow_html=True)
91
+ st.markdown(pdf_display, unsafe_allow_html=True)
92
+ else:
93
+ st.warning(f"No se encontró el archivo `{filename}` en memoria.")
94
+
95
+ with col2:
96
+ st.markdown("#### 📄 Información del CV")
97
+ st.write(f"**Nombre del archivo:** `{filename}`")
98
+ st.write(f"**Score de similitud:** `{score * 100:.2f}%`")
99
+
100
+ # Eliminar todos los vectores del índice (vaciar)
101
+ index.delete(delete_all=True)
102
+ st.success("🧹 Todos los vectores han sido eliminados del índice de Pinecone.")
requirements.txt ADDED
Binary file (4.04 kB). View file