Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -44,7 +44,33 @@ def get_pdf_text(pdf_docs : list) -> str:
|
|
| 44 |
text += page.extract_text()
|
| 45 |
return text
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
|
|
|
| 48 |
#def get_text_chunks(text:str) ->list:
|
| 49 |
# text_splitter = CharacterTextSplitter(
|
| 50 |
# separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
|
|
@@ -167,10 +193,11 @@ def main():
|
|
| 167 |
with st.spinner("Procesando"):
|
| 168 |
# get pdf text
|
| 169 |
raw_text = get_pdf_text(pdf_docs)
|
| 170 |
-
|
|
|
|
| 171 |
# get the text chunks
|
| 172 |
-
text_chunks = get_text_chunks(raw_text)
|
| 173 |
-
|
| 174 |
# create vector store
|
| 175 |
vectorstore = get_vectorstore(text_chunks)
|
| 176 |
|
|
|
|
| 44 |
text += page.extract_text()
|
| 45 |
return text
|
| 46 |
|
| 47 |
+
def get_pdf_pages(pdf_docs):
|
| 48 |
+
"""
|
| 49 |
+
Extract text from a list of PDF documents.
|
| 50 |
+
Parameters
|
| 51 |
+
----------
|
| 52 |
+
pdf_docs : list
|
| 53 |
+
List of PDF documents to extract text from.
|
| 54 |
+
Returns
|
| 55 |
+
-------
|
| 56 |
+
str
|
| 57 |
+
Extracted text from all the PDF documents.
|
| 58 |
+
"""
|
| 59 |
+
pages = []
|
| 60 |
+
import tempfile
|
| 61 |
+
|
| 62 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
| 63 |
+
for pdf in pdf_docs:
|
| 64 |
+
pdf_path=os.path.join(tmpdirname,pdf.name)
|
| 65 |
+
with open(pdf_path, "wb") as f:
|
| 66 |
+
f.write(pdf.getbuffer())
|
| 67 |
+
|
| 68 |
+
pdf_loader = UnstructuredPDFLoader(pdf_path)
|
| 69 |
+
pdf_pages = pdf_loader.load_and_split()
|
| 70 |
+
pages=pages+pdf_pages
|
| 71 |
+
return pages
|
| 72 |
|
| 73 |
+
|
| 74 |
#def get_text_chunks(text:str) ->list:
|
| 75 |
# text_splitter = CharacterTextSplitter(
|
| 76 |
# separator="\n", chunk_size=1500, chunk_overlap=300, length_function=len
|
|
|
|
| 193 |
with st.spinner("Procesando"):
|
| 194 |
# get pdf text
|
| 195 |
raw_text = get_pdf_text(pdf_docs)
|
| 196 |
+
pages = get_pdf_pages(pdf_docs)
|
| 197 |
+
|
| 198 |
# get the text chunks
|
| 199 |
+
#text_chunks = get_text_chunks(raw_text)
|
| 200 |
+
text_chunks = get_text_chunks(pages)
|
| 201 |
# create vector store
|
| 202 |
vectorstore = get_vectorstore(text_chunks)
|
| 203 |
|