Spaces:
Sleeping
Sleeping
Commit
ยท
ecbae88
1
Parent(s):
755d925
Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,8 @@ from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVL
|
|
11 |
import tempfile # ์์ ํ์ผ์ ์์ฑํ๊ธฐ ์ํ ๋ผ์ด๋ธ๋ฌ๋ฆฌ์
๋๋ค.
|
12 |
import os
|
13 |
from huggingface_hub import hf_hub_download # Hugging Face Hub์์ ๋ชจ๋ธ์ ๋ค์ด๋ก๋ํ๊ธฐ ์ํ ํจ์์
๋๋ค.
|
|
|
|
|
14 |
|
15 |
|
16 |
# PDF ๋ฌธ์๋ก๋ถํฐ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ ํจ์์
๋๋ค.
|
@@ -28,12 +30,17 @@ def get_pdf_text(pdf_docs):
|
|
28 |
def get_text_file(docs):
|
29 |
text_list = []
|
30 |
for doc in docs:
|
31 |
-
if
|
32 |
-
#
|
33 |
-
|
34 |
-
text_list.append(text)
|
35 |
else:
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return text_list
|
38 |
|
39 |
|
|
|
11 |
import tempfile # ์์ ํ์ผ์ ์์ฑํ๊ธฐ ์ํ ๋ผ์ด๋ธ๋ฌ๋ฆฌ์
๋๋ค.
|
12 |
import os
|
13 |
from huggingface_hub import hf_hub_download # Hugging Face Hub์์ ๋ชจ๋ธ์ ๋ค์ด๋ก๋ํ๊ธฐ ์ํ ํจ์์
๋๋ค.
|
14 |
+
#์ถ๊ฐ๋ก ํ์ํ ๊ฒ๋ค
|
15 |
+
from PyPDF2 import PdfReader
|
16 |
|
17 |
|
18 |
# PDF ๋ฌธ์๋ก๋ถํฐ ํ
์คํธ๋ฅผ ์ถ์ถํ๋ ํจ์์
๋๋ค.
|
|
|
30 |
def get_text_file(docs):
|
31 |
text_list = []
|
32 |
for doc in docs:
|
33 |
+
if isinstance(doc, bytes):
|
34 |
+
# Bytes ๊ฐ์ฒด๋ฅผ ์ฌ์ฉํ์ฌ PdfReader๋ฅผ ์ด๊ธฐํํฉ๋๋ค.
|
35 |
+
pdf_reader = PdfReader(io.BytesIO(doc))
|
|
|
36 |
else:
|
37 |
+
# ํ์ผ ๊ฐ์ฒด๋ฅผ ์ฌ์ฉํ์ฌ PdfReader๋ฅผ ์ด๊ธฐํํฉ๋๋ค.
|
38 |
+
pdf_reader = PdfReader(doc)
|
39 |
+
|
40 |
+
# ๊ฐ ํ์ด์ง์ ํ
์คํธ๋ฅผ ์ถ์ถํ์ฌ text_list์ ์ถ๊ฐํฉ๋๋ค.
|
41 |
+
for page in pdf_reader.pages:
|
42 |
+
text = page.extract_text()
|
43 |
+
text_list.append(text)
|
44 |
return text_list
|
45 |
|
46 |
|