DDingcheol commited on
Commit
ecbae88
ยท
1 Parent(s): 755d925

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -5
app.py CHANGED
@@ -11,6 +11,8 @@ from langchain.document_loaders import PyPDFLoader, TextLoader, JSONLoader, CSVL
11
  import tempfile # ์ž„์‹œ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜๊ธฐ ์œ„ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์ž…๋‹ˆ๋‹ค.
12
  import os
13
  from huggingface_hub import hf_hub_download # Hugging Face Hub์—์„œ ๋ชจ๋ธ์„ ๋‹ค์šด๋กœ๋“œํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
 
 
14
 
15
 
16
  # PDF ๋ฌธ์„œ๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
@@ -28,12 +30,17 @@ def get_pdf_text(pdf_docs):
28
  def get_text_file(docs):
29
  text_list = []
30
  for doc in docs:
31
- if hasattr(doc, 'read') and callable(getattr(doc, 'read')):
32
- # ํŒŒ์ผ ๊ฐ์ฒด์ธ์ง€ ํ™•์ธํ•˜๊ณ  'read' ๋ฉ”์„œ๋“œ๊ฐ€ ์žˆ๋Š”์ง€ ๊ฒ€์‚ฌํ•ฉ๋‹ˆ๋‹ค.
33
- text = doc.read().decode("utf-8") # ํŒŒ์ผ์„ ์ฝ๊ณ  UTF-8๋กœ ๋””์ฝ”๋”ฉํ•˜์—ฌ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
34
- text_list.append(text)
35
  else:
36
- text_list.append(str(doc)) # ํŒŒ์ผ ๊ฐ์ฒด๊ฐ€ ์•„๋‹Œ ๊ฒฝ์šฐ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
 
 
 
 
 
 
37
  return text_list
38
 
39
 
 
11
  import tempfile # ์ž„์‹œ ํŒŒ์ผ์„ ์ƒ์„ฑํ•˜๊ธฐ ์œ„ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์ž…๋‹ˆ๋‹ค.
12
  import os
13
  from huggingface_hub import hf_hub_download # Hugging Face Hub์—์„œ ๋ชจ๋ธ์„ ๋‹ค์šด๋กœ๋“œํ•˜๊ธฐ ์œ„ํ•œ ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
14
+ #์ถ”๊ฐ€๋กœ ํ•„์š”ํ•œ ๊ฒƒ๋“ค
15
+ from PyPDF2 import PdfReader
16
 
17
 
18
  # PDF ๋ฌธ์„œ๋กœ๋ถ€ํ„ฐ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜๋Š” ํ•จ์ˆ˜์ž…๋‹ˆ๋‹ค.
 
30
  def get_text_file(docs):
31
  text_list = []
32
  for doc in docs:
33
+ if isinstance(doc, bytes):
34
+ # Bytes ๊ฐ์ฒด๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ PdfReader๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
35
+ pdf_reader = PdfReader(io.BytesIO(doc))
 
36
  else:
37
+ # ํŒŒ์ผ ๊ฐ์ฒด๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ PdfReader๋ฅผ ์ดˆ๊ธฐํ™”ํ•ฉ๋‹ˆ๋‹ค.
38
+ pdf_reader = PdfReader(doc)
39
+
40
+ # ๊ฐ ํŽ˜์ด์ง€์˜ ํ…์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•˜์—ฌ text_list์— ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค.
41
+ for page in pdf_reader.pages:
42
+ text = page.extract_text()
43
+ text_list.append(text)
44
  return text_list
45
 
46