notabaka commited on
Commit
9bbbf26
·
1 Parent(s): 0ee4a85
Files changed (1) hide show
  1. app.py +6 -3
app.py CHANGED
@@ -36,10 +36,13 @@ def extract_text(doc):
36
  if doc.type == 'text/plain':
37
  return doc.read().decode('utf-8')
38
 
39
- if doc.name.endswith('.pdf'):
40
- with pdfplumber.open(doc) as pdf:
 
 
 
41
  pages = [page.extract_text() for page in pdf.pages]
42
- return '\n'.join(pages)
43
 
44
  if doc.name.endswith('.docx'):
45
  raw_text = doc.read()
 
36
  if doc.type == 'text/plain':
37
  return doc.read().decode('utf-8')
38
 
39
+ if doc.name.endswith(".pdf"):
40
+ raw = doc.read()
41
+
42
+ with pdfplumber.open(raw) as pdf:
43
+ pdf.set_doc(raw)
44
  pages = [page.extract_text() for page in pdf.pages]
45
+ return "\n".join(pages)
46
 
47
  if doc.name.endswith('.docx'):
48
  raw_text = doc.read()