notabaka commited on
Commit
faa2e50
·
1 Parent(s): 14bc55d
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -38,14 +38,15 @@ def extract_text(doc):
38
  return doc.read().decode('utf-8')
39
 
40
  if doc.name.endswith(".pdf"):
41
- raw = doc.read()
 
 
 
42
 
43
- # Remove null bytes without decoding
44
- raw = raw.replace(b'\x00', b'')
45
 
46
- pdf = pdfplumber.open(io.BytesIO(raw))
47
- pages = [page.extract_text() for page in pdf.pages]
48
- return "\n".join(pages)
49
 
50
 
51
  if doc.name.endswith('.docx'):
 
38
  return doc.read().decode('utf-8')
39
 
40
  if doc.name.endswith(".pdf"):
41
+ raw = doc.read()
42
+
43
+ # Handle null bytes
44
+ raw = raw.replace(b'\x00', b'')
45
 
46
+ with pdfplumber.open(raw) as pdf:
47
+ pages = [page.extract_text() for page in pdf.pages]
48
 
49
+ return "\n".join(pages)
 
 
50
 
51
 
52
  if doc.name.endswith('.docx'):