notabaka commited on
Commit
d0e6bd5
·
1 Parent(s): 60eae40
Files changed (1) hide show
  1. app.py +20 -15
app.py CHANGED
@@ -29,23 +29,28 @@ query = st.text_input("Enter search query")
29
  click = st.button("Search")
30
 
31
  def extract_text(doc):
32
- if doc.type == 'text/plain':
33
- return doc.getvalue().decode("utf-8")
 
34
 
35
- if doc.type == "application/pdf":
36
- with pdfplumber.open(doc) as pdf:
37
- pages = [page.extract_text() for page in pdf.pages]
38
- return "\n".join(pages)
39
-
40
- if doc.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
41
- return docx2txt.process(doc)
42
-
43
- if doc.name.endswith(".xlsx"):
44
- text = textract.process(doc)
45
- return text.decode("utf-8")
46
 
47
- return None
48
-
 
 
 
 
 
49
 
50
  if click and query:
51
  doc_contents = []
 
29
  click = st.button("Search")
30
 
31
  def extract_text(doc):
32
+ # Write temp file
33
+ with tempfile.TemporaryFile() as fp:
34
+ fp.write(doc.read())
35
 
36
+ if doc.type == 'text/plain':
37
+ fp.seek(0)
38
+ return fp.read().decode("utf-8")
39
+
40
+ # Rest of logic
41
+ if doc.name.endswith(".pdf"):
42
+ fp.seek(0)
43
+ with pdfplumber.open(fp) as pdf:
44
+ pages = [page.extract_text() for page in pdf.pages]
45
+ return "\n".join(pages)
 
46
 
47
+ if doc.name.endswith(".docx"):
48
+ fp.seek(0)
49
+ return docx2txt.process(fp)
50
+
51
+ # other cases
52
+
53
+ return None
54
 
55
  if click and query:
56
  doc_contents = []