Schmitz005 commited on
Commit
5a64d4d
·
verified ·
1 Parent(s): 6d6f152

Update whale_core/parser.py

Browse files
Files changed (1) hide show
  1. whale_core/parser.py +14 -9
whale_core/parser.py CHANGED
@@ -32,16 +32,21 @@ def parse_text(filepath):
32
  with open(filepath, 'r') as f:
33
  return f.read()
34
 
35
- def parse_file(filepath):
36
- if filepath.endswith('.pdf'):
37
- return parse_pdf(filepath)
38
- elif filepath.endswith('.txt'):
39
- return parse_text(filepath)
40
- # Comment this out if you don’t want audio at all
41
- # elif filepath.endswith(('.mp3', '.wav', '.m4a')):
42
- # return parse_audio(filepath)
 
 
 
 
 
43
  else:
44
- raise ValueError(f"Unsupported file type: {filepath}")
45
 
46
  def chunk_text(text, chunk_size=300):
47
  words = text.split()
 
32
  with open(filepath, 'r') as f:
33
  return f.read()
34
 
35
+ def parse_file(file_obj):
36
+ filename = file_obj.name.lower()
37
+
38
+ if filename.endswith(".pdf"):
39
+ reader = PyPDF2.PdfReader(file_obj)
40
+ text = ""
41
+ for page in reader.pages:
42
+ text += page.extract_text()
43
+ return text
44
+
45
+ elif filename.endswith(".txt"):
46
+ return file_obj.read().decode("utf-8")
47
+
48
  else:
49
+ raise ValueError("Unsupported file type.")
50
 
51
  def chunk_text(text, chunk_size=300):
52
  words = text.split()