Spaces:
Sleeping
Sleeping
Update whale_core/parser.py
Browse files- whale_core/parser.py +14 -9
whale_core/parser.py
CHANGED
@@ -32,16 +32,21 @@ def parse_text(filepath):
|
|
32 |
with open(filepath, 'r') as f:
|
33 |
return f.read()
|
34 |
|
35 |
-
def parse_file(
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
43 |
else:
|
44 |
-
raise ValueError(
|
45 |
|
46 |
def chunk_text(text, chunk_size=300):
|
47 |
words = text.split()
|
|
|
32 |
with open(filepath, 'r') as f:
|
33 |
return f.read()
|
34 |
|
35 |
+
def parse_file(file_obj):
|
36 |
+
filename = file_obj.name.lower()
|
37 |
+
|
38 |
+
if filename.endswith(".pdf"):
|
39 |
+
reader = PyPDF2.PdfReader(file_obj)
|
40 |
+
text = ""
|
41 |
+
for page in reader.pages:
|
42 |
+
text += page.extract_text()
|
43 |
+
return text
|
44 |
+
|
45 |
+
elif filename.endswith(".txt"):
|
46 |
+
return file_obj.read().decode("utf-8")
|
47 |
+
|
48 |
else:
|
49 |
+
raise ValueError("Unsupported file type.")
|
50 |
|
51 |
def chunk_text(text, chunk_size=300):
|
52 |
words = text.split()
|