Spaces:

Schmitz005
/

huggingwhale.ai

Sleeping

Schmitz005 commited on 29 days ago

Commit

5a64d4d

verified ·

1 Parent(s): 6d6f152

Update whale_core/parser.py

Files changed (1) hide show

whale_core/parser.py CHANGED Viewed

@@ -32,16 +32,21 @@ def parse_text(filepath):
     with open(filepath, 'r') as f:
         return f.read()
-def parse_file(filepath):
-    if filepath.endswith('.pdf'):
-        return parse_pdf(filepath)
-    elif filepath.endswith('.txt'):
-        return parse_text(filepath)
-    # Comment this out if you don’t want audio at all
-    # elif filepath.endswith(('.mp3', '.wav', '.m4a')):
-    #     return parse_audio(filepath)
     else:
-        raise ValueError(f"Unsupported file type: {filepath}")
 def chunk_text(text, chunk_size=300):
     words = text.split()

     with open(filepath, 'r') as f:
         return f.read()
+def parse_file(file_obj):
+    filename = file_obj.name.lower()
+    if filename.endswith(".pdf"):
+        reader = PyPDF2.PdfReader(file_obj)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+        return text
+    elif filename.endswith(".txt"):
+        return file_obj.read().decode("utf-8")
     else:
+        raise ValueError("Unsupported file type.")
 def chunk_text(text, chunk_size=300):
     words = text.split()