Spaces:

Schmitz005
/

huggingwhale.ai

Sleeping

App Files Files Community

Schmitz005 commited on about 1 month ago

Commit

8baa906

verified ·

1 Parent(s): 897e33d

Create parser.py

Browse files

Files changed (1) hide show

whale_core/parser.py +54 -0

whale_core/parser.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import PyPDF2
+from sentence_transformers import SentenceTransformer
+import warnings
+warnings.filterwarnings(
+    "ignore",
+    category=FutureWarning,
+    message="`clean_up_tokenization_spaces` was not set.*"
+)
+model = SentenceTransformer('all-MiniLM-L6-v2')
+def parse_pdf(filepath):
+    text = ""
+    with open(filepath, 'rb') as f:
+        reader = PyPDF2.PdfReader(f)
+        for page in reader.pages:
+            text += page.extract_text() + "\n"
+    return text
+def parse_audio(filepath):
+    try:
+        import whisper
+        model = whisper.load_model("base")
+        result = model.transcribe(filepath)
+        return result['text']
+    except Exception as e:
+        raise RuntimeError(f"Audio parsing failed — likely missing ffmpeg. Error: {e}")
+def parse_text(filepath):
+    with open(filepath, 'r') as f:
+        return f.read()
+def parse_file(filepath):
+    if filepath.endswith('.pdf'):
+        return parse_pdf(filepath)
+    elif filepath.endswith('.txt'):
+        return parse_text(filepath)
+    # Comment this out if you don’t want audio at all
+    # elif filepath.endswith(('.mp3', '.wav', '.m4a')):
+    #     return parse_audio(filepath)
+    else:
+        raise ValueError(f"Unsupported file type: {filepath}")
+def chunk_text(text, chunk_size=300):
+    words = text.split()
+    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
+def chunk_and_embed(text):
+    chunks = chunk_text(text)
+    embeddings = model.encode(chunks).tolist()
+    return list(zip(chunks, embeddings))