Spaces:

Schmitz005
/

huggingwhale.ai

Sleeping

Schmitz005 commited on Apr 22

Commit

f2a821b

verified ·

1 Parent(s): 4bcfc5e

Create whalecore/parser.py

Files changed (1) hide show

whalecore/parser.py ADDED Viewed

+import os
+import PyPDF2
+import whisper
+from pydub import AudioSegment
+from sentence_transformers import SentenceTransformer
+import warnings
+warnings.filterwarnings(
+    "ignore",
+    category=FutureWarning,
+    message="`clean_up_tokenization_spaces` was not set.*"
+)
+model = SentenceTransformer('all-MiniLM-L6-v2')
+def parse_pdf(filepath):
+    text = ""
+    with open(filepath, 'rb') as f:
+        reader = PyPDF2.PdfReader(f)
+        for page in reader.pages:
+            text += page.extract_text() + "\n"
+    return text
+def parse_audio(filepath):
+    model = whisper.load_model("base")
+    result = model.transcribe(filepath)
+    return result['text']
+def parse_text(filepath):
+    with open(filepath, 'r') as f:
+        return f.read()
+def parse_file(filepath):
+    if filepath.endswith('.pdf'):
+        return parse_pdf(filepath)
+    elif filepath.endswith(('.mp3', '.wav', '.m4a')):
+        return parse_audio(filepath)
+    elif filepath.endswith('.txt'):
+        return parse_text(filepath)
+    else:
+        raise ValueError(f"Unsupported file type: {filepath}")
+def chunk_text(text, chunk_size=300):
+    words = text.split()
+    return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
+def chunk_and_embed(text):
+    chunks = chunk_text(text)
+    embeddings = model.encode(chunks).tolist()
+    return list(zip(chunks, embeddings))