Schmitz005 commited on
Commit
f2a821b
·
verified ·
1 Parent(s): 4bcfc5e

Create whalecore/parser.py

Browse files
Files changed (1) hide show
  1. whalecore/parser.py +49 -0
whalecore/parser.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+ import whisper
4
+ from pydub import AudioSegment
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ import warnings
8
+ warnings.filterwarnings(
9
+ "ignore",
10
+ category=FutureWarning,
11
+ message="`clean_up_tokenization_spaces` was not set.*"
12
+ )
13
+ model = SentenceTransformer('all-MiniLM-L6-v2')
14
+
15
+ def parse_pdf(filepath):
16
+ text = ""
17
+ with open(filepath, 'rb') as f:
18
+ reader = PyPDF2.PdfReader(f)
19
+ for page in reader.pages:
20
+ text += page.extract_text() + "\n"
21
+ return text
22
+
23
+ def parse_audio(filepath):
24
+ model = whisper.load_model("base")
25
+ result = model.transcribe(filepath)
26
+ return result['text']
27
+
28
+ def parse_text(filepath):
29
+ with open(filepath, 'r') as f:
30
+ return f.read()
31
+
32
+ def parse_file(filepath):
33
+ if filepath.endswith('.pdf'):
34
+ return parse_pdf(filepath)
35
+ elif filepath.endswith(('.mp3', '.wav', '.m4a')):
36
+ return parse_audio(filepath)
37
+ elif filepath.endswith('.txt'):
38
+ return parse_text(filepath)
39
+ else:
40
+ raise ValueError(f"Unsupported file type: {filepath}")
41
+
42
+ def chunk_text(text, chunk_size=300):
43
+ words = text.split()
44
+ return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
45
+
46
+ def chunk_and_embed(text):
47
+ chunks = chunk_text(text)
48
+ embeddings = model.encode(chunks).tolist()
49
+ return list(zip(chunks, embeddings))