Schmitz005 commited on
Commit
8baa906
·
verified ·
1 Parent(s): 897e33d

Create parser.py

Browse files
Files changed (1) hide show
  1. whale_core/parser.py +54 -0
whale_core/parser.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import PyPDF2
3
+ from sentence_transformers import SentenceTransformer
4
+ import warnings
5
+
6
+ warnings.filterwarnings(
7
+ "ignore",
8
+ category=FutureWarning,
9
+ message="`clean_up_tokenization_spaces` was not set.*"
10
+ )
11
+
12
+ model = SentenceTransformer('all-MiniLM-L6-v2')
13
+
14
+ def parse_pdf(filepath):
15
+ text = ""
16
+ with open(filepath, 'rb') as f:
17
+ reader = PyPDF2.PdfReader(f)
18
+ for page in reader.pages:
19
+ text += page.extract_text() + "\n"
20
+ return text
21
+
22
+ def parse_audio(filepath):
23
+ try:
24
+ import whisper
25
+ model = whisper.load_model("base")
26
+ result = model.transcribe(filepath)
27
+ return result['text']
28
+ except Exception as e:
29
+ raise RuntimeError(f"Audio parsing failed — likely missing ffmpeg. Error: {e}")
30
+
31
+ def parse_text(filepath):
32
+ with open(filepath, 'r') as f:
33
+ return f.read()
34
+
35
+ def parse_file(filepath):
36
+ if filepath.endswith('.pdf'):
37
+ return parse_pdf(filepath)
38
+ elif filepath.endswith('.txt'):
39
+ return parse_text(filepath)
40
+ # Comment this out if you don’t want audio at all
41
+ # elif filepath.endswith(('.mp3', '.wav', '.m4a')):
42
+ # return parse_audio(filepath)
43
+ else:
44
+ raise ValueError(f"Unsupported file type: {filepath}")
45
+
46
+ def chunk_text(text, chunk_size=300):
47
+ words = text.split()
48
+ return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
49
+
50
+ def chunk_and_embed(text):
51
+ chunks = chunk_text(text)
52
+ embeddings = model.encode(chunks).tolist()
53
+ return list(zip(chunks, embeddings))
54
+