Spaces:
Sleeping
Sleeping
File size: 1,561 Bytes
8baa906 5a64d4d 8baa906 5a64d4d 8baa906 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import os
import PyPDF2
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings(
"ignore",
category=FutureWarning,
message="`clean_up_tokenization_spaces` was not set.*"
)
model = SentenceTransformer('all-MiniLM-L6-v2')
def parse_pdf(filepath):
text = ""
with open(filepath, 'rb') as f:
reader = PyPDF2.PdfReader(f)
for page in reader.pages:
text += page.extract_text() + "\n"
return text
def parse_audio(filepath):
try:
import whisper
model = whisper.load_model("base")
result = model.transcribe(filepath)
return result['text']
except Exception as e:
raise RuntimeError(f"Audio parsing failed — likely missing ffmpeg. Error: {e}")
def parse_text(filepath):
with open(filepath, 'r') as f:
return f.read()
def parse_file(file_obj):
filename = file_obj.name.lower()
if filename.endswith(".pdf"):
reader = PyPDF2.PdfReader(file_obj)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
elif filename.endswith(".txt"):
return file_obj.read().decode("utf-8")
else:
raise ValueError("Unsupported file type.")
def chunk_text(text, chunk_size=300):
words = text.split()
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
def chunk_and_embed(text):
chunks = chunk_text(text)
embeddings = model.encode(chunks).tolist()
return list(zip(chunks, embeddings))
|