Spaces:
Sleeping
Sleeping
import os | |
import PyPDF2 | |
from sentence_transformers import SentenceTransformer | |
import warnings | |
warnings.filterwarnings( | |
"ignore", | |
category=FutureWarning, | |
message="`clean_up_tokenization_spaces` was not set.*" | |
) | |
model = SentenceTransformer('all-MiniLM-L6-v2') | |
def parse_pdf(filepath): | |
text = "" | |
with open(filepath, 'rb') as f: | |
reader = PyPDF2.PdfReader(f) | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
def parse_audio(filepath): | |
try: | |
import whisper | |
model = whisper.load_model("base") | |
result = model.transcribe(filepath) | |
return result['text'] | |
except Exception as e: | |
raise RuntimeError(f"Audio parsing failed — likely missing ffmpeg. Error: {e}") | |
def parse_text(filepath): | |
with open(filepath, 'r') as f: | |
return f.read() | |
def parse_file(file_obj): | |
filename = file_obj.name.lower() | |
if filename.endswith(".pdf"): | |
reader = PyPDF2.PdfReader(file_obj) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() | |
return text | |
elif filename.endswith(".txt"): | |
return file_obj.read().decode("utf-8") | |
else: | |
raise ValueError("Unsupported file type.") | |
def chunk_text(text, chunk_size=300): | |
words = text.split() | |
return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] | |
def chunk_and_embed(text): | |
chunks = chunk_text(text) | |
embeddings = model.encode(chunks).tolist() | |
return list(zip(chunks, embeddings)) | |