import os import PyPDF2 from sentence_transformers import SentenceTransformer import warnings warnings.filterwarnings( "ignore", category=FutureWarning, message="`clean_up_tokenization_spaces` was not set.*" ) model = SentenceTransformer('all-MiniLM-L6-v2') def parse_pdf(filepath): text = "" with open(filepath, 'rb') as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: text += page.extract_text() + "\n" return text def parse_audio(filepath): try: import whisper model = whisper.load_model("base") result = model.transcribe(filepath) return result['text'] except Exception as e: raise RuntimeError(f"Audio parsing failed — likely missing ffmpeg. Error: {e}") def parse_text(filepath): with open(filepath, 'r') as f: return f.read() def parse_file(file_obj): filename = file_obj.name.lower() if filename.endswith(".pdf"): reader = PyPDF2.PdfReader(file_obj) text = "" for page in reader.pages: text += page.extract_text() return text elif filename.endswith(".txt"): return file_obj.read().decode("utf-8") else: raise ValueError("Unsupported file type.") def chunk_text(text, chunk_size=300): words = text.split() return [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] def chunk_and_embed(text): chunks = chunk_text(text) embeddings = model.encode(chunks).tolist() return list(zip(chunks, embeddings))