Spaces:
Sleeping
Sleeping
import re | |
from typing import Any, AsyncGenerator | |
import aiofiles | |
from transformers import AutoModel | |
DEFAULT_CHUNK_SIZE = 1024 * 1024 * 50 # 50 megabytes | |
embedder = AutoModel.from_pretrained( | |
"jinaai/jina-embeddings-v2-base-en", trust_remote_code=True | |
) | |
async def load(filepath: str) -> AsyncGenerator[str, Any]: | |
async with aiofiles.open(filepath, "r", encoding="utf-8") as f: | |
while chunk := await f.read(DEFAULT_CHUNK_SIZE): | |
yield chunk | |
def clean(text: str) -> str: | |
t = text.replace("\n", " ") | |
t = re.sub(r"\s+", " ", t) | |
t = re.sub(r"\. ,", "", t) | |
t = t.replace("..", ".") | |
t = t.replace(". .", ".") | |
cleaned_text = t.replace("\n", " ").strip() | |
return cleaned_text | |
def embed(text: str) -> list[float]: | |
return embedder.encode(text).tolist() |