Spaces:
Sleeping
Sleeping
File size: 810 Bytes
43e97e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
import re
from typing import Any, AsyncGenerator
import aiofiles
from transformers import AutoModel
DEFAULT_CHUNK_SIZE = 1024 * 1024 * 50 # 50 megabytes
embedder = AutoModel.from_pretrained(
"jinaai/jina-embeddings-v2-base-en", trust_remote_code=True
)
async def load(filepath: str) -> AsyncGenerator[str, Any]:
async with aiofiles.open(filepath, "r", encoding="utf-8") as f:
while chunk := await f.read(DEFAULT_CHUNK_SIZE):
yield chunk
def clean(text: str) -> str:
t = text.replace("\n", " ")
t = re.sub(r"\s+", " ", t)
t = re.sub(r"\. ,", "", t)
t = t.replace("..", ".")
t = t.replace(". .", ".")
cleaned_text = t.replace("\n", " ").strip()
return cleaned_text
def embed(text: str) -> list[float]:
return embedder.encode(text).tolist() |