Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| import faiss | |
| check_point = 'nomic-ai/nomic-embed-text-v1' | |
| embedding_model = SentenceTransformer(check_point,trust_remote_code=True) | |
| def parese_doc(doc,first_section,ignore_after): | |
| documents_1 = '' | |
| reader = doc | |
| for page in reader.pages: | |
| documents_1 += page.extract_text() | |
| cleaned_string = documents_1.replace('\n', ' ') | |
| cleaned_string = cleaned_string.lower() | |
| start_index = cleaned_string.find(first_section) | |
| end_index = cleaned_string.rfind(ignore_after) | |
| if start_index!=-1 and end_index!=-1: | |
| cleaned_string = cleaned_string[start_index:end_index] | |
| sentence_list = cleaned_string.split('. ') | |
| context_list = [] | |
| group_size = 20 | |
| overlap = 5 | |
| i = 0 | |
| while True: | |
| group = sentence_list[i:i+group_size] | |
| text = '. '.join(group) | |
| context_list.append(text) | |
| i+=group_size-overlap | |
| if i>=len(sentence_list): | |
| break | |
| return context_list | |
| def get_embeddings(doc): | |
| model_input = doc | |
| out = embedding_model.encode(model_input) | |
| return out | |
| def create_embedding(context_list): | |
| embedding_dimension = embedding_model.get_sentence_embedding_dimension() | |
| embeddings = list(map(get_embeddings,context_list)) | |
| embeddings_array = np.array(embeddings) | |
| index = faiss.IndexFlatL2(embedding_dimension) | |
| index.add(embeddings_array) | |
| return index | |