Spaces:

mano-wii
/

tools

Running

File size: 10,146 Bytes

e8fa699

# routers/embedding/__init__.py

import os
import re
import sys
import threading
import torch
import platform
import textract
import subprocess
from sentence_transformers import SentenceTransformer, util
from typing import Dict, List, Tuple, Set, LiteralString

if platform.system() == "Windows":
    import win32com.client  # Windows only


def read_doc_windows(file_path):
    """Reads .doc files using pywin32 on Windows"""
    word = win32com.client.Dispatch("Word.Application")
    word.Visible = False
    doc = word.Documents.Open(file_path)
    text = doc.Content.Text
    doc.Close(False)
    word.Quit()
    return text


def convert_doc_to_docx_and_read(doc_path):
    """Converts .doc to .docx using LibreOffice (Linux/macOS) and reads it"""
    docx_path = doc_path + "x"
    command = ["soffice", "--headless", "--convert-to", "docx",
               "--outdir", os.path.dirname(doc_path), doc_path]
    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return textract.process(docx_path).decode("utf-8")


def read_doc(file_path):
    """Reads .doc or .docx files based on the OS"""

    # For .docx, use python-docx
    if file_path.endswith(".docx"):
        return textract.process(file_path).decode("utf-8")

    # Windows: Use pywin32
    if platform.system() == "Windows":
        return read_doc_windows(file_path)

    # Linux/macOS: Use LibreOffice
    return convert_doc_to_docx_and_read(file_path)


class EmbeddingContext:
    # These don't change
    TOKEN_LEN_MAX_FOR_EMBEDDING = 512

    # Set when creating the object
    lock = None
    model = None
    openai_client = None
    model_name = ''
    config_type = ''
    embedding_shape = None
    embedding_dtype = None
    embedding_device = None

    # Updates constantly
    data = {}

    def __init__(self):
        try:
            from config import settings
        except:
            sys.path.append(os.path.abspath(
                os.path.join(os.path.dirname(__file__), '../..')))
            from config import settings

        self.lock = threading.Lock()
        config_type = settings.embedding_api
        model_name = settings.embedding_model

        if config_type == 'sbert':
            self.model = SentenceTransformer(model_name, use_auth_token=False)
            self.model.max_seq_length = self.TOKEN_LEN_MAX_FOR_EMBEDDING
            print("Max Sequence Length:", self.model.max_seq_length)

            self.encode = self.encode_sbert
            if torch.cuda.is_available():
                self.model = self.model.to('cuda')

        elif config_type == 'openai':
            from openai import OpenAI
            self.openai_client = OpenAI(
                # base_url = settings.openai_api_base
                api_key=settings.OPENAI_API_KEY,
            )
            self.encode = self.encode_openai

        self.model_name = model_name
        self.config_type = config_type

        tmp = self.encode(['tmp'])
        self.embedding_shape = tmp.shape[1:]
        self.embedding_dtype = tmp.dtype
        self.embedding_device = tmp.device

    def encode(self, texts_to_embed):
        pass

    def encode_sbert(self, texts_to_embed):
        return self.model.encode(texts_to_embed, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)

    def encode_openai(self, texts_to_embed):
        import math
        import time

        tokens_count = 0
        for text in texts_to_embed:
            tokens_count += len(self.get_tokens(text))

            chunks_num = math.ceil(tokens_count / 500000)
            chunk_size = math.ceil(len(texts_to_embed) / chunks_num)

            embeddings = []
            for i in range(chunks_num):
                start = i * chunk_size
                end = start + chunk_size
                chunk = texts_to_embed[start:end]

                embeddings_tmp = self.openai_client.embeddings.create(
                    model=self.model_name,
                    input=chunk,
                ).data

                if embeddings_tmp is None:
                    break

                embeddings.extend(embeddings_tmp)

                if i < chunks_num - 1:
                    time.sleep(60)  # Wait 1 minute before the next call

            return torch.stack([torch.tensor(embedding.embedding, dtype=torch.float32) for embedding in embeddings])

    def get_tokens(self, text):
        if self.model:
            return self.model.tokenizer.tokenize(text)

        tokens = []
        for token in re.split(r'(\W|\b)', text):
            if token.strip():
                tokens.append(token)

        return tokens


class SplitDocs:
    def split_in_topics(self,
                        filedir: LiteralString = None,
                        *,
                        pattern_filename=r'(?<!navigation)\.(md|rst|pdf)',
                        pattern_content_sub=r'---\nhide:[\s\S]+?---\s*',
                        patterns_titles=(
                            r'^# (.+)', r'^## (.+)', r'^### (.+)'),
                        ) -> List[Tuple[str, str]]:
        def matches_pattern(filename):
            return re.search(pattern_filename, filename) is not None

        def split_patterns_recursive(patterns, text, index=-1):
            sections = re.split(patterns[0], text, flags=re.MULTILINE)
            for i, section in enumerate(sections):
                if not section.strip():
                    continue
                is_match = bool(i & 1)
                if is_match:
                    yield (index, section)
                elif len(patterns) > 1:
                    for j, section_j in split_patterns_recursive(patterns[1:], section, index + 1):
                        yield (j, section_j)
                else:
                    yield (-1, section)

        if not os.path.isdir(filedir):
            root, name = os.path.split(filedir)
            os_walk = [[root, None, [name]]]
        else:
            os_walk = os.walk(filedir)

        for root, _, files in os_walk:
            for name in files:
                if not matches_pattern(name):
                    continue

                full_path = os.path.join(root, name)
                content = ""
                if name.lower().endswith(".pdf"):
                    import pypdf
                    content = ""
                    with open(full_path, 'rb') as file:
                        reader = pypdf.PdfReader(file)
                        for page in reader.pages:
                            content += page.extract_text(
                                extraction_mode="layout")
                elif name.lower().endswith(".doc"):
                    content = read_doc(full_path)
                else:
                    with open(full_path, 'r', encoding='utf-8') as file:
                        content = file.read()

                if pattern_content_sub:
                    content = re.sub(pattern_content_sub, '', content)

                rel_path = full_path.replace(filedir, '').replace('\\', '/')

                # Protect code parts
                patterns = (r'(```[\s\S]+?```)', *patterns_titles)

                last_titles = []
                last_titles_index = []
                content_accum = ''
                for i, section in split_patterns_recursive(patterns, content):
                    if i < 0:
                        content_accum += section
                        continue
                    if content_accum:
                        yield rel_path, last_titles, content_accum
                        content_accum = ''
                    if not last_titles_index or i > last_titles_index[-1]:
                        last_titles_index.append(i)
                        last_titles.append(section)
                        continue
                    while len(last_titles_index) > 1 and i < last_titles_index[-1]:
                        last_titles_index.pop()
                        last_titles.pop()
                    # Replace
                    last_titles_index[-1] = i
                    last_titles[-1] = section
                if content_accum or i != -1:
                    yield rel_path, last_titles, content_accum

    def reduce_text(_self, text):
        text = re.sub(r'^\n+', '', text)  # Strip
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r':\S*: ', '', text)  # Remove [:...:] patterns
        text = re.sub(r'\s*\n+', '\n', text)
        return text

    def embedding_header(_self, rel_path, titles):
        return f"{rel_path}\n# {' | '.join(titles)}\n\n"

    def split_for_embedding(self,
                            filedir: LiteralString = None,
                            *,
                            pattern_filename=r'(?<!navigation)\.(md|rst|doc|pdf)',
                            pattern_content_sub=r'---\nhide:[\s\S]+?---\s*',
                            patterns_titles=(
                                r'^# (.+)', r'^## (.+)', r'^### (.+)'),
                            ):
        tokenizer = EMBEDDING_CTX.model.tokenizer
        max_tokens = EMBEDDING_CTX.model.max_seq_length
        texts = []

        for rel_path, titles, content in self.split_in_topics(
                filedir, pattern_filename=pattern_filename, pattern_content_sub=pattern_content_sub, patterns_titles=patterns_titles):
            header = self.embedding_header(rel_path, titles)
            tokens_pre_len = len(tokenizer.tokenize(header))
            tokens_so_far = tokens_pre_len
            text_so_far = header
            for part in self.reduce_text(content).splitlines():
                part += '\n'
                part_tokens_len = len(tokenizer.tokenize(part))
                if tokens_so_far + part_tokens_len > max_tokens:
                    texts.append(text_so_far)
                    text_so_far = header
                    tokens_so_far = tokens_pre_len
                text_so_far += part
                tokens_so_far += part_tokens_len

            if tokens_so_far != tokens_pre_len:
                texts.append(text_so_far)

        return texts


EMBEDDING_CTX = EmbeddingContext()