File size: 7,154 Bytes
6d55408
 
 
 
 
 
 
6866239
 
6d55408
 
6866239
 
 
6d55408
 
 
 
 
 
 
 
 
 
6866239
6d55408
6866239
 
 
6d55408
 
 
 
 
 
 
 
 
 
 
 
 
 
6866239
6d55408
 
6866239
6d55408
 
 
6866239
 
 
 
 
6d55408
 
 
6866239
6d55408
 
6866239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6d55408
6866239
 
 
6d55408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import os
import hashlib
import io
import pandas as pd
from PyPDF2 import PdfReader
from docx import Document
from langchain_huggingface import HuggingFaceEmbeddings
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
import json

class FileHandler:
    def __init__(self,api_token,logger):
        self.logger = logger
        self.logger.info("Initializing FileHandler...")
        # Initialize the embedding model using Hugging Face
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={"token": api_token},
        )

    def handle_file_upload(self, file, document_name, document_description):
        try:
            content = file.read()
            file_hash = hashlib.md5(content).hexdigest()
            collection_name = f"collection_{file_hash}"

            # Check if the collection exists
            if connections._fetch_handler().has_collection(collection_name):
                self.logger.info(f"Collection '{collection_name}' already exists.")
                return {"message": "File already processed."}

            # Process file based on type
            if file.name.endswith(".pdf"):
                texts, metadatas = self.load_and_split_pdf(file)
            elif file.name.endswith(".docx"):
                texts, metadatas = self.load_and_split_docx(file)
            elif file.name.endswith(".txt"):
                texts, metadatas = self.load_and_split_txt(content)
            elif file.name.endswith(".xlsx"):
                texts, metadatas = self.load_and_split_table(content)
            elif file.name.endswith(".csv"):
                texts, metadatas = self.load_and_split_csv(content)
            else:
                self.logger.info("Unsupported file format.")
                raise ValueError("Unsupported file format.")


            if not texts:
                return {"message": "No text extracted from the file. Check the file content."}

            # self._store_vectors(collection_name, texts, metadatas)
            filename = file.name
            filelen = len(content)
            self._store_vectors(collection_name, texts, metadatas, document_name, document_description,filename,filelen)
            self.logger.info(f"File processed successfully. Collection name: {collection_name}")

            return {"message": "File processed successfully."}
        except Exception as e:
            self.logger.error(f"Error processing file: {str(e)}")
            return {"message": f"Error processing file: {str(e)}"}

    def _store_vectors(self, collection_name, texts, metadatas, document_name, document_description,file_name,file_len):
        fields = [
            FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
            FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=384),
            FieldSchema(name="file_name_hash", dtype=DataType.INT64),  # Hash of file name
            FieldSchema(name="document_name_hash", dtype=DataType.INT64),  # Hash of document name
            FieldSchema(name="document_description_hash", dtype=DataType.INT64),  # Hash of document description
            FieldSchema(name="file_meta_hash", dtype=DataType.INT64),
            FieldSchema(name="file_size", dtype=DataType.INT64),
        ]
        schema = CollectionSchema(fields, description="Document embeddings with metadata")
        collection = Collection(name=collection_name, schema=schema)
        # Generate embeddings
        embeddings = [self.embeddings.embed_query(text) for text in texts]

        # Convert metadata to hashed values
        file_name_hash = int(hashlib.md5(file_name.encode('utf-8')).hexdigest(), 16) % (10 ** 12)
        document_name_hash = int(hashlib.md5((document_name or "Unknown Document").encode('utf-8')).hexdigest(), 16) % (
                    10 ** 12)
        document_description_hash = int(
            hashlib.md5((document_description or "No Description Provided").encode('utf-8')).hexdigest(), 16) % (
                                                10 ** 12)
        # Convert metadata list to JSON string and hash it
        metadata_string = json.dumps(metadatas, ensure_ascii=False)
        file_meta_hash = int(hashlib.md5(metadata_string.encode('utf-8')).hexdigest(), 16) % (10 ** 12)

        # Prepare data for insertion
        data = [
            embeddings,
            [file_name_hash] * len(embeddings),
            [document_name_hash] * len(embeddings),
            [document_description_hash] * len(embeddings),
            [file_meta_hash] * len(embeddings),
            [file_len or 0] * len(embeddings),
        ]

        # Insert data into collection
        collection.insert(data)
        collection.load()
    def load_and_split_pdf(self, file):
        reader = PdfReader(file)
        texts = []
        metadatas = []
        for page_num, page in enumerate(reader.pages):
            text = page.extract_text()
            if text:
                texts.append(text)
                metadatas.append({"page_number": page_num + 1})
        return texts, metadatas

    def load_and_split_docx(self, file):
        doc = Document(file)
        texts = []
        metadatas = []
        for para_num, paragraph in enumerate(doc.paragraphs):
            if paragraph.text:
                texts.append(paragraph.text)
                metadatas.append({"paragraph_number": para_num + 1})
        return texts, metadatas

    def load_and_split_txt(self, content):
        text = content.decode("utf-8")
        lines = text.split('\n')
        texts = [line for line in lines if line.strip()]
        metadatas = [{}] * len(texts)
        return texts, metadatas

    def load_and_split_table(self, content):
        excel_data = pd.read_excel(io.BytesIO(content), sheet_name=None)
        texts = []
        metadatas = []
        for sheet_name, df in excel_data.items():
            df = df.dropna(how='all', axis=0).dropna(how='all', axis=1)
            df = df.fillna('N/A')
            for _, row in df.iterrows():
                row_dict = row.to_dict()
                # Combine key-value pairs into a string
                row_text = ', '.join([f"{key}: {value}" for key, value in row_dict.items()])
                texts.append(row_text)
                metadatas.append({"sheet_name": sheet_name})
        return texts, metadatas

    def load_and_split_csv(self, content):
        csv_data = pd.read_csv(io.StringIO(content.decode('utf-8')))
        texts = []
        metadatas = []
        csv_data = csv_data.dropna(how='all', axis=0).dropna(how='all', axis=1)
        csv_data = csv_data.fillna('N/A')
        for _, row in csv_data.iterrows():
            row_dict = row.to_dict()
            row_text = ', '.join([f"{key}: {value}" for key, value in row_dict.items()])
            texts.append(row_text)
            metadatas.append({"row_index": _})
        return texts, metadatas