OuroborosM commited on
Commit
591e68e
·
1 Parent(s): e85fc80

Add file upload func

Browse files
Files changed (1) hide show
  1. app.py +144 -4
app.py CHANGED
@@ -23,6 +23,139 @@ from pinecone.core.client.configuration import Configuration as OpenApiConfigura
23
  import gradio as gr
24
  import time
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  class DB_Search(BaseTool):
27
  name = "Vector Database Search"
28
  description = "This is the internal database to search information firstly. If information is found, it is trustful."
@@ -183,11 +316,18 @@ def chathmi2(message, history):
183
  # chatbot = gr.Chatbot().style(color_map =("blue", "pink"))
184
  # chatbot = gr.Chatbot(color_map =("blue", "pink"))
185
 
186
- demo = gr.ChatInterface(
187
- chathmi2,
188
- title="STLA BABY - YOUR FRIENDLY GUIDE ",
189
- description= "v0.2: Powered by MECH Core Team",
 
 
 
 
 
190
  )
 
 
191
 
192
  # demo = gr.Interface(
193
  # chathmi,
 
23
  import gradio as gr
24
  import time
25
 
26
+ import glob
27
+ from typing import List
28
+ from multiprocessing import Pool
29
+ from tqdm import tqdm
30
+
31
+ from langchain.document_loaders import (
32
+ CSVLoader,
33
+ EverNoteLoader,
34
+ PyMuPDFLoader,
35
+ TextLoader,
36
+ UnstructuredEmailLoader,
37
+ UnstructuredEPubLoader,
38
+ UnstructuredHTMLLoader,
39
+ UnstructuredMarkdownLoader,
40
+ UnstructuredODTLoader,
41
+ UnstructuredPowerPointLoader,
42
+ UnstructuredWordDocumentLoader,
43
+ )
44
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
45
+ from langchain.docstore.document import Document
46
+
47
+ # Custom document loaders
48
+ class MyElmLoader(UnstructuredEmailLoader):
49
+ """Wrapper to fallback to text/plain when default does not work"""
50
+
51
+ def load(self) -> List[Document]:
52
+ """Wrapper adding fallback for elm without html"""
53
+ try:
54
+ try:
55
+ doc = UnstructuredEmailLoader.load(self)
56
+ except ValueError as e:
57
+ if 'text/html content not found in email' in str(e):
58
+ # Try plain text
59
+ self.unstructured_kwargs["content_source"]="text/plain"
60
+ doc = UnstructuredEmailLoader.load(self)
61
+ else:
62
+ raise
63
+ except Exception as e:
64
+ # Add file_path to exception message
65
+ raise type(e)(f"{self.file_path}: {e}") from e
66
+
67
+ return doc
68
+
69
+ LOADER_MAPPING = {
70
+ ".csv": (CSVLoader, {}),
71
+ # ".docx": (Docx2txtLoader, {}),
72
+ ".doc": (UnstructuredWordDocumentLoader, {}),
73
+ ".docx": (UnstructuredWordDocumentLoader, {}),
74
+ ".enex": (EverNoteLoader, {}),
75
+ ".eml": (MyElmLoader, {}),
76
+ ".epub": (UnstructuredEPubLoader, {}),
77
+ ".html": (UnstructuredHTMLLoader, {}),
78
+ ".md": (UnstructuredMarkdownLoader, {}),
79
+ ".odt": (UnstructuredODTLoader, {}),
80
+ ".pdf": (PyMuPDFLoader, {}),
81
+ ".ppt": (UnstructuredPowerPointLoader, {}),
82
+ ".pptx": (UnstructuredPowerPointLoader, {}),
83
+ ".txt": (TextLoader, {"encoding": "utf8"}),
84
+ # Add more mappings for other file extensions and loaders as needed
85
+ }
86
+
87
+ source_directory = 'Upload Files'
88
+ file_path = ''
89
+ chunk_size = 500
90
+ chunk_overlap = 300
91
+
92
+ def load_single_document(file_path: str) -> List[Document]:
93
+ ext = "." + file_path.rsplit(".", 1)[-1]
94
+ if ext in LOADER_MAPPING:
95
+ loader_class, loader_args = LOADER_MAPPING[ext]
96
+ loader = loader_class(file_path, **loader_args)
97
+ return loader.load()
98
+
99
+ raise ValueError(f"Unsupported file extension '{ext}'")
100
+
101
+
102
+ def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
103
+ """
104
+ Loads all documents from the source documents directory, ignoring specified files
105
+ """
106
+ all_files = []
107
+ for ext in LOADER_MAPPING:
108
+ all_files.extend(
109
+ glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
110
+ )
111
+ filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]
112
+
113
+ with Pool(processes=os.cpu_count()) as pool:
114
+ results = []
115
+ with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
116
+ for i, docs in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
117
+ results.extend(docs)
118
+ pbar.update()
119
+
120
+ return results
121
+
122
+ def process_documents(ignored_files: List[str] = []) -> List[Document]:
123
+ """
124
+ Load documents and split in chunks
125
+ """
126
+ print(f"Loading documents from {source_directory}")
127
+ documents = load_documents(source_directory, ignored_files)
128
+ if not documents:
129
+ print("No new documents to load")
130
+ exit(0)
131
+ print(f"Loaded {len(documents)} new documents from {source_directory}")
132
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
133
+ texts = text_splitter.split_documents(documents)
134
+ print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
135
+ return texts
136
+
137
+ def UpdateDb(file_path: str):
138
+ global vectordb_p
139
+ # pinecone.Index(index_name).delete(delete_all=True, namespace='')
140
+ # collection = vectordb_p.get()
141
+ # split_docs = process_documents([metadata['source'] for metadata in collection['metadatas']])
142
+ # split_docs = process_documents()
143
+ documents = load_single_document(file_path)
144
+ if not documents:
145
+ print("No new documents to load")
146
+ exit(0)
147
+ print(f"Loaded {len(documents)} new documents from {source_directory}")
148
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
149
+ split_docs = text_splitter.split_documents(documents)
150
+ print(f"Split into {len(split_docs)} chunks of text (max. {chunk_size} tokens each)")
151
+ tt = len(split_docs)
152
+ print(split_docs[tt-1])
153
+ print(f"Creating embeddings. May take some minutes...")
154
+ vectordb_p = Pinecone.from_documents(split_docs, embeddings, index_name = "stla-baby")
155
+ print("Pinecone Updated Done")
156
+ print(index.describe_index_stats())
157
+
158
+
159
  class DB_Search(BaseTool):
160
  name = "Vector Database Search"
161
  description = "This is the internal database to search information firstly. If information is found, it is trustful."
 
316
  # chatbot = gr.Chatbot().style(color_map =("blue", "pink"))
317
  # chatbot = gr.Chatbot(color_map =("blue", "pink"))
318
 
319
+ def upload_file(files):
320
+ print(files)
321
+ pass
322
+
323
+ with gr.Blocks() as demo:
324
+ gr.ChatInterface(
325
+ chathmi2,
326
+ title="STLA BABY - YOUR FRIENDLY GUIDE ",
327
+ description= "v0.2: Powered by MECH Core Team",
328
  )
329
+ upload_button = gr.UploadButton("Upload File", file_count="multiple")
330
+ upload_button.upload(upload_file, upload_button)
331
 
332
  # demo = gr.Interface(
333
  # chathmi,