Aiswarya Sankar commited on
Commit
be672a0
·
1 Parent(s): 5c16594

Update app

Browse files
Files changed (1) hide show
  1. app.py +52 -52
app.py CHANGED
@@ -116,59 +116,51 @@ def index_repo(textbox: str, dropdown: str) -> Response:
116
  dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
117
  invalid_dataset_path = True
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  try:
120
- try:
121
- db = DeepLake(dataset_path=dataset_path,
122
- embedding_function=embeddings,
123
- token=os.environ['ACTIVELOOP_TOKEN'],
124
- read_only=True,
125
- num_workers=12,
126
- runtime = {"tensor_db": True}
127
- )
128
- except Exception as e:
129
- print("Failed to read: " + str(e))
130
- if "scheduled for deletion" in str(e):
131
- dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
132
- invalid_dataset_path = True
133
-
134
- if invalid_dataset_path or db is None or len(db.vectorstore.dataset) == 0:
135
- print("Dataset doesn't exist, fetching data")
136
- try:
137
- docs = []
138
- for dirpath, dirnames, filenames in os.walk(root_dir):
139
- for file in filenames:
140
- print(file)
141
- try:
142
- loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
143
- docs.extend(loader.load_and_split())
144
- except Exception as e:
145
- print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
146
- pass
147
-
148
- activeloop_username = "aiswaryas"
149
- text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
150
- texts = text_splitter.split_documents(docs)
151
-
152
- db = DeepLake(dataset_path=dataset_path,
153
- embedding_function=embeddings,
154
- token=os.environ['ACTIVELOOP_TOKEN'],
155
- read_only=False,
156
- num_workers=12,
157
- runtime = {"tensor_db": True}
158
- )
159
- # Do this in chunks to avoid hitting the ratelimit immediately
160
- for i in range(0, len(texts), 500):
161
- print("Adding documents " + str(i))
162
- db.add_documents(texts[i:i+500])
163
- time.sleep(.5)
164
-
165
- except Exception as e:
166
- return Response(
167
- result= "Failed to index github repo",
168
- repo="",
169
- error=str(e),
170
- stdout="",
171
- )
172
 
173
  except Exception as e:
174
  return Response(
@@ -178,6 +170,14 @@ def index_repo(textbox: str, dropdown: str) -> Response:
178
  stdout="",
179
  )
180
 
 
 
 
 
 
 
 
 
181
  vector_db_url.value = dataset_path
182
 
183
  return {
 
116
  dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
117
  invalid_dataset_path = True
118
 
119
+ # try:
120
+ # try:
121
+ # db = DeepLake(dataset_path=dataset_path,
122
+ # embedding_function=embeddings,
123
+ # token=os.environ['ACTIVELOOP_TOKEN'],
124
+ # read_only=True,
125
+ # num_workers=12,
126
+ # runtime = {"tensor_db": True}
127
+ # )
128
+ # except Exception as e:
129
+ # print("Failed to read: " + str(e))
130
+ # if "scheduled for deletion" in str(e):
131
+ # dataset_path = f"hub://{activeloop_username}/" + pathName + str(random.randint(1,100))
132
+ # invalid_dataset_path = True
133
+
134
+ # if invalid_dataset_path or db is None or len(db.vectorstore.dataset) == 0:
135
+ # print("Dataset doesn't exist, fetching data")
136
  try:
137
+ docs = []
138
+ for dirpath, dirnames, filenames in os.walk(root_dir):
139
+ for file in filenames:
140
+ print(file)
141
+ try:
142
+ loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
143
+ docs.extend(loader.load_and_split())
144
+ except Exception as e:
145
+ print("Exception: " + str(e) + "| File: " + os.path.join(dirpath, file))
146
+ pass
147
+
148
+ activeloop_username = "aiswaryas"
149
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
150
+ texts = text_splitter.split_documents(docs)
151
+
152
+ db = DeepLake(dataset_path=dataset_path,
153
+ embedding_function=embeddings,
154
+ token=os.environ['ACTIVELOOP_TOKEN'],
155
+ read_only=False,
156
+ num_workers=12,
157
+ runtime = {"tensor_db": True}
158
+ )
159
+ # Do this in chunks to avoid hitting the ratelimit immediately
160
+ for i in range(0, len(texts), 500):
161
+ print("Adding documents " + str(i))
162
+ db.add_documents(texts[i:i+500])
163
+ time.sleep(.5)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  except Exception as e:
166
  return Response(
 
170
  stdout="",
171
  )
172
 
173
+ # except Exception as e:
174
+ # return Response(
175
+ # result= "Failed to index github repo",
176
+ # repo="",
177
+ # error=str(e),
178
+ # stdout="",
179
+ # )
180
+
181
  vector_db_url.value = dataset_path
182
 
183
  return {