Update app.py
Browse files
app.py
CHANGED
@@ -1,113 +1,8 @@
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
from groq import Groq
|
4 |
-
|
5 |
-
############ TESTING ############
|
6 |
import pandas as pd
|
7 |
from datasets import Dataset
|
8 |
-
|
9 |
-
# Define the dataset schema
|
10 |
-
test_dataset_df = pd.DataFrame(columns=['id', 'title', 'content', 'prechunk_id', 'postchunk_id', 'arxiv_id', 'references'])
|
11 |
-
|
12 |
-
# Populate the dataset with examples
|
13 |
-
test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
|
14 |
-
'id': '1',
|
15 |
-
'title': 'Best restaurants in queens',
|
16 |
-
'content': 'I personally like to go to the J-Pan Chicken, they have fried chicken and amazing bubble tea.',
|
17 |
-
'prechunk_id': '',
|
18 |
-
'postchunk_id': '2',
|
19 |
-
'arxiv_id': '2401.04088',
|
20 |
-
'references': ['arXiv:9012.3456', 'arXiv:7890.1234']
|
21 |
-
}])], ignore_index=True)
|
22 |
-
|
23 |
-
test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
|
24 |
-
'id': '2',
|
25 |
-
'title': 'Best restaurants in queens',
|
26 |
-
'content': 'if you like asian food, flushing is second to none.',
|
27 |
-
'prechunk_id': '1',
|
28 |
-
'postchunk_id': '3',
|
29 |
-
'arxiv_id': '2401.04088',
|
30 |
-
'references': ['arXiv:6543.2109', 'arXiv:3210.9876']
|
31 |
-
}])], ignore_index=True)
|
32 |
-
|
33 |
-
test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
|
34 |
-
'id': '3',
|
35 |
-
'title': 'Best restaurants in queens',
|
36 |
-
'content': 'you have to try the ziti from ECC',
|
37 |
-
'prechunk_id': '2',
|
38 |
-
'postchunk_id': '',
|
39 |
-
'arxiv_id': '2401.04088',
|
40 |
-
'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
|
41 |
-
}])], ignore_index=True)
|
42 |
-
|
43 |
-
test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
|
44 |
-
'id': '6',
|
45 |
-
'title': 'Best restaurants in queens',
|
46 |
-
'content': 'theres a good halal cart on Wub Street, they give extra sticky creamy white sauce',
|
47 |
-
'prechunk_id': '',
|
48 |
-
'postchunk_id': '',
|
49 |
-
'arxiv_id': '2401.04088',
|
50 |
-
'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
|
51 |
-
}])], ignore_index=True)
|
52 |
-
|
53 |
-
test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
|
54 |
-
'id': '4',
|
55 |
-
'title': 'Spending a saturday in queens; what to do?',
|
56 |
-
'content': 'theres a hidden gem called The Lounge, you can play poker and blackjack and darts',
|
57 |
-
'prechunk_id': '',
|
58 |
-
'postchunk_id': '5',
|
59 |
-
'arxiv_id': '2401.04088',
|
60 |
-
'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
|
61 |
-
}])], ignore_index=True)
|
62 |
-
|
63 |
-
test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
|
64 |
-
'id': '5',
|
65 |
-
'title': 'Spending a saturday in queens; what to do?',
|
66 |
-
'content': 'if its a nice day, basketball at Non-non-Fiction Park is always fun',
|
67 |
-
'prechunk_id': '',
|
68 |
-
'postchunk_id': '6',
|
69 |
-
'arxiv_id': '2401.04088',
|
70 |
-
'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
|
71 |
-
}])], ignore_index=True)
|
72 |
-
|
73 |
-
test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
|
74 |
-
'id': '7',
|
75 |
-
'title': 'visiting queens for the weekend, how to get around?',
|
76 |
-
'content': 'nothing beats the subway, even with delays its the fastest option. you can transfer between the bus and subway with one swipe',
|
77 |
-
'prechunk_id': '',
|
78 |
-
'postchunk_id': '8',
|
79 |
-
'arxiv_id': '2401.04088',
|
80 |
-
'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
|
81 |
-
}])], ignore_index=True)
|
82 |
-
|
83 |
-
test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
|
84 |
-
'id': '8',
|
85 |
-
'title': 'visiting queens for the weekend, how to get around?',
|
86 |
-
'content': 'if youre going to the bar, its honestly worth ubering there. MTA while drunk isnt something id recommend.',
|
87 |
-
'prechunk_id': '7',
|
88 |
-
'postchunk_id': '',
|
89 |
-
'arxiv_id': '2401.04088',
|
90 |
-
'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
|
91 |
-
}])], ignore_index=True)
|
92 |
-
|
93 |
-
# Convert the DataFrame to a Hugging Face Dataset object
|
94 |
-
test_dataset = Dataset.from_pandas(test_dataset_df)
|
95 |
-
|
96 |
-
data = test_dataset
|
97 |
-
|
98 |
-
data = data.map(lambda x: {
|
99 |
-
"id": x["id"],
|
100 |
-
"metadata": {
|
101 |
-
"title": x["title"],
|
102 |
-
"content": x["content"],
|
103 |
-
}
|
104 |
-
})
|
105 |
-
# drop uneeded columns
|
106 |
-
data = data.remove_columns([
|
107 |
-
"title", "content", "prechunk_id",
|
108 |
-
"postchunk_id", "arxiv_id", "references"
|
109 |
-
])
|
110 |
-
|
111 |
from semantic_router.encoders import HuggingFaceEncoder
|
112 |
|
113 |
encoder = HuggingFaceEncoder(name="dwzhu/e5-base-4k")
|
@@ -159,22 +54,6 @@ time.sleep(1)
|
|
159 |
# view index stats
|
160 |
index.describe_index_stats()
|
161 |
|
162 |
-
from tqdm.auto import tqdm
|
163 |
-
|
164 |
-
batch_size = 2 # how many embeddings we create and insert at once
|
165 |
-
|
166 |
-
for i in tqdm(range(0, len(data), batch_size)):
|
167 |
-
# find end of batch
|
168 |
-
i_end = min(len(data), i+batch_size)
|
169 |
-
# create batch
|
170 |
-
batch = data[i:i_end]
|
171 |
-
# create embeddings
|
172 |
-
chunks = [f'{x["title"]}: {x["content"]}' for x in batch["metadata"]]
|
173 |
-
embeds = encoder(chunks)
|
174 |
-
assert len(embeds) == (i_end-i)
|
175 |
-
to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
|
176 |
-
# upsert to Pinecone
|
177 |
-
index.upsert(vectors=to_upsert)
|
178 |
|
179 |
def get_docs(query: str, top_k: int) -> list[str]:
|
180 |
# encode query
|
@@ -182,7 +61,7 @@ def get_docs(query: str, top_k: int) -> list[str]:
|
|
182 |
# search pinecone index
|
183 |
res = index.query(vector=xq, top_k=top_k, include_metadata=True)
|
184 |
# get doc text
|
185 |
-
docs = [x["metadata"]['
|
186 |
return docs
|
187 |
|
188 |
from groq import Groq
|
|
|
1 |
import gradio as gr
|
2 |
import os
|
3 |
from groq import Groq
|
|
|
|
|
4 |
import pandas as pd
|
5 |
from datasets import Dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
from semantic_router.encoders import HuggingFaceEncoder
|
7 |
|
8 |
encoder = HuggingFaceEncoder(name="dwzhu/e5-base-4k")
|
|
|
54 |
# view index stats
|
55 |
index.describe_index_stats()
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
def get_docs(query: str, top_k: int) -> list[str]:
|
59 |
# encode query
|
|
|
61 |
# search pinecone index
|
62 |
res = index.query(vector=xq, top_k=top_k, include_metadata=True)
|
63 |
# get doc text
|
64 |
+
docs = [x["metadata"]['content_snippet'] for x in res["matches"]]
|
65 |
return docs
|
66 |
|
67 |
from groq import Groq
|