mt3842ml commited on
Commit
c5874a9
·
verified ·
1 Parent(s): 04f4638

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -122
app.py CHANGED
@@ -1,113 +1,8 @@
1
  import gradio as gr
2
  import os
3
  from groq import Groq
4
-
5
- ############ TESTING ############
6
  import pandas as pd
7
  from datasets import Dataset
8
-
9
- # Define the dataset schema
10
- test_dataset_df = pd.DataFrame(columns=['id', 'title', 'content', 'prechunk_id', 'postchunk_id', 'arxiv_id', 'references'])
11
-
12
- # Populate the dataset with examples
13
- test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
14
- 'id': '1',
15
- 'title': 'Best restaurants in queens',
16
- 'content': 'I personally like to go to the J-Pan Chicken, they have fried chicken and amazing bubble tea.',
17
- 'prechunk_id': '',
18
- 'postchunk_id': '2',
19
- 'arxiv_id': '2401.04088',
20
- 'references': ['arXiv:9012.3456', 'arXiv:7890.1234']
21
- }])], ignore_index=True)
22
-
23
- test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
24
- 'id': '2',
25
- 'title': 'Best restaurants in queens',
26
- 'content': 'if you like asian food, flushing is second to none.',
27
- 'prechunk_id': '1',
28
- 'postchunk_id': '3',
29
- 'arxiv_id': '2401.04088',
30
- 'references': ['arXiv:6543.2109', 'arXiv:3210.9876']
31
- }])], ignore_index=True)
32
-
33
- test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
34
- 'id': '3',
35
- 'title': 'Best restaurants in queens',
36
- 'content': 'you have to try the ziti from ECC',
37
- 'prechunk_id': '2',
38
- 'postchunk_id': '',
39
- 'arxiv_id': '2401.04088',
40
- 'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
41
- }])], ignore_index=True)
42
-
43
- test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
44
- 'id': '6',
45
- 'title': 'Best restaurants in queens',
46
- 'content': 'theres a good halal cart on Wub Street, they give extra sticky creamy white sauce',
47
- 'prechunk_id': '',
48
- 'postchunk_id': '',
49
- 'arxiv_id': '2401.04088',
50
- 'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
51
- }])], ignore_index=True)
52
-
53
- test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
54
- 'id': '4',
55
- 'title': 'Spending a saturday in queens; what to do?',
56
- 'content': 'theres a hidden gem called The Lounge, you can play poker and blackjack and darts',
57
- 'prechunk_id': '',
58
- 'postchunk_id': '5',
59
- 'arxiv_id': '2401.04088',
60
- 'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
61
- }])], ignore_index=True)
62
-
63
- test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
64
- 'id': '5',
65
- 'title': 'Spending a saturday in queens; what to do?',
66
- 'content': 'if its a nice day, basketball at Non-non-Fiction Park is always fun',
67
- 'prechunk_id': '',
68
- 'postchunk_id': '6',
69
- 'arxiv_id': '2401.04088',
70
- 'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
71
- }])], ignore_index=True)
72
-
73
- test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
74
- 'id': '7',
75
- 'title': 'visiting queens for the weekend, how to get around?',
76
- 'content': 'nothing beats the subway, even with delays its the fastest option. you can transfer between the bus and subway with one swipe',
77
- 'prechunk_id': '',
78
- 'postchunk_id': '8',
79
- 'arxiv_id': '2401.04088',
80
- 'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
81
- }])], ignore_index=True)
82
-
83
- test_dataset_df = pd.concat([test_dataset_df, pd.DataFrame([{
84
- 'id': '8',
85
- 'title': 'visiting queens for the weekend, how to get around?',
86
- 'content': 'if youre going to the bar, its honestly worth ubering there. MTA while drunk isnt something id recommend.',
87
- 'prechunk_id': '7',
88
- 'postchunk_id': '',
89
- 'arxiv_id': '2401.04088',
90
- 'references': ['arXiv:1234.5678', 'arXiv:9012.3456']
91
- }])], ignore_index=True)
92
-
93
- # Convert the DataFrame to a Hugging Face Dataset object
94
- test_dataset = Dataset.from_pandas(test_dataset_df)
95
-
96
- data = test_dataset
97
-
98
- data = data.map(lambda x: {
99
- "id": x["id"],
100
- "metadata": {
101
- "title": x["title"],
102
- "content": x["content"],
103
- }
104
- })
105
- # drop uneeded columns
106
- data = data.remove_columns([
107
- "title", "content", "prechunk_id",
108
- "postchunk_id", "arxiv_id", "references"
109
- ])
110
-
111
  from semantic_router.encoders import HuggingFaceEncoder
112
 
113
  encoder = HuggingFaceEncoder(name="dwzhu/e5-base-4k")
@@ -159,22 +54,6 @@ time.sleep(1)
159
  # view index stats
160
  index.describe_index_stats()
161
 
162
- from tqdm.auto import tqdm
163
-
164
- batch_size = 2 # how many embeddings we create and insert at once
165
-
166
- for i in tqdm(range(0, len(data), batch_size)):
167
- # find end of batch
168
- i_end = min(len(data), i+batch_size)
169
- # create batch
170
- batch = data[i:i_end]
171
- # create embeddings
172
- chunks = [f'{x["title"]}: {x["content"]}' for x in batch["metadata"]]
173
- embeds = encoder(chunks)
174
- assert len(embeds) == (i_end-i)
175
- to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
176
- # upsert to Pinecone
177
- index.upsert(vectors=to_upsert)
178
 
179
  def get_docs(query: str, top_k: int) -> list[str]:
180
  # encode query
@@ -182,7 +61,7 @@ def get_docs(query: str, top_k: int) -> list[str]:
182
  # search pinecone index
183
  res = index.query(vector=xq, top_k=top_k, include_metadata=True)
184
  # get doc text
185
- docs = [x["metadata"]['content'] for x in res["matches"]]
186
  return docs
187
 
188
  from groq import Groq
 
1
  import gradio as gr
2
  import os
3
  from groq import Groq
 
 
4
  import pandas as pd
5
  from datasets import Dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  from semantic_router.encoders import HuggingFaceEncoder
7
 
8
  encoder = HuggingFaceEncoder(name="dwzhu/e5-base-4k")
 
54
  # view index stats
55
  index.describe_index_stats()
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  def get_docs(query: str, top_k: int) -> list[str]:
59
  # encode query
 
61
  # search pinecone index
62
  res = index.query(vector=xq, top_k=top_k, include_metadata=True)
63
  # get doc text
64
+ docs = [x["metadata"]['content_snippet'] for x in res["matches"]]
65
  return docs
66
 
67
  from groq import Groq