CiccioQuinto commited on
Commit
72e442e
Β·
verified Β·
1 Parent(s): 281b857

Upload explore_metadata.ipynb

Browse files
Files changed (1) hide show
  1. explore_metadata.ipynb +343 -0
explore_metadata.ipynb ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "a600d7fc",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import json \n",
11
+ "with open('metadata.jsonl', 'r') as f: \n",
12
+ " json_list = list(f)\n",
13
+ "\n",
14
+ "json_QA = []\n",
15
+ "for json_str in json_list: \n",
16
+ " json_data = json.loads(json_str)\n",
17
+ " json_QA.append(json_data)"
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "code",
22
+ "execution_count": 2,
23
+ "id": "fa5d8eb8",
24
+ "metadata": {},
25
+ "outputs": [
26
+ {
27
+ "name": "stdout",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "==================================================\n",
31
+ "Task ID: 853c8244-429e-46ca-89f2-addf40dfb2bd\n",
32
+ "Question: In the 2015 Metropolitan Museum of Art exhibition titled after the Chinese zodiac animal of 2015, how many of the \"twelve animals of the Chinese zodiac\" have a hand visible?\n",
33
+ "Level: 2\n",
34
+ "Final Answer: 11\n",
35
+ "Annotator Metadata: \n",
36
+ " β”œβ”€β”€ Steps: \n",
37
+ " β”‚ β”œβ”€β”€ 1. Search \"2015 Chinese zodiac animal\" on Google search.\n",
38
+ " β”‚ β”œβ”€β”€ 2. Note the animal (ram).\n",
39
+ " β”‚ β”œβ”€β”€ 3. Search \"Metropolitan Museum of Art\" on Google search.\n",
40
+ " β”‚ β”œβ”€β”€ 4. Open the Metropolitan Museum of Art website.\n",
41
+ " β”‚ β”œβ”€β”€ 5. Click \"Exhibitions\" under \"Exhibitions and Events\" \n",
42
+ " β”‚ β”œβ”€β”€ 6. Click \"Past\".\n",
43
+ " β”‚ β”œβ”€β”€ 7. Set the year to 2015.\n",
44
+ " β”‚ β”œβ”€β”€ 8. Scroll to find the exhibit mentioning rams and click \"Celebration of the Year of the Ram\".\n",
45
+ " β”‚ β”œβ”€β”€ 9. Click \"View All Objects\".\n",
46
+ " β”‚ β”œβ”€β”€ 10. Click \"Twelve animals of the Chinese zodiac\" to open the image.\n",
47
+ " β”‚ β”œβ”€β”€ 11. Count how many have a visible hand.\n",
48
+ " β”œβ”€β”€ Number of steps: 11\n",
49
+ " β”œβ”€β”€ How long did this take?: 10 minutes\n",
50
+ " β”œβ”€β”€ Tools:\n",
51
+ " β”‚ β”œβ”€β”€ 1. Web browser\n",
52
+ " β”‚ β”œβ”€β”€ 2. Search engine\n",
53
+ " β”‚ β”œβ”€β”€ 3. Image recognition tools\n",
54
+ " └── Number of tools: 3\n",
55
+ "==================================================\n"
56
+ ]
57
+ }
58
+ ],
59
+ "source": [
60
+ "import random\n",
61
+ "random_samples = random.sample(json_QA, 1)\n",
62
+ "for sample in random_samples:\n",
63
+ " print(\"=\" * 50)\n",
64
+ " print(f\"Task ID: {sample['task_id']}\")\n",
65
+ " print(f\"Question: {sample['Question']}\")\n",
66
+ " print(f\"Level: {sample['Level']}\")\n",
67
+ " print(f\"Final Answer: {sample['Final answer']}\")\n",
68
+ " print(f\"Annotator Metadata: \")\n",
69
+ " print(f\" β”œβ”€β”€ Steps: \")\n",
70
+ " for step in sample['Annotator Metadata']['Steps'].split('\\n'):\n",
71
+ " print(f\" β”‚ β”œβ”€β”€ {step}\")\n",
72
+ " print(f\" β”œβ”€β”€ Number of steps: {sample['Annotator Metadata']['Number of steps']}\")\n",
73
+ " print(f\" β”œβ”€β”€ How long did this take?: {sample['Annotator Metadata']['How long did this take?']}\")\n",
74
+ " print(f\" β”œβ”€β”€ Tools:\")\n",
75
+ " for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
76
+ " print(f\" β”‚ β”œβ”€β”€ {tool}\")\n",
77
+ " print(f\" └── Number of tools: {sample['Annotator Metadata']['Number of tools']}\")\n",
78
+ "print(\"=\" * 50)"
79
+ ]
80
+ },
81
+ {
82
+ "cell_type": "code",
83
+ "execution_count": 3,
84
+ "id": "05076516",
85
+ "metadata": {},
86
+ "outputs": [
87
+ {
88
+ "name": "stderr",
89
+ "output_type": "stream",
90
+ "text": [
91
+ "c:\\Users\\franc\\repos\\gaia-agent\\gaia-agent\\.venv\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
92
+ " from .autonotebook import tqdm as notebook_tqdm\n",
93
+ "c:\\Users\\franc\\repos\\gaia-agent\\gaia-agent\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:143: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\franc\\.cache\\huggingface\\hub\\models--sentence-transformers--all-mpnet-base-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
94
+ "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
95
+ " warnings.warn(message)\n",
96
+ "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n"
97
+ ]
98
+ }
99
+ ],
100
+ "source": [
101
+ "import os\n",
102
+ "from dotenv import load_dotenv\n",
103
+ "from langchain_huggingface import HuggingFaceEmbeddings\n",
104
+ "from langchain_community.vectorstores import SupabaseVectorStore\n",
105
+ "from supabase.client import Client, create_client\n",
106
+ "\n",
107
+ "\n",
108
+ "load_dotenv()\n",
109
+ "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\") # dim=768\n",
110
+ "\n",
111
+ "supabase_url = os.environ.get(\"SUPABASE_URL\")\n",
112
+ "supabase_key = os.environ.get(\"SUPABASE_SERVICE_ROLE_KEY\")\n",
113
+ "supabase: Client = create_client(supabase_url, supabase_key)"
114
+ ]
115
+ },
116
+ {
117
+ "cell_type": "code",
118
+ "execution_count": 5,
119
+ "id": "aa1402e3",
120
+ "metadata": {},
121
+ "outputs": [],
122
+ "source": [
123
+ "from langchain.schema import Document\n",
124
+ "docs = []\n",
125
+ "cnt = 0 \n",
126
+ "for sample in json_QA:\n",
127
+ " content = f\"Question : {sample['Question']}\\n\\nFinal answer : {sample['Final answer']}\"\n",
128
+ " doc = {\n",
129
+ " \"id\" : cnt,\n",
130
+ " \"content\" : content,\n",
131
+ " \"metadata\" : {\n",
132
+ " \"source\" : sample['task_id']\n",
133
+ " },\n",
134
+ " \"embedding\" : embeddings.embed_query(content),\n",
135
+ " }\n",
136
+ " docs.append(doc)\n",
137
+ " cnt += 1\n",
138
+ "\n",
139
+ "# upload the documents to the vector database\n",
140
+ "try:\n",
141
+ " response = (\n",
142
+ " supabase.table(\"documents\")\n",
143
+ " .insert(docs)\n",
144
+ " .execute()\n",
145
+ " )\n",
146
+ "except Exception as exception:\n",
147
+ " print(\"Error inserting data into Supabase:\", exception)\n",
148
+ "\n",
149
+ "# # Save the documents (a list of dict) into a csv file, and manually upload it to Supabase\n",
150
+ "# import pandas as pd\n",
151
+ "# df = pd.DataFrame(docs)\n",
152
+ "# df.to_csv('supabase_docs.csv',index=False)"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "code",
157
+ "execution_count": 7,
158
+ "id": "9aa7eb5e",
159
+ "metadata": {},
160
+ "outputs": [],
161
+ "source": [
162
+ "# add items to vector database\n",
163
+ "vector_store = SupabaseVectorStore(\n",
164
+ " client=supabase,\n",
165
+ " embedding= embeddings,\n",
166
+ " table_name=\"documents\",\n",
167
+ " query_name=\"match_documents_langchain\",\n",
168
+ ")\n",
169
+ "retriever = vector_store.as_retriever()"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 8,
175
+ "id": "9eecafd1",
176
+ "metadata": {},
177
+ "outputs": [],
178
+ "source": [
179
+ "query = \"On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\"\n",
180
+ "# matched_docs = vector_store.similarity_search(query, k=2)\n",
181
+ "docs = retriever.invoke(query)"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 43,
187
+ "id": "ff917840",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "data": {
192
+ "text/plain": [
193
+ "Document(metadata={'source': '840bfca7-4f7b-481a-8794-c560c340185d'}, page_content='Question : On June 6, 2023, an article by Carolyn Collins Petersen was published in Universe Today. This article mentions a team that produced a paper about their observations, linked at the bottom of the article. Find this paper. Under what NASA award number was the work performed by R. G. Arendt supported by?\\n\\nFinal answer : 80GSFC21M0002')"
194
+ ]
195
+ },
196
+ "execution_count": 43,
197
+ "metadata": {},
198
+ "output_type": "execute_result"
199
+ }
200
+ ],
201
+ "source": [
202
+ "docs[0]"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": 44,
208
+ "id": "01c8f337",
209
+ "metadata": {},
210
+ "outputs": [
211
+ {
212
+ "name": "stdout",
213
+ "output_type": "stream",
214
+ "text": [
215
+ "List of tools used in all samples:\n",
216
+ "Total number of tools used: 83\n",
217
+ " β”œβ”€β”€ web browser: 107\n",
218
+ " β”œβ”€β”€ image recognition tools (to identify and parse a figure with three axes): 1\n",
219
+ " β”œβ”€β”€ search engine: 101\n",
220
+ " β”œβ”€β”€ calculator: 34\n",
221
+ " β”œβ”€β”€ unlambda compiler (optional): 1\n",
222
+ " β”œβ”€β”€ a web browser.: 2\n",
223
+ " β”œβ”€β”€ a search engine.: 2\n",
224
+ " β”œβ”€β”€ a calculator.: 1\n",
225
+ " β”œβ”€β”€ microsoft excel: 5\n",
226
+ " β”œβ”€β”€ google search: 1\n",
227
+ " β”œβ”€β”€ ne: 9\n",
228
+ " β”œβ”€β”€ pdf access: 7\n",
229
+ " β”œβ”€β”€ file handling: 2\n",
230
+ " β”œβ”€β”€ python: 3\n",
231
+ " β”œβ”€β”€ image recognition tools: 12\n",
232
+ " β”œβ”€β”€ jsonld file access: 1\n",
233
+ " β”œβ”€β”€ video parsing: 1\n",
234
+ " β”œβ”€β”€ python compiler: 1\n",
235
+ " β”œβ”€β”€ video recognition tools: 3\n",
236
+ " β”œβ”€β”€ pdf viewer: 7\n",
237
+ " ��── microsoft excel / google sheets: 3\n",
238
+ " β”œβ”€β”€ word document access: 1\n",
239
+ " β”œβ”€β”€ tool to extract text from images: 1\n",
240
+ " β”œβ”€β”€ a word reversal tool / script: 1\n",
241
+ " β”œβ”€β”€ counter: 1\n",
242
+ " β”œβ”€β”€ excel: 3\n",
243
+ " β”œβ”€β”€ image recognition: 5\n",
244
+ " β”œβ”€β”€ color recognition: 3\n",
245
+ " β”œβ”€β”€ excel file access: 3\n",
246
+ " β”œβ”€β”€ xml file access: 1\n",
247
+ " β”œβ”€β”€ access to the internet archive, web.archive.org: 1\n",
248
+ " β”œβ”€β”€ text processing/diff tool: 1\n",
249
+ " β”œβ”€β”€ gif parsing tools: 1\n",
250
+ " β”œβ”€β”€ a web browser: 7\n",
251
+ " β”œβ”€β”€ a search engine: 7\n",
252
+ " β”œβ”€β”€ a speech-to-text tool: 2\n",
253
+ " β”œβ”€β”€ code/data analysis tools: 1\n",
254
+ " β”œβ”€β”€ audio capability: 2\n",
255
+ " β”œβ”€β”€ pdf reader: 1\n",
256
+ " β”œβ”€β”€ markdown: 1\n",
257
+ " β”œβ”€β”€ a calculator: 5\n",
258
+ " β”œβ”€β”€ access to wikipedia: 3\n",
259
+ " β”œβ”€β”€ image recognition/ocr: 3\n",
260
+ " β”œβ”€β”€ google translate access: 1\n",
261
+ " β”œβ”€β”€ ocr: 4\n",
262
+ " β”œβ”€β”€ bass note data: 1\n",
263
+ " β”œβ”€β”€ text editor: 1\n",
264
+ " β”œβ”€β”€ xlsx file access: 1\n",
265
+ " β”œβ”€β”€ powerpoint viewer: 1\n",
266
+ " β”œβ”€β”€ csv file access: 1\n",
267
+ " β”œβ”€β”€ calculator (or use excel): 1\n",
268
+ " β”œβ”€β”€ computer algebra system: 1\n",
269
+ " β”œβ”€β”€ video processing software: 1\n",
270
+ " β”œβ”€β”€ audio processing software: 1\n",
271
+ " β”œβ”€β”€ computer vision: 1\n",
272
+ " β”œβ”€β”€ google maps: 1\n",
273
+ " β”œβ”€β”€ access to excel files: 1\n",
274
+ " β”œβ”€β”€ calculator (or ability to count): 1\n",
275
+ " β”œβ”€β”€ a file interface: 3\n",
276
+ " β”œβ”€β”€ a python ide: 1\n",
277
+ " β”œβ”€β”€ spreadsheet editor: 1\n",
278
+ " β”œβ”€β”€ tools required: 1\n",
279
+ " β”œβ”€β”€ b browser: 1\n",
280
+ " β”œβ”€β”€ image recognition and processing tools: 1\n",
281
+ " β”œβ”€β”€ computer vision or ocr: 1\n",
282
+ " β”œβ”€β”€ c++ compiler: 1\n",
283
+ " β”œβ”€β”€ access to google maps: 1\n",
284
+ " β”œβ”€β”€ youtube player: 1\n",
285
+ " β”œβ”€β”€ natural language processor: 1\n",
286
+ " β”œβ”€β”€ graph interaction tools: 1\n",
287
+ " β”œβ”€β”€ bablyonian cuniform -> arabic legend: 1\n",
288
+ " β”œβ”€β”€ access to youtube: 1\n",
289
+ " β”œβ”€β”€ image search tools: 1\n",
290
+ " β”œβ”€β”€ calculator or counting function: 1\n",
291
+ " β”œβ”€β”€ a speech-to-text audio processing tool: 1\n",
292
+ " β”œβ”€β”€ access to academic journal websites: 1\n",
293
+ " β”œβ”€β”€ pdf reader/extracter: 1\n",
294
+ " β”œβ”€β”€ rubik's cube model: 1\n",
295
+ " β”œβ”€β”€ wikipedia: 1\n",
296
+ " β”œβ”€β”€ video capability: 1\n",
297
+ " β”œβ”€β”€ image processing tools: 1\n",
298
+ " β”œβ”€β”€ age recognition software: 1\n",
299
+ " β”œβ”€β”€ youtube: 1\n"
300
+ ]
301
+ }
302
+ ],
303
+ "source": [
304
+ "# list of the tools used in all the samples\n",
305
+ "from collections import Counter, OrderedDict\n",
306
+ "\n",
307
+ "tools = []\n",
308
+ "for sample in json_QA:\n",
309
+ " for tool in sample['Annotator Metadata']['Tools'].split('\\n'):\n",
310
+ " tool = tool[2:].strip().lower()\n",
311
+ " if tool.startswith(\"(\"):\n",
312
+ " tool = tool[11:].strip()\n",
313
+ " tools.append(tool)\n",
314
+ "tools_counter = OrderedDict(Counter(tools))\n",
315
+ "print(\"List of tools used in all samples:\")\n",
316
+ "print(\"Total number of tools used:\", len(tools_counter))\n",
317
+ "for tool, count in tools_counter.items():\n",
318
+ " print(f\" β”œβ”€β”€ {tool}: {count}\")"
319
+ ]
320
+ }
321
+ ],
322
+ "metadata": {
323
+ "kernelspec": {
324
+ "display_name": ".venv",
325
+ "language": "python",
326
+ "name": "python3"
327
+ },
328
+ "language_info": {
329
+ "codemirror_mode": {
330
+ "name": "ipython",
331
+ "version": 3
332
+ },
333
+ "file_extension": ".py",
334
+ "mimetype": "text/x-python",
335
+ "name": "python",
336
+ "nbconvert_exporter": "python",
337
+ "pygments_lexer": "ipython3",
338
+ "version": "3.13.1"
339
+ }
340
+ },
341
+ "nbformat": 4,
342
+ "nbformat_minor": 5
343
+ }