Commit
·
8822f57
1
Parent(s):
dcced35
Add data download
Browse filesSigned-off-by: Aivin V. Solatorio <[email protected]>
- services.py +18 -2
services.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import json
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
import torch
|
| 4 |
import httpx
|
|
@@ -8,6 +9,7 @@ from typing import Optional, Any
|
|
| 8 |
from sentence_transformers import SentenceTransformer
|
| 9 |
|
| 10 |
from pydantic import BaseModel, Field
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
def get_best_torch_device():
|
|
@@ -26,9 +28,23 @@ device = get_best_torch_device()
|
|
| 26 |
|
| 27 |
|
| 28 |
# Load the basic WDI metadata and vectors.
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
df = pd.read_json(wdi_data_vec_fpath)
|
| 33 |
|
| 34 |
# Make it easy to index based on the idno
|
|
|
|
| 1 |
import json
|
| 2 |
+
import os
|
| 3 |
import pandas as pd
|
| 4 |
import torch
|
| 5 |
import httpx
|
|
|
|
| 9 |
from sentence_transformers import SentenceTransformer
|
| 10 |
|
| 11 |
from pydantic import BaseModel, Field
|
| 12 |
+
from urllib.request import urlretrieve
|
| 13 |
|
| 14 |
|
| 15 |
def get_best_torch_device():
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
# Load the basic WDI metadata and vectors.
|
| 31 |
+
|
| 32 |
+
EMBEDDING_FNAME = "avsolatorio__GIST-small-Embedding-v0__005__indicator_embeddings.json"
|
| 33 |
+
EMBEDDING_SOURCE = (
|
| 34 |
+
f"https://raw.githubusercontent.com/"
|
| 35 |
+
f"avsolatorio/ai-for-data-blog/refs/heads/main/semantic-search/data/{EMBEDDING_FNAME}"
|
| 36 |
)
|
| 37 |
+
wdi_data_vec_fpath = os.path.join("data", EMBEDDING_FNAME)
|
| 38 |
+
|
| 39 |
+
os.makedirs(os.path.dirname(wdi_data_vec_fpath), exist_ok=True)
|
| 40 |
+
|
| 41 |
+
if not os.path.exists(wdi_data_vec_fpath):
|
| 42 |
+
print(f"Downloading {EMBEDDING_FNAME} to {wdi_data_vec_fpath}...")
|
| 43 |
+
urlretrieve(EMBEDDING_SOURCE, wdi_data_vec_fpath)
|
| 44 |
+
print("Download complete.")
|
| 45 |
+
else:
|
| 46 |
+
print(f"File already exists at {wdi_data_vec_fpath}.")
|
| 47 |
+
|
| 48 |
df = pd.read_json(wdi_data_vec_fpath)
|
| 49 |
|
| 50 |
# Make it easy to index based on the idno
|