Spaces:
Sleeping
Sleeping
traopia
commited on
Commit
·
00c29b3
1
Parent(s):
767fe1d
no ollama
Browse files- src/__pycache__/generate_queries_alternative.cpython-312.pyc +0 -0
- src/__pycache__/questions_queries.cpython-312.pyc +0 -0
- src/__pycache__/sparql_query_wikibase.cpython-312.pyc +0 -0
- src/__pycache__/use_llm.cpython-312.pyc +0 -0
- src/__pycache__/visual_qa.cpython-312.pyc +0 -0
- src/generate_queries_alternative.py +5 -53
- src/sparql_query_wikibase.py +1 -7
- src/use_llm.py +15 -1
- src/wikibase_helpers.py +86 -0
src/__pycache__/generate_queries_alternative.cpython-312.pyc
DELETED
Binary file (28.2 kB)
|
|
src/__pycache__/questions_queries.cpython-312.pyc
DELETED
Binary file (36.3 kB)
|
|
src/__pycache__/sparql_query_wikibase.cpython-312.pyc
DELETED
Binary file (8.35 kB)
|
|
src/__pycache__/use_llm.cpython-312.pyc
DELETED
Binary file (1.5 kB)
|
|
src/__pycache__/visual_qa.cpython-312.pyc
DELETED
Binary file (5.23 kB)
|
|
src/generate_queries_alternative.py
CHANGED
@@ -1,11 +1,9 @@
|
|
1 |
-
from src.use_llm import main_generate
|
2 |
|
3 |
from src.questions_queries import *
|
4 |
import time
|
5 |
-
import ollama
|
6 |
import uuid
|
7 |
import chromadb
|
8 |
-
import openai
|
9 |
import spacy
|
10 |
import numpy as np
|
11 |
nlp = spacy.load("en_core_web_sm")
|
@@ -174,52 +172,6 @@ def capitalize_sentences(sentences):
|
|
174 |
|
175 |
return capitalized_sentences
|
176 |
|
177 |
-
def similarity_question(question, questions_queries_dictionary, collection, n_results=5, threshold=0.15):
|
178 |
-
nlp = spacy.load("en_core_web_sm") # Load spaCy model for entity recognition
|
179 |
-
|
180 |
-
original_documents = [questions_queries_dictionary[i]["question"] for i in range(len(questions_queries_dictionary))]
|
181 |
-
#original_documents = capitalize_sentences(original_documents)
|
182 |
-
masked_documents = [mask_entities(q, nlp) for q in original_documents]
|
183 |
-
#masked_documents = list(set(masked_documents))
|
184 |
-
# Store each document in the vector embedding database
|
185 |
-
for i, d in enumerate(masked_documents):
|
186 |
-
response = ollama.embed(model="mxbai-embed-large", input=d)
|
187 |
-
embeddings = response["embeddings"]
|
188 |
-
collection.add(
|
189 |
-
ids=[str(i)],
|
190 |
-
embeddings=embeddings,
|
191 |
-
documents=[d]
|
192 |
-
)
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
# Compute the embedding for the input question
|
197 |
-
masked_question = mask_entities(question, nlp)
|
198 |
-
response = ollama.embed(model="mxbai-embed-large", input=masked_question)
|
199 |
-
results = collection.query(
|
200 |
-
query_embeddings=[response["embeddings"][0]],
|
201 |
-
n_results=n_results
|
202 |
-
)
|
203 |
-
triples = []
|
204 |
-
for i in range(len(results['documents'][0])):
|
205 |
-
masked_similar_question = results['documents'][0][i]
|
206 |
-
distance = results['distances'][0][i]
|
207 |
-
print(distance)
|
208 |
-
paraphrase = distance < threshold
|
209 |
-
|
210 |
-
# Find the corresponding original question
|
211 |
-
index_similar_query = masked_documents.index(masked_similar_question)
|
212 |
-
original_similar_question = original_documents[index_similar_query]
|
213 |
-
similar_query = questions_queries_dictionary[index_similar_query]["query"]
|
214 |
-
|
215 |
-
if paraphrase and "[ENTITY]" in masked_similar_question and "[ENTITY]" in masked_question:
|
216 |
-
to_do_query = replace_entity(original_similar_question, question, similar_query)
|
217 |
-
else:
|
218 |
-
to_do_query = None
|
219 |
-
|
220 |
-
triples.append((original_similar_question, similar_query, to_do_query))
|
221 |
-
|
222 |
-
return triples
|
223 |
|
224 |
|
225 |
|
@@ -238,7 +190,7 @@ def similarity_question(question, questions_queries_dictionary, collection, n_re
|
|
238 |
|
239 |
# Store each unique document in the vector embedding database
|
240 |
for i, d in enumerate(masked_documents):
|
241 |
-
response =
|
242 |
embedding = response["embeddings"][0] # Extract the first (and only) embedding from the nested list
|
243 |
|
244 |
# Check if embedding is unique
|
@@ -254,7 +206,7 @@ def similarity_question(question, questions_queries_dictionary, collection, n_re
|
|
254 |
|
255 |
# Compute the embedding for the input question
|
256 |
masked_question = mask_entities(question, nlp)
|
257 |
-
response =
|
258 |
query_embedding = response["embeddings"][0] # Extract embedding
|
259 |
|
260 |
results = collection.query(
|
@@ -297,7 +249,7 @@ def similarity_question_no_masking(question, questions_queries_dictionary, colle
|
|
297 |
|
298 |
# Store each unique document in the vector embedding database
|
299 |
for i, d in enumerate(original_documents):
|
300 |
-
response =
|
301 |
embedding = response["embeddings"][0] # Extract the first (and only) embedding from the nested list
|
302 |
|
303 |
# Check if embedding is unique
|
@@ -313,7 +265,7 @@ def similarity_question_no_masking(question, questions_queries_dictionary, colle
|
|
313 |
|
314 |
# Compute the embedding for the input question
|
315 |
|
316 |
-
response =
|
317 |
query_embedding = response["embeddings"][0] # Extract embedding
|
318 |
|
319 |
results = collection.query(
|
|
|
1 |
+
from src.use_llm import main_generate, get_embeddings
|
2 |
|
3 |
from src.questions_queries import *
|
4 |
import time
|
|
|
5 |
import uuid
|
6 |
import chromadb
|
|
|
7 |
import spacy
|
8 |
import numpy as np
|
9 |
nlp = spacy.load("en_core_web_sm")
|
|
|
172 |
|
173 |
return capitalized_sentences
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
|
176 |
|
177 |
|
|
|
190 |
|
191 |
# Store each unique document in the vector embedding database
|
192 |
for i, d in enumerate(masked_documents):
|
193 |
+
response = get_embeddings(d)
|
194 |
embedding = response["embeddings"][0] # Extract the first (and only) embedding from the nested list
|
195 |
|
196 |
# Check if embedding is unique
|
|
|
206 |
|
207 |
# Compute the embedding for the input question
|
208 |
masked_question = mask_entities(question, nlp)
|
209 |
+
response = get_embeddings(d)
|
210 |
query_embedding = response["embeddings"][0] # Extract embedding
|
211 |
|
212 |
results = collection.query(
|
|
|
249 |
|
250 |
# Store each unique document in the vector embedding database
|
251 |
for i, d in enumerate(original_documents):
|
252 |
+
response = get_embeddings(d)
|
253 |
embedding = response["embeddings"][0] # Extract the first (and only) embedding from the nested list
|
254 |
|
255 |
# Check if embedding is unique
|
|
|
265 |
|
266 |
# Compute the embedding for the input question
|
267 |
|
268 |
+
response = get_embeddings(question)
|
269 |
query_embedding = response["embeddings"][0] # Extract embedding
|
270 |
|
271 |
results = collection.query(
|
src/sparql_query_wikibase.py
CHANGED
@@ -1,9 +1,3 @@
|
|
1 |
-
wikibase_api_url = 'https://fashionwiki.wikibase.cloud/w/api.php'
|
2 |
-
config = {
|
3 |
-
"SPARQL_ENDPOINT_URL": "https://fashionwiki.wikibase.cloud/query/sparql",
|
4 |
-
'USER_AGENT': 'YourBotName/1.0 (https://yourwebsite.org/bot-info)',
|
5 |
-
'WIKIBASE_URL': wikibase_api_url,
|
6 |
-
}
|
7 |
|
8 |
|
9 |
from urllib.parse import urlparse
|
@@ -16,6 +10,7 @@ from wikibaseintegrator.wbi_helpers import get_user_agent
|
|
16 |
import pandas as pd
|
17 |
from string import Template
|
18 |
queries = False
|
|
|
19 |
|
20 |
|
21 |
def execute_sparql_query(query: str, prefix: str | None = None, endpoint: str | None = None, user_agent: str | None = None, max_retries: int = 1000, retry_after: int = 60) -> dict:
|
@@ -71,7 +66,6 @@ def get_results_to_df( query):
|
|
71 |
return df
|
72 |
|
73 |
if queries:
|
74 |
-
from src.new_fct_add_entities import wikibase_properties_id, classes_wikibase
|
75 |
query_fashion_designers_template = Template("""
|
76 |
PREFIX wbt: <https://fashionwiki.wikibase.cloud/prop/direct/>
|
77 |
PREFIX wb: <https://fashionwiki.wikibase.cloud/entity/>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
|
3 |
from urllib.parse import urlparse
|
|
|
10 |
import pandas as pd
|
11 |
from string import Template
|
12 |
queries = False
|
13 |
+
from wikibase_helpers import wikibase_properties_id, classes_wikibase, config, wikibase_api_url
|
14 |
|
15 |
|
16 |
def execute_sparql_query(query: str, prefix: str | None = None, endpoint: str | None = None, user_agent: str | None = None, max_retries: int = 1000, retry_after: int = 60) -> dict:
|
|
|
66 |
return df
|
67 |
|
68 |
if queries:
|
|
|
69 |
query_fashion_designers_template = Template("""
|
70 |
PREFIX wbt: <https://fashionwiki.wikibase.cloud/prop/direct/>
|
71 |
PREFIX wb: <https://fashionwiki.wikibase.cloud/entity/>
|
src/use_llm.py
CHANGED
@@ -27,4 +27,18 @@ def send_chat_prompt(prompt: str, model: str, system_prompt: str) -> str:
|
|
27 |
def main_generate(prompt, model=DEFAULT_MODEL, system_prompt="You are a helpful assistant that generates SPARQL queries."):
|
28 |
response = send_chat_prompt(prompt, model, system_prompt)
|
29 |
response = response.replace('```', '').replace('json', '').strip()
|
30 |
-
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def main_generate(prompt, model=DEFAULT_MODEL, system_prompt="You are a helpful assistant that generates SPARQL queries."):
|
28 |
response = send_chat_prompt(prompt, model, system_prompt)
|
29 |
response = response.replace('```', '').replace('json', '').strip()
|
30 |
+
return response
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
# Use your own token securely via Space secrets or local env
|
35 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # define this in Hugging Face Space Secrets
|
36 |
+
MODEL_ID = "thenlper/gte-large" # or another embedding model like BAAI/bge-base-en
|
37 |
+
|
38 |
+
client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)
|
39 |
+
|
40 |
+
def get_embeddings(texts):
|
41 |
+
if isinstance(texts, str):
|
42 |
+
texts = [texts]
|
43 |
+
embeddings = [client.embed(text) for text in texts]
|
44 |
+
return embeddings
|
src/wikibase_helpers.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import logging
|
3 |
+
from wikibaseintegrator import WikibaseIntegrator, datatypes, wbi_helpers
|
4 |
+
from wikibaseintegrator.wbi_config import config
|
5 |
+
from wikibaseintegrator.wbi_exceptions import MWApiError
|
6 |
+
|
7 |
+
wikibase_api_url = 'https://fashionwiki.wikibase.cloud/w/api.php'
|
8 |
+
config = {
|
9 |
+
"SPARQL_ENDPOINT_URL": "https://fashionwiki.wikibase.cloud/query/sparql",
|
10 |
+
'USER_AGENT': 'YourBotName/1.0 (https://yourwebsite.org/bot-info)',
|
11 |
+
'WIKIBASE_URL': wikibase_api_url,
|
12 |
+
}
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
# List of valid language codes (can be expanded)
|
18 |
+
VALID_LANGUAGE_CODES = ['en']
|
19 |
+
|
20 |
+
def get_property_id_by_label(property_label, api_url):
|
21 |
+
"""
|
22 |
+
Resolve the property label to its corresponding property ID from Wikibase.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
property_label (str): The label of the property to search.
|
26 |
+
api_url (str): The API URL of the target Wikibase or Wikidata.
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
str: The property ID if found, otherwise None.
|
30 |
+
"""
|
31 |
+
url = f'{api_url}/w/api.php?action=wbsearchentities&search={property_label}&language=en&type=property&format=json'
|
32 |
+
response = requests.get(url)
|
33 |
+
|
34 |
+
if response.status_code == 200:
|
35 |
+
search_results = response.json()
|
36 |
+
if 'search' in search_results and search_results['search']:
|
37 |
+
# Return the first matching property ID
|
38 |
+
return search_results['search'][0]['id']
|
39 |
+
else:
|
40 |
+
logging.info(f"No property found for label: {property_label}")
|
41 |
+
return None
|
42 |
+
else:
|
43 |
+
logging.error(f"Failed to search for property by label in the target Wikibase. HTTP Status Code: {response.status_code}")
|
44 |
+
return None
|
45 |
+
|
46 |
+
|
47 |
+
wikibase_properties_id = {"instance of": get_property_id_by_label("instance of", wikibase_api_url),
|
48 |
+
"reference URL": get_property_id_by_label("reference URL", wikibase_api_url),
|
49 |
+
"start time": get_property_id_by_label("start time", wikibase_api_url),
|
50 |
+
"end time": get_property_id_by_label("end time", wikibase_api_url),
|
51 |
+
"occupation title": get_property_id_by_label("occupation title", wikibase_api_url),
|
52 |
+
"educated at": get_property_id_by_label("educated at", wikibase_api_url),
|
53 |
+
"employer": get_property_id_by_label("employer", wikibase_api_url),
|
54 |
+
"work location": get_property_id_by_label("work location", wikibase_api_url),
|
55 |
+
"award received": get_property_id_by_label("award received", wikibase_api_url),
|
56 |
+
"point in time": get_property_id_by_label("point in time", wikibase_api_url),
|
57 |
+
"exact match": get_property_id_by_label("exact match", wikibase_api_url),
|
58 |
+
"date of birth": get_property_id_by_label("date of birth", wikibase_api_url),
|
59 |
+
"place of birth": get_property_id_by_label("place of birth", wikibase_api_url),
|
60 |
+
"date of death": get_property_id_by_label("date of death", wikibase_api_url),
|
61 |
+
"country of citizenship": get_property_id_by_label("country of citizenship", wikibase_api_url),
|
62 |
+
"occupation": get_property_id_by_label("occupation", wikibase_api_url),
|
63 |
+
"sex or gender": get_property_id_by_label("sex or gender", wikibase_api_url),
|
64 |
+
"official website": get_property_id_by_label("official website", wikibase_api_url),
|
65 |
+
"perfumes": get_property_id_by_label("perfumes", wikibase_api_url),
|
66 |
+
"who wears it": get_property_id_by_label("who wears it", wikibase_api_url),
|
67 |
+
"inception": get_property_id_by_label("inception", wikibase_api_url),
|
68 |
+
"headquarters location": get_property_id_by_label("headquarters location", wikibase_api_url),
|
69 |
+
"parent organization": get_property_id_by_label("parent organization", wikibase_api_url),
|
70 |
+
"founded by": get_property_id_by_label("founded by", wikibase_api_url),
|
71 |
+
"owned by": get_property_id_by_label("owned by", wikibase_api_url),
|
72 |
+
"industry": get_property_id_by_label("industry", wikibase_api_url),
|
73 |
+
"country": get_property_id_by_label("country", wikibase_api_url),
|
74 |
+
"total revenue": get_property_id_by_label("total revenue", wikibase_api_url),
|
75 |
+
"designer employed": get_property_id_by_label("designer employed", wikibase_api_url),
|
76 |
+
"country of origin": get_property_id_by_label("country of origin", wikibase_api_url),
|
77 |
+
"fashion collection": get_property_id_by_label("fashion collection", wikibase_api_url),
|
78 |
+
"fashion season": get_property_id_by_label("fashion season", wikibase_api_url),
|
79 |
+
"fashion show location": get_property_id_by_label("fashion show location", wikibase_api_url),
|
80 |
+
"description of fashion collection": get_property_id_by_label("description of fashion collection", wikibase_api_url),
|
81 |
+
"image of fashion collection": get_property_id_by_label("image of fashion collection", wikibase_api_url),
|
82 |
+
"editor of fashion collection description": get_property_id_by_label("editor of fashion collection description", wikibase_api_url),
|
83 |
+
"date of fashion collection": get_property_id_by_label("date of fashion collection", wikibase_api_url),
|
84 |
+
"fashion show category": get_property_id_by_label("fashion show category", wikibase_api_url),
|
85 |
+
"fashion house X fashion collection": get_property_id_by_label("fashion house X fashion collection", wikibase_api_url),
|
86 |
+
"designer of collection": get_property_id_by_label("designer of collection", wikibase_api_url)}
|