traopia commited on
Commit
00c29b3
·
1 Parent(s): 767fe1d
src/__pycache__/generate_queries_alternative.cpython-312.pyc DELETED
Binary file (28.2 kB)
 
src/__pycache__/questions_queries.cpython-312.pyc DELETED
Binary file (36.3 kB)
 
src/__pycache__/sparql_query_wikibase.cpython-312.pyc DELETED
Binary file (8.35 kB)
 
src/__pycache__/use_llm.cpython-312.pyc DELETED
Binary file (1.5 kB)
 
src/__pycache__/visual_qa.cpython-312.pyc DELETED
Binary file (5.23 kB)
 
src/generate_queries_alternative.py CHANGED
@@ -1,11 +1,9 @@
1
- from src.use_llm import main_generate
2
 
3
  from src.questions_queries import *
4
  import time
5
- import ollama
6
  import uuid
7
  import chromadb
8
- import openai
9
  import spacy
10
  import numpy as np
11
  nlp = spacy.load("en_core_web_sm")
@@ -174,52 +172,6 @@ def capitalize_sentences(sentences):
174
 
175
  return capitalized_sentences
176
 
177
- def similarity_question(question, questions_queries_dictionary, collection, n_results=5, threshold=0.15):
178
- nlp = spacy.load("en_core_web_sm") # Load spaCy model for entity recognition
179
-
180
- original_documents = [questions_queries_dictionary[i]["question"] for i in range(len(questions_queries_dictionary))]
181
- #original_documents = capitalize_sentences(original_documents)
182
- masked_documents = [mask_entities(q, nlp) for q in original_documents]
183
- #masked_documents = list(set(masked_documents))
184
- # Store each document in the vector embedding database
185
- for i, d in enumerate(masked_documents):
186
- response = ollama.embed(model="mxbai-embed-large", input=d)
187
- embeddings = response["embeddings"]
188
- collection.add(
189
- ids=[str(i)],
190
- embeddings=embeddings,
191
- documents=[d]
192
- )
193
-
194
-
195
-
196
- # Compute the embedding for the input question
197
- masked_question = mask_entities(question, nlp)
198
- response = ollama.embed(model="mxbai-embed-large", input=masked_question)
199
- results = collection.query(
200
- query_embeddings=[response["embeddings"][0]],
201
- n_results=n_results
202
- )
203
- triples = []
204
- for i in range(len(results['documents'][0])):
205
- masked_similar_question = results['documents'][0][i]
206
- distance = results['distances'][0][i]
207
- print(distance)
208
- paraphrase = distance < threshold
209
-
210
- # Find the corresponding original question
211
- index_similar_query = masked_documents.index(masked_similar_question)
212
- original_similar_question = original_documents[index_similar_query]
213
- similar_query = questions_queries_dictionary[index_similar_query]["query"]
214
-
215
- if paraphrase and "[ENTITY]" in masked_similar_question and "[ENTITY]" in masked_question:
216
- to_do_query = replace_entity(original_similar_question, question, similar_query)
217
- else:
218
- to_do_query = None
219
-
220
- triples.append((original_similar_question, similar_query, to_do_query))
221
-
222
- return triples
223
 
224
 
225
 
@@ -238,7 +190,7 @@ def similarity_question(question, questions_queries_dictionary, collection, n_re
238
 
239
  # Store each unique document in the vector embedding database
240
  for i, d in enumerate(masked_documents):
241
- response = ollama.embed(model="mxbai-embed-large", input=d)
242
  embedding = response["embeddings"][0] # Extract the first (and only) embedding from the nested list
243
 
244
  # Check if embedding is unique
@@ -254,7 +206,7 @@ def similarity_question(question, questions_queries_dictionary, collection, n_re
254
 
255
  # Compute the embedding for the input question
256
  masked_question = mask_entities(question, nlp)
257
- response = ollama.embed(model="mxbai-embed-large", input=masked_question)
258
  query_embedding = response["embeddings"][0] # Extract embedding
259
 
260
  results = collection.query(
@@ -297,7 +249,7 @@ def similarity_question_no_masking(question, questions_queries_dictionary, colle
297
 
298
  # Store each unique document in the vector embedding database
299
  for i, d in enumerate(original_documents):
300
- response = ollama.embed(model="mxbai-embed-large", input=d)
301
  embedding = response["embeddings"][0] # Extract the first (and only) embedding from the nested list
302
 
303
  # Check if embedding is unique
@@ -313,7 +265,7 @@ def similarity_question_no_masking(question, questions_queries_dictionary, colle
313
 
314
  # Compute the embedding for the input question
315
 
316
- response = ollama.embed(model="mxbai-embed-large", input=question)
317
  query_embedding = response["embeddings"][0] # Extract embedding
318
 
319
  results = collection.query(
 
1
+ from src.use_llm import main_generate, get_embeddings
2
 
3
  from src.questions_queries import *
4
  import time
 
5
  import uuid
6
  import chromadb
 
7
  import spacy
8
  import numpy as np
9
  nlp = spacy.load("en_core_web_sm")
 
172
 
173
  return capitalized_sentences
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
 
177
 
 
190
 
191
  # Store each unique document in the vector embedding database
192
  for i, d in enumerate(masked_documents):
193
+ response = get_embeddings(d)
194
  embedding = response["embeddings"][0] # Extract the first (and only) embedding from the nested list
195
 
196
  # Check if embedding is unique
 
206
 
207
  # Compute the embedding for the input question
208
  masked_question = mask_entities(question, nlp)
209
+ response = get_embeddings(d)
210
  query_embedding = response["embeddings"][0] # Extract embedding
211
 
212
  results = collection.query(
 
249
 
250
  # Store each unique document in the vector embedding database
251
  for i, d in enumerate(original_documents):
252
+ response = get_embeddings(d)
253
  embedding = response["embeddings"][0] # Extract the first (and only) embedding from the nested list
254
 
255
  # Check if embedding is unique
 
265
 
266
  # Compute the embedding for the input question
267
 
268
+ response = get_embeddings(question)
269
  query_embedding = response["embeddings"][0] # Extract embedding
270
 
271
  results = collection.query(
src/sparql_query_wikibase.py CHANGED
@@ -1,9 +1,3 @@
1
- wikibase_api_url = 'https://fashionwiki.wikibase.cloud/w/api.php'
2
- config = {
3
- "SPARQL_ENDPOINT_URL": "https://fashionwiki.wikibase.cloud/query/sparql",
4
- 'USER_AGENT': 'YourBotName/1.0 (https://yourwebsite.org/bot-info)',
5
- 'WIKIBASE_URL': wikibase_api_url,
6
- }
7
 
8
 
9
  from urllib.parse import urlparse
@@ -16,6 +10,7 @@ from wikibaseintegrator.wbi_helpers import get_user_agent
16
  import pandas as pd
17
  from string import Template
18
  queries = False
 
19
 
20
 
21
  def execute_sparql_query(query: str, prefix: str | None = None, endpoint: str | None = None, user_agent: str | None = None, max_retries: int = 1000, retry_after: int = 60) -> dict:
@@ -71,7 +66,6 @@ def get_results_to_df( query):
71
  return df
72
 
73
  if queries:
74
- from src.new_fct_add_entities import wikibase_properties_id, classes_wikibase
75
  query_fashion_designers_template = Template("""
76
  PREFIX wbt: <https://fashionwiki.wikibase.cloud/prop/direct/>
77
  PREFIX wb: <https://fashionwiki.wikibase.cloud/entity/>
 
 
 
 
 
 
 
1
 
2
 
3
  from urllib.parse import urlparse
 
10
  import pandas as pd
11
  from string import Template
12
  queries = False
13
+ from wikibase_helpers import wikibase_properties_id, classes_wikibase, config, wikibase_api_url
14
 
15
 
16
  def execute_sparql_query(query: str, prefix: str | None = None, endpoint: str | None = None, user_agent: str | None = None, max_retries: int = 1000, retry_after: int = 60) -> dict:
 
66
  return df
67
 
68
  if queries:
 
69
  query_fashion_designers_template = Template("""
70
  PREFIX wbt: <https://fashionwiki.wikibase.cloud/prop/direct/>
71
  PREFIX wb: <https://fashionwiki.wikibase.cloud/entity/>
src/use_llm.py CHANGED
@@ -27,4 +27,18 @@ def send_chat_prompt(prompt: str, model: str, system_prompt: str) -> str:
27
  def main_generate(prompt, model=DEFAULT_MODEL, system_prompt="You are a helpful assistant that generates SPARQL queries."):
28
  response = send_chat_prompt(prompt, model, system_prompt)
29
  response = response.replace('```', '').replace('json', '').strip()
30
- return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  def main_generate(prompt, model=DEFAULT_MODEL, system_prompt="You are a helpful assistant that generates SPARQL queries."):
28
  response = send_chat_prompt(prompt, model, system_prompt)
29
  response = response.replace('```', '').replace('json', '').strip()
30
+ return response
31
+
32
+
33
+
34
+ # Use your own token securely via Space secrets or local env
35
+ HF_TOKEN = os.getenv("HF_TOKEN") # define this in Hugging Face Space Secrets
36
+ MODEL_ID = "thenlper/gte-large" # or another embedding model like BAAI/bge-base-en
37
+
38
+ client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)
39
+
40
+ def get_embeddings(texts):
41
+ if isinstance(texts, str):
42
+ texts = [texts]
43
+ embeddings = [client.embed(text) for text in texts]
44
+ return embeddings
src/wikibase_helpers.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import logging
3
+ from wikibaseintegrator import WikibaseIntegrator, datatypes, wbi_helpers
4
+ from wikibaseintegrator.wbi_config import config
5
+ from wikibaseintegrator.wbi_exceptions import MWApiError
6
+
7
+ wikibase_api_url = 'https://fashionwiki.wikibase.cloud/w/api.php'
8
+ config = {
9
+ "SPARQL_ENDPOINT_URL": "https://fashionwiki.wikibase.cloud/query/sparql",
10
+ 'USER_AGENT': 'YourBotName/1.0 (https://yourwebsite.org/bot-info)',
11
+ 'WIKIBASE_URL': wikibase_api_url,
12
+ }
13
+
14
+
15
+
16
+
17
+ # List of valid language codes (can be expanded)
18
+ VALID_LANGUAGE_CODES = ['en']
19
+
20
+ def get_property_id_by_label(property_label, api_url):
21
+ """
22
+ Resolve the property label to its corresponding property ID from Wikibase.
23
+
24
+ Args:
25
+ property_label (str): The label of the property to search.
26
+ api_url (str): The API URL of the target Wikibase or Wikidata.
27
+
28
+ Returns:
29
+ str: The property ID if found, otherwise None.
30
+ """
31
+ url = f'{api_url}/w/api.php?action=wbsearchentities&search={property_label}&language=en&type=property&format=json'
32
+ response = requests.get(url)
33
+
34
+ if response.status_code == 200:
35
+ search_results = response.json()
36
+ if 'search' in search_results and search_results['search']:
37
+ # Return the first matching property ID
38
+ return search_results['search'][0]['id']
39
+ else:
40
+ logging.info(f"No property found for label: {property_label}")
41
+ return None
42
+ else:
43
+ logging.error(f"Failed to search for property by label in the target Wikibase. HTTP Status Code: {response.status_code}")
44
+ return None
45
+
46
+
47
+ wikibase_properties_id = {"instance of": get_property_id_by_label("instance of", wikibase_api_url),
48
+ "reference URL": get_property_id_by_label("reference URL", wikibase_api_url),
49
+ "start time": get_property_id_by_label("start time", wikibase_api_url),
50
+ "end time": get_property_id_by_label("end time", wikibase_api_url),
51
+ "occupation title": get_property_id_by_label("occupation title", wikibase_api_url),
52
+ "educated at": get_property_id_by_label("educated at", wikibase_api_url),
53
+ "employer": get_property_id_by_label("employer", wikibase_api_url),
54
+ "work location": get_property_id_by_label("work location", wikibase_api_url),
55
+ "award received": get_property_id_by_label("award received", wikibase_api_url),
56
+ "point in time": get_property_id_by_label("point in time", wikibase_api_url),
57
+ "exact match": get_property_id_by_label("exact match", wikibase_api_url),
58
+ "date of birth": get_property_id_by_label("date of birth", wikibase_api_url),
59
+ "place of birth": get_property_id_by_label("place of birth", wikibase_api_url),
60
+ "date of death": get_property_id_by_label("date of death", wikibase_api_url),
61
+ "country of citizenship": get_property_id_by_label("country of citizenship", wikibase_api_url),
62
+ "occupation": get_property_id_by_label("occupation", wikibase_api_url),
63
+ "sex or gender": get_property_id_by_label("sex or gender", wikibase_api_url),
64
+ "official website": get_property_id_by_label("official website", wikibase_api_url),
65
+ "perfumes": get_property_id_by_label("perfumes", wikibase_api_url),
66
+ "who wears it": get_property_id_by_label("who wears it", wikibase_api_url),
67
+ "inception": get_property_id_by_label("inception", wikibase_api_url),
68
+ "headquarters location": get_property_id_by_label("headquarters location", wikibase_api_url),
69
+ "parent organization": get_property_id_by_label("parent organization", wikibase_api_url),
70
+ "founded by": get_property_id_by_label("founded by", wikibase_api_url),
71
+ "owned by": get_property_id_by_label("owned by", wikibase_api_url),
72
+ "industry": get_property_id_by_label("industry", wikibase_api_url),
73
+ "country": get_property_id_by_label("country", wikibase_api_url),
74
+ "total revenue": get_property_id_by_label("total revenue", wikibase_api_url),
75
+ "designer employed": get_property_id_by_label("designer employed", wikibase_api_url),
76
+ "country of origin": get_property_id_by_label("country of origin", wikibase_api_url),
77
+ "fashion collection": get_property_id_by_label("fashion collection", wikibase_api_url),
78
+ "fashion season": get_property_id_by_label("fashion season", wikibase_api_url),
79
+ "fashion show location": get_property_id_by_label("fashion show location", wikibase_api_url),
80
+ "description of fashion collection": get_property_id_by_label("description of fashion collection", wikibase_api_url),
81
+ "image of fashion collection": get_property_id_by_label("image of fashion collection", wikibase_api_url),
82
+ "editor of fashion collection description": get_property_id_by_label("editor of fashion collection description", wikibase_api_url),
83
+ "date of fashion collection": get_property_id_by_label("date of fashion collection", wikibase_api_url),
84
+ "fashion show category": get_property_id_by_label("fashion show category", wikibase_api_url),
85
+ "fashion house X fashion collection": get_property_id_by_label("fashion house X fashion collection", wikibase_api_url),
86
+ "designer of collection": get_property_id_by_label("designer of collection", wikibase_api_url)}