Ask-FashionDB / src /sparql_query_wikibase.py
traopia
Initial commit
cf0b712
raw
history blame
4.47 kB
from urllib.parse import urlparse
import logging
log = logging.getLogger(__name__)
from time import sleep
import requests
helpers_session = requests.Session()
from wikibaseintegrator.wbi_helpers import get_user_agent
import pandas as pd
from string import Template
queries = False
wikibase_api_url = 'https://fashionwiki.wikibase.cloud/w/api.php'
config = {
"SPARQL_ENDPOINT_URL": "https://fashionwiki.wikibase.cloud/query/sparql",
'USER_AGENT': 'YourBotName/1.0 (https://yourwebsite.org/bot-info)',
'WIKIBASE_URL': wikibase_api_url,
}
wikibase_properties_id = {'instance of': 'P2',
'reference URL': 'P24',
'start time': 'P15',
'end time': 'P16',
'occupation title': 'P25',
'educated at': 'P9',
'employer': 'P10',
'work location': 'P7',
'award received': 'P18',
'point in time': 'P28',
'exact match': 'P23',
'date of birth': 'P3',
'place of birth': 'P4',
'date of death': 'P5',
'country of citizenship': 'P6',
'occupation': 'P19',
'sex or gender': 'P8',
'official website': 'P17',
'perfumes': 'P27',
'who wears it': 'P26',
'inception': 'P11',
'headquarters location': 'P12',
'parent organization': 'P13',
'founded by': 'P14',
'owned by': 'P22',
'industry': 'P20',
'country': 'P30',
'total revenue': 'P21',
'designer employed': 'P29',
'country of origin': 'P30',
'fashion collection': 'P31',
'fashion season': 'P32',
'fashion show location': 'P33',
'description of fashion collection': 'P34',
'image of fashion collection': 'P35',
'editor of fashion collection description': 'P36',
'date of fashion collection': 'P37',
'fashion show category': 'P38',
'fashion house X fashion collection': 'P39'}
classes_wikibase = {'fashion designer': 'Q5',
'fashion house': 'Q1',
'business': 'Q9',
'academic institution': 'Q2',
'geographic location': 'Q4',
'fashion award': 'Q8',
'gender': 'Q6',
'occupation': 'Q7',
'human': 'Q36',
'organization': 'Q3',
'brand': 'Q38',
'lifestyle brand': 'Q3417',
'privately held company': 'Q1729',
'fashion season': 'Q8199',
'fashion show category': 'Q8200',
'fashion season collection': 'Q8201',
'fashion journalist': 'Q8207'}
def execute_sparql_query(query: str, prefix: str | None = None, endpoint: str | None = None, user_agent: str | None = None, max_retries: int = 1000, retry_after: int = 60) -> dict:
"""
Execute any SPARQL query with the provided parameters.
"""
sparql_endpoint_url = str(endpoint or config['SPARQL_ENDPOINT_URL'])
user_agent = user_agent or (str(config['USER_AGENT']) if config['USER_AGENT'] is not None else None)
hostname = urlparse(sparql_endpoint_url).hostname
if hostname and hostname.endswith(('wikidata.org', 'wikipedia.org', 'wikimedia.org')) and user_agent is None:
log.warning('WARNING: Please set a user agent if you interact with a Wikimedia Foundation instance.')
if prefix:
query = prefix + '\n' + query
headers = {
'Accept': 'application/sparql-results+json',
'User-Agent': get_user_agent(user_agent),
'Content-Type': 'application/sparql-query' # Correct Content-Type
}
# Attempt to make the request
for _ in range(max_retries):
try:
# Use 'data' instead of 'params' for the POST request to SPARQL
response = helpers_session.post(sparql_endpoint_url, data=query, headers=headers)
except requests.exceptions.ConnectionError as e:
log.exception("Connection error: %s. Sleeping for %d seconds.", e, retry_after)
sleep(retry_after)
continue
if response.status_code in (500, 502, 503, 504):
log.error("Service unavailable (HTTP Code %d). Sleeping for %d seconds.", response.status_code, retry_after)
sleep(retry_after)
continue
if response.status_code == 429:
if 'retry-after' in response.headers:
retry_after = int(response.headers['retry-after'])
log.error("Too Many Requests (429). Sleeping for %d seconds", retry_after)
sleep(retry_after)
continue
response.raise_for_status() # Raise any non-success status code
return response.json() # Return the JSON result if successful
raise Exception(f"No result after {max_retries} retries.")
def get_results_to_df( query):
results = execute_sparql_query(query)
df = pd.DataFrame(results["results"]["bindings"])
df = df.map(lambda x: x['value'] if pd.notnull(x) else None)
return df