Spaces:
Running
Running
from urllib.parse import urlparse | |
import logging | |
log = logging.getLogger(__name__) | |
from time import sleep | |
import requests | |
helpers_session = requests.Session() | |
from wikibaseintegrator.wbi_helpers import get_user_agent | |
import pandas as pd | |
from string import Template | |
queries = False | |
wikibase_api_url = 'https://fashionwiki.wikibase.cloud/w/api.php' | |
config = { | |
"SPARQL_ENDPOINT_URL": "https://fashionwiki.wikibase.cloud/query/sparql", | |
'USER_AGENT': 'YourBotName/1.0 (https://yourwebsite.org/bot-info)', | |
'WIKIBASE_URL': wikibase_api_url, | |
} | |
wikibase_properties_id = {'instance of': 'P2', | |
'reference URL': 'P24', | |
'start time': 'P15', | |
'end time': 'P16', | |
'occupation title': 'P25', | |
'educated at': 'P9', | |
'employer': 'P10', | |
'work location': 'P7', | |
'award received': 'P18', | |
'point in time': 'P28', | |
'exact match': 'P23', | |
'date of birth': 'P3', | |
'place of birth': 'P4', | |
'date of death': 'P5', | |
'country of citizenship': 'P6', | |
'occupation': 'P19', | |
'sex or gender': 'P8', | |
'official website': 'P17', | |
'perfumes': 'P27', | |
'who wears it': 'P26', | |
'inception': 'P11', | |
'headquarters location': 'P12', | |
'parent organization': 'P13', | |
'founded by': 'P14', | |
'owned by': 'P22', | |
'industry': 'P20', | |
'country': 'P30', | |
'total revenue': 'P21', | |
'designer employed': 'P29', | |
'country of origin': 'P30', | |
'fashion collection': 'P31', | |
'fashion season': 'P32', | |
'fashion show location': 'P33', | |
'description of fashion collection': 'P34', | |
'image of fashion collection': 'P35', | |
'editor of fashion collection description': 'P36', | |
'date of fashion collection': 'P37', | |
'fashion show category': 'P38', | |
'fashion house X fashion collection': 'P39'} | |
classes_wikibase = {'fashion designer': 'Q5', | |
'fashion house': 'Q1', | |
'business': 'Q9', | |
'academic institution': 'Q2', | |
'geographic location': 'Q4', | |
'fashion award': 'Q8', | |
'gender': 'Q6', | |
'occupation': 'Q7', | |
'human': 'Q36', | |
'organization': 'Q3', | |
'brand': 'Q38', | |
'lifestyle brand': 'Q3417', | |
'privately held company': 'Q1729', | |
'fashion season': 'Q8199', | |
'fashion show category': 'Q8200', | |
'fashion season collection': 'Q8201', | |
'fashion journalist': 'Q8207'} | |
def execute_sparql_query(query: str, prefix: str | None = None, endpoint: str | None = None, user_agent: str | None = None, max_retries: int = 1000, retry_after: int = 60) -> dict: | |
""" | |
Execute any SPARQL query with the provided parameters. | |
""" | |
sparql_endpoint_url = str(endpoint or config['SPARQL_ENDPOINT_URL']) | |
user_agent = user_agent or (str(config['USER_AGENT']) if config['USER_AGENT'] is not None else None) | |
hostname = urlparse(sparql_endpoint_url).hostname | |
if hostname and hostname.endswith(('wikidata.org', 'wikipedia.org', 'wikimedia.org')) and user_agent is None: | |
log.warning('WARNING: Please set a user agent if you interact with a Wikimedia Foundation instance.') | |
if prefix: | |
query = prefix + '\n' + query | |
headers = { | |
'Accept': 'application/sparql-results+json', | |
'User-Agent': get_user_agent(user_agent), | |
'Content-Type': 'application/sparql-query' # Correct Content-Type | |
} | |
# Attempt to make the request | |
for _ in range(max_retries): | |
try: | |
# Use 'data' instead of 'params' for the POST request to SPARQL | |
response = helpers_session.post(sparql_endpoint_url, data=query, headers=headers) | |
except requests.exceptions.ConnectionError as e: | |
log.exception("Connection error: %s. Sleeping for %d seconds.", e, retry_after) | |
sleep(retry_after) | |
continue | |
if response.status_code in (500, 502, 503, 504): | |
log.error("Service unavailable (HTTP Code %d). Sleeping for %d seconds.", response.status_code, retry_after) | |
sleep(retry_after) | |
continue | |
if response.status_code == 429: | |
if 'retry-after' in response.headers: | |
retry_after = int(response.headers['retry-after']) | |
log.error("Too Many Requests (429). Sleeping for %d seconds", retry_after) | |
sleep(retry_after) | |
continue | |
response.raise_for_status() # Raise any non-success status code | |
return response.json() # Return the JSON result if successful | |
raise Exception(f"No result after {max_retries} retries.") | |
def get_results_to_df( query): | |
results = execute_sparql_query(query) | |
df = pd.DataFrame(results["results"]["bindings"]) | |
df = df.map(lambda x: x['value'] if pd.notnull(x) else None) | |
return df | |