|
|
|
|
|
from typing import List |
|
import gradio as gr |
|
import duckdb |
|
import pandas as pd |
|
import requests |
|
|
|
DATASETS_SERVER_ENDPOINT = "https://datasets-server.huggingface.co" |
|
PARQUET_REVISION="refs/convert/parquet" |
|
|
|
EXAMPLE_DATASET_NAME = "LLMs/Alpaca-ShareGPT" |
|
|
|
|
|
def get_parquet_urls(dataset: str) -> List[str]: |
|
splits = requests.get(f"{DATASETS_SERVER_ENDPOINT}/splits?dataset={dataset}", timeout=60).json().get("splits") |
|
split = splits[0] |
|
response = requests.get(f"{DATASETS_SERVER_ENDPOINT}/parquet?dataset={dataset}&config={split['config']}", timeout=60) |
|
if response.status_code != 200: |
|
raise Exception(response) |
|
|
|
response = response.json() |
|
parquet_files = response["parquet_files"] |
|
urls = [content["url"] for content in parquet_files if content["split"] == split["split"]] |
|
if len(urls) == 0: |
|
raise Exception("No parquet files found for dataset") |
|
return urls |
|
|
|
def run_command(query: str) -> pd.DataFrame: |
|
try: |
|
result = duckdb.execute("SELECT fts_main_data.match_bm25(id, ?) AS score, id, instruction, input, output FROM data WHERE score IS NOT NULL ORDER BY score DESC;", [query]) |
|
print("Ok") |
|
except Exception as error: |
|
print(f"Error: {str(error)}") |
|
return pd.DataFrame({"Error": [f"β {str(error)}"]}) |
|
print(result) |
|
return result.df() |
|
|
|
def import_data(): |
|
duckdb.execute("INSTALL 'httpfs';") |
|
duckdb.execute("LOAD 'httpfs';") |
|
duckdb.execute("INSTALL 'fts';") |
|
duckdb.execute("LOAD 'fts';") |
|
duckdb.sql("select * from duckdb_extensions();").show() |
|
|
|
|
|
parquet_url = get_parquet_urls(EXAMPLE_DATASET_NAME)[0] |
|
print("parquet_url", parquet_url) |
|
duckdb.sql("CREATE SEQUENCE serial START 1;") |
|
|
|
|
|
|
|
duckdb.sql(f"CREATE TABLE data AS SELECT nextval('serial') AS id, * FROM '{parquet_url}';") |
|
duckdb.sql("PRAGMA create_fts_index('data', 'id', '*');") |
|
|
|
duckdb.sql("DESCRIBE SELECT * FROM data").show() |
|
print("foo foo") |
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown(""" |
|
## Full-text search using DuckDB on top of datasets-server Parquet files π€ |
|
|
|
Inspired by https://huggingface.co/spaces/asoria/duckdb-parquet-demo |
|
""") |
|
gr.CheckboxGroup(label="Dataset", choices=["LLMs/Alpaca-ShareGPT"], value="LLMs/Alpaca-ShareGPT", info="Dataset to query"), |
|
query = gr.Textbox(label="query", placeholder="Full-text search...") |
|
run_button = gr.Button("Run") |
|
gr.Markdown("### Result") |
|
cached_responses_table = gr.DataFrame() |
|
run_button.click(run_command, inputs=[query], outputs=cached_responses_table) |
|
|
|
|
|
if __name__ == "__main__": |
|
import_data() |
|
demo.launch() |
|
|