Spaces:
Sleeping
Sleeping
rename files
Browse files- Dockerfile +2 -2
- requirements.txt +1 -0
- run.py β run_job.py +3 -1
- app.py β start_app.py +56 -12
Dockerfile
CHANGED
|
@@ -20,8 +20,8 @@ RUN pip install --no-cache-dir --upgrade pip
|
|
| 20 |
COPY --chown=user run.py app.py requirements.txt README.md $HOME/app/
|
| 21 |
|
| 22 |
# Install dependencies
|
| 23 |
-
RUN pip install "gradio[oauth]"
|
| 24 |
RUN pip install -r requirements.txt
|
| 25 |
|
| 26 |
# Run app
|
| 27 |
-
ENTRYPOINT python
|
|
|
|
| 20 |
COPY --chown=user run.py app.py requirements.txt README.md $HOME/app/
|
| 21 |
|
| 22 |
# Install dependencies
|
| 23 |
+
RUN pip install "gradio[oauth]"
|
| 24 |
RUN pip install -r requirements.txt
|
| 25 |
|
| 26 |
# Run app
|
| 27 |
+
ENTRYPOINT python start_app.py
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
duckdb
|
| 2 |
huggingface_hub
|
| 3 |
tabulate
|
|
|
|
| 1 |
+
fire
|
| 2 |
duckdb
|
| 3 |
huggingface_hub
|
| 4 |
tabulate
|
run.py β run_job.py
RENAMED
|
@@ -50,11 +50,13 @@ def sql(src: str, dst: str, query: str, config: str = "default", split: str = "t
|
|
| 50 |
src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
|
| 51 |
if not src_kwargs:
|
| 52 |
raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
|
|
|
|
| 53 |
con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
|
| 54 |
if dry_run:
|
| 55 |
-
print(f"Sample data from '{src}' that would be written to '{dst}':\n")
|
| 56 |
else:
|
| 57 |
con.sql("PRAGMA enable_progress_bar;")
|
|
|
|
| 58 |
result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
|
| 59 |
if dry_run:
|
| 60 |
print(result.df().to_markdown())
|
|
|
|
| 50 |
src_kwargs = con.sql(CMD_SRC_KWARGS.format(src=src, config=config, split=split)).df().to_dict(orient="records")
|
| 51 |
if not src_kwargs:
|
| 52 |
raise ValueError(f'Invalid --config "{config}" for dataset "{src}", please select a valid dataset config/subset.')
|
| 53 |
+
|
| 54 |
con.sql((CMD_SRC_DRY_RUN if dry_run else CMD_SRC).format(**src_kwargs[0]))
|
| 55 |
if dry_run:
|
| 56 |
+
print(f"Sample data from '{src}' that would be written to dataset '{dst}':\n")
|
| 57 |
else:
|
| 58 |
con.sql("PRAGMA enable_progress_bar;")
|
| 59 |
+
|
| 60 |
result = con.sql((CMD_DST_DRY_RUN if dry_run else CMD_DST).format(query=query.rstrip("\n ;")))
|
| 61 |
if dry_run:
|
| 62 |
print(result.df().to_markdown())
|
app.py β start_app.py
RENAMED
|
@@ -1,18 +1,20 @@
|
|
|
|
|
| 1 |
import re
|
| 2 |
import subprocess
|
| 3 |
import yaml
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
import requests
|
| 7 |
-
from huggingface_hub import HfApi
|
| 8 |
|
| 9 |
|
| 10 |
-
CMD = ["python" ,"
|
| 11 |
|
| 12 |
with open("README.md") as f:
|
| 13 |
METADATA = yaml.safe_load(f.read().split("---\n")[1])
|
| 14 |
TITLE = METADATA["title"]
|
| 15 |
EMOJI = METADATA["emoji"]
|
|
|
|
| 16 |
|
| 17 |
try:
|
| 18 |
process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
@@ -22,26 +24,68 @@ except Exception:
|
|
| 22 |
|
| 23 |
DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
|
| 24 |
|
| 25 |
-
def
|
| 26 |
if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|ββ"):
|
| 27 |
[pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
|
| 28 |
percent = float(percent_match.group(0)[:-1]) / 100
|
| 29 |
desc = line[:percent_match.start()].strip() or "Progress"
|
| 30 |
pbars[desc] = percent
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def dry_run(src, config, split, dst, query):
|
| 33 |
if not all([src, config, split, dst, query]):
|
| 34 |
raise gr.Error("Please fill source, destination and query.")
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
| 37 |
for line in iter(process.stdout.readline, b""):
|
| 38 |
logs += line.decode()
|
| 39 |
-
yield {output_markdown: logs
|
| 40 |
|
| 41 |
-
def run(src, config, split, dst, query):
|
| 42 |
if not all([src, config, split, dst, query]):
|
| 43 |
raise gr.Error("Please fill source, destination and query.")
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
|
| 47 |
NUM_TRENDING_DATASETS = 10
|
|
@@ -51,17 +95,17 @@ with gr.Blocks() as demo:
|
|
| 51 |
with gr.Column(scale=10):
|
| 52 |
gr.Markdown(f"# {TITLE} {EMOJI}")
|
| 53 |
with gr.Column():
|
| 54 |
-
gr.LoginButton(
|
| 55 |
with gr.Row():
|
| 56 |
-
with gr.Column():
|
| 57 |
with gr.Row():
|
| 58 |
loading_codes_json = gr.JSON([], visible=False)
|
| 59 |
dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
|
| 60 |
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
|
| 61 |
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
|
| 62 |
-
with gr.Column(
|
| 63 |
gr.HTML("<div style='font-size: 4em;'>β</div>")
|
| 64 |
-
with gr.Column():
|
| 65 |
dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
|
| 66 |
query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
|
| 67 |
with gr.Row():
|
|
|
|
| 1 |
+
import os
|
| 2 |
import re
|
| 3 |
import subprocess
|
| 4 |
import yaml
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
import requests
|
| 8 |
+
from huggingface_hub import HfApi, get_token
|
| 9 |
|
| 10 |
|
| 11 |
+
CMD = ["python" ,"run_job.py"]
|
| 12 |
|
| 13 |
with open("README.md") as f:
|
| 14 |
METADATA = yaml.safe_load(f.read().split("---\n")[1])
|
| 15 |
TITLE = METADATA["title"]
|
| 16 |
EMOJI = METADATA["emoji"]
|
| 17 |
+
spaceId = os.environ.get("SPACE_ID") or "lhoestq/run-duckdb"
|
| 18 |
|
| 19 |
try:
|
| 20 |
process = subprocess.run(CMD + ["--help"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
|
|
| 24 |
|
| 25 |
DRY_RUN = bool(HELP) and bool(m :=re.search("--dry(-|_)run", HELP)) and m.group(0)
|
| 26 |
|
| 27 |
+
def parse_log(line: str, pbars: dict[str, float]):
|
| 28 |
if (percent_match := re.search("\\d+(?:\\.\\d+)?%", line)) and any(c in line.split("%")[1][:10] for c in "|ββ"):
|
| 29 |
[pbars.pop(desc) for desc, percent in pbars.items() if percent == 1.]
|
| 30 |
percent = float(percent_match.group(0)[:-1]) / 100
|
| 31 |
desc = line[:percent_match.start()].strip() or "Progress"
|
| 32 |
pbars[desc] = percent
|
| 33 |
+
yield ""
|
| 34 |
+
else:
|
| 35 |
+
yield line
|
| 36 |
|
| 37 |
def dry_run(src, config, split, dst, query):
|
| 38 |
if not all([src, config, split, dst, query]):
|
| 39 |
raise gr.Error("Please fill source, destination and query.")
|
| 40 |
+
args = ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query, DRY_RUN]
|
| 41 |
+
cmd = CMD + args
|
| 42 |
+
logs = "Job:\n\n```bash\n" + " ".join('"' + arg.replace('"', '\"""') + '"' if " " in arg else arg for arg in cmd) + "\n```\nOutput:\n\n"
|
| 43 |
+
yield {output_markdown: logs, progress_labels: gr.Label(visible=False)}
|
| 44 |
+
process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
|
| 45 |
for line in iter(process.stdout.readline, b""):
|
| 46 |
logs += line.decode()
|
| 47 |
+
yield {output_markdown: logs}
|
| 48 |
|
| 49 |
+
def run(src, config, split, dst, query, oauth_token: gr.OAuthToken | None, profile: gr.OAuthProfile | None):
|
| 50 |
if not all([src, config, split, dst, query]):
|
| 51 |
raise gr.Error("Please fill source, destination and query.")
|
| 52 |
+
if oauth_token and profile:
|
| 53 |
+
token = oauth_token.token
|
| 54 |
+
username = profile.username
|
| 55 |
+
elif (token := get_token()):
|
| 56 |
+
username = HfApi().whoami(token=token)["name"]
|
| 57 |
+
else:
|
| 58 |
+
raise gr.Error("Please log in to run the job.")
|
| 59 |
+
args = ["--src", src, "--config", config, "--split", split, "--dst", dst, "--query", query]
|
| 60 |
+
cmd = CMD + args
|
| 61 |
+
logs = "Job:\n\n```bash\n" + " ".join('"' + arg.replace('"', '\"""') + '"' if " " in arg else arg for arg in cmd) + "\n```\nOutput:\n\n"
|
| 62 |
+
pbars = {}
|
| 63 |
+
yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
|
| 64 |
+
resp = requests.post(
|
| 65 |
+
f"https://huggingface.co/api/jobs/{username}",
|
| 66 |
+
json={
|
| 67 |
+
"spaceId": spaceId,
|
| 68 |
+
"arguments": args,
|
| 69 |
+
"command": CMD,
|
| 70 |
+
"environment": {},
|
| 71 |
+
"flavor": "cpu-basic"
|
| 72 |
+
},
|
| 73 |
+
headers={"Authorization": f"Bearer {token}"}
|
| 74 |
+
)
|
| 75 |
+
if resp.status_code != 200:
|
| 76 |
+
logs += resp.text
|
| 77 |
+
pbars = {"Finished with an error β": 1.0}
|
| 78 |
+
else:
|
| 79 |
+
job_id = resp.json()["metadata"]["job_id"]
|
| 80 |
+
resp = requests.get(
|
| 81 |
+
f"https://huggingface.co/api/jobs/{username}/{job_id}/logs-stream",
|
| 82 |
+
headers={"Authorization": f"Bearer {token}"}
|
| 83 |
+
)
|
| 84 |
+
for line in iter(resp.raw.readline, b""):
|
| 85 |
+
logs += parse_log(line.decode(), pbars=pbars)
|
| 86 |
+
yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
|
| 87 |
+
pbars = {"Finished" + (" β
" if process.returncode == 0 else " with an error β"): 1.0}
|
| 88 |
+
yield {output_markdown: logs, progress_labels: gr.Label(pbars, visible=bool(pbars))}
|
| 89 |
|
| 90 |
READ_FUNCTIONS = ("pl.read_parquet", "pl.read_csv", "pl.read_json")
|
| 91 |
NUM_TRENDING_DATASETS = 10
|
|
|
|
| 95 |
with gr.Column(scale=10):
|
| 96 |
gr.Markdown(f"# {TITLE} {EMOJI}")
|
| 97 |
with gr.Column():
|
| 98 |
+
gr.LoginButton()
|
| 99 |
with gr.Row():
|
| 100 |
+
with gr.Column(scale=10):
|
| 101 |
with gr.Row():
|
| 102 |
loading_codes_json = gr.JSON([], visible=False)
|
| 103 |
dataset_dropdown = gr.Dropdown(label="Source Dataset", allow_custom_value=True, scale=10)
|
| 104 |
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
|
| 105 |
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
|
| 106 |
+
with gr.Column(min_width=60):
|
| 107 |
gr.HTML("<div style='font-size: 4em;'>β</div>")
|
| 108 |
+
with gr.Column(scale=10):
|
| 109 |
dst_dropdown = gr.Dropdown(label="Destination Dataset", allow_custom_value=True)
|
| 110 |
query_textarea = gr.TextArea(label="SQL Query", placeholder="SELECT * FROM src;", value="SELECT * FROM src;", container=False, show_label=False)
|
| 111 |
with gr.Row():
|