Spaces:
Sleeping
Sleeping
hide params on reclick
Browse files
app.py
CHANGED
|
@@ -78,52 +78,9 @@ DEFAULT_CODE = dedent(
|
|
| 78 |
|
| 79 |
make_gallery_image_buttons_js = """
|
| 80 |
function load() {
|
| 81 |
-
class ClassWatcher {
|
| 82 |
-
|
| 83 |
-
constructor(targetNode, classToWatch, classAddedCallback, arg) {
|
| 84 |
-
this.targetNode = targetNode
|
| 85 |
-
this.classToWatch = classToWatch
|
| 86 |
-
this.classAddedCallback = classAddedCallback
|
| 87 |
-
this.arg = arg
|
| 88 |
-
this.observer = null
|
| 89 |
-
this.lastClassState = targetNode.classList.contains(this.classToWatch)
|
| 90 |
-
|
| 91 |
-
this.init()
|
| 92 |
-
}
|
| 93 |
-
|
| 94 |
-
init() {
|
| 95 |
-
this.observer = new MutationObserver(this.mutationCallback)
|
| 96 |
-
this.observe()
|
| 97 |
-
}
|
| 98 |
-
|
| 99 |
-
observe() {
|
| 100 |
-
this.observer.observe(this.targetNode, { attributes: true })
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
disconnect() {
|
| 104 |
-
this.observer.disconnect()
|
| 105 |
-
}
|
| 106 |
-
|
| 107 |
-
mutationCallback = mutationsList => {
|
| 108 |
-
for (let mutation of mutationsList) {
|
| 109 |
-
if (mutation.type === 'attributes' && mutation.attributeName === 'class') {
|
| 110 |
-
let currentClassState = mutation.target.classList.contains(this.classToWatch)
|
| 111 |
-
if(this.lastClassState !== currentClassState) {
|
| 112 |
-
this.lastClassState = currentClassState
|
| 113 |
-
if(currentClassState) {
|
| 114 |
-
this.classAddedCallback(this.arg)
|
| 115 |
-
}
|
| 116 |
-
}
|
| 117 |
-
}
|
| 118 |
-
}
|
| 119 |
-
}
|
| 120 |
-
}
|
| 121 |
let buttons = document.getElementsByClassName("block-button");
|
| 122 |
-
function clickButton(i) {
|
| 123 |
-
buttons[i].click();
|
| 124 |
-
}
|
| 125 |
Array.from(document.getElementById("pipeline-gallery").getElementsByClassName("thumbnail-item")).map(
|
| 126 |
-
(b, i) =>
|
| 127 |
)
|
| 128 |
}
|
| 129 |
"""
|
|
@@ -147,7 +104,7 @@ tr td {
|
|
| 147 |
min-height: 600px;
|
| 148 |
max-height: 600px;
|
| 149 |
}
|
| 150 |
-
.
|
| 151 |
overflow: scroll;
|
| 152 |
}
|
| 153 |
"""
|
|
@@ -164,10 +121,10 @@ def non_empty_list_or_none(input_list: list[str]) -> Optional[list[str]]:
|
|
| 164 |
|
| 165 |
|
| 166 |
with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
| 167 |
-
state = gr.State({"selected_block":
|
| 168 |
gr.Markdown("# Common Crawl Pipeline Creator")
|
| 169 |
with gr.Row():
|
| 170 |
-
with gr.Column():
|
| 171 |
gallery = gr.Gallery(
|
| 172 |
blocks,
|
| 173 |
columns=4,
|
|
@@ -344,28 +301,31 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
| 344 |
]
|
| 345 |
|
| 346 |
with gr.Column():
|
| 347 |
-
with gr.
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
with gr.
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
|
|
|
| 360 |
|
| 361 |
|
| 362 |
gr.Markdown("_powered by [datatrove](https://github.com/huggingface/datatrove)_")
|
| 363 |
|
| 364 |
-
def show_block_ui(i):
|
|
|
|
|
|
|
| 365 |
return {**{block_ui: gr.Column(visible=(j == i)) for j, block_ui in enumerate(blocks_uis)}, state: {"selected_block": i}}
|
| 366 |
|
| 367 |
for i, button in enumerate(gallery_image_buttons):
|
| 368 |
-
button.click(partial(show_block_ui, i), outputs=blocks_uis + [state])
|
| 369 |
|
| 370 |
|
| 371 |
inputs = [
|
|
@@ -505,8 +465,8 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
| 505 |
|
| 506 |
if num_warc_samples:
|
| 507 |
yield {
|
| 508 |
-
output_tab: gr.Tab(f"Output (
|
| 509 |
-
excluded_tab: gr.Tab(f"Excluded (
|
| 510 |
output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
|
| 511 |
**{
|
| 512 |
excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
|
|
@@ -514,7 +474,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
| 514 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
| 515 |
},
|
| 516 |
**{
|
| 517 |
-
excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} (
|
| 518 |
for step_to_run in pipeline_executor.pipeline
|
| 519 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
| 520 |
},
|
|
@@ -535,8 +495,8 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
| 535 |
},
|
| 536 |
}
|
| 537 |
yield {
|
| 538 |
-
output_tab: gr.Tab(f"Output (
|
| 539 |
-
excluded_tab: gr.Tab(f"Excluded (
|
| 540 |
output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
|
| 541 |
**{
|
| 542 |
excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
|
|
@@ -544,7 +504,7 @@ with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
|
| 544 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
| 545 |
},
|
| 546 |
**{
|
| 547 |
-
excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} (
|
| 548 |
for step_to_run in pipeline_executor.pipeline
|
| 549 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
| 550 |
},
|
|
|
|
| 78 |
|
| 79 |
make_gallery_image_buttons_js = """
|
| 80 |
function load() {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
let buttons = document.getElementsByClassName("block-button");
|
|
|
|
|
|
|
|
|
|
| 82 |
Array.from(document.getElementById("pipeline-gallery").getElementsByClassName("thumbnail-item")).map(
|
| 83 |
+
(b, i) => b.addEventListener("click", () => buttons[i].click())
|
| 84 |
)
|
| 85 |
}
|
| 86 |
"""
|
|
|
|
| 104 |
min-height: 600px;
|
| 105 |
max-height: 600px;
|
| 106 |
}
|
| 107 |
+
.scollabe_tabs .tab-wrapper .tab-container {
|
| 108 |
overflow: scroll;
|
| 109 |
}
|
| 110 |
"""
|
|
|
|
| 121 |
|
| 122 |
|
| 123 |
with gr.Blocks(css=css, js=make_gallery_image_buttons_js) as demo:
|
| 124 |
+
state = gr.State({"selected_block": None})
|
| 125 |
gr.Markdown("# Common Crawl Pipeline Creator")
|
| 126 |
with gr.Row():
|
| 127 |
+
with gr.Column(min_width=640):
|
| 128 |
gallery = gr.Gallery(
|
| 129 |
blocks,
|
| 130 |
columns=4,
|
|
|
|
| 301 |
]
|
| 302 |
|
| 303 |
with gr.Column():
|
| 304 |
+
with gr.Tabs(elem_classes="scollabe_tabs"):
|
| 305 |
+
with gr.Tab("Output (and % of data)") as output_tab:
|
| 306 |
+
output_dataframe = gr.DataFrame(datatype="markdown")
|
| 307 |
+
with gr.Tab("Excluded (and % of data)") as excluded_tab:
|
| 308 |
+
with gr.Tabs(elem_classes="scollabe_tabs"):
|
| 309 |
+
excluded_dataframes: dict[Type, gr.DataFrame] = {}
|
| 310 |
+
excluded_tabs: dict[Type, gr.Tab] = {}
|
| 311 |
+
for step in steps:
|
| 312 |
+
if issubclass(step, BaseFilter) and step is not URLFilter:
|
| 313 |
+
with gr.Tab(step.__name__ + " (and % of data)") as t:
|
| 314 |
+
excluded_dataframes[step] = gr.DataFrame(datatype="markdown")
|
| 315 |
+
excluded_tabs[step] = t
|
| 316 |
+
with gr.Tab("Python code") as code_tab:
|
| 317 |
+
python_code_markdown = gr.Markdown(DEFAULT_CODE)
|
| 318 |
|
| 319 |
|
| 320 |
gr.Markdown("_powered by [datatrove](https://github.com/huggingface/datatrove)_")
|
| 321 |
|
| 322 |
+
def show_block_ui(i, current_state: dict):
|
| 323 |
+
if i == current_state.get("selected_block"):
|
| 324 |
+
i = None
|
| 325 |
return {**{block_ui: gr.Column(visible=(j == i)) for j, block_ui in enumerate(blocks_uis)}, state: {"selected_block": i}}
|
| 326 |
|
| 327 |
for i, button in enumerate(gallery_image_buttons):
|
| 328 |
+
button.click(partial(show_block_ui, i), inputs=[state], outputs=blocks_uis + [state])
|
| 329 |
|
| 330 |
|
| 331 |
inputs = [
|
|
|
|
| 465 |
|
| 466 |
if num_warc_samples:
|
| 467 |
yield {
|
| 468 |
+
output_tab: gr.Tab(f"Output ({len(output_docs)/num_warc_samples*100:.03f}%)"),
|
| 469 |
+
excluded_tab: gr.Tab(f"Excluded ({100 - len(output_docs)/num_warc_samples*100:.03f}%)"),
|
| 470 |
output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
|
| 471 |
**{
|
| 472 |
excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
|
|
|
|
| 474 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
| 475 |
},
|
| 476 |
**{
|
| 477 |
+
excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} ({len(step_to_run.exclusion_writer.docs)/num_warc_samples*100:.03f}%)")
|
| 478 |
for step_to_run in pipeline_executor.pipeline
|
| 479 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
| 480 |
},
|
|
|
|
| 495 |
},
|
| 496 |
}
|
| 497 |
yield {
|
| 498 |
+
output_tab: gr.Tab(f"Output ({len(output_docs)/num_warc_samples*100:.03f}%)"),
|
| 499 |
+
excluded_tab: gr.Tab(f"Excluded ({100 - len(output_docs)/num_warc_samples*100:.03f}%)"),
|
| 500 |
output_dataframe: pd.DataFrame({"text": [doc.text for doc in output_docs]}),
|
| 501 |
**{
|
| 502 |
excluded_dataframes[type(step_to_run)]: pd.DataFrame({"text": [doc.text for doc in step_to_run.exclusion_writer.docs]})
|
|
|
|
| 504 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
| 505 |
},
|
| 506 |
**{
|
| 507 |
+
excluded_tabs[type(step_to_run)]: gr.Tab(f"{type(step_to_run).__name__} ({len(step_to_run.exclusion_writer.docs)/num_warc_samples*100:.03f}%)")
|
| 508 |
for step_to_run in pipeline_executor.pipeline
|
| 509 |
if isinstance(step_to_run, BaseFilter) and type(step_to_run) in excluded_dataframes
|
| 510 |
},
|