Update index.html
Browse files- index.html +7 -7
index.html
CHANGED
@@ -65,7 +65,7 @@
|
|
65 |
|
66 |
<div class="column has-text-centered">
|
67 |
<span class="link-block">
|
68 |
-
<a href="https://arxiv.org/abs/
|
69 |
class="external-link button is-normal is-rounded is-dark">
|
70 |
<span class="icon">
|
71 |
<i class="ai ai-arxiv"></i>
|
@@ -151,9 +151,9 @@
|
|
151 |
<li><strong>✔️ Accuracy:</strong> Spearman’s ρ > 0.87 with human ground truth</li>
|
152 |
<li><strong>📈 Downstream LLM Training:</strong>
|
153 |
<ul>
|
154 |
-
<li
|
155 |
-
<li
|
156 |
-
<li>Effective threshold strategies:
|
157 |
</ul>
|
158 |
</li>
|
159 |
<li><strong>⚡ Annotation Speed:</strong> ~11,000 docs/min (A100 GPU, avg. 690 tokens)</li>
|
@@ -184,10 +184,10 @@
|
|
184 |
<h2 class="title is-3">📜 Citation</h2>
|
185 |
<p>If you use JQL, the annotations, or the pretrained annotators, please cite the paper:</p>
|
186 |
<pre><code>@article{ali2024jql,
|
187 |
-
title={
|
188 |
author={Ali, Mehdi and Brack, Manuel and Lübbering, Max and Wendt, Elias and Khan, Abbas Goher and Rutmann, Richard and Jude, Alex and Kraus, Maurice and Weber, Alexander Arno and Stollenwerk, Felix and Kaczér, David and Mai, Florian and Flek, Lucie and Sifa, Rafet and Flores-Herr, Nicolas and Köhler, Joachim and Schramowski, Patrick and Fromm, Michael and Kersting, Kristian},
|
189 |
-
journal={
|
190 |
-
year={
|
191 |
}</code></pre>
|
192 |
</div>
|
193 |
</section>
|
|
|
65 |
|
66 |
<div class="column has-text-centered">
|
67 |
<span class="link-block">
|
68 |
+
<a href="https://arxiv.org/abs/2505.22232" target="_blank"
|
69 |
class="external-link button is-normal is-rounded is-dark">
|
70 |
<span class="icon">
|
71 |
<i class="ai ai-arxiv"></i>
|
|
|
151 |
<li><strong>✔️ Accuracy:</strong> Spearman’s ρ > 0.87 with human ground truth</li>
|
152 |
<li><strong>📈 Downstream LLM Training:</strong>
|
153 |
<ul>
|
154 |
+
<li>Benchmark performance improvement over FineWeb2</li>
|
155 |
+
<li>Higher document retention vs. FineWeb2 heuristic filter</li>
|
156 |
+
<li>Effective dynamic threshold strategies: Trade-off document quality for quantity</li>
|
157 |
</ul>
|
158 |
</li>
|
159 |
<li><strong>⚡ Annotation Speed:</strong> ~11,000 docs/min (A100 GPU, avg. 690 tokens)</li>
|
|
|
184 |
<h2 class="title is-3">📜 Citation</h2>
|
185 |
<p>If you use JQL, the annotations, or the pretrained annotators, please cite the paper:</p>
|
186 |
<pre><code>@article{ali2024jql,
|
187 |
+
title={Judging Quality Across Languages: A Multilingual Approach to Pretraining Data Filtering with Language Modelss},
|
188 |
author={Ali, Mehdi and Brack, Manuel and Lübbering, Max and Wendt, Elias and Khan, Abbas Goher and Rutmann, Richard and Jude, Alex and Kraus, Maurice and Weber, Alexander Arno and Stollenwerk, Felix and Kaczér, David and Mai, Florian and Flek, Lucie and Sifa, Rafet and Flores-Herr, Nicolas and Köhler, Joachim and Schramowski, Patrick and Fromm, Michael and Kersting, Kristian},
|
189 |
+
journal={arXiv preprint arXiv:2505.22232},
|
190 |
+
year={2025}
|
191 |
}</code></pre>
|
192 |
</div>
|
193 |
</section>
|