Spaces:
Runtime error
Runtime error
Commit
·
4ee0173
1
Parent(s):
5c906aa
update "how to train at 100mbps"
Browse files- app.py +44 -8
- st_helpers.py +2 -7
- static/content_style.css +4 -1
app.py
CHANGED
|
@@ -19,11 +19,11 @@ make_header()
|
|
| 19 |
content_text(f"""
|
| 20 |
There was a time when you could comfortably train state-of-the-art vision and language models at home on your workstation.
|
| 21 |
The first convolutional neural net to beat ImageNet
|
| 22 |
-
(
|
| 23 |
was trained for 5-6 days on two gamer-grade GPUs. In contrast, today's TOP-1 ImageNet model
|
| 24 |
-
(
|
| 25 |
takes 20,000 TPU-v3 days. And things are even worse in the NLP world: training
|
| 26 |
-
|
| 27 |
with 8x A100 would take decades.""")
|
| 28 |
|
| 29 |
content_text(f"""
|
|
@@ -34,12 +34,49 @@ All it takes is for a bunch of us to come together. In fact, we're doing it righ
|
|
| 34 |
draw_current_progress()
|
| 35 |
|
| 36 |
content_text(f"""
|
| 37 |
-
We're training a model similar to
|
| 38 |
that is, a transformer "language model" that generates images from text description.
|
| 39 |
-
It is trained on
|
| 40 |
the world's largest openly available image-text-pair dataset with 400 million samples. Our model is based on
|
| 41 |
-
the
|
| 42 |
-
by
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
content_title("How do I join?")
|
|
@@ -71,5 +108,4 @@ content_text("<b> TODO </b> General Story That Weaves Together Three Tabs Below
|
|
| 71 |
|
| 72 |
make_tabs()
|
| 73 |
|
| 74 |
-
content_text("<b> TODO UPDATE")
|
| 75 |
make_footer()
|
|
|
|
| 19 |
content_text(f"""
|
| 20 |
There was a time when you could comfortably train state-of-the-art vision and language models at home on your workstation.
|
| 21 |
The first convolutional neural net to beat ImageNet
|
| 22 |
+
({cite("AlexNet", "https://proceedings.neurips.cc/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf")})
|
| 23 |
was trained for 5-6 days on two gamer-grade GPUs. In contrast, today's TOP-1 ImageNet model
|
| 24 |
+
({cite("CoAtNet", "https://arxiv.org/abs/2106.04803")})
|
| 25 |
takes 20,000 TPU-v3 days. And things are even worse in the NLP world: training
|
| 26 |
+
{cite("GPT‑3", "https://arxiv.org/abs/2005.14165")} on a top-tier server
|
| 27 |
with 8x A100 would take decades.""")
|
| 28 |
|
| 29 |
content_text(f"""
|
|
|
|
| 34 |
draw_current_progress()
|
| 35 |
|
| 36 |
content_text(f"""
|
| 37 |
+
We're training a model similar to {cite("OpenAI DALL-E", "https://openai.com/blog/dall-e/")},
|
| 38 |
that is, a transformer "language model" that generates images from text description.
|
| 39 |
+
It is trained on {cite("LAION-400M", "https://laion.ai/laion-400-open-dataset/")},
|
| 40 |
the world's largest openly available image-text-pair dataset with 400 million samples. Our model is based on
|
| 41 |
+
the {cite("dalle‑pytorch", "https://github.com/lucidrains/DALLE-pytorch")} implementation
|
| 42 |
+
by {cite("Phil Wang", "https://github.com/lucidrains")} with a few tweaks to make it communication-efficient.
|
| 43 |
+
""", vspace_after=8)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
with st.expander("How to train efficiently over the internet?"):
|
| 47 |
+
content_text(f"""
|
| 48 |
+
Modern distributed training algorithms are designed for HPC networks with 10-100 gigabit per second bandwidth.
|
| 49 |
+
In turn, a typical Internet connection runs at 10-100 megabits per second: that’s three orders of magnitude slower.
|
| 50 |
+
To make distributed training over the Internet efficient, you need to win back these three orders of magnitude.
|
| 51 |
+
""")
|
| 52 |
+
content_text(f"""
|
| 53 |
+
This may seem daunting at first, but in reality, DL researchers have already made all the necessary pieces for solving this puzzle:
|
| 54 |
+
<table style="border: 0px;"><tbody style="border: 0px;">
|
| 55 |
+
<tr><td> Speed-up (AllReduce)<br> </td> <td>Existing technique</td></tr>
|
| 56 |
+
<tr><td class=centered><strong>4-16x</strong></td><td>
|
| 57 |
+
<strong>Large-batch training:</strong> {cite("You et al. (2019)", "https://arxiv.org/abs/1904.00962")} proposed a way for training neural networks efficiently with larger batches, and hence, fewer communication rounds.
|
| 58 |
+
</td></tr>
|
| 59 |
+
<tr><td class=centered><strong>4-64x</strong></td><td>
|
| 60 |
+
<strong>Gradient Compression:</strong> from simple {cite("8-bit quantization", "https://arxiv.org/abs/1511.04561")}
|
| 61 |
+
to advanced techniques such as {cite("Deep Gradient Compression", "https://arxiv.org/abs/1712.01887")},
|
| 62 |
+
{cite("PowerSGD", "https://arxiv.org/abs/1905.13727")}, {cite("1-bit Adam", "https://arxiv.org/abs/2102.02888")},
|
| 63 |
+
and many others. As a rule of thumb, you can safely reduce communication by 16-64x. More extreme compression is often
|
| 64 |
+
possible, but it may affect stability or final quality.
|
| 65 |
+
</td></tr>
|
| 66 |
+
<tr><td class=centered><strong>4-24x</strong></td><td>
|
| 67 |
+
<strong>Parameter sharing:</strong> reusing parameters between model layers results in a model with fewer parameters,
|
| 68 |
+
and hence, fewer gradients to communicate. {cite("Lan et al. (2019)", "https://arxiv.org/abs/1909.11942")} and
|
| 69 |
+
{cite("Xue et al. (2021)", "https://arxiv.org/pdf/2107.11817.pdf")} propose efficient parameter sharing techniques
|
| 70 |
+
for NLP and vision.
|
| 71 |
+
</td></tr>
|
| 72 |
+
<tr><td class=centered><strong>1.5-2x</strong></td><td>
|
| 73 |
+
<strong>Overlapping computation with communication:</strong> running network communication in background while
|
| 74 |
+
computing the next portion of gradients. This is a {cite("long-standing trick from HPC", "https://ur.booksc.eu/book/1624068/2d0506")}
|
| 75 |
+
that was recently adapted for DL training. {cite("Ren et al. (2021)", "https://arxiv.org/abs/2101.06840")} show that
|
| 76 |
+
updating parameters in background while computing the next batch of gradients does not reduce convergence.
|
| 77 |
+
</td></tr>
|
| 78 |
+
</tbody></table>
|
| 79 |
+
""")
|
| 80 |
|
| 81 |
|
| 82 |
content_title("How do I join?")
|
|
|
|
| 108 |
|
| 109 |
make_tabs()
|
| 110 |
|
|
|
|
| 111 |
make_footer()
|
st_helpers.py
CHANGED
|
@@ -50,10 +50,5 @@ def content_text(text: str, vspace_before: int = 0, vspace_after: int = 0):
|
|
| 50 |
f'{text}</div><center>',
|
| 51 |
unsafe_allow_html=True)
|
| 52 |
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
def cite(tag):
|
| 58 |
-
CITATIONS.setdefault(tag, len(CITATIONS) + 1)
|
| 59 |
-
return f" [{CITATIONS[tag]}]"
|
|
|
|
| 50 |
f'{text}</div><center>',
|
| 51 |
unsafe_allow_html=True)
|
| 52 |
|
| 53 |
+
def cite(tag, link):
|
| 54 |
+
return f"""<a target="_blank" rel="noopener noreferrer" href="{link}">{tag}</a>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static/content_style.css
CHANGED
|
@@ -1,11 +1,14 @@
|
|
| 1 |
.faded {
|
| 2 |
margin: 0 auto;
|
| 3 |
background: var(--window-color);
|
| 4 |
-
box-shadow: 0 0
|
| 5 |
font-family: cursive;
|
| 6 |
font-family: "Gill Sans", sans-serif;
|
| 7 |
display: inline-block
|
| 8 |
}
|
|
|
|
|
|
|
|
|
|
| 9 |
.padded {
|
| 10 |
width: 100%;
|
| 11 |
max-width: 800px;
|
|
|
|
| 1 |
.faded {
|
| 2 |
margin: 0 auto;
|
| 3 |
background: var(--window-color);
|
| 4 |
+
box-shadow: 0 0 1px 1px var(--window-color);
|
| 5 |
font-family: cursive;
|
| 6 |
font-family: "Gill Sans", sans-serif;
|
| 7 |
display: inline-block
|
| 8 |
}
|
| 9 |
+
.centered {
|
| 10 |
+
text-align: center;
|
| 11 |
+
}
|
| 12 |
.padded {
|
| 13 |
width: 100%;
|
| 14 |
max-width: 800px;
|