Spaces:
Runtime error
Runtime error
im
commited on
Commit
·
84c1553
1
Parent(s):
120ad45
add chapter about Embeddings and vector comparison charts
Browse files- .gitignore +2 -1
- app.py +75 -1
- requirements.txt +3 -1
.gitignore
CHANGED
|
@@ -162,4 +162,5 @@ cython_debug/
|
|
| 162 |
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 163 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 164 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 165 |
-
.idea/
|
|
|
|
|
|
| 162 |
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 163 |
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 164 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 165 |
+
.idea/
|
| 166 |
+
.streamlit/secrets.toml
|
app.py
CHANGED
|
@@ -213,7 +213,7 @@ if tokeniser_name == 'BPE':
|
|
| 213 |
""")
|
| 214 |
|
| 215 |
|
| 216 |
-
st.subheader("Try Yourself:")
|
| 217 |
st.write(f"""\
|
| 218 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
| 219 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
|
@@ -445,3 +445,77 @@ with st.expander("References:"):
|
|
| 445 |
divider()
|
| 446 |
st.header("Embeddings")
|
| 447 |
st.caption("TBD...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
""")
|
| 214 |
|
| 215 |
|
| 216 |
+
st.subheader(":green[Try Yourself:]")
|
| 217 |
st.write(f"""\
|
| 218 |
*Using text area field below try to find or create a comprehensive vocabulary (training dataset) for Tokenisation, which can enhance the
|
| 219 |
efficiency of the process. This approach helps to eliminate unknown tokens, thereby making the token sequence
|
|
|
|
| 445 |
divider()
|
| 446 |
st.header("Embeddings")
|
| 447 |
st.caption("TBD...")
|
| 448 |
+
|
| 449 |
+
st.write("""\
|
| 450 |
+
Following tokenization, each token is transformed into a vector of numeric characteristics, a process
|
| 451 |
+
known as 'embedding.' In this context, 'embedding' refers to the mapping of the discrete, categorical space of words
|
| 452 |
+
or tokens into a continuous, numeric space, which the model can manipulate more effectively.
|
| 453 |
+
|
| 454 |
+
Each dimension in this high-dimensional space can encapsulate a different facet of the token's meaning. For instance,
|
| 455 |
+
one dimension might capture the tense of a token if it's a verb, while another dimension might capture the degree of
|
| 456 |
+
positivity or negativity if the token is an adjective expressing sentiment. For instance:
|
| 457 |
+
""")
|
| 458 |
+
st.code("""\
|
| 459 |
+
"I" -> [noun, person]
|
| 460 |
+
"love" -> [verb, feeling]
|
| 461 |
+
"machine" -> [noun, automation]
|
| 462 |
+
"learn" -> [verb, knowledge]
|
| 463 |
+
"##ing" -> [gerund, continues]
|
| 464 |
+
""")
|
| 465 |
+
|
| 466 |
+
st.write("""\
|
| 467 |
+
The actual embeddings in a typical NLP model would be in a much higher-dimensional space (often several hundred dimensions), but the idea is the same.
|
| 468 |
+
Embeddings are dynamically learned from the data, with the model adjusting these embeddings during
|
| 469 |
+
training to minimize the discrepancy between the predicted and actual outputs for a set of training examples.
|
| 470 |
+
Consequently, tokens with similar meanings often end up with similar embeddings.
|
| 471 |
+
|
| 472 |
+
In the context of Transformers, these embeddings are the inputs that the model uses. Once again, we represent all the
|
| 473 |
+
characteristics using numbers, not words.
|
| 474 |
+
""")
|
| 475 |
+
|
| 476 |
+
col1, col2 = st.columns(2)
|
| 477 |
+
token_king = col1.text_input("Choose words to compare embeddings:", value="king")
|
| 478 |
+
token_queen = col2.text_input("Choose words to compare embeddings:", value="queen")
|
| 479 |
+
|
| 480 |
+
from torch import nn
|
| 481 |
+
from transformers import AutoConfig
|
| 482 |
+
from transformers import AutoTokenizer
|
| 483 |
+
import pandas as pd
|
| 484 |
+
import openai
|
| 485 |
+
|
| 486 |
+
model_ckpt = 'bert-base-uncased'
|
| 487 |
+
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
|
| 488 |
+
king_id = tokenizer(token_king, return_tensors="pt", add_special_tokens=False)
|
| 489 |
+
queen_id = tokenizer(token_queen, return_tensors="pt", add_special_tokens=False)
|
| 490 |
+
|
| 491 |
+
config = AutoConfig.from_pretrained(model_ckpt)
|
| 492 |
+
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
|
| 493 |
+
king_embeddings = token_emb(king_id.input_ids)
|
| 494 |
+
queen_embeddings = token_emb(queen_id.input_ids)
|
| 495 |
+
king_emb_np = king_embeddings.reshape(-1).detach().numpy()
|
| 496 |
+
queen_emb_np = queen_embeddings.reshape(-1).detach().numpy()
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
openai.api_key = st.secrets["OPENAI_API_KEY"]
|
| 500 |
+
EMBEDDING_MODEL = 'text-embedding-ada-002'
|
| 501 |
+
EMBEDDING_CTX_LENGTH = 8191
|
| 502 |
+
EMBEDDING_ENCODING = 'cl100k_base'
|
| 503 |
+
king = openai.Embedding.create(input=token_king, model=EMBEDDING_MODEL)["data"][0]["embedding"]
|
| 504 |
+
queen = openai.Embedding.create(input=token_queen, model=EMBEDDING_MODEL)["data"][0]["embedding"]
|
| 505 |
+
|
| 506 |
+
st.write("Google's 'bert-base-uncased' model embeddings:")
|
| 507 |
+
df = pd.DataFrame({f'"{token_king}" embeddings': king_emb_np[:50], f'"{token_queen}" embeddings': queen_emb_np[:50]})
|
| 508 |
+
st.line_chart(df)
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
st.write("OpenAI's 'text-embedding-ada-002' model embeddings:")
|
| 512 |
+
df = pd.DataFrame({f'"{token_king}" embeddings': king[:50], f'"{token_queen}" embeddings': queen[:50]})
|
| 513 |
+
st.line_chart(df)
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
|
| 517 |
+
with st.expander("References:"):
|
| 518 |
+
st.write("""\
|
| 519 |
+
- https://huggingface.co/blog/getting-started-with-embeddings
|
| 520 |
+
- https://huggingface.co/blog/1b-sentence-embeddings
|
| 521 |
+
""")
|
requirements.txt
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
streamlit~=1.21.0
|
| 2 |
tokenizers~=0.13.3
|
| 3 |
-
transformers~=4.31.0
|
|
|
|
|
|
|
|
|
| 1 |
streamlit~=1.21.0
|
| 2 |
tokenizers~=0.13.3
|
| 3 |
+
transformers~=4.31.0
|
| 4 |
+
torch~=2.0.1
|
| 5 |
+
openai~=0.27.8
|