Spaces:
Runtime error
Runtime error
File size: 1,839 Bytes
86b7493 17b7b74 86b7493 d40b082 86b7493 1d8c55e 86b7493 17b7b74 86b7493 64df977 86b7493 2737fb5 86b7493 2737fb5 86b7493 2737fb5 86b7493 2737fb5 86b7493 2737fb5 86b7493 2737fb5 86b7493 2737fb5 86b7493 17b7b74 86b7493 17b7b74 86b7493 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
CHARACTER_LENGTH = "length_function=lambda x: len(x)"
TOKEN_LENGTH = enc = tiktoken.get_encoding("cl100k_base")
length_function = lambda text: len(enc.encode(text))
# Streamlit UI
st.title("Understand Chunk and Token")
chunk_size = st.number_input(
min_value=1,
label="Chunk Size",
value=1000
)
chunk_overlap = st.number_input(
min_value=1,
max_value=chunk_size - 1,
label="Chunk Overlap",
value=int(chunk_size * 0.2)
)
length_function_option = st.selectbox(
"Length Function",
["Characters", "Tokens"]
)
splitter_choice = st.selectbox(
"Select a Text Splitter",
["RecursiveCharacter", "Character"]
)
# Auswählen der passenden length_function
if length_function_option == "Characters":
# Messen in Zeichen
length_function = len
length_function_str = CHARACTER_LENGTH
elif length_function_option == "Tokens":
# Messen in Tokens mithilfe tiktoken
enc = tiktoken.get_encoding("cl100k_base")
def length_function(text: str) -> int:
return len(enc.encode(text))
length_function_str = TOKEN_LENGTH
else:
raise ValueError("Ungültige Option für length_function.")
# Text-Eingabe
doc = st.text_area("Füge hier deinen Text ein:")
# Button zum Splitten des Textes
if st.button("Split Text"):
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=length_function
)
# Aufteilen des Textes
splits = splitter.split_text(doc)
# Ausgabe der erstellten Textsplitter
for idx, split in enumerate(splits, start=1):
st.text_area(f"Teilstück {idx}", split, height=150)
|