import streamlit as st from langchain.text_splitter import RecursiveCharacterTextSplitter import tiktoken CHARACTER_LENGTH = "length_function=lambda x: len(x)" TOKEN_LENGTH = enc = tiktoken.get_encoding("cl100k_base") length_function = lambda text: len(enc.encode(text)) # Streamlit UI st.title("Understand Chunk and Token") chunk_size = st.number_input( min_value=1, label="Chunk Size", value=1000 ) chunk_overlap = st.number_input( min_value=1, max_value=chunk_size - 1, label="Chunk Overlap", value=int(chunk_size * 0.2) ) length_function_option = st.selectbox( "Length Function", ["Characters", "Tokens"] ) length_function_option = Tokens # Text-Eingabe doc = st.text_area("Füge hier deinen Text ein:") # Button zum Splitten des Textes if st.button("Split Text"): splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=length_function ) # Aufteilen des Textes splits = splitter.split_text(doc) # Ausgabe der erstellten Textsplitter for idx, split in enumerate(splits, start=1): st.text_area(f"Teilstück {idx}", split, height=150)