import streamlit as st from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language import tiktoken CHARACTER_LENGTH = "length_function=lambda x: len(x)" TOKEN_LENGTH = """enc = tiktoken.get_encoding("cl100k_base") length_function = lambda text: len(enc.encode(text)) """ CHARACTER = """CharacterTextSplitter( separator="\\n\\n", chunk_size={chunk_size}, chunk_overlap={chunk_overlap}, length_function={length_function} ) """ RECURSIVE_CHARACTER = """RecursiveCharacterTextSplitter( chunk_size={chunk_size}, chunk_overlap={chunk_overlap}, length_function={length_function} ) """ LANGUAGE = """RecursiveCharacterTextSplitter.from_language( language="{language}", chunk_size={chunk_size}, chunk_overlap={chunk_overlap}, length_function={length_function} ) """ # Streamlit UI st.title("Understand Chunk and Token") col1, col2, col3, col4 = st.columns([1, 1, 1, 2]) with col1: chunk_size = st.number_input( min_value=1, label="Chunk Size", value=1000 ) with col2: chunk_overlap = st.number_input( min_value=1, max_value=chunk_size - 1, label="Chunk Overlap", value=int(chunk_size * 0.2) ) if chunk_overlap >= chunk_size: st.warning("Achtung: Chunk Overlap sollte kleiner als die Chunk Size sein!") with col3: length_function_option = st.selectbox( "Length Function", ["Characters", "Tokens"] ) splitter_choices = ["RecursiveCharacter", "Character"] + [f"Language.{v.name}" for v in Language] with col4: splitter_choice = st.selectbox( "Select a Text Splitter", splitter_choices ) # Auswählen der passenden length_function if length_function_option == "Characters": # Messen in Zeichen length_function = len length_function_str = CHARACTER_LENGTH elif length_function_option == "Tokens": # Messen in Tokens mithilfe tiktoken enc = tiktoken.get_encoding("cl100k_base") def length_function(text: str) -> int: return len(enc.encode(text)) length_function_str = TOKEN_LENGTH else: raise ValueError("Ungültige Option für length_function.") # Text-Eingabe doc = st.text_area("Füge hier deinen Text ein:") # Button zum Splitten des Textes if st.button("Split Text"): # Erzeugen des Splitter-Objekts basierend auf der Auswahl if splitter_choice == "Character": splitter = CharacterTextSplitter( separator="\n\n", chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=length_function ) elif splitter_choice == "RecursiveCharacter": splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=length_function ) elif "Language." in splitter_choice: lang = splitter_choice.split(".")[1].lower() splitter = RecursiveCharacterTextSplitter.from_language( language=lang, chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=length_function ) else: raise ValueError("Ungültige Wahl beim Text Splitter.") # Aufteilen des Textes splits = splitter.split_text(doc) # Ausgabe der erstellten Textsplitter for idx, split in enumerate(splits, start=1): st.text_area(f"Teilstück {idx}", split, height=150)