Spaces:
Runtime error
Runtime error
import streamlit as st | |
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language | |
import tiktoken | |
# | |
# BEISPIELHAFTE CODE-SNIPPETS | |
# =========================== | |
# Da das Original-Beispiel auf "code_snippets" verweist, kannst du hier | |
# eigene Code-Beispiele oder Strings einfügen. Für die Demo setzen wir | |
# einfach ein paar Strings ein. | |
# | |
CHARACTER_LENGTH = "length_function=lambda x: len(x)" | |
TOKEN_LENGTH = """enc = tiktoken.get_encoding("cl100k_base") | |
length_function = lambda text: len(enc.encode(text)) | |
""" | |
CHARACTER = """CharacterTextSplitter( | |
separator="\\n\\n", | |
chunk_size={chunk_size}, | |
chunk_overlap={chunk_overlap}, | |
length_function={length_function} | |
) | |
""" | |
RECURSIVE_CHARACTER = """RecursiveCharacterTextSplitter( | |
chunk_size={chunk_size}, | |
chunk_overlap={chunk_overlap}, | |
length_function={length_function} | |
) | |
""" | |
LANGUAGE = """RecursiveCharacterTextSplitter.from_language( | |
language="{language}", | |
chunk_size={chunk_size}, | |
chunk_overlap={chunk_overlap}, | |
length_function={length_function} | |
) | |
""" | |
# Streamlit UI | |
st.title("Text Splitter Playground") | |
st.info("""\ | |
Splitte einen Text in Teilstücke (Chunks), basierend auf deinen Einstellungen: | |
- **Chunk Size**: Maximalgröße eines Teilstücks (in Zeichen oder Tokens) | |
- **Chunk Overlap**: Überlappung zwischen den Teilstücken | |
- **Length Function**: Gibt an, ob die Teilstück-Größe in Zeichen oder Tokens gemessen werden soll | |
- **Splitter Choice**: Definiert den Text-Splitter (Charakter-basiert, rekursiv oder basierend auf einer Sprache) | |
""") | |
col1, col2, col3, col4 = st.columns([1, 1, 1, 2]) | |
with col1: | |
chunk_size = st.number_input( | |
min_value=1, | |
label="Chunk Size", | |
value=1000 | |
) | |
with col2: | |
chunk_overlap = st.number_input( | |
min_value=1, | |
max_value=chunk_size - 1, | |
label="Chunk Overlap", | |
value=int(chunk_size * 0.2) | |
) | |
if chunk_overlap >= chunk_size: | |
st.warning("Achtung: Chunk Overlap sollte kleiner als die Chunk Size sein!") | |
with col3: | |
length_function_option = st.selectbox( | |
"Length Function", | |
["Characters", "Tokens"] | |
) | |
splitter_choices = ["RecursiveCharacter", "Character"] + [f"Language.{v.name}" for v in Language] | |
with col4: | |
splitter_choice = st.selectbox( | |
"Select a Text Splitter", | |
splitter_choices | |
) | |
# Auswählen der passenden length_function | |
if length_function_option == "Characters": | |
# Messen in Zeichen | |
length_function = len | |
length_function_str = CHARACTER_LENGTH | |
elif length_function_option == "Tokens": | |
# Messen in Tokens mithilfe tiktoken | |
enc = tiktoken.get_encoding("cl100k_base") | |
def length_function(text: str) -> int: | |
return len(enc.encode(text)) | |
length_function_str = TOKEN_LENGTH | |
else: | |
raise ValueError("Ungültige Option für length_function.") | |
# Code-Text bauen, der den vom User ausgewählten Splitter zeigt | |
if splitter_choice == "Character": | |
import_text = CHARACTER.format( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=length_function_str | |
) | |
elif splitter_choice == "RecursiveCharacter": | |
import_text = RECURSIVE_CHARACTER.format( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=length_function_str | |
) | |
elif "Language." in splitter_choice: | |
lang = splitter_choice.split(".")[1].lower() | |
import_text = LANGUAGE.format( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
language=lang, | |
length_function=length_function_str | |
) | |
else: | |
raise ValueError("Ungültige Wahl beim Text Splitter.") | |
# Anzeigen des generierten Beispiel-Codes | |
st.info("**Beispielcode:**\n\n" + import_text) | |
# Text-Eingabe | |
doc = st.text_area("Füge hier deinen Text ein:") | |
# Button zum Splitten des Textes | |
if st.button("Split Text"): | |
# Erzeugen des Splitter-Objekts basierend auf der Auswahl | |
if splitter_choice == "Character": | |
splitter = CharacterTextSplitter( | |
separator="\n\n", | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=length_function | |
) | |
elif splitter_choice == "RecursiveCharacter": | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=length_function | |
) | |
elif "Language." in splitter_choice: | |
lang = splitter_choice.split(".")[1].lower() | |
splitter = RecursiveCharacterTextSplitter.from_language( | |
language=lang, | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
length_function=length_function | |
) | |
else: | |
raise ValueError("Ungültige Wahl beim Text Splitter.") | |
# Aufteilen des Textes | |
splits = splitter.split_text(doc) | |
# Ausgabe der erstellten Textsplitter | |
for idx, split in enumerate(splits, start=1): | |
st.text_area(f"Teilstück {idx}", split, height=150) | |