Spaces:
Runtime error
Runtime error
File size: 5,047 Bytes
86b7493 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language
import tiktoken
#
# BEISPIELHAFTE CODE-SNIPPETS
# ===========================
# Da das Original-Beispiel auf "code_snippets" verweist, kannst du hier
# eigene Code-Beispiele oder Strings einfügen. Für die Demo setzen wir
# einfach ein paar Strings ein.
#
CHARACTER_LENGTH = "length_function=lambda x: len(x)"
TOKEN_LENGTH = """enc = tiktoken.get_encoding("cl100k_base")
length_function = lambda text: len(enc.encode(text))
"""
CHARACTER = """CharacterTextSplitter(
separator="\\n\\n",
chunk_size={chunk_size},
chunk_overlap={chunk_overlap},
length_function={length_function}
)
"""
RECURSIVE_CHARACTER = """RecursiveCharacterTextSplitter(
chunk_size={chunk_size},
chunk_overlap={chunk_overlap},
length_function={length_function}
)
"""
LANGUAGE = """RecursiveCharacterTextSplitter.from_language(
language="{language}",
chunk_size={chunk_size},
chunk_overlap={chunk_overlap},
length_function={length_function}
)
"""
# Streamlit UI
st.title("Text Splitter Playground")
st.info("""\
Splitte einen Text in Teilstücke (Chunks), basierend auf deinen Einstellungen:
- **Chunk Size**: Maximalgröße eines Teilstücks (in Zeichen oder Tokens)
- **Chunk Overlap**: Überlappung zwischen den Teilstücken
- **Length Function**: Gibt an, ob die Teilstück-Größe in Zeichen oder Tokens gemessen werden soll
- **Splitter Choice**: Definiert den Text-Splitter (Charakter-basiert, rekursiv oder basierend auf einer Sprache)
""")
col1, col2, col3, col4 = st.columns([1, 1, 1, 2])
with col1:
chunk_size = st.number_input(
min_value=1,
label="Chunk Size",
value=1000
)
with col2:
chunk_overlap = st.number_input(
min_value=1,
max_value=chunk_size - 1,
label="Chunk Overlap",
value=int(chunk_size * 0.2)
)
if chunk_overlap >= chunk_size:
st.warning("Achtung: Chunk Overlap sollte kleiner als die Chunk Size sein!")
with col3:
length_function_option = st.selectbox(
"Length Function",
["Characters", "Tokens"]
)
splitter_choices = ["RecursiveCharacter", "Character"] + [f"Language.{v.name}" for v in Language]
with col4:
splitter_choice = st.selectbox(
"Select a Text Splitter",
splitter_choices
)
# Auswählen der passenden length_function
if length_function_option == "Characters":
# Messen in Zeichen
length_function = len
length_function_str = CHARACTER_LENGTH
elif length_function_option == "Tokens":
# Messen in Tokens mithilfe tiktoken
enc = tiktoken.get_encoding("cl100k_base")
def length_function(text: str) -> int:
return len(enc.encode(text))
length_function_str = TOKEN_LENGTH
else:
raise ValueError("Ungültige Option für length_function.")
# Code-Text bauen, der den vom User ausgewählten Splitter zeigt
if splitter_choice == "Character":
import_text = CHARACTER.format(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=length_function_str
)
elif splitter_choice == "RecursiveCharacter":
import_text = RECURSIVE_CHARACTER.format(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=length_function_str
)
elif "Language." in splitter_choice:
lang = splitter_choice.split(".")[1].lower()
import_text = LANGUAGE.format(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
language=lang,
length_function=length_function_str
)
else:
raise ValueError("Ungültige Wahl beim Text Splitter.")
# Anzeigen des generierten Beispiel-Codes
st.info("**Beispielcode:**\n\n" + import_text)
# Text-Eingabe
doc = st.text_area("Füge hier deinen Text ein:")
# Button zum Splitten des Textes
if st.button("Split Text"):
# Erzeugen des Splitter-Objekts basierend auf der Auswahl
if splitter_choice == "Character":
splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=length_function
)
elif splitter_choice == "RecursiveCharacter":
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=length_function
)
elif "Language." in splitter_choice:
lang = splitter_choice.split(".")[1].lower()
splitter = RecursiveCharacterTextSplitter.from_language(
language=lang,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=length_function
)
else:
raise ValueError("Ungültige Wahl beim Text Splitter.")
# Aufteilen des Textes
splits = splitter.split_text(doc)
# Ausgabe der erstellten Textsplitter
for idx, split in enumerate(splits, start=1):
st.text_area(f"Teilstück {idx}", split, height=150)
|