File size: 5,047 Bytes
86b7493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, Language
import tiktoken

#
# BEISPIELHAFTE CODE-SNIPPETS
# ===========================
# Da das Original-Beispiel auf "code_snippets" verweist, kannst du hier
# eigene Code-Beispiele oder Strings einfügen. Für die Demo setzen wir
# einfach ein paar Strings ein.
#
CHARACTER_LENGTH = "length_function=lambda x: len(x)"
TOKEN_LENGTH = """enc = tiktoken.get_encoding("cl100k_base")
length_function = lambda text: len(enc.encode(text))
"""
CHARACTER = """CharacterTextSplitter(
    separator="\\n\\n",
    chunk_size={chunk_size},
    chunk_overlap={chunk_overlap},
    length_function={length_function}
)
"""
RECURSIVE_CHARACTER = """RecursiveCharacterTextSplitter(
    chunk_size={chunk_size},
    chunk_overlap={chunk_overlap},
    length_function={length_function}
)
"""
LANGUAGE = """RecursiveCharacterTextSplitter.from_language(
    language="{language}",
    chunk_size={chunk_size},
    chunk_overlap={chunk_overlap},
    length_function={length_function}
)
"""

# Streamlit UI
st.title("Text Splitter Playground")
st.info("""\
Splitte einen Text in Teilstücke (Chunks), basierend auf deinen Einstellungen:

- **Chunk Size**: Maximalgröße eines Teilstücks (in Zeichen oder Tokens)
- **Chunk Overlap**: Überlappung zwischen den Teilstücken
- **Length Function**: Gibt an, ob die Teilstück-Größe in Zeichen oder Tokens gemessen werden soll
- **Splitter Choice**: Definiert den Text-Splitter (Charakter-basiert, rekursiv oder basierend auf einer Sprache)
""")

col1, col2, col3, col4 = st.columns([1, 1, 1, 2])

with col1:
    chunk_size = st.number_input(
        min_value=1,
        label="Chunk Size",
        value=1000
    )

with col2:
    chunk_overlap = st.number_input(
        min_value=1,
        max_value=chunk_size - 1,
        label="Chunk Overlap",
        value=int(chunk_size * 0.2)
    )

    if chunk_overlap >= chunk_size:
        st.warning("Achtung: Chunk Overlap sollte kleiner als die Chunk Size sein!")

with col3:
    length_function_option = st.selectbox(
        "Length Function",
        ["Characters", "Tokens"]
    )

splitter_choices = ["RecursiveCharacter", "Character"] + [f"Language.{v.name}" for v in Language]

with col4:
    splitter_choice = st.selectbox(
        "Select a Text Splitter",
        splitter_choices
    )

# Auswählen der passenden length_function
if length_function_option == "Characters":
    # Messen in Zeichen
    length_function = len
    length_function_str = CHARACTER_LENGTH
elif length_function_option == "Tokens":
    # Messen in Tokens mithilfe tiktoken
    enc = tiktoken.get_encoding("cl100k_base")
    def length_function(text: str) -> int:
        return len(enc.encode(text))
    length_function_str = TOKEN_LENGTH
else:
    raise ValueError("Ungültige Option für length_function.")

# Code-Text bauen, der den vom User ausgewählten Splitter zeigt
if splitter_choice == "Character":
    import_text = CHARACTER.format(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=length_function_str
    )
elif splitter_choice == "RecursiveCharacter":
    import_text = RECURSIVE_CHARACTER.format(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=length_function_str
    )
elif "Language." in splitter_choice:
    lang = splitter_choice.split(".")[1].lower()
    import_text = LANGUAGE.format(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        language=lang,
        length_function=length_function_str
    )
else:
    raise ValueError("Ungültige Wahl beim Text Splitter.")

# Anzeigen des generierten Beispiel-Codes
st.info("**Beispielcode:**\n\n" + import_text)

# Text-Eingabe
doc = st.text_area("Füge hier deinen Text ein:")

# Button zum Splitten des Textes
if st.button("Split Text"):
    # Erzeugen des Splitter-Objekts basierend auf der Auswahl
    if splitter_choice == "Character":
        splitter = CharacterTextSplitter(
            separator="\n\n",
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=length_function
        )
    elif splitter_choice == "RecursiveCharacter":
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=length_function
        )
    elif "Language." in splitter_choice:
        lang = splitter_choice.split(".")[1].lower()
        splitter = RecursiveCharacterTextSplitter.from_language(
            language=lang,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=length_function
        )
    else:
        raise ValueError("Ungültige Wahl beim Text Splitter.")
    
    # Aufteilen des Textes
    splits = splitter.split_text(doc)

    # Ausgabe der erstellten Textsplitter
    for idx, split in enumerate(splits, start=1):
        st.text_area(f"Teilstück {idx}", split, height=150)