File size: 1,839 Bytes
86b7493
17b7b74
86b7493
 
d40b082
86b7493
1d8c55e
86b7493
17b7b74
86b7493
64df977
86b7493
 
 
2737fb5
86b7493
 
 
2737fb5
86b7493
2737fb5
 
86b7493
 
 
 
2737fb5
86b7493
 
2737fb5
86b7493
 
 
 
2737fb5
86b7493
2737fb5
86b7493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17b7b74
 
86b7493
 
 
17b7b74
 
86b7493
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken


CHARACTER_LENGTH = "length_function=lambda x: len(x)"
TOKEN_LENGTH = enc = tiktoken.get_encoding("cl100k_base")
length_function = lambda text: len(enc.encode(text))

# Streamlit UI
st.title("Understand Chunk and Token")



chunk_size = st.number_input(
        min_value=1,
        label="Chunk Size",
        value=1000
)


chunk_overlap = st.number_input(
        min_value=1,
        max_value=chunk_size - 1,
        label="Chunk Overlap",
        value=int(chunk_size * 0.2)
)


length_function_option = st.selectbox(
        "Length Function",
        ["Characters", "Tokens"]
    )

splitter_choice = st.selectbox(
        "Select a Text Splitter",
        ["RecursiveCharacter", "Character"]
    )
# Auswählen der passenden length_function
if length_function_option == "Characters":
    # Messen in Zeichen
    length_function = len
    length_function_str = CHARACTER_LENGTH
elif length_function_option == "Tokens":
    # Messen in Tokens mithilfe tiktoken
    enc = tiktoken.get_encoding("cl100k_base")
    def length_function(text: str) -> int:
        return len(enc.encode(text))
    length_function_str = TOKEN_LENGTH
else:
    raise ValueError("Ungültige Option für length_function.")


# Text-Eingabe
doc = st.text_area("Füge hier deinen Text ein:")

# Button zum Splitten des Textes
if st.button("Split Text"):

    splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=length_function
    )

    
    # Aufteilen des Textes
    splits = splitter.split_text(doc)

    # Ausgabe der erstellten Textsplitter
    for idx, split in enumerate(splits, start=1):
        st.text_area(f"Teilstück {idx}", split, height=150)