Sebbe33 commited on
Commit
042f436
·
verified ·
1 Parent(s): d565db2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -32
app.py CHANGED
@@ -1,56 +1,32 @@
1
  import streamlit as st
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
- import tiktoken
4
-
5
-
6
- CHARACTER_LENGTH = "length_function=lambda x: len(x)"
7
- TOKEN_LENGTH = enc = tiktoken.get_encoding("cl100k_base")
8
- length_function = lambda text: len(enc.encode(text))
9
-
10
- # Streamlit UI
11
- st.title("Understand Chunk and Token")
12
-
13
 
 
14
 
15
  chunk_size = st.number_input(
16
  min_value=1,
17
  label="Chunk Size",
18
- value=1000
19
  )
20
 
21
-
22
  chunk_overlap = st.number_input(
23
  min_value=1,
24
  max_value=chunk_size - 1,
25
  label="Chunk Overlap",
26
- value=int(chunk_size * 0.2)
27
  )
28
 
 
29
 
30
- length_function_option = st.selectbox(
31
- "Length Function",
32
- ["Characters", "Tokens"]
33
- )
34
-
35
-
36
- length_function_option = Tokens
37
-
38
- # Text-Eingabe
39
- doc = st.text_area("Füge hier deinen Text ein:")
40
-
41
- # Button zum Splitten des Textes
42
- if st.button("Split Text"):
43
 
44
  splitter = RecursiveCharacterTextSplitter(
45
  chunk_size=chunk_size,
46
  chunk_overlap=chunk_overlap,
47
- length_function=length_function
48
  )
49
 
50
-
51
- # Aufteilen des Textes
52
- splits = splitter.split_text(doc)
53
 
54
- # Ausgabe der erstellten Textsplitter
55
  for idx, split in enumerate(splits, start=1):
56
- st.text_area(f"Teilstück {idx}", split, height=150)
 
1
  import streamlit as st
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
 
 
 
 
 
 
 
3
 
4
+ st.title("Understand Chunking")
5
 
6
  chunk_size = st.number_input(
7
  min_value=1,
8
  label="Chunk Size",
9
+ value=50
10
  )
11
 
 
12
  chunk_overlap = st.number_input(
13
  min_value=1,
14
  max_value=chunk_size - 1,
15
  label="Chunk Overlap",
16
+ value=10
17
  )
18
 
19
+ docs = st.text_area("Put your text:")
20
 
21
+ if st.button("Split"):
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  splitter = RecursiveCharacterTextSplitter(
24
  chunk_size=chunk_size,
25
  chunk_overlap=chunk_overlap,
26
+ length_function=len
27
  )
28
 
29
+ splits = splitter.split_text(docs)
 
 
30
 
 
31
  for idx, split in enumerate(splits, start=1):
32
+ st.text_area(f"Chunk {idx}", split)