Sebbe33's picture
Create app.py
aa4d076 verified
# chunker_app.py
import streamlit as st
import math
# --- Page Configuration ---
st.set_page_config(
page_title="Text Chunker Demo",
page_icon="πŸ“„",
layout="wide",
initial_sidebar_state="expanded"
)
# --- Styling (Optional: Add some CSS for a fancier look) ---
st.markdown("""
<style>
/* Improve readability and spacing */
.stTextArea textarea {
font-family: monospace; /* Consistent font for text areas */
line-height: 1.5;
}
.stExpander {
border: 1px solid #e6eaf1; /* Subtle border for expanders */
border-radius: 0.5rem;
margin-bottom: 10px; /* Space between chunks */
background-color: #f8f9fa; /* Slightly different background for chunks */
}
.stExpander header {
font-weight: bold;
color: #262730; /* Darker header text */
}
.stExpander div[data-testid="stExpanderDetails"] {
padding: 15px; /* Padding inside the chunk view */
}
/* Sidebar styling */
[data-testid="stSidebar"] {
background-color: #f0f2f6; /* Light grey sidebar */
}
[data-testid="stSidebar"] h1 {
font-size: 1.5em; /* Smaller sidebar header */
color: #007bff; /* Blue sidebar title */
}
/* Button styling (if any buttons were used) */
/* .stButton>button { ... } */
</style>
""", unsafe_allow_html=True)
# --- Core Chunking Function ---
def chunk_text(text, chunk_size, chunk_overlap):
"""
Splits the text into chunks of a specified size with a given overlap.
Args:
text (str): The input text to chunk.
chunk_size (int): The desired maximum size of each chunk (in characters).
chunk_overlap (int): The number of characters to overlap between consecutive chunks.
Returns:
list[str]: A list of text chunks.
"""
if chunk_size <= 0:
st.error("Chunk Size must be a positive integer.")
return []
if chunk_overlap < 0:
st.error("Chunk Overlap cannot be negative.")
return []
if chunk_overlap >= chunk_size:
st.error("Chunk Overlap must be smaller than Chunk Size to prevent infinite loops or empty chunks.")
return []
if not text:
return []
chunks = []
start_index = 0
text_length = len(text)
while start_index < text_length:
end_index = start_index + chunk_size
# Slice the text to get the current chunk
chunk = text[start_index:end_index]
chunks.append(chunk)
# Move the start index for the next chunk
next_start_index = start_index + chunk_size - chunk_overlap
# If the next start index is the same as the current one (e.g., overlap == size)
# and we haven't reached the end, increment by 1 to avoid infinite loop.
# This case is technically prevented by the overlap < chunk_size check above,
# but good to be robust.
if next_start_index <= start_index and start_index < text_length:
# This scenario should not happen with valid inputs due to the check above.
# If it somehow does, break to prevent potential infinite loop.
st.warning(f"Potential loop detected. Breaking chunking at chunk {len(chunks)}.")
break
# Or force progress: next_start_index = start_index + 1
start_index = next_start_index
# Optimization: If the overlap is so large that the next start is past the text end,
# and we already captured the last part, we can break early.
# The `while start_index < text_length` condition handles this naturally.
return chunks
# --- Example Text ---
EXAMPLE_TEXT = """Streamlit is an open-source Python library that makes it easy to create and share beautiful, custom web apps for machine learning and data science. In just a few minutes you can build and deploy powerful data apps.
Let's consider the process of chunking text. This is a common technique in Natural Language Processing (NLP), especially when dealing with large documents that need to be fed into models with fixed input size limits, like many transformer models (e.g., BERT, GPT).
Chunking involves breaking down a large piece of text into smaller, manageable segments or 'chunks'. The size of these chunks is a critical parameter. Another important parameter is the 'overlap'. Overlap means that consecutive chunks will share some amount of text. This is useful to ensure that semantic context is not lost at the boundaries of chunks. For example, if a sentence is split exactly between two chunks, having an overlap allows the model processing the chunks to see the full sentence eventually, potentially across two adjacent chunks.
Choosing the right chunk size and overlap depends heavily on the specific application, the model being used, and the nature of the text. Smaller chunks capture finer details but might lose broader context. Larger chunks retain more context but might exceed model limits or smooth over important local information. Overlap helps mitigate context loss at boundaries but increases the total number of chunks and computational overhead. Experimentation is often required to find the optimal settings.
"""
# --- Streamlit App Layout ---
# Sidebar for Settings
with st.sidebar:
st.markdown("<h1>βš™οΈ Chunking Settings</h1>", unsafe_allow_html=True)
st.markdown("Configure how the text should be split.")
chunk_size = st.number_input(
"Chunk Size (characters)",
min_value=1,
value=250,
step=50,
help="Maximum number of characters per chunk."
)
chunk_overlap = st.number_input(
"Overlap (characters)",
min_value=0,
# Ensure initial overlap is less than initial size
value=min(50, chunk_size - 1 if chunk_size > 1 else 0),
step=10,
help="Number of characters shared between consecutive chunks. Must be less than Chunk Size."
)
# Dynamic check for overlap vs size
if chunk_overlap >= chunk_size and chunk_size > 0:
st.warning("Overlap should be smaller than Chunk Size.")
st.markdown("---")
st.markdown("Built with [Streamlit](https://streamlit.io)")
# Main content area
st.title("πŸ“„ Text Chunking Demonstrator")
st.markdown(
"Enter text and adjust the settings in the sidebar to see how it's divided into chunks."
)
st.divider()
# Input Text Area
st.subheader("πŸ“ Input Text")
input_text = st.text_area(
"Paste your text here or use the example:",
value=EXAMPLE_TEXT,
height=300,
label_visibility="collapsed"
)
st.divider()
# Display Chunks
st.subheader(f"🧩 Generated Chunks (Size: {chunk_size}, Overlap: {chunk_overlap})")
if input_text:
# Perform chunking (function includes input validation)
generated_chunks = chunk_text(input_text, chunk_size, chunk_overlap)
if generated_chunks:
st.info(f"Successfully generated **{len(generated_chunks)}** chunks.")
# Display each chunk in an expander
for i, chunk in enumerate(generated_chunks):
expander_title = f"Chunk {i+1} (Length: {len(chunk)})"
# Highlight overlapping sections visually (simple approach)
overlap_indicator = ""
if chunk_overlap > 0 and i > 0:
overlap_indicator = f" (Overlaps previous by {chunk_overlap} chars)"
with st.expander(expander_title + overlap_indicator):
# Display the chunk content using st.text or st.markdown
# Use st.text for plain text representation which respects whitespace
st.text(chunk)
# # Alternative: use st.markdown if you want potential markdown rendering
# st.markdown(f"```\n{chunk}\n```")
elif chunk_size > 0 and chunk_overlap >= 0 and chunk_overlap < chunk_size:
# Only show this if no error occurred and text was provided
st.warning("No chunks were generated. The input text might be empty or shorter than the chunk size with zero overlap.")
else:
st.info("Please enter some text in the input area above to generate chunks.")