# chunker_app.py import streamlit as st import math # --- Page Configuration --- st.set_page_config( page_title="Text Chunker Demo", page_icon="📄", layout="wide", initial_sidebar_state="expanded" ) # --- Styling (Optional: Add some CSS for a fancier look) --- st.markdown(""" """, unsafe_allow_html=True) # --- Core Chunking Function --- def chunk_text(text, chunk_size, chunk_overlap): """ Splits the text into chunks of a specified size with a given overlap. Args: text (str): The input text to chunk. chunk_size (int): The desired maximum size of each chunk (in characters). chunk_overlap (int): The number of characters to overlap between consecutive chunks. Returns: list[str]: A list of text chunks. """ if chunk_size <= 0: st.error("Chunk Size must be a positive integer.") return [] if chunk_overlap < 0: st.error("Chunk Overlap cannot be negative.") return [] if chunk_overlap >= chunk_size: st.error("Chunk Overlap must be smaller than Chunk Size to prevent infinite loops or empty chunks.") return [] if not text: return [] chunks = [] start_index = 0 text_length = len(text) while start_index < text_length: end_index = start_index + chunk_size # Slice the text to get the current chunk chunk = text[start_index:end_index] chunks.append(chunk) # Move the start index for the next chunk next_start_index = start_index + chunk_size - chunk_overlap # If the next start index is the same as the current one (e.g., overlap == size) # and we haven't reached the end, increment by 1 to avoid infinite loop. # This case is technically prevented by the overlap < chunk_size check above, # but good to be robust. if next_start_index <= start_index and start_index < text_length: # This scenario should not happen with valid inputs due to the check above. # If it somehow does, break to prevent potential infinite loop. st.warning(f"Potential loop detected. Breaking chunking at chunk {len(chunks)}.") break # Or force progress: next_start_index = start_index + 1 start_index = next_start_index # Optimization: If the overlap is so large that the next start is past the text end, # and we already captured the last part, we can break early. # The `while start_index < text_length` condition handles this naturally. return chunks # --- Example Text --- EXAMPLE_TEXT = """Streamlit is an open-source Python library that makes it easy to create and share beautiful, custom web apps for machine learning and data science. In just a few minutes you can build and deploy powerful data apps. Let's consider the process of chunking text. This is a common technique in Natural Language Processing (NLP), especially when dealing with large documents that need to be fed into models with fixed input size limits, like many transformer models (e.g., BERT, GPT). Chunking involves breaking down a large piece of text into smaller, manageable segments or 'chunks'. The size of these chunks is a critical parameter. Another important parameter is the 'overlap'. Overlap means that consecutive chunks will share some amount of text. This is useful to ensure that semantic context is not lost at the boundaries of chunks. For example, if a sentence is split exactly between two chunks, having an overlap allows the model processing the chunks to see the full sentence eventually, potentially across two adjacent chunks. Choosing the right chunk size and overlap depends heavily on the specific application, the model being used, and the nature of the text. Smaller chunks capture finer details but might lose broader context. Larger chunks retain more context but might exceed model limits or smooth over important local information. Overlap helps mitigate context loss at boundaries but increases the total number of chunks and computational overhead. Experimentation is often required to find the optimal settings. """ # --- Streamlit App Layout --- # Sidebar for Settings with st.sidebar: st.markdown("