Spaces:
Sleeping
Sleeping
| from abc import ABC, abstractmethod | |
| import re | |
| from collections import Counter | |
| import string | |
| from .model_loader import load_nltk_punkt | |
| from .utils import * | |
| # Define the abstract base class for chunking strategies | |
| class ChunkingStrategy(ABC): | |
| """ | |
| Abstract base class for chunking strategies. | |
| """ | |
| def chunk(self, text: str) -> list: | |
| """ | |
| Abstract method to chunk the given text. | |
| Args: | |
| text (str): The text to chunk. | |
| Returns: | |
| list: A list of chunks. | |
| """ | |
| pass | |
| # Create an identity chunking strategy f(x) = [x] | |
| class IdentityChunking(ChunkingStrategy): | |
| """ | |
| Chunking strategy that returns the input text as a single chunk. | |
| """ | |
| def chunk(self, text: str) -> list: | |
| return [text] | |
| # Regex-based chunking | |
| class RegexChunking(ChunkingStrategy): | |
| """ | |
| Chunking strategy that splits text based on regular expression patterns. | |
| """ | |
| def __init__(self, patterns=None, **kwargs): | |
| """ | |
| Initialize the RegexChunking object. | |
| Args: | |
| patterns (list): A list of regular expression patterns to split text. | |
| """ | |
| if patterns is None: | |
| patterns = [r'\n\n'] # Default split pattern | |
| self.patterns = patterns | |
| def chunk(self, text: str) -> list: | |
| paragraphs = [text] | |
| for pattern in self.patterns: | |
| new_paragraphs = [] | |
| for paragraph in paragraphs: | |
| new_paragraphs.extend(re.split(pattern, paragraph)) | |
| paragraphs = new_paragraphs | |
| return paragraphs | |
| # NLP-based sentence chunking | |
| class NlpSentenceChunking(ChunkingStrategy): | |
| """ | |
| Chunking strategy that splits text into sentences using NLTK's sentence tokenizer. | |
| """ | |
| def __init__(self, **kwargs): | |
| """ | |
| Initialize the NlpSentenceChunking object. | |
| """ | |
| load_nltk_punkt() | |
| def chunk(self, text: str) -> list: | |
| # Improved regex for sentence splitting | |
| # sentence_endings = re.compile( | |
| # r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z][A-Z]\.)(?<![A-Za-z]\.)(?<=\.|\?|\!|\n)\s' | |
| # ) | |
| # sentences = sentence_endings.split(text) | |
| # sens = [sent.strip() for sent in sentences if sent] | |
| from nltk.tokenize import sent_tokenize | |
| sentences = sent_tokenize(text) | |
| sens = [sent.strip() for sent in sentences] | |
| return list(set(sens)) | |
| # Topic-based segmentation using TextTiling | |
| class TopicSegmentationChunking(ChunkingStrategy): | |
| """ | |
| Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer. | |
| How it works: | |
| 1. Segment the text into topics using TextTilingTokenizer | |
| 2. Extract keywords for each topic segment | |
| """ | |
| def __init__(self, num_keywords=3, **kwargs): | |
| """ | |
| Initialize the TopicSegmentationChunking object. | |
| Args: | |
| num_keywords (int): The number of keywords to extract for each topic segment. | |
| """ | |
| import nltk as nl | |
| self.tokenizer = nl.tokenize.TextTilingTokenizer() | |
| self.num_keywords = num_keywords | |
| def chunk(self, text: str) -> list: | |
| # Use the TextTilingTokenizer to segment the text | |
| segmented_topics = self.tokenizer.tokenize(text) | |
| return segmented_topics | |
| def extract_keywords(self, text: str) -> list: | |
| # Tokenize and remove stopwords and punctuation | |
| import nltk as nl | |
| tokens = nl.toknize.word_tokenize(text) | |
| tokens = [token.lower() for token in tokens if token not in nl.corpus.stopwords.words('english') and token not in string.punctuation] | |
| # Calculate frequency distribution | |
| freq_dist = Counter(tokens) | |
| keywords = [word for word, freq in freq_dist.most_common(self.num_keywords)] | |
| return keywords | |
| def chunk_with_topics(self, text: str) -> list: | |
| # Segment the text into topics | |
| segments = self.chunk(text) | |
| # Extract keywords for each topic segment | |
| segments_with_topics = [(segment, self.extract_keywords(segment)) for segment in segments] | |
| return segments_with_topics | |
| # Fixed-length word chunks | |
| class FixedLengthWordChunking(ChunkingStrategy): | |
| """ | |
| Chunking strategy that splits text into fixed-length word chunks. | |
| How it works: | |
| 1. Split the text into words | |
| 2. Create chunks of fixed length | |
| 3. Return the list of chunks | |
| """ | |
| def __init__(self, chunk_size=100, **kwargs): | |
| """ | |
| Initialize the fixed-length word chunking strategy with the given chunk size. | |
| Args: | |
| chunk_size (int): The size of each chunk in words. | |
| """ | |
| self.chunk_size = chunk_size | |
| def chunk(self, text: str) -> list: | |
| words = text.split() | |
| return [' '.join(words[i:i + self.chunk_size]) for i in range(0, len(words), self.chunk_size)] | |
| # Sliding window chunking | |
| class SlidingWindowChunking(ChunkingStrategy): | |
| """ | |
| Chunking strategy that splits text into overlapping word chunks. | |
| How it works: | |
| 1. Split the text into words | |
| 2. Create chunks of fixed length | |
| 3. Return the list of chunks | |
| """ | |
| def __init__(self, window_size=100, step=50, **kwargs): | |
| """ | |
| Initialize the sliding window chunking strategy with the given window size and | |
| step size. | |
| Args: | |
| window_size (int): The size of the sliding window in words. | |
| step (int): The step size for sliding the window in words. | |
| """ | |
| self.window_size = window_size | |
| self.step = step | |
| def chunk(self, text: str) -> list: | |
| words = text.split() | |
| chunks = [] | |
| if len(words) <= self.window_size: | |
| return [text] | |
| for i in range(0, len(words) - self.window_size + 1, self.step): | |
| chunk = ' '.join(words[i:i + self.window_size]) | |
| chunks.append(chunk) | |
| # Handle the last chunk if it doesn't align perfectly | |
| if i + self.window_size < len(words): | |
| chunks.append(' '.join(words[-self.window_size:])) | |
| return chunks | |
| class OverlappingWindowChunking(ChunkingStrategy): | |
| """ | |
| Chunking strategy that splits text into overlapping word chunks. | |
| How it works: | |
| 1. Split the text into words using whitespace | |
| 2. Create chunks of fixed length equal to the window size | |
| 3. Slide the window by the overlap size | |
| 4. Return the list of chunks | |
| """ | |
| def __init__(self, window_size=1000, overlap=100, **kwargs): | |
| """ | |
| Initialize the overlapping window chunking strategy with the given window size and | |
| overlap size. | |
| Args: | |
| window_size (int): The size of the window in words. | |
| overlap (int): The size of the overlap between consecutive chunks in words. | |
| """ | |
| self.window_size = window_size | |
| self.overlap = overlap | |
| def chunk(self, text: str) -> list: | |
| words = text.split() | |
| chunks = [] | |
| if len(words) <= self.window_size: | |
| return [text] | |
| start = 0 | |
| while start < len(words): | |
| end = start + self.window_size | |
| chunk = ' '.join(words[start:end]) | |
| chunks.append(chunk) | |
| if end >= len(words): | |
| break | |
| start = end - self.overlap | |
| return chunks |