|
""" |
|
Text processing utilities for the efficient-context library. |
|
""" |
|
|
|
import re |
|
from typing import List, Dict, Any |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
def split_into_sentences(text: str) -> List[str]: |
|
""" |
|
Split text into sentences. |
|
|
|
Args: |
|
text: Text to split |
|
|
|
Returns: |
|
sentences: List of sentences |
|
""" |
|
|
|
|
|
text = text.replace('\n', ' ') |
|
|
|
|
|
try: |
|
import nltk |
|
try: |
|
return nltk.sent_tokenize(text) |
|
except Exception as e: |
|
logger.warning(f"NLTK sentence tokenizer error: {e}. Using fallback.") |
|
return _simple_sentence_split(text) |
|
except ImportError: |
|
logger.warning("NLTK not available, using fallback sentence splitter") |
|
return _simple_sentence_split(text) |
|
|
|
def _simple_sentence_split(text: str) -> List[str]: |
|
"""Fallback sentence splitter without dependencies.""" |
|
|
|
|
|
for abbr in ['Mr.', 'Mrs.', 'Dr.', 'vs.', 'e.g.', 'i.e.', 'etc.']: |
|
text = text.replace(abbr, abbr.replace('.', '<POINT>')) |
|
|
|
|
|
sentences = re.split(r'(?<=[.!?])\s+', text) |
|
|
|
|
|
sentences = [s.replace('<POINT>', '.') for s in sentences] |
|
|
|
|
|
return [s for s in sentences if s.strip()] |
|
|
|
def get_sentence_importance(sentences: List[str]) -> List[float]: |
|
""" |
|
Calculate importance scores for sentences based on heuristics. |
|
|
|
Args: |
|
sentences: List of sentences to score |
|
|
|
Returns: |
|
importances: List of importance scores (0.0 to 1.0) |
|
""" |
|
|
|
importances = [] |
|
|
|
for sentence in sentences: |
|
score = 0.0 |
|
words = sentence.split() |
|
|
|
|
|
length_score = min(len(words) / 20, 1.0) |
|
|
|
|
|
keyword_score = 0.0 |
|
keywords = ['important', 'significant', 'key', 'critical', 'crucial', |
|
'essential', 'main', 'major', 'primary', 'central', |
|
'result', 'conclusion', 'finding', 'discovered', 'shows'] |
|
|
|
for word in words: |
|
if word.lower() in keywords: |
|
keyword_score += 0.2 |
|
|
|
keyword_score = min(keyword_score, 0.6) |
|
|
|
|
|
number_score = 0.0 |
|
if re.search(r'\d', sentence): |
|
number_score = 0.2 |
|
|
|
|
|
score = 0.5 * length_score + 0.3 * keyword_score + 0.2 * number_score |
|
|
|
|
|
importances.append(min(score, 1.0)) |
|
|
|
return importances |
|
|
|
def calculate_text_overlap(text1: str, text2: str) -> float: |
|
""" |
|
Calculate simple text overlap between two strings. |
|
|
|
Args: |
|
text1: First text |
|
text2: Second text |
|
|
|
Returns: |
|
overlap_ratio: Ratio of shared tokens (0.0 to 1.0) |
|
""" |
|
|
|
tokens1 = set(text1.lower().split()) |
|
tokens2 = set(text2.lower().split()) |
|
|
|
|
|
if not tokens1 or not tokens2: |
|
return 0.0 |
|
|
|
overlap = tokens1.intersection(tokens2) |
|
return len(overlap) / min(len(tokens1), len(tokens2)) |
|
|