File size: 29,339 Bytes
7e96e8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ea07d9
7e96e8d
 
 
 
 
 
 
6ea07d9
7e96e8d
 
 
 
6ea07d9
 
 
 
 
 
 
 
 
 
 
 
 
 
7e96e8d
6ea07d9
7e96e8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ea07d9
7e96e8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ea07d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e96e8d
 
 
 
 
6ea07d9
7e96e8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ea07d9
7e96e8d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
import nltk
import logging
import spacy
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
import re
from tqdm import tqdm

# Logging setup
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

class NgramProcessor:
    def __init__(self, models=None):
        try:
            nltk.data.find('corpora/stopwords')
        except LookupError:
            nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        
        # Default to standard model if none specified
        if models is None:
            models = ["en_core_web_trf"]
        
        # Load specified model
        self.models = {}
        for model_name in models:
            try:
                self.models[model_name] = spacy.load(model_name)
                tqdm.write(f"[NgramProcessor] Loaded model: {model_name}")
            except IOError:
                tqdm.write(f"[NgramProcessor] Error: Model '{model_name}' not found. Please install it with:")
                tqdm.write(f"python -m spacy download {model_name}")
            except Exception as e:
                tqdm.write(f"[NgramProcessor] Error loading model '{model_name}': {str(e)}")
        
        # Set primary NLP model for other processes
        if "en_core_web_trf" in self.models:
            self.nlp = self.models["en_core_web_trf"]
        elif len(self.models) > 0:
            # Use first available model as primary if preferred one isn't available
            self.nlp = next(iter(self.models.values()))
        else:
            raise ValueError("No spaCy model was successfully loaded")
            
        # Add custom entity patterns for numerical ranges to primary model
        if "entity_ruler" not in self.nlp.pipe_names:
            ruler = self.nlp.add_pipe("entity_ruler", before="ner")
            patterns = [
                {"label": "CARDINAL", "pattern": [{"TEXT": {"REGEX": "\\d+-\\d+"}}]},  # Pattern for ranges like "7-10"
                {"label": "PERCENT", "pattern": [{"TEXT": {"REGEX": "\\d+%"}}]}           # Pattern for percentages
            ]
            ruler.add_patterns(patterns)
        
        # Create special pattern for numerical ranges
        self.number_range_pattern = re.compile(r'\b(\d+(?:-\d+)+)\b')
        
        tqdm.write("[NgramProcessor] Initialized with stopwords, spaCy NLP model, and numerical range detection.")

    def remove_stopwords(self, text):
        words = re.findall(r'\w+', text.lower())
        filtered_words = [word for word in words if word not in self.stop_words]
        return ' '.join(filtered_words)

    def extract_number_ranges(self, sentences):
        """Extract numerical ranges like '7-10' from sentences"""
        tqdm.write("[NgramProcessor] Extracting numerical ranges...")
        
        number_ranges = []
        range_counts = Counter()
        
        for sentence in sentences:
            # Find all numerical ranges in the sentence
            matches = self.number_range_pattern.findall(sentence)
            for match in matches:
                range_counts[match] += 1
                
        # Add all ranges that appear in all sentences (threshold for ranges)
        for range_text, count in range_counts.items():
            if count >= 1:
                number_ranges.append(range_text)
                
        tqdm.write(f"[NgramProcessor] Found {len(number_ranges)} numerical ranges: {number_ranges}")

        return number_ranges
    def extract_standalone_numbers(self, sentences):
        """Extract standalone numerical values from sentences"""
        tqdm.write("[NgramProcessor] Extracting standalone numbers...")
        
        # Two patterns: one for percentages, one for regular numbers
        percentage_pattern = re.compile(r'\b\d+%\b')  # Only matches numbers with % sign
        number_pattern = re.compile(r'\b\d+\b')       # Only matches standalone numbers
        
        percentage_counts = Counter()
        number_counts = Counter()
        percentage_values = set()  # Store the numeric part of percentages for cross-reference
        
        # First pass: Find all percentages
        for sentence in sentences:
            # Extract all percentages first
            percentage_matches = percentage_pattern.findall(sentence)
            for match in percentage_matches:
                percentage_counts[match] += 1
                # Store the numeric part for later comparison
                numeric_part = match.rstrip('%')
                percentage_values.add(numeric_part)
        
        # Second pass: Find standalone numbers
        for sentence in sentences:
            # Only look for standalone numbers now
            number_matches = number_pattern.findall(sentence)
            for match in number_matches:
                # Avoid double counting numbers that we already counted as percentages
                if match not in percentage_values:
                    number_counts[match] += 1
        
        # Process percentages first (they have priority)
        threshold = max(1, int(len(sentences) * 1.0))
        standalone_numbers = []
        
        # Add percentages that meet the threshold
        for num, count in percentage_counts.items():
            if count >= threshold:
                standalone_numbers.append(num)  # Already has % sign
        
        # Then add standalone numbers, converting to percentage format if needed
        for num, count in number_counts.items():
            if count >= threshold:
                # If this number also appeared as part of a percentage, use the percentage format
                if num in percentage_values:
                    standalone_numbers.append(f"{num}%")
                else:
                    standalone_numbers.append(num)
        
        tqdm.write(f"[NgramProcessor] Found {len(standalone_numbers)} standalone numbers: {standalone_numbers}")
        return standalone_numbers

    def extract_regex_subsequences(self, sentences):
        """Extract potential subsequences using regex patterns before applying NLP"""
        tqdm.write("[NgramProcessor] Extracting regex subsequences...")
        
        # Find potential multi-word subsequences (2-5 words) that occur across sentences
        potential_subsequences = set()
        
        # Process each sentence to find multi-word phrases
        for sentence in sentences:
            # First, clean the sentence by removing punctuation and converting to lowercase
            clean_sentence = re.sub(r'[^\w\s&-./\'()[\]$€£¥+%]', ' ', sentence.lower())
            
            # Extract sequences of 2-6 words
            for i in range(2, 7):  # Try sequences of length 2-6 words 
                pattern = r'\b(\w+(?:[-&\s./\'()[\]$€£¥+%]+\w+){' + str(i-1) + r'})\b'
                matches = re.findall(pattern, clean_sentence)
                potential_subsequences.update(matches)
        
        # Filter out sequences that consist only of stopwords (but preserve numbers)
        filtered_subsequences = []
        for subseq in potential_subsequences:
            words = re.split(r'[\s-]+', subseq)  # Split on spaces or hyphens
            
            # Function to check if a word is a number or percentage
            def is_numeric(word):
                return bool(re.match(r'^\d+(\.\d+)?%?$|^\d+-\d+$', word))
            
            # Skip if ALL words are stopwords and none are numeric
            if all((word in self.stop_words and not is_numeric(word)) for word in words):
                tqdm.write(f"[NgramProcessor] Skipping all-stopword phrase: {subseq}")
                continue
                
            # Keep if sequence has significant words (not just stopwords)
            # OR if it contains numbers/percentages
            if len(words) > 1 and (
                any(word not in self.stop_words and (len(word) > 2 or is_numeric(word)) for word in words)
            ):
                # Additional check to reject if standalone "the" or other common stopwords
                if not (len(words) == 1 and words[0] in self.stop_words and not is_numeric(words[0])):
                    filtered_subsequences.append(subseq)
        
        # Count occurrences across all sentences
        subseq_counts = Counter()
        for subseq in filtered_subsequences:
            for sentence in sentences:
                if re.search(r'\b' + re.escape(subseq) + r'\b', sentence.lower()):
                    subseq_counts[subseq] += 1
        
        # Keep only subsequences that appear in multiple sentences
        threshold = max(2, int(len(sentences) * 1.0))  # threshold to catch all patterns
        regex_candidates = [subseq for subseq, count in subseq_counts.items() 
                        if count >= threshold]
        
        tqdm.write(f"[NgramProcessor] Found {len(regex_candidates)} regex subsequences")
        return regex_candidates
    
    def filter_standalone_stopwords(self, ngrams_dict):
        """Remove standalone stopwords and very short terms from the ngrams dictionary"""
        filtered_dict = {}
        for sentence, ngrams in ngrams_dict.items():
            filtered_dict[sentence] = {}
            for ngram, indices in ngrams.items():
                words = ngram.split()
                # Skip single stopwords and very short terms UNLESS they are numbers
                if (len(words) == 1 and (words[0] in self.stop_words or len(words[0]) < 3)):
                    # Exception for numbers
                    if len(words) == 1 and re.match(r'^\d+$', words[0]):
                        filtered_dict[sentence][ngram] = indices
                        continue
                    else:
                        continue
                # Skip if ALL words are stopwords
                if all(word in self.stop_words for word in words):
                    continue
                filtered_dict[sentence][ngram] = indices
        return filtered_dict

    def extract_named_entities(self, sentences):
        entity_counter = Counter()
        
        # Process each sentence with each model
        for model_name, nlp_model in self.models.items():
            tqdm.write(f"[NgramProcessor] Extracting entities with model: {model_name}")
            docs = list(nlp_model.pipe(sentences))
            
            # Process each sentence
            for doc in docs:
                for ent in doc.ents:
                    # Include entity types relevant to this model
                    # This is a comprehensive list - some models may not use all these types
                    if ent.label_ in {
                        # People, organizations, locations
                        "PERSON", "ORG", "GPE", "LOC", "NORP",
                        
                        # Facilities and products
                        "FAC", "PRODUCT", "WORK_OF_ART", "EVENT",
                        
                        # Numeric entities
                        "DATE", "TIME", "MONEY", "QUANTITY", "PERCENT", "CARDINAL", "ORDINAL",
                        
                        # Others
                        "LAW", "LANGUAGE",
                        
                        # Scientific entities
                        "SCIENTIFIC", "SUBSTANCE", "CHEMICAL", "TECHNOLOGY",
                        
                        # Medical entities
                        "DISEASE", "MEDICAL", "CLINICAL", "TREATMENT", "SYMPTOM", "DIAGNOSTIC", 
                        "ANATOMICAL", "BIOLOGY", "GENE", "PROTEIN", "DRUG",
                        
                        # Legal entities
                        "LEGAL", "COURT", "STATUTE", "PROVISION", "CASE_CITATION", "JUDGE",
                        "LEGAL_ROLE", "REGULATION", "CONTRACT"
                    }:
                        # Handle possessive forms by stripping 's
                        clean_entity = re.sub(r"'s\b", "", ent.text.lower()).strip()
                        # Add model name prefix to distinguish sources
                        entity_counter[clean_entity] += 1
        
        threshold = max(1, len(sentences) * 1.0)  # Adjusted threshold for entities
        return [ent for ent, count in entity_counter.items() if count >= threshold]

    def extract_domain_specific_entities(self, text):
        """Extract entities from all models and categorize by domain"""
        domain_entities = {}
        
        for model_name, nlp_model in self.models.items():
            doc = nlp_model(text)
            domain_entities[model_name] = [(ent.text, ent.label_) for ent in doc.ents]
            
        return domain_entities

    def is_substring_of_any(self, ngram, common_ngrams):
        for other_ngram in common_ngrams:
            if ngram != other_ngram and ngram in other_ngram:
                return True
        return False

    def find_filtered_ngrams(self, sentences):
        tqdm.write("[NgramProcessor] Processing...")

        # Step 1: First extract numerical ranges or standalone numbers (special priority)
        number_ranges = self.extract_number_ranges(sentences)
        standalone_numbers = self.extract_standalone_numbers(sentences)  
        
        # Step 2: Use regex to find common subsequences
        regex_subsequences = self.extract_regex_subsequences(sentences)
        tqdm.write(f"[NgramProcessor] Regex Subsequences: {regex_subsequences}")
        
        # Step 3: Then apply spaCy to detect named entities
        named_entities = self.extract_named_entities(sentences)
        # Make sure percentage values have proper format
        for i, entity in enumerate(named_entities):
            if re.match(r'\d+$', entity) and any(f"{entity}%" in sentence for sentence in sentences):
                # Replace standalone digit with percentage if it appears as percentage in text
                named_entities[i] = f"{entity}%"

        tqdm.write(f"[NgramProcessor] Named Entities: {named_entities}")
        
        # Step 4: Consolidate and filter all detected patterns
        # Collect all patterns in one list
        all_patterns = number_ranges + regex_subsequences + named_entities + standalone_numbers
        
        # Sort by length (longer first) to prioritize more specific patterns
        all_patterns.sort(key=len, reverse=True)
        
        # Remove duplicates while preserving order
        unique_patterns = []
        seen = set()
        for pattern in all_patterns:
            if pattern not in seen:
                # Check if this pattern is a substring of any already selected pattern
                is_substring = False
                for selected_pattern in unique_patterns:
                    if pattern in selected_pattern and pattern != selected_pattern:
                        is_substring = True
                        break
                if not is_substring:
                    unique_patterns.append(pattern)
                    seen.add(pattern)
        # Re-index sequentially        
        indexed_patterns = [(i+1, pattern) for i, pattern in enumerate(unique_patterns)]
        self.indexed_patterns = indexed_patterns
        non_melting_points = [pattern for _, pattern in indexed_patterns]
        tqdm.write(f"[NgramProcessor] Filtered non_melting_points: {non_melting_points}")
        tqdm.write(f"[NgramProcessor] Filtered non-melting points: {len(non_melting_points)}")

        # Filter out patterns that are substrings of longer patterns or standalone numbers
        standalone_numbers_set = set(standalone_numbers)
        non_melting_points = []
        for pattern in unique_patterns:
            is_substring = False
            for longer_pattern in non_melting_points:
                # Check if pattern is contained within a longer pattern
                if pattern in longer_pattern:
                    is_substring = True
                    break
            if not is_substring or pattern in standalone_numbers_set:
                non_melting_points.append(pattern)
                
        # For remaining cases that might have been missed, apply NLTK n-gram extraction
        # Only on cleaned sentences (less computationally expensive now)
        clean_to_original = {}
        sentences_cleaned = []
        
        # Process sentences with spaCy to preserve entity information
        docs = list(self.nlp.pipe(sentences))
        
        for i, doc in enumerate(docs):
            original_sentence = sentences[i]
            entity_texts = {ent.text.lower() for ent in doc.ents if len(ent.text.split()) > 1}

            # Tokenize while preserving entities and numerical ranges
            tokens = []
            j = 0
            words = [token.text for token in doc]
            
            while j < len(words):
                # First check for numerical ranges
                current_word = words[j].lower()
                if self.number_range_pattern.match(current_word):
                    tokens.append(current_word)
                    j += 1
                    continue
                
                # Then check for entities
                matched_entity = None
                for ent in sorted(entity_texts, key=len, reverse=True):
                    ent_words = ent.split()
                    if j + len(ent_words) <= len(words) and [w.lower() for w in words[j:j+len(ent_words)]] == ent_words:
                        matched_entity = " ".join(words[j:j+len(ent_words)])
                        tokens.append(matched_entity.lower())  # preserve full entity
                        j += len(ent_words)
                        break
                
                if not matched_entity:
                    word = words[j].lower()
                    if word not in self.stop_words and re.match(r'\w+', word):
                        tokens.append(word)
                    j += 1

            cleaned = " ".join(tokens)
            sentences_cleaned.append(cleaned)
            clean_to_original[cleaned] = original_sentence

        # Step 5: Only run n-gram extraction on gaps not covered by regex and named entities
        ngram_lengths = [4, 3, 2, 1]  # Consider shorter n-grams now since we already have longer phrases
        all_ngrams_by_length = {}

        for n in ngram_lengths:
            all_ngrams = []
            for sentence in sentences_cleaned:
                tokens = sentence.split()
                if len(tokens) >= n:
                    sent_ngrams = list(ngrams(tokens, n))
                    all_ngrams.extend(sent_ngrams)
            all_ngrams_by_length[n] = Counter(all_ngrams)

        # Step 6: Add additional n-grams that are frequent but weren't caught by regex or named entities
        threshold_factor = 1.0  # threshold since we're focusing on gaps
        
        for n_size in sorted(ngram_lengths, reverse=True):
            ngram_counts = all_ngrams_by_length[n_size]
            threshold = max(2, int(len(sentences) * threshold_factor))
            
            # Sort by count for efficiency
            for ngram, count in ngram_counts.most_common():
                if count >= threshold:
                    ngram_str = ' '.join(ngram)
                    # Skip if is a substring of existing n-grams or already in our collection
                    if ngram_str not in non_melting_points and not self.is_substring_of_any(ngram_str, non_melting_points):
                        non_melting_points.append(ngram_str)
        
        # Create sorted version for efficient lookup
        final_non_melting_points = non_melting_points.copy()
        sorted_non_melting_points = sorted(final_non_melting_points, key=len, reverse=True)
        final_indexed_patterns = [(i+1, pattern) for i, pattern in enumerate(sorted_non_melting_points)]
        
        #Filter out n-grams that consist entirely of stop words
        filtered_patterns = []
        for idx, pattern in final_indexed_patterns:
            words = pattern.lower().split()
            
            # Check if the pattern is a number or contains a number
            has_number = any(re.match(r'.*\d+.*', word) for word in words)
            
            # If the pattern has a number OR has any non-stop word, keep it
            if has_number or any(word not in self.stop_words for word in words):
                filtered_patterns.append((idx, pattern))
            else:
                tqdm.write(f"[NgramProcessor] Removing n-gram with all stop words: {pattern}")
        
        # Reassign filtered patterns with reindexed values
        self.indexed_patterns = [(i+1, pattern) for i, (_, pattern) in enumerate(filtered_patterns)]
        
        # Generate the results with more efficient regex matching
        result = {}
        for sentence in sentences:
            sentence_result = {}
            for _,ngram in self.indexed_patterns:  # Use the filtered patterns
                # Skip single word stopwords and short terms
                words = ngram.split()
                if len(words) == 1 and (words[0] in self.stop_words or len(words[0]) < 3):
                    continue
                    
                # Handle numerical ranges differently - need exact matching
                if self.number_range_pattern.match(ngram):
                    pattern = re.compile(r'\b' + re.escape(ngram) + r'\b', re.IGNORECASE)
                else:
                    # Compile the regex pattern once per n-gram - modified to handle special characters
                    pattern = re.compile(r'(?<!\w)' + re.escape(ngram) + r'(?!\w)', re.IGNORECASE)
                
                matches = list(pattern.finditer(sentence))
                
                if matches:
                    indices = []
                    for match in matches:
                        # Calculate word indices with improved handling for hyphenated terms
                        start_pos = match.start()
                        text_before = sentence[:start_pos]
                        
                        # More accurate word counting that handles hyphenated terms
                        start_idx = len(re.findall(r'\s+', text_before)) + (0 if text_before.strip() == "" else 1)
                        
                        # Count words in the matched n-gram (handling hyphens as single terms)
                        if self.number_range_pattern.match(ngram):
                            # Numerical ranges count as one term
                            ngram_word_count = 1
                        else:
                            ngram_word_count = len(re.findall(r'\S+', ngram))
                        
                        end_idx = start_idx + ngram_word_count - 1
                        
                        indices.append((start_idx, end_idx))
                    
                    if indices:  # Only add if we found valid indices
                        sentence_result[ngram] = indices
            
            result[sentence] = sentence_result
        
        # Apply the stopword filter before returning 
        result = self.filter_standalone_stopwords(result)
        return result, dict(self.indexed_patterns)

    def find_relative_order(self, sentence, common_ngrams):
        # First, identify all possible matches without modifying the sentence
        all_matches = []
        
        for ngram in common_ngrams:
            # Special handling for percentages
            if any(char in ngram for char in '&-/.\'()[]$€£¥+%'):
                pattern = re.compile(r'\b' + re.escape(ngram) + r'\b', re.IGNORECASE)
            # Handle numerical ranges
            elif self.number_range_pattern.match(ngram):
                pattern = re.compile(r'\b' + re.escape(ngram) + r'\b', re.IGNORECASE)
            else:
                pattern = re.compile(r'(?<!\w)' + re.escape(ngram) + r"(?:'s)?(?!\w)", re.IGNORECASE)
            
            for match in pattern.finditer(sentence):
                start, end = match.span()
                #store character position range, ngram text, and token count
                all_matches.append((start, end, ngram, len(ngram.split())))
        
        # Pre-process: identify all word spans in the original sentence
        words = []
        word_spans = []
        for match in re.finditer(r'\S+', sentence):
            words.append(match.group())
            word_spans.append((match.start(), match.end()))
        
        # Create a mapping from character positions to word indices
        char_to_word_idx = {}
        for i, (start, end) in enumerate(word_spans):
            for pos in range(start, end + 1):
                char_to_word_idx[pos] = i
        
        # Sort by length in characters first, then by word count
        all_matches.sort(key=lambda x: (-len(x[2]), -x[3], x[0])) 
        
        # Filter out ngrams that overlap with already claimed ranges
        filtered_matches = []
        claimed_ranges = []
        
        for start, end, ngram, length in all_matches:
            # Check if this match overlaps with any existing claimed range
            is_overlapping = False
            for c_start, c_end in claimed_ranges:
                # Check for any overlap
                if max(start, c_start) < min(end, c_end):
                    is_overlapping = True
                    break
            
            if not is_overlapping:
                # Add this ngram to our filtered list
                filtered_matches.append((start, end, ngram, length))
                # Claim its range
                claimed_ranges.append((start, end))
        
        # Sort filtered matches by position for final ordering
        filtered_matches.sort(key=lambda x: x[0])
        
        # Create word-level indices for the final matches
        word_level_matches = []
        for start, end, ngram, _ in filtered_matches:
            # Find the word index for the start and end positions
            try:
                start_word_idx = char_to_word_idx.get(start, char_to_word_idx.get(start+1))
                end_word_idx = char_to_word_idx.get(end-1, char_to_word_idx.get(end-2))
                
                if start_word_idx is not None and end_word_idx is not None:
                    word_level_matches.append((start_word_idx, end_word_idx, ngram))
            except (KeyError, IndexError):
                # Skip this match if we can't determine word indices
                continue
        
        # Create the final order with 1-based indexing
        ngram_to_index = {pattern: idx for idx, pattern in self.indexed_patterns}
        relative_order = [(ngram_to_index.get(ngram, i+1), ngram) for i, (_, _, ngram) in enumerate(word_level_matches)]
        
        return relative_order, sentence


# Example usage
if __name__ == "__main__":
    # Test with NBA Play-In Tournament example
    sentences = [
        "The NBA Play-In Tournament tips off tonight as the No. 7-10 teams in each conference battle for a spot in the playoffs. Here's everything you need to know as the action unfolds.",
        "Tonight the NBA Play-In Tournament begins with No. 7-10 teams from each conference competing for playoff spots. Here's your guide to following all the action.",
        "The NBA Play-In Tournament kicks off this evening featuring the No. 7-10 teams across both conferences fighting for playoff positions. Here's what you should know about the upcoming games.",
        "Starting tonight, the NBA Play-In Tournament will showcase the No. 7-10 teams from each conference as they compete for remaining playoff berths. Here's your complete guide to the action.",
        "The NBA Play-In Tournament begins tonight with the No. 7-10 teams in both conferences battling for playoff spots. Here's everything you need to know about the upcoming games.",
        "Tonight marks the start of the NBA Play-In Tournament where No. 7-10 teams in each conference compete for playoff positions. Here's your essential guide to following the action.",
        "The NBA Play-In Tournament tips off tonight, featuring No. 7-10 teams from both conferences fighting for playoff berths. Here's what you need to know about the tournament.",
        "Beginning tonight, the NBA Play-In Tournament will pit the No. 7-10 teams in each conference against each other for playoff spots. Here's everything you should know about the games.",
        "The NBA Play-In Tournament starts tonight with No. 7-10 teams across both conferences competing for playoff positions. Here's your complete guide to all the action.",
        "Tonight is the tip-off of the NBA Play-In Tournament where the No. 7-10 teams from each conference battle for remaining playoff spots. Here's what you need to know as the games unfold."
    ]
    
    # Initialize with multiple models
    processor = NgramProcessor(models=["en_core_web_trf"])
    
    # Process with all models combined
    common_ngrams,indexed_ngrams = processor.find_filtered_ngrams(sentences)

    
    # Print results
    print("Common n-grams with indices per sentence:")
    for sentence in sentences:
        order, updated_sentence = processor.find_relative_order(sentence, common_ngrams[sentence])
        print(f"Sentence: {sentence}")
        print(f"Order: {order}")
        print()