File size: 15,371 Bytes
d86b25e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
"""

Optimized Game Fuzzy Matcher for G-Assist LLM Integration

Handles game name variations, prioritizes Steam API data, and provides intelligent matching.

"""

import re
import logging
from collections import defaultdict
from typing import List, Tuple, Dict, Optional
import asyncio
from src.rtx_llm_analyzer import GAssistLLMAnalyzer


class OptimizedGameFuzzyMatcher:
    """

    Ultra-optimized fuzzy matcher for video game titles

    Designed for G-Assist LLM integration with game-specific optimizations

    """

    def __init__(self, threshold: float = 0.75):
        self.threshold = threshold
        self.cache = {}  # Unified cache for all preprocessing
        self.logger = logging.getLogger(__name__)
        self.llm_analyzer = GAssistLLMAnalyzer()
        
        # Direct game name mapping with numbers preserved
        self.game_map = {
            # Diablo games with numbers preserved
            'diablo': 'diablo',
            'diablo i': 'diablo',
            'diablo 2': 'diablo ii',
            'diablo 3': 'diablo iii',
            'diablo 4': 'diablo iv',
            
            'grand theft auto': 'grand theft auto',
            'gta': 'grand theft auto',
            'gta 3': 'grand theft auto 3',
            'gta iii': 'grand theft auto 3',
            'gta 4': 'grand theft auto 4',
            'gta iv': 'grand theft auto 4',
            'gta 5': 'grand theft auto 5',
            'gta v': 'grand theft auto 5',           
        }

        # Expanded game-specific mappings for common abbreviations
        self.acronym_map = {
            'gta': ['grand', 'theft', 'auto'],
            'cod': ['call', 'of', 'duty'],
            'cs': ['counter', 'strike'],
            'csgo': ['counter', 'strike', 'global', 'offensive'],
            'pubg': ['playerunknowns', 'battlegrounds'],
            'ac': ['assassins', 'creed'],
            'ds': ['dark', 'souls'],
            'gow': ['god', 'of', 'war'],
            'hzd': ['horizon', 'zero', 'dawn'],
            'botw': ['breath', 'of', 'the', 'wild'],
            'mw': ['modern', 'warfare'],
            'nfs': ['need', 'for', 'speed'],
            'ff': ['final', 'fantasy'],
            'lol': ['league', 'of', 'legends'],
            'wow': ['world', 'of', 'warcraft'],
            'diablo': ['diablo']
        }

        # Number normalization patterns (critical for Diablo 4 -> Diablo IV)
        self.roman_map = {
            'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5',
            'vi': '6', 'vii': '7', 'viii': '8', 'ix': '9', 'x': '10',
            'xi': '11', 'xii': '12', 'xiii': '13', 'xiv': '14', 'xv': '15'
        }
        
        # Reverse mapping for number to roman conversion
        self.number_to_roman = {v: k for k, v in self.roman_map.items()}

        # Common game subtitles/editions to handle gracefully
        self.edition_words = {
            'edition', 'remastered', 'remake', 'definitive', 'ultimate',
            'goty', 'complete', 'deluxe', 'special', 'anniversary', 'enhanced'
        }

    def preprocess_title(self, title: str) -> List[str]:
        """Preprocess and tokenize game title with aggressive normalization."""
        cache_key = f"prep_{title}"
        if cache_key in self.cache:
            return self.cache[cache_key]

        # Lowercase and remove special chars
        clean = re.sub(r'[^a-z0-9\s]', ' ', title.lower())
        
        # Keep original game name with numbers intact
        original_tokens = clean.split()
        
        # Split into tokens
        tokens = clean.split()
        
        # Always preserve original game name with number intact
        if len(original_tokens) >= 2 and any(t.isdigit() or t in self.roman_map for t in original_tokens):
            self.logger.info(f"Preserving numbered game title: {title}")
            self.cache[cache_key] = original_tokens
            return original_tokens

        # Process each token
        processed_tokens = []
        i = 0
        while i < len(tokens):
            token = tokens[i]

            # Check if token is a known acronym
            if token in self.acronym_map:
                processed_tokens.extend(self.acronym_map[token])
            
            # Check for multi-token acronyms (like cs:go -> csgo)
            elif i + 1 < len(tokens):
                combined = token + tokens[i + 1]
                if combined in self.acronym_map:
                    processed_tokens.extend(self.acronym_map[combined])
                    i += 1  # Skip next token
                else:
                    processed_tokens.append(self.normalize_token(token))
            else:
                processed_tokens.append(self.normalize_token(token))

            i += 1

        self.cache[cache_key] = processed_tokens
        return processed_tokens

    def normalize_token(self, token: str) -> str:
        """Normalize individual token with bidirectional roman/number conversion."""
        # Convert Roman numerals to numbers
        if token in self.roman_map:
            return self.roman_map[token]
        
        # Convert numbers to roman numerals for reverse matching
        if token in self.number_to_roman:
            return self.number_to_roman[token]

        # Handle year formats (23 -> 2023)
        if token.isdigit() and len(token) == 2:
            year = int(token)
            if year < 50:
                return f"20{token}"
            else:
                return f"19{token}"

        # Preserve numbers to avoid losing them in game titles
        if token.isdigit() or token in self.roman_map:
            return token
            
        return token

    def fuzzy_match_with_variants(self, query: str, target: str) -> float:
        """

        Enhanced fuzzy matching that creates multiple variants for comparison.

        Specifically handles cases like "Diablo 4" -> "Diablo IV"

        """
        # Quick exact match check
        if query.lower() == target.lower():
            return 1.0

        # Generate variants of both query and target
        query_variants = self.generate_variants(query)
        target_variants = self.generate_variants(target)

        # Find best match among all combinations
        max_score = 0.0
        
        for q_variant in query_variants:
            for t_variant in target_variants:
                score = self.basic_fuzzy_match(q_variant, t_variant)
                max_score = max(max_score, score)
                
                # Early termination for perfect matches
                if score >= 0.95:
                    return score

        return max_score

    def generate_variants(self, title: str) -> List[str]:
        """Generate multiple variants of a game title for robust matching."""
        variants = [title]
        
        # Original preprocessing
        tokens = self.preprocess_title(title)
        if tokens:
            variants.append(' '.join(tokens))
        
        # Roman/Number conversion variants
        lower_title = title.lower()
        
        # Convert numbers to roman numerals
        for num, roman in self.number_to_roman.items():
            if num in lower_title:
                variant = lower_title.replace(num, roman)
                variants.append(variant)
        
        # Convert roman numerals to numbers  
        for roman, num in self.roman_map.items():
            if roman in lower_title:
                variant = lower_title.replace(roman, num)
                variants.append(variant)
        
        # Remove duplicates while preserving order
        seen = set()
        unique_variants = []
        for variant in variants:
            if variant not in seen:
                seen.add(variant)
                unique_variants.append(variant)
        
        return unique_variants

    def basic_fuzzy_match(self, title1: str, title2: str) -> float:
        """Basic fuzzy matching with token-based similarity."""
        tokens1 = self.preprocess_title(title1)
        tokens2 = self.preprocess_title(title2)
        
        if not tokens1 or not tokens2:
            return 0.0

        set1, set2 = set(tokens1), set(tokens2)
        
        # Calculate Jaccard similarity
        intersection = set1 & set2
        union = set1 | set2
        
        if not union:
            return 0.0

        # Base score
        jaccard = len(intersection) / len(union)

        # Weight adjustments for game-specific patterns
        weight = 1.0

        # Boost score if main game words match
        main_words = intersection - self.edition_words
        if main_words:
            weight += 0.2

        # Small penalty for missing edition words
        edition_diff = (set1 ^ set2) & self.edition_words
        if edition_diff:
            weight -= 0.05 * len(edition_diff)

        return min(1.0, jaccard * weight)

    def normalize_game_name(self, game_name: str) -> str:
        """

        Normalize game name for consistent caching and matching.

        

        Args:

            game_name: Original game name

            

        Returns:

            Normalized game name with roman numerals and standardized format

        """
        # Use the preprocess_title method and join back the tokens
        tokens = self.preprocess_title(game_name)
        return ' '.join(tokens)

    async def find_best_match(self, query: str, candidates: List[str],

                       steam_priority: bool = True) -> Optional[Tuple[str, float]]:
        """

        Find the best match with Steam API prioritization and simplified mapping.

        

        Args:

            query: Game name to search for

            candidates: List of candidate game names

            steam_priority: Whether to prioritize results that look like Steam data

            

        Returns:

            Tuple of (best_match, confidence_score) or None

        """
        if not candidates:
            return None
            
        # First, try direct mapping using the game_map
        query_lower = query.lower()
        if query_lower in self.game_map:
            mapped_name = self.game_map[query_lower]
            self.logger.info(f"Direct game map match: '{query}' -> '{mapped_name}'")
            
            # Find this mapped name in the candidates (case-insensitive)
            # First, try exact match
            for candidate in candidates:
                if candidate.lower() == mapped_name.lower():
                    # For 'diablo 3', return 'Diablo III' from candidates with proper capitalization
                    return candidate, 1.0
                    
            # Then try contains - needed for games like "Diablo III" which might be "Diablo III: Reaper of Souls" in candidates
            for candidate in candidates:
                if mapped_name.lower() in candidate.lower().split():
                    self.logger.info(f"Partial match for mapped name: '{mapped_name}' found in '{candidate}'")
                    return candidate, 0.95
                    
        # Look for exact match in candidates
        for candidate in candidates:
            if candidate.lower() == query_lower:
                return candidate, 1.0
                
        # For games with numbers, preserve the exact numbered version from the query
        query_words = query_lower.split()
        if len(query_words) >= 2 and any(w.isdigit() or w in self.roman_map for w in query_words):
            # Keep the exact query if it contains a number
            self.logger.info(f"Preserving numbered query: {query}")
            
            # Check if any candidate contains both the base name and the number
            for candidate in candidates:
                candidate_lower = candidate.lower()
                # Check if candidate contains all words from the query
                if all(word in candidate_lower for word in query_words):
                    return candidate, 1.0
                
            # Do NOT strip numbers for partial matching - numbered games are distinct entries
            # Instead, try to find candidates that have at least the base name
            # but return None if no exact match with number is found
            self.logger.info(f"No exact match for numbered game: '{query}'")
            return None
        
        # For simple fuzzy matching, use the best candidate with high enough score
        matches = []
        for candidate in candidates:
            # Simple similarity score
            query_set = set(query_lower.split())
            candidate_set = set(candidate.lower().split())
            
            intersection = query_set & candidate_set
            if intersection and len(intersection) / len(query_set) > 0.5:
                score = len(intersection) / max(len(query_set), len(candidate_set))
                matches.append((candidate, score))
        
        if matches:
            # Sort by score and return best match
            matches.sort(key=lambda x: x[1], reverse=True)
            best_match, best_score = matches[0]
            if best_score > 0.6:
                return best_match, best_score
                
        # If all else fails, return the first candidate
        if candidates:
            return candidates[0], 0.5
            
        return None

    def looks_like_steam_title(self, title: str) -> bool:
        """Heuristic to identify Steam-style game titles."""
        # Steam titles tend to be more formal and complete
        return (
            len(title) > 5 and  # Not just abbreviations
            not any(abbrev in title.lower() for abbrev in ['gta', 'cod', 'cs']) and
            ':' not in title  # Steam tends to use cleaner formatting
        )

    async def match_with_steam_fallback(self, query: str, steam_candidates: List[str], 

                                      cache_candidates: List[str]) -> Optional[Tuple[str, float, str]]:
        """

        Match with Steam API prioritization and local cache fallback.

        

        Returns:

            Tuple of (matched_name, confidence_score, source) or None

        """
        # First try Steam API candidates
        if steam_candidates:
            steam_match = await self.find_best_match(query, steam_candidates, steam_priority=True)
            if steam_match and steam_match[1] >= self.threshold:
                return steam_match[0], steam_match[1], "Steam API"

        # Fallback to local cache
        if cache_candidates:
            cache_match = await self.find_best_match(query, cache_candidates, steam_priority=False)
            if cache_match and cache_match[1] >= self.threshold:
                return cache_match[0], cache_match[1], "Local Cache"

        # Log failed match for debugging
        self.logger.warning(f"No fuzzy match found for '{query}' in {len(steam_candidates)} Steam + {len(cache_candidates)} cache candidates")
        return None


# Singleton instance for use across the application
game_fuzzy_matcher = OptimizedGameFuzzyMatcher(threshold=0.7)