File size: 3,345 Bytes
fd06f0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Similarity metrics for text comparison
"""
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    """
    Calculate cosine similarity between two texts using TF-IDF vectorization.
    
    Args:
        text1 (str): First text
        text2 (str): Second text
        
    Returns:
        float: Cosine similarity score between 0 and 1
    """
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    
    try:
        # Transform texts into TF-IDF vectors
        tfidf_matrix = vectorizer.fit_transform([text1, text2])
        
        # Calculate cosine similarity
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        return float(similarity)
    except Exception as e:
        print(f"Error calculating cosine similarity: {e}")
        return 0.0

def calculate_jaccard_similarity(text1, text2):
    """
    Calculate Jaccard similarity between two texts (word-level).
    
    Args:
        text1 (str): First text
        text2 (str): Second text
        
    Returns:
        float: Jaccard similarity score between 0 and 1
    """
    # Convert to sets of words
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    
    # Calculate Jaccard similarity
    if not words1 and not words2:
        return 1.0  # If both are empty, they're identical
    
    try:
        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))
        return intersection / union
    except Exception as e:
        print(f"Error calculating Jaccard similarity: {e}")
        return 0.0

def calculate_semantic_similarity(text1, text2):
    """
    Calculate pseudo-semantic similarity by comparing word overlap patterns.
    
    This is a simplified approach that doesn't use embedding models like Word2Vec or BERT.
    
    Args:
        text1 (str): First text
        text2 (str): Second text
        
    Returns:
        float: Semantic similarity score between 0 and 1
    """
    # For now, this is a weighted combination of cosine and Jaccard similarity
    # In a real app, you'd use a proper semantic model
    cosine = calculate_cosine_similarity(text1, text2)
    jaccard = calculate_jaccard_similarity(text1, text2)
    
    # Weight more towards cosine similarity
    return 0.7 * cosine + 0.3 * jaccard

def calculate_similarity(text1, text2, metrics=None):
    """
    Calculate various similarity metrics between two texts.
    
    Args:
        text1 (str): First text
        text2 (str): Second text
        metrics (list): List of metrics to calculate
        
    Returns:
        dict: Dictionary of similarity scores
    """
    if metrics is None:
        metrics = ["Cosine Similarity", "Jaccard Similarity", "Semantic Similarity"]
    
    results = {}
    
    if "Cosine Similarity" in metrics:
        results["cosine_similarity"] = calculate_cosine_similarity(text1, text2)
    
    if "Jaccard Similarity" in metrics:
        results["jaccard_similarity"] = calculate_jaccard_similarity(text1, text2)
    
    if "Semantic Similarity" in metrics:
        results["semantic_similarity"] = calculate_semantic_similarity(text1, text2)
    
    return results