File size: 4,296 Bytes
532392b
 
 
 
7cfdc4c
532392b
 
 
 
 
 
 
 
 
 
 
 
 
99b887a
 
 
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4bc190
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4bc190
 
 
 
 
 
 
532392b
c4bc190
 
 
532392b
 
 
c4bc190
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4bc190
532392b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import streamlit as st
import pandas as pd
import json
import os
import gzip
from sentence_transformers import SentenceTransformer, util
from loguru import logger

# ================== CONFIGURATION ==================
st.set_page_config(page_title="Problem Deduplication Explorer", layout="wide")

# Load a pre-trained model for embeddings
MODEL_NAME = "all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# Load preloaded dataset
@st.cache_data
def load_data():
    file_path = "data/merged_dataset.csv.zip"
    with gzip.open(file_path, "rt") as f:
        df = pd.read_csv(f)

df = load_data()

# ================== FUNCTION DEFINITIONS ==================
def compute_embeddings(problems):
    """Compute sentence embeddings."""
    return model.encode(problems, normalize_embeddings=True)

def find_similar_problems(df, similarity_threshold=0.9):
    """Find similar problems using cosine similarity."""
    embeddings = compute_embeddings(df['problem'].tolist())
    similarity_matrix = util.cos_sim(embeddings, embeddings).numpy()
    clusters = {}
    for i in range(len(df)):
        current_uuid = df["uuid"][i]
        similar_items = [
            (df["uuid"][j], float(similarity_matrix[i][j]))  # Convert float32 to float
            for j in range(i + 1, len(df))
            if similarity_matrix[i][j] > similarity_threshold
        ]
        if similar_items:
            clusters[current_uuid] = similar_items
    return clusters

def analyze_clusters(df, similarity_threshold=0.9):
    """Analyze duplicate problem clusters."""
    clusters = find_similar_problems(df, similarity_threshold)
    detailed_analysis = {}
    for key, values in clusters.items():
        base_row = df[df["uuid"] == key].iloc[0]
        cluster_details = []
        for val, score in values:
            comparison_row = df[df["uuid"] == val].iloc[0]
            
            column_differences = {}
            for col in df.columns:
                if col != "uuid":
                    base_val = base_row[col]
                    comp_val = comparison_row[col]
                    # Convert numpy types to native Python types
                    if hasattr(base_val, 'item'):
                        base_val = base_val.item()
                    if hasattr(comp_val, 'item'):
                        comp_val = comp_val.item()
                    column_differences[col] = {
                        'base': base_val,
                        'comparison': comp_val,
                        'match': bool(base_val == comp_val)  # Convert numpy bool to Python bool
                    }
            cluster_details.append({
                'uuid': val,
                'similarity_score': float(score),  # Convert float32 to float
                'column_differences': column_differences,
            })
        detailed_analysis[key] = cluster_details
    return detailed_analysis

# ================== STREAMLIT UI ==================
st.title("πŸ” Problem Deduplication Explorer")

st.sidebar.header("Settings")
similarity_threshold = st.sidebar.slider(
    "Similarity Threshold", min_value=0.5, max_value=1.0, value=0.9, step=0.01
)

if st.sidebar.button("Run Deduplication Analysis"):
    with st.spinner("Analyzing..."):
        results = analyze_clusters(df, similarity_threshold)
    st.success("Analysis Complete!")
    
    st.subheader("πŸ“Š Duplicate Problem Clusters")
    for base_uuid, cluster in results.items():
        base_problem = df[df["uuid"] == base_uuid]["problem"].values[0]
        st.markdown(f"### Problem: {base_problem}")
        for entry in cluster:
            similar_problem = df[df["uuid"] == entry["uuid"]]["problem"].values[0]
            st.write(f"**Similar to:** {similar_problem}")
            st.write(f"**Similarity Score:** {entry['similarity_score']:.4f}")
            with st.expander("Show Column Differences"):
                st.json(entry["column_differences"])
            st.markdown("---")

    # Export results
    st.sidebar.download_button(
        label="Download Results as JSON",
        data=json.dumps(results, indent=2),
        file_name="deduplication_results.json",
        mime="application/json"
    )

# ================== DATAFRAME DISPLAY ==================
st.subheader("πŸ“„ Explore the Dataset")
st.dataframe(df)