rahideer commited on
Commit
12646dd
Β·
verified Β·
1 Parent(s): a3f5843

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -61
app.py CHANGED
@@ -23,7 +23,7 @@ warnings.filterwarnings("ignore")
23
  MODEL_NAME = "microsoft/codebert-base"
24
  MAX_LENGTH = 512
25
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26
- DATASET_PATH = "archive (1).zip" # Update this path if needed
27
 
28
  # Initialize models with caching
29
  @st.cache_resource
@@ -39,36 +39,30 @@ def load_models():
39
  @st.cache_resource
40
  def load_dataset():
41
  try:
42
- # Extract dataset if needed
43
  if not os.path.exists("Subject_CloneTypes_Directories"):
44
  with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
45
  zip_ref.extractall(".")
46
 
47
- # Load sample pairs (modify this based on your dataset structure)
48
  clone_pairs = []
49
  base_path = "Subject_CloneTypes_Directories"
50
 
51
- # Example: Load one pair from each clone type
52
  for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]:
53
  type_path = os.path.join(base_path, clone_type)
54
  if os.path.exists(type_path):
55
  for root, _, files in os.walk(type_path):
56
- if files:
57
- # Take first two files as a pair
58
- if len(files) >= 2:
59
- with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
60
- code1 = f1.read()
61
- with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
62
- code2 = f2.read()
63
- clone_pairs.append({
64
- "type": clone_type,
65
- "code1": code1,
66
- "code2": code2
67
- })
68
- break # Just take one pair per type for demo
69
-
70
- return clone_pairs[:10] # Return first 10 pairs for demo
71
 
 
72
  except Exception as e:
73
  st.error(f"Error loading dataset: {str(e)}")
74
  return []
@@ -76,17 +70,15 @@ def load_dataset():
76
  tokenizer, code_model = load_models()
77
  dataset_pairs = load_dataset()
78
 
79
- # Normalization function
80
  def normalize_code(code):
81
  try:
82
- code = re.sub(r'//.*', '', code) # Remove single-line comments
83
- code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL) # Multi-line comments
84
- code = re.sub(r'\s+', ' ', code).strip() # Normalize whitespace
85
  return code
86
  except Exception:
87
  return code
88
 
89
- # Embedding generation
90
  def get_embedding(code):
91
  try:
92
  code = normalize_code(code)
@@ -101,12 +93,11 @@ def get_embedding(code):
101
  with torch.no_grad():
102
  outputs = code_model(**inputs)
103
 
104
- return outputs.last_hidden_state.mean(dim=1) # Pooled embedding
105
  except Exception as e:
106
  st.error(f"Error processing code: {str(e)}")
107
  return None
108
 
109
- # Comparison function
110
  def compare_code(code1, code2):
111
  if not code1 or not code2:
112
  return None
@@ -125,9 +116,7 @@ def compare_code(code1, code2):
125
 
126
  # UI Elements
127
  st.title("πŸ” Java Code Clone Detector (IJaDataset 2.1)")
128
- st.markdown("""
129
- Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.
130
- """)
131
 
132
  # Dataset selector
133
  selected_pair = None
@@ -154,52 +143,51 @@ with col2:
154
  value=selected_pair["code2"] if selected_pair else "",
155
  help="Enter the second Java code snippet"
156
  )
157
- # Threshold slider with proper value handling
158
  threshold = st.slider(
159
  "Clone Detection Threshold",
160
  min_value=0.50,
161
  max_value=1.00,
162
- value=0.75, # Default middle value
163
  step=0.01,
164
  help="Similarity score needed to consider code as cloned (0.5-1.0)"
165
  )
166
 
167
- # In your comparison logic:
168
- if similarity is not None:
169
- # Display results with threshold comparison
170
- is_clone = similarity >= threshold
171
-
172
- st.subheader("Results")
173
- col1, col2, col3 = st.columns(3)
174
 
175
- with col1:
176
- st.metric("Similarity Score", f"{similarity:.3f}")
177
-
178
- with col2:
179
- # Show the current threshold being used
180
- st.metric("Current Threshold", f"{threshold:.3f}")
181
-
182
- with col3:
183
- # Visual clone decision
184
- st.metric(
185
  "Verdict",
186
  "βœ… CLONE" if is_clone else "❌ NOT CLONE",
187
  delta=f"{similarity-threshold:+.3f}",
188
  help=f"Score {'β‰₯' if is_clone else '<'} threshold"
189
  )
190
-
191
- # Visual indicator
192
- st.progress(similarity)
193
-
194
- # Interpretation guide
195
- with st.expander("Interpretation Guide"):
196
- st.markdown("""
197
- - **> 0.95**: Nearly identical (Type-1 clone)
198
- - **0.85-0.95**: Very similar (Type-2 clone)
199
- - **0.70-0.85**: Similar structure (Type-3 clone)
200
- - **< 0.70**: Different code
201
- """)
202
- # Footer
 
 
 
 
 
203
  st.markdown("---")
204
  st.markdown("""
205
  **Dataset Information**:
 
23
  MODEL_NAME = "microsoft/codebert-base"
24
  MAX_LENGTH = 512
25
  DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26
+ DATASET_PATH = "archive (1).zip"
27
 
28
  # Initialize models with caching
29
  @st.cache_resource
 
39
  @st.cache_resource
40
  def load_dataset():
41
  try:
 
42
  if not os.path.exists("Subject_CloneTypes_Directories"):
43
  with zipfile.ZipFile(DATASET_PATH, 'r') as zip_ref:
44
  zip_ref.extractall(".")
45
 
 
46
  clone_pairs = []
47
  base_path = "Subject_CloneTypes_Directories"
48
 
 
49
  for clone_type in ["Clone_Type1", "Clone_Type2", "Clone_Type3 - ST"]:
50
  type_path = os.path.join(base_path, clone_type)
51
  if os.path.exists(type_path):
52
  for root, _, files in os.walk(type_path):
53
+ if files and len(files) >= 2:
54
+ with open(os.path.join(root, files[0]), 'r', encoding='utf-8') as f1:
55
+ code1 = f1.read()
56
+ with open(os.path.join(root, files[1]), 'r', encoding='utf-8') as f2:
57
+ code2 = f2.read()
58
+ clone_pairs.append({
59
+ "type": clone_type,
60
+ "code1": code1,
61
+ "code2": code2
62
+ })
63
+ break
 
 
 
 
64
 
65
+ return clone_pairs[:10]
66
  except Exception as e:
67
  st.error(f"Error loading dataset: {str(e)}")
68
  return []
 
70
  tokenizer, code_model = load_models()
71
  dataset_pairs = load_dataset()
72
 
 
73
  def normalize_code(code):
74
  try:
75
+ code = re.sub(r'//.*', '', code)
76
+ code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
77
+ code = re.sub(r'\s+', ' ', code).strip()
78
  return code
79
  except Exception:
80
  return code
81
 
 
82
  def get_embedding(code):
83
  try:
84
  code = normalize_code(code)
 
93
  with torch.no_grad():
94
  outputs = code_model(**inputs)
95
 
96
+ return outputs.last_hidden_state.mean(dim=1)
97
  except Exception as e:
98
  st.error(f"Error processing code: {str(e)}")
99
  return None
100
 
 
101
  def compare_code(code1, code2):
102
  if not code1 or not code2:
103
  return None
 
116
 
117
  # UI Elements
118
  st.title("πŸ” Java Code Clone Detector (IJaDataset 2.1)")
119
+ st.markdown("Compare Java code snippets from the IJaDataset 2.1 using CodeBERT embeddings.")
 
 
120
 
121
  # Dataset selector
122
  selected_pair = None
 
143
  value=selected_pair["code2"] if selected_pair else "",
144
  help="Enter the second Java code snippet"
145
  )
146
+
147
  threshold = st.slider(
148
  "Clone Detection Threshold",
149
  min_value=0.50,
150
  max_value=1.00,
151
+ value=0.75,
152
  step=0.01,
153
  help="Similarity score needed to consider code as cloned (0.5-1.0)"
154
  )
155
 
156
+ # Only perform comparison when button is clicked
157
+ if st.button("Compare Code"):
158
+ similarity = compare_code(code1, code2)
 
 
 
 
159
 
160
+ if similarity is not None:
161
+ is_clone = similarity >= threshold
162
+
163
+ st.subheader("Results")
164
+ cols = st.columns(3)
165
+ cols[0].metric("Similarity Score", f"{similarity:.3f}")
166
+ cols[1].metric("Current Threshold", f"{threshold:.3f}")
167
+ cols[2].metric(
 
 
168
  "Verdict",
169
  "βœ… CLONE" if is_clone else "❌ NOT CLONE",
170
  delta=f"{similarity-threshold:+.3f}",
171
  help=f"Score {'β‰₯' if is_clone else '<'} threshold"
172
  )
173
+
174
+ st.progress(similarity)
175
+
176
+ with st.expander("Interpretation Guide"):
177
+ st.markdown("""
178
+ - **> 0.95**: Nearly identical (Type-1 clone)
179
+ - **0.85-0.95**: Very similar (Type-2 clone)
180
+ - **0.70-0.85**: Similar structure (Type-3 clone)
181
+ - **< 0.70**: Different code
182
+ """)
183
+
184
+ with st.expander("Show normalized code"):
185
+ tab1, tab2 = st.tabs(["First Code", "Second Code"])
186
+ with tab1:
187
+ st.code(normalize_code(code1))
188
+ with tab2:
189
+ st.code(normalize_code(code2))
190
+
191
  st.markdown("---")
192
  st.markdown("""
193
  **Dataset Information**: