Spaces:

RyanS974
/

525GradioApp

Sleeping

App Files Files Community

Ryan commited on Apr 21

Commit

30bc4e7

1 Parent(s): fc52d23

update

Browse files

Files changed (2) hide show

.idea/workspace.xml +4 -2
processors/topic_modeling.py +5 -95

.idea/workspace.xml CHANGED Viewed

@@ -4,7 +4,9 @@
     <option name="autoReloadType" value="SELECTIVE" />
   </component>
   <component name="ChangeListManager">
-    <list default="true" id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment="" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
@@ -53,7 +55,7 @@
       <option name="presentableId" value="Default" />
       <updated>1745170754325</updated>
       <workItem from="1745170755404" duration="245000" />
-      <workItem from="1745172030020" duration="11341000" />
     </task>
     <servers />
   </component>

     <option name="autoReloadType" value="SELECTIVE" />
   </component>
   <component name="ChangeListManager">
+    <list default="true" id="8e67814c-7f04-433c-ab7a-2b65a1106d4c" name="Changes" comment="">
+      <change beforePath="$PROJECT_DIR$/processors/topic_modeling.py" beforeDir="false" afterPath="$PROJECT_DIR$/processors/topic_modeling.py" afterDir="false" />
+    </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
       <option name="presentableId" value="Default" />
       <updated>1745170754325</updated>
       <workItem from="1745170755404" duration="245000" />
+      <workItem from="1745172030020" duration="11940000" />
     </task>
     <servers />
   </component>

processors/topic_modeling.py CHANGED Viewed

@@ -83,10 +83,12 @@ def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
     # Create document-term matrix
     if method == "nmf":
         # For NMF, use TF-IDF vectorization
-        vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.85)
     else:
         # For LDA, use CountVectorizer
-        vectorizer = CountVectorizer(max_features=1000, min_df=2, max_df=0.85)
     X = vectorizer.fit_transform(preprocessed_texts)
     feature_names = vectorizer.get_feature_names_out()
@@ -113,96 +115,4 @@ def extract_topics(texts, n_topics=3, n_top_words=10, method="lda"):
             "distribution": normalized_dist.tolist()
         })
-    return result
-def compare_topics(response_texts, model_names, n_topics=3, n_top_words=10, method="lda"):
-    """
-    Compare topic distributions between different model responses
-    Args:
-        response_texts (list): List of response texts to compare
-        model_names (list): Names of models corresponding to responses
-        n_topics (int): Number of topics to extract
-        n_top_words (int): Number of top words per topic
-        method (str): Topic modeling method ('lda' or 'nmf')
-    Returns:
-        dict: Comparative topic analysis
-    """
-    # Initialize results
-    result = {
-        "models": model_names,
-        "method": method,
-        "n_topics": n_topics,
-        "topics": [],
-        "model_topics": {},
-        "comparisons": {}
-    }
-    # Extract topics
-    topic_model = extract_topics(response_texts, n_topics, n_top_words, method)
-    result["topics"] = topic_model["topics"]
-    # Map topic distributions to models
-    for i, model_name in enumerate(model_names):
-        if i < len(topic_model["document_topics"]):
-            result["model_topics"][model_name] = topic_model["document_topics"][i]["distribution"]
-    # Calculate topic distribution differences for pairs of models
-    if len(model_names) >= 2:
-        for i in range(len(model_names)):
-            for j in range(i+1, len(model_names)):
-                model1, model2 = model_names[i], model_names[j]
-                # Get topic distributions
-                dist1 = result["model_topics"].get(model1, [])
-                dist2 = result["model_topics"].get(model2, [])
-                # Skip if distributions are not available
-                if not dist1 or not dist2 or len(dist1) != len(dist2):
-                    continue
-                # Calculate Jensen-Shannon divergence (approximation using average of KL divergences)
-                dist1 = np.array(dist1)
-                dist2 = np.array(dist2)
-                # Add small epsilon to avoid division by zero
-                epsilon = 1e-10
-                dist1 = dist1 + epsilon
-                dist2 = dist2 + epsilon
-                # Normalize
-                dist1 = dist1 / np.sum(dist1)
-                dist2 = dist2 / np.sum(dist2)
-                # Calculate average distribution
-                avg_dist = (dist1 + dist2) / 2
-                # Calculate KL divergences
-                kl_div1 = np.sum(dist1 * np.log(dist1 / avg_dist))
-                kl_div2 = np.sum(dist2 * np.log(dist2 / avg_dist))
-                # Jensen-Shannon divergence
-                js_div = (kl_div1 + kl_div2) / 2
-                # Topic-wise differences
-                topic_diffs = []
-                for t in range(len(dist1)):
-                    topic_diffs.append({
-                        "topic_id": t,
-                        "model1_weight": float(dist1[t]),
-                        "model2_weight": float(dist2[t]),
-                        "diff": float(abs(dist1[t] - dist2[t]))
-                    })
-                # Sort by difference
-                topic_diffs.sort(key=lambda x: x["diff"], reverse=True)
-                # Store comparison
-                comparison_key = f"{model1} vs {model2}"
-                result["comparisons"][comparison_key] = {
-                    "js_divergence": float(js_div),
-                    "topic_differences": topic_diffs
-                }
-    return result

     # Create document-term matrix
     if method == "nmf":
         # For NMF, use TF-IDF vectorization
+        # Adjust min_df and max_df for small document sets
+        vectorizer = TfidfVectorizer(max_features=1000, min_df=1, max_df=1.0)
     else:
         # For LDA, use CountVectorizer
+        # Adjust min_df and max_df for small document sets
+        vectorizer = CountVectorizer(max_features=1000, min_df=1, max_df=1.0)
     X = vectorizer.fit_transform(preprocessed_texts)
     feature_names = vectorizer.get_feature_names_out()
             "distribution": normalized_dist.tolist()
         })
+    return result