Spaces:

blazingbunny
/

cluster-vis-marimo

Runtime error

File size: 5,673 Bytes

40e2e7e

import sqlite3
import pandas as pd
import numpy as np
import networkx as nx
from networkx.algorithms import community
import matplotlib.pyplot as plt
import random
import time
from datetime import datetime
from googleapiclient.discovery import build
import langid
from tld import get_tld
from fuzzywuzzy import fuzz

# ---------------------------
# Utility Functions
# ---------------------------

def language_detection(text):
    return langid.classify(text)[0]

def extract_mainDomain(url):
    try:
        res = get_tld(url, as_object=True)
        return res.fld
    except Exception:
        return ""

def fuzzy_ratio(str1, str2):
    return fuzz.ratio(str1, str2)

def fuzzy_token_set_ratio(str1, str2):
    return fuzz.token_set_ratio(str1, str2)

# ---------------------------
# Google Custom Search
# ---------------------------

def google_search(query, api_key, cse_id, hl, gl):
    try:
        service = build("customsearch", "v1", developerKey=api_key, cache_discovery=False)
        res = service.cse().list(
            q=query, hl=hl, gl=gl, cx=cse_id,
            fields='queries(request(totalResults,searchTerms,hl,gl)),items(title,displayLink,link,snippet)',
            num=10
        ).execute()
        time.sleep(1)
        return res
    except Exception as e:
        print("Search error:", e)
        return None

# ---------------------------
# Fetch and Store Search Results
# ---------------------------

def getSearchResult(keywords, hl, gl, api_key, cse_id, database, table):
    timestamp = datetime.now()
    rows = []

    for query in keywords:
        result = google_search(query, api_key, cse_id, hl, gl)
        if result and "items" in result:
            for i, item in enumerate(result["items"]):
                snippet = item.get("snippet", "")
                title = item.get("title", "")

                rows.append({
                    "requestTimestamp": timestamp,
                    "searchTerms": query,
                    "gl": gl,
                    "hl": hl,
                    "totalResults": result["queries"]["request"][0]["totalResults"],
                    "link": item["link"],
                    "displayLink": item["displayLink"],
                    "main_domain": extract_mainDomain(item["link"]),
                    "position": i + 1,
                    "snippet": snippet,
                    "snipped_language": language_detection(snippet),
                    "snippet_matchScore_order": fuzzy_ratio(snippet, query),
                    "snippet_matchScore_token": fuzzy_token_set_ratio(snippet, query),
                    "title": title,
                    "title_matchScore_order": fuzzy_ratio(title, query),
                    "title_matchScore_token": fuzzy_token_set_ratio(title, query),
                })

    df = pd.DataFrame(rows)
    with sqlite3.connect(database) as conn:
        df.to_sql(table, index=False, if_exists="append", dtype={"requestTimestamp": "DateTime"})

# ---------------------------
# Cluster Graphs
# ---------------------------

def com_postion(n, scale=1, center=(0, 0)):
    theta = np.linspace(0, 2 * np.pi, n, endpoint=False)
    pos = np.column_stack((np.cos(theta), np.sin(theta)))
    return scale * pos + np.array(center)

def node_postion(nodes, scale=1, center=(0, 0)):
    n = len(nodes)
    theta = np.linspace(0, 2 * np.pi, n, endpoint=False)
    pos = np.column_stack((np.cos(theta), np.sin(theta)))
    return dict(zip(nodes, scale * pos + np.array(center)))

def getClustersWithGraph(database, serp_table, timestamp="max"):
    with sqlite3.connect(database) as conn:
        if timestamp == "max":
            query = f'''
                SELECT * FROM {serp_table}
                WHERE requestTimestamp = (SELECT MAX(requestTimestamp) FROM {serp_table})
            '''
        else:
            query = f'''
                SELECT * FROM {serp_table}
                WHERE requestTimestamp = "{timestamp}"
            '''
        df = pd.read_sql(query, conn)

    G = nx.Graph()
    G.add_nodes_from(df["searchTerms"])

    for _, row in df.iterrows():
        for _, r2 in df[df["link"] == row["link"]].iterrows():
            if row["searchTerms"] != r2["searchTerms"]:
                G.add_edge(row["searchTerms"], r2["searchTerms"])

    communities = community.greedy_modularity_communities(G)
    degrees = dict(G.degree())
    colors = ["#" + ''.join(random.choices('0123456789ABCDEF', k=6)) for _ in communities]

    pos = {}
    centers = com_postion(len(communities), scale=3)
    for i, group in enumerate(communities):
        pos.update(node_postion(list(group), scale=0.8, center=centers[i]))

    fig, ax = plt.subplots(figsize=(12, 8), dpi=100)
    nx.draw(G, pos, with_labels=True, ax=ax, node_size=10, font_size=8, edge_color='gray', alpha=0.2)

    for i, group in enumerate(communities):
        nx.draw_networkx_nodes(
            G, pos, nodelist=list(group), node_color=colors[i],
            node_size=[degrees[n] * 10 for n in group], ax=ax
        )

    ax.axis('off')

    # Return cluster assignments
    cluster_rows = []
    for i, group in enumerate(communities):
        for kw in group:
            cluster_rows.append({
                "searchTerms": kw,
                "cluster": i,
                "requestTimestamp": timestamp
            })
    df_clusters = pd.DataFrame(cluster_rows)

    return fig, df_clusters

# ---------------------------
# Compare Clusters
# ---------------------------

def compare_clusters(df1, df2):
    merged = pd.merge(df1, df2, on=\"searchTerms\", suffixes=(\"_1\", \"_2\"))
    moved = merged[merged[\"cluster_1\"] != merged[\"cluster_2\"]]
    return moved