import sqlite3 import pandas as pd import numpy as np import networkx as nx from networkx.algorithms import community import matplotlib.pyplot as plt import random import time from datetime import datetime from googleapiclient.discovery import build import langid from tld import get_tld from fuzzywuzzy import fuzz # --------------------------- # Utility Functions # --------------------------- def language_detection(text): return langid.classify(text)[0] def extract_mainDomain(url): try: res = get_tld(url, as_object=True) return res.fld except Exception: return "" def fuzzy_ratio(str1, str2): return fuzz.ratio(str1, str2) def fuzzy_token_set_ratio(str1, str2): return fuzz.token_set_ratio(str1, str2) # --------------------------- # Google Custom Search # --------------------------- def google_search(query, api_key, cse_id, hl, gl): try: service = build("customsearch", "v1", developerKey=api_key, cache_discovery=False) res = service.cse().list( q=query, hl=hl, gl=gl, cx=cse_id, fields='queries(request(totalResults,searchTerms,hl,gl)),items(title,displayLink,link,snippet)', num=10 ).execute() time.sleep(1) return res except Exception as e: print("Search error:", e) return None # --------------------------- # Fetch and Store Search Results # --------------------------- def getSearchResult(keywords, hl, gl, api_key, cse_id, database, table): timestamp = datetime.now() rows = [] for query in keywords: result = google_search(query, api_key, cse_id, hl, gl) if result and "items" in result: for i, item in enumerate(result["items"]): snippet = item.get("snippet", "") title = item.get("title", "") rows.append({ "requestTimestamp": timestamp, "searchTerms": query, "gl": gl, "hl": hl, "totalResults": result["queries"]["request"][0]["totalResults"], "link": item["link"], "displayLink": item["displayLink"], "main_domain": extract_mainDomain(item["link"]), "position": i + 1, "snippet": snippet, "snipped_language": language_detection(snippet), "snippet_matchScore_order": fuzzy_ratio(snippet, query), "snippet_matchScore_token": fuzzy_token_set_ratio(snippet, query), "title": title, "title_matchScore_order": fuzzy_ratio(title, query), "title_matchScore_token": fuzzy_token_set_ratio(title, query), }) df = pd.DataFrame(rows) with sqlite3.connect(database) as conn: df.to_sql(table, index=False, if_exists="append", dtype={"requestTimestamp": "DateTime"}) # --------------------------- # Cluster Graphs # --------------------------- def com_postion(n, scale=1, center=(0, 0)): theta = np.linspace(0, 2 * np.pi, n, endpoint=False) pos = np.column_stack((np.cos(theta), np.sin(theta))) return scale * pos + np.array(center) def node_postion(nodes, scale=1, center=(0, 0)): n = len(nodes) theta = np.linspace(0, 2 * np.pi, n, endpoint=False) pos = np.column_stack((np.cos(theta), np.sin(theta))) return dict(zip(nodes, scale * pos + np.array(center))) def getClustersWithGraph(database, serp_table, timestamp="max"): with sqlite3.connect(database) as conn: if timestamp == "max": query = f''' SELECT * FROM {serp_table} WHERE requestTimestamp = (SELECT MAX(requestTimestamp) FROM {serp_table}) ''' else: query = f''' SELECT * FROM {serp_table} WHERE requestTimestamp = "{timestamp}" ''' df = pd.read_sql(query, conn) G = nx.Graph() G.add_nodes_from(df["searchTerms"]) for _, row in df.iterrows(): for _, r2 in df[df["link"] == row["link"]].iterrows(): if row["searchTerms"] != r2["searchTerms"]: G.add_edge(row["searchTerms"], r2["searchTerms"]) communities = community.greedy_modularity_communities(G) degrees = dict(G.degree()) colors = ["#" + ''.join(random.choices('0123456789ABCDEF', k=6)) for _ in communities] pos = {} centers = com_postion(len(communities), scale=3) for i, group in enumerate(communities): pos.update(node_postion(list(group), scale=0.8, center=centers[i])) fig, ax = plt.subplots(figsize=(12, 8), dpi=100) nx.draw(G, pos, with_labels=True, ax=ax, node_size=10, font_size=8, edge_color='gray', alpha=0.2) for i, group in enumerate(communities): nx.draw_networkx_nodes( G, pos, nodelist=list(group), node_color=colors[i], node_size=[degrees[n] * 10 for n in group], ax=ax ) ax.axis('off') # Return cluster assignments cluster_rows = [] for i, group in enumerate(communities): for kw in group: cluster_rows.append({ "searchTerms": kw, "cluster": i, "requestTimestamp": timestamp }) df_clusters = pd.DataFrame(cluster_rows) return fig, df_clusters # --------------------------- # Compare Clusters # --------------------------- def compare_clusters(df1, df2): merged = pd.merge(df1, df2, on=\"searchTerms\", suffixes=(\"_1\", \"_2\")) moved = merged[merged[\"cluster_1\"] != merged[\"cluster_2\"]] return moved