Spaces:
Runtime error
Runtime error
import sqlite3 | |
import pandas as pd | |
import numpy as np | |
import networkx as nx | |
from networkx.algorithms import community | |
import matplotlib.pyplot as plt | |
import random | |
import time | |
from datetime import datetime | |
from googleapiclient.discovery import build | |
import langid | |
from tld import get_tld | |
from fuzzywuzzy import fuzz | |
# --------------------------- | |
# Utility Functions | |
# --------------------------- | |
def language_detection(text): | |
return langid.classify(text)[0] | |
def extract_mainDomain(url): | |
try: | |
res = get_tld(url, as_object=True) | |
return res.fld | |
except Exception: | |
return "" | |
def fuzzy_ratio(str1, str2): | |
return fuzz.ratio(str1, str2) | |
def fuzzy_token_set_ratio(str1, str2): | |
return fuzz.token_set_ratio(str1, str2) | |
# --------------------------- | |
# Google Custom Search | |
# --------------------------- | |
def google_search(query, api_key, cse_id, hl, gl): | |
try: | |
service = build("customsearch", "v1", developerKey=api_key, cache_discovery=False) | |
res = service.cse().list( | |
q=query, hl=hl, gl=gl, cx=cse_id, | |
fields='queries(request(totalResults,searchTerms,hl,gl)),items(title,displayLink,link,snippet)', | |
num=10 | |
).execute() | |
time.sleep(1) | |
return res | |
except Exception as e: | |
print("Search error:", e) | |
return None | |
# --------------------------- | |
# Fetch and Store Search Results | |
# --------------------------- | |
def getSearchResult(keywords, hl, gl, api_key, cse_id, database, table): | |
timestamp = datetime.now() | |
rows = [] | |
for query in keywords: | |
result = google_search(query, api_key, cse_id, hl, gl) | |
if result and "items" in result: | |
for i, item in enumerate(result["items"]): | |
snippet = item.get("snippet", "") | |
title = item.get("title", "") | |
rows.append({ | |
"requestTimestamp": timestamp, | |
"searchTerms": query, | |
"gl": gl, | |
"hl": hl, | |
"totalResults": result["queries"]["request"][0]["totalResults"], | |
"link": item["link"], | |
"displayLink": item["displayLink"], | |
"main_domain": extract_mainDomain(item["link"]), | |
"position": i + 1, | |
"snippet": snippet, | |
"snipped_language": language_detection(snippet), | |
"snippet_matchScore_order": fuzzy_ratio(snippet, query), | |
"snippet_matchScore_token": fuzzy_token_set_ratio(snippet, query), | |
"title": title, | |
"title_matchScore_order": fuzzy_ratio(title, query), | |
"title_matchScore_token": fuzzy_token_set_ratio(title, query), | |
}) | |
df = pd.DataFrame(rows) | |
with sqlite3.connect(database) as conn: | |
df.to_sql(table, index=False, if_exists="append", dtype={"requestTimestamp": "DateTime"}) | |
# --------------------------- | |
# Cluster Graphs | |
# --------------------------- | |
def com_postion(n, scale=1, center=(0, 0)): | |
theta = np.linspace(0, 2 * np.pi, n, endpoint=False) | |
pos = np.column_stack((np.cos(theta), np.sin(theta))) | |
return scale * pos + np.array(center) | |
def node_postion(nodes, scale=1, center=(0, 0)): | |
n = len(nodes) | |
theta = np.linspace(0, 2 * np.pi, n, endpoint=False) | |
pos = np.column_stack((np.cos(theta), np.sin(theta))) | |
return dict(zip(nodes, scale * pos + np.array(center))) | |
def getClustersWithGraph(database, serp_table, timestamp="max"): | |
with sqlite3.connect(database) as conn: | |
if timestamp == "max": | |
query = f''' | |
SELECT * FROM {serp_table} | |
WHERE requestTimestamp = (SELECT MAX(requestTimestamp) FROM {serp_table}) | |
''' | |
else: | |
query = f''' | |
SELECT * FROM {serp_table} | |
WHERE requestTimestamp = "{timestamp}" | |
''' | |
df = pd.read_sql(query, conn) | |
G = nx.Graph() | |
G.add_nodes_from(df["searchTerms"]) | |
for _, row in df.iterrows(): | |
for _, r2 in df[df["link"] == row["link"]].iterrows(): | |
if row["searchTerms"] != r2["searchTerms"]: | |
G.add_edge(row["searchTerms"], r2["searchTerms"]) | |
communities = community.greedy_modularity_communities(G) | |
degrees = dict(G.degree()) | |
colors = ["#" + ''.join(random.choices('0123456789ABCDEF', k=6)) for _ in communities] | |
pos = {} | |
centers = com_postion(len(communities), scale=3) | |
for i, group in enumerate(communities): | |
pos.update(node_postion(list(group), scale=0.8, center=centers[i])) | |
fig, ax = plt.subplots(figsize=(12, 8), dpi=100) | |
nx.draw(G, pos, with_labels=True, ax=ax, node_size=10, font_size=8, edge_color='gray', alpha=0.2) | |
for i, group in enumerate(communities): | |
nx.draw_networkx_nodes( | |
G, pos, nodelist=list(group), node_color=colors[i], | |
node_size=[degrees[n] * 10 for n in group], ax=ax | |
) | |
ax.axis('off') | |
# Return cluster assignments | |
cluster_rows = [] | |
for i, group in enumerate(communities): | |
for kw in group: | |
cluster_rows.append({ | |
"searchTerms": kw, | |
"cluster": i, | |
"requestTimestamp": timestamp | |
}) | |
df_clusters = pd.DataFrame(cluster_rows) | |
return fig, df_clusters | |
# --------------------------- | |
# Compare Clusters | |
# --------------------------- | |
def compare_clusters(df1, df2): | |
merged = pd.merge(df1, df2, on=\"searchTerms\", suffixes=(\"_1\", \"_2\")) | |
moved = merged[merged[\"cluster_1\"] != merged[\"cluster_2\"]] | |
return moved | |