blazingbunny commited on
Commit
40e2e7e
·
verified ·
1 Parent(s): 2dff648

Create script.py

Browse files
Files changed (1) hide show
  1. script.py +168 -0
script.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import pandas as pd
3
+ import numpy as np
4
+ import networkx as nx
5
+ from networkx.algorithms import community
6
+ import matplotlib.pyplot as plt
7
+ import random
8
+ import time
9
+ from datetime import datetime
10
+ from googleapiclient.discovery import build
11
+ import langid
12
+ from tld import get_tld
13
+ from fuzzywuzzy import fuzz
14
+
15
+ # ---------------------------
16
+ # Utility Functions
17
+ # ---------------------------
18
+
19
+ def language_detection(text):
20
+ return langid.classify(text)[0]
21
+
22
+ def extract_mainDomain(url):
23
+ try:
24
+ res = get_tld(url, as_object=True)
25
+ return res.fld
26
+ except Exception:
27
+ return ""
28
+
29
+ def fuzzy_ratio(str1, str2):
30
+ return fuzz.ratio(str1, str2)
31
+
32
+ def fuzzy_token_set_ratio(str1, str2):
33
+ return fuzz.token_set_ratio(str1, str2)
34
+
35
+ # ---------------------------
36
+ # Google Custom Search
37
+ # ---------------------------
38
+
39
+ def google_search(query, api_key, cse_id, hl, gl):
40
+ try:
41
+ service = build("customsearch", "v1", developerKey=api_key, cache_discovery=False)
42
+ res = service.cse().list(
43
+ q=query, hl=hl, gl=gl, cx=cse_id,
44
+ fields='queries(request(totalResults,searchTerms,hl,gl)),items(title,displayLink,link,snippet)',
45
+ num=10
46
+ ).execute()
47
+ time.sleep(1)
48
+ return res
49
+ except Exception as e:
50
+ print("Search error:", e)
51
+ return None
52
+
53
+ # ---------------------------
54
+ # Fetch and Store Search Results
55
+ # ---------------------------
56
+
57
+ def getSearchResult(keywords, hl, gl, api_key, cse_id, database, table):
58
+ timestamp = datetime.now()
59
+ rows = []
60
+
61
+ for query in keywords:
62
+ result = google_search(query, api_key, cse_id, hl, gl)
63
+ if result and "items" in result:
64
+ for i, item in enumerate(result["items"]):
65
+ snippet = item.get("snippet", "")
66
+ title = item.get("title", "")
67
+
68
+ rows.append({
69
+ "requestTimestamp": timestamp,
70
+ "searchTerms": query,
71
+ "gl": gl,
72
+ "hl": hl,
73
+ "totalResults": result["queries"]["request"][0]["totalResults"],
74
+ "link": item["link"],
75
+ "displayLink": item["displayLink"],
76
+ "main_domain": extract_mainDomain(item["link"]),
77
+ "position": i + 1,
78
+ "snippet": snippet,
79
+ "snipped_language": language_detection(snippet),
80
+ "snippet_matchScore_order": fuzzy_ratio(snippet, query),
81
+ "snippet_matchScore_token": fuzzy_token_set_ratio(snippet, query),
82
+ "title": title,
83
+ "title_matchScore_order": fuzzy_ratio(title, query),
84
+ "title_matchScore_token": fuzzy_token_set_ratio(title, query),
85
+ })
86
+
87
+ df = pd.DataFrame(rows)
88
+ with sqlite3.connect(database) as conn:
89
+ df.to_sql(table, index=False, if_exists="append", dtype={"requestTimestamp": "DateTime"})
90
+
91
+ # ---------------------------
92
+ # Cluster Graphs
93
+ # ---------------------------
94
+
95
+ def com_postion(n, scale=1, center=(0, 0)):
96
+ theta = np.linspace(0, 2 * np.pi, n, endpoint=False)
97
+ pos = np.column_stack((np.cos(theta), np.sin(theta)))
98
+ return scale * pos + np.array(center)
99
+
100
+ def node_postion(nodes, scale=1, center=(0, 0)):
101
+ n = len(nodes)
102
+ theta = np.linspace(0, 2 * np.pi, n, endpoint=False)
103
+ pos = np.column_stack((np.cos(theta), np.sin(theta)))
104
+ return dict(zip(nodes, scale * pos + np.array(center)))
105
+
106
+ def getClustersWithGraph(database, serp_table, timestamp="max"):
107
+ with sqlite3.connect(database) as conn:
108
+ if timestamp == "max":
109
+ query = f'''
110
+ SELECT * FROM {serp_table}
111
+ WHERE requestTimestamp = (SELECT MAX(requestTimestamp) FROM {serp_table})
112
+ '''
113
+ else:
114
+ query = f'''
115
+ SELECT * FROM {serp_table}
116
+ WHERE requestTimestamp = "{timestamp}"
117
+ '''
118
+ df = pd.read_sql(query, conn)
119
+
120
+ G = nx.Graph()
121
+ G.add_nodes_from(df["searchTerms"])
122
+
123
+ for _, row in df.iterrows():
124
+ for _, r2 in df[df["link"] == row["link"]].iterrows():
125
+ if row["searchTerms"] != r2["searchTerms"]:
126
+ G.add_edge(row["searchTerms"], r2["searchTerms"])
127
+
128
+ communities = community.greedy_modularity_communities(G)
129
+ degrees = dict(G.degree())
130
+ colors = ["#" + ''.join(random.choices('0123456789ABCDEF', k=6)) for _ in communities]
131
+
132
+ pos = {}
133
+ centers = com_postion(len(communities), scale=3)
134
+ for i, group in enumerate(communities):
135
+ pos.update(node_postion(list(group), scale=0.8, center=centers[i]))
136
+
137
+ fig, ax = plt.subplots(figsize=(12, 8), dpi=100)
138
+ nx.draw(G, pos, with_labels=True, ax=ax, node_size=10, font_size=8, edge_color='gray', alpha=0.2)
139
+
140
+ for i, group in enumerate(communities):
141
+ nx.draw_networkx_nodes(
142
+ G, pos, nodelist=list(group), node_color=colors[i],
143
+ node_size=[degrees[n] * 10 for n in group], ax=ax
144
+ )
145
+
146
+ ax.axis('off')
147
+
148
+ # Return cluster assignments
149
+ cluster_rows = []
150
+ for i, group in enumerate(communities):
151
+ for kw in group:
152
+ cluster_rows.append({
153
+ "searchTerms": kw,
154
+ "cluster": i,
155
+ "requestTimestamp": timestamp
156
+ })
157
+ df_clusters = pd.DataFrame(cluster_rows)
158
+
159
+ return fig, df_clusters
160
+
161
+ # ---------------------------
162
+ # Compare Clusters
163
+ # ---------------------------
164
+
165
+ def compare_clusters(df1, df2):
166
+ merged = pd.merge(df1, df2, on=\"searchTerms\", suffixes=(\"_1\", \"_2\"))
167
+ moved = merged[merged[\"cluster_1\"] != merged[\"cluster_2\"]]
168
+ return moved