From this blog post >> https://medium.com/applied-data-science/the-google-vs-trick-618c8fd5359f
See another implementation here > https://github.com/ekerstein/egograph
import requests
import re
import time
def get_top(seed, orig=None):
"""
Get the top suggestions for a given seed word.
"""
if orig is None:
orig = seed
params = {
'output': 'toolbar',
'hl': 'en',
'q': f"{seed} vs ",
}
# Rate limit a bit.
time.sleep(0.5)
URL = "http://suggestqueries.google.com/complete/search"
r = requests.get(URL, params)
# Get the suggs from the API query.
suggs = []
for s in re.findall(r'suggestion data="(.+?)"', r.text):
try:
sugg = s.split(' vs ')[1]
except IndexError:
sugg = None
suggs.append(sugg)
# Clean them up.
top = []
for sugg in suggs:
if sugg is None:
continue
if (orig in sugg) or (seed in sugg):
continue
sugg = sugg.replace(' 2020', '')
sugg = sugg.replace(' ', '-')
try:
# Get rid of things that are just numbers.
s = float(sugg)
continue
except:
pass
# Add it.
if sugg not in top:
top.append(sugg)
# Stop when we have 5 (or before).
if len(top) == 5:
break
return top
seed = 'geophysics'
get_top(seed)
['geology', 'physics']
...and count everything.
from collections import Counter
def get_nodes(seed):
"""
Use recursion, they said.
No, I said.
But I agree that this is gross.
Going to go to 3 levels.
"""
done = []
c = Counter({seed: 5}) # Why 5?
d = Counter()
top = get_top(seed)
c.update(top)
for w1, word1 in enumerate(top):
if word1 in done: continue
d.update({(seed, word1): 5 - w1})
top = get_top(word1, seed)
done.append(word1)
c.update(top)
for w2, word2 in enumerate(top):
if word2 in done: continue
d.update({(word1, word2): 5 - w2})
top = get_top(word2, seed)
done.append(word2)
c.update(top)
for w3, word3 in enumerate(top):
if word3 in done: continue
done.append(word3)
d.update({(word2, word3): 5 - w3})
return c, d
c, d = get_nodes(seed)
This is right outta the blog post.
nodes = [(k, {'count': v}) for k, v in c.items()]
edges = [(k[0], k[1], {'weight': v, 'distance': 11 - v}) for k, v in d.items()]
import networkx as nx
G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)
E = nx.ego_graph(G, seed, distance='distance', radius=22)
subgraphs = nx.algorithms.connectivity.edge_kcomponents.k_edge_subgraphs(E, k=3)
for sub in subgraphs:
if seed in sub:
break
pruned_E = E.subgraph(sub)
ego_nodes = pruned_E.nodes()
ego_edges = pruned_E.edges()
nx.draw(E, )
%matplotlib inline
import matplotlib.pyplot as plt
edges = E.edges()
weights = [E[u][v]['weight'] for u,v in edges]
counts = [200 * nx.get_node_attributes(E, 'count')[u] for u in E.nodes()]
params = {
'node_size': counts,
'with_labels': True,
'width': weights,
}
pos = nx.spring_layout(E)
fig = plt.figure(figsize=(8, 8))
nx.draw(E, pos, **params)