The goal of this post is to describe how to make wordclouds using Python.
Good sources for a description of the algorithm can be found here:
We will use the Peter Norvig approach to coding: describe with English words, then implement later.
Words and weights will be modelled as strings (unicode) and floats.
We will use a rectangle based approach. So we need rectangles. Rectangles must be testable for intersection. Use a quadtree for that.
from copy import deepcopy as copy
def build_wordcloud(words, weights):
"""Builds a wordcloud from words and weights."""
wordcloud = Wordcloud()
for word, weight in sorted(zip(words, weights), key=lambda item: item[1]):
ideal_location = place_word(wordcloud)
location = copy(ideal_location)
while intersection(word, weight, location, wordcloud):
location = move_location_along_spiral(location, ideal_location)
insert_word(word, wordcloud, location)
return wordcloud
# class: Wordcloud, functions: place_word, move_location_along_spiral
# TODO intersection insert_word
Let's move on to placing words. The strategy described in the PDF is as follows:
Wordle offers the user a choice of placement strategies. These strategies influence the shape and texture of the completed Wordle, by determining where each word “wants” to go. On the Wordle website, the choices are center-line and alphabetical center-line. Both strategies place words near the horizontal center-line of the playing field (not necessarily upon it, but scattered away from it by a random distribution). The alphabetical strategy sorts the words alphabetically and then distributes their preferred x coordinates across the playing field.
Here, we need to define the location datatype. As a location is just a $(x,y)$ coordinate, we will make use of the complex numbers present in Python. This will simplify vector operations further along.
class Location(complex):
"A location in the (x, y) plane."
def __repr__(self): return 'Location({}, {})'.format(self.x, self.y)
x = property(lambda p: p.real)
y = property(lambda p: p.imag)
Now let's implement place_words
.
import random
def place_word(wordcloud):
"""Places a word according to a random center-line strategy in the wordcloud."""
y_position = get_vertical_center(wordcloud)
left, right = get_horizontal_bounds(wordcloud)
x_position = random.uniform(left, right)
return Location(x_position, y_position)
# functions get_vertical_center, get_horizontal_bounds
We now need to do some design decisions. Our wordcloud will be a class that keeps track of positioned words as rectangular bounding boxes. So we will define a class for positioned words and the wordcloud itself.
class PositionedWord:
"""Class that handles a word positioned on the playing field."""
def __init__(self, location, word, fontsize):
self.location = location
self.word = word
self.fontsize = fontsize
self.bbox = compute_bbox(self)
# TODO compute_bbox
class Wordcloud:
"""Class that keeps track of the positioned words and the playing field."""
def __init__(self, width=3, height=2):
self.words = []
self.width = width
self.height = height
def __repr__(self):
print("Wordcloud (w={}, h={}, words={})".format(self.width, self.height, len(self.words)))
Now, let's implement the low level function we've just used:
def get_vertical_center(wordcloud):
"""Returns the vertical center of the wordcloud."""
return wordcloud.height / 2.
def get_horizontal_bounds(wordcloud):
"""Returns the (x_min, x_max) bounds of the wordcloud."""
return (0, wordcloud.width)
This being done, we now turn to moving the location along if the intersection with existing words is not empty.
from cmath import polar, pi, exp
def move_location_along_spiral(location, center):
"""Moves location along a spiral relative to a center."""
FACTOR = 0.05
r, theta = polar(location - center)
return r * (1 + FACTOR) * exp(1j * (theta + pi * FACTOR))
Let's now move to the intersection routine. We'll do something naive by looping over existing words in the wordcloud and if the bounding box of our word is within one of the boxes of the words, we'll flag it as an intersection.
def intersection(word, weight, location, wordcloud):
"""Checks for intersections between word at a given location and the existing wordcloud."""
tentative_positioning = PositionedWord(location, word, weight)
for existing_word in wordcloud.words:
if bbox_intersection(tentative_positioning, existing_word):
return True
return False
Now, let's define the function that checks if the bounding boxes intersect.
def bbox_intersection(tentative_positioning, existing_word):
"""Checks whether the bboxes of the two words intersect or not."""
# check for x axis
%matplotlib inline
import matplotlib.pyplot as plt
We can test the spiraling here:
def plot_cplx(z):
"""Plots a complex number."""
plt.plot(z.real, z.imag, 'o')
center = Location(0, 0)
location = Location(0, 1)
plot_cplx(center)
plot_cplx(location)
for i in range(100):
location = move_location_along_spiral(location, center)
plot_cplx(location)
plt.xlim(-10, 10)
plt.ylim(-10, 10)
(-10, 10)
Let's test the whole process:
words = "William Shakespeare was an English poet, playwright, and actor, widely regarded as the greatest writer in the English language and the world's pre-eminent dramatist.".split(" ")
weigths = [1 for word in words]
words
['William', 'Shakespeare', '', 'was', 'an', 'English', 'poet,', 'playwright,', 'and', 'actor,', 'widely', 'regarded', 'as', 'the', 'greatest', 'writer', 'in', 'the', 'English', 'language', 'and', 'the', "world's", 'pre-eminent', 'dramatist.']
weigths
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
build_wordcloud(words, weigths)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-18-6ab906b1f878> in <module>() ----> 1 build_wordcloud(words, weigths) <ipython-input-1-e54e08f4cdcd> in build_wordcloud(words, weights) 7 ideal_location = place_word(wordcloud) 8 location = copy(ideal_location) ----> 9 while intersection(word, weight, location, wordcloud): 10 location = move_location_along_spiral(location, ideal_location) 11 insert_word(word, wordcloud, location) <ipython-input-9-b5b4073eca8e> in intersection(word, weight, location, wordcloud) 1 def intersection(word, weight, location, wordcloud): 2 """Checks for intersections between word at a given location and the existing wordcloud.""" ----> 3 tentative_positioning = PositionedWord(location, word, weight) 4 for existing_word in wordcloud.words: 5 if bbox_intersection(tentative_positioning, existing_word): <ipython-input-4-4aea2ffcdd52> in __init__(self, location, word, fontsize) 5 self.word = word 6 self.fontsize = fontsize ----> 7 self.bbox = compute_bbox(self) 8 9 # TODO compute_bbox NameError: name 'compute_bbox' is not defined
txt = plt.text(0.5, 0.5, "blabla")
plt.gca().add_artist(txt)
rect = plt.Rectangle((246, 162), 22.5, 10, xycoords='data')
plt.gca().add_artist(rect)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-19-daf2f847ea24> in <module>() 1 txt = plt.text(0.5, 0.5, "blabla") 2 plt.gca().add_artist(txt) ----> 3 rect = plt.Rectangle((246, 162), 22.5, 10, xycoords='data') 4 plt.gca().add_artist(rect) C:\Anaconda3\lib\site-packages\matplotlib\patches.py in __init__(self, xy, width, height, angle, **kwargs) 637 """ 638 --> 639 Patch.__init__(self, **kwargs) 640 641 self._x = float(xy[0]) C:\Anaconda3\lib\site-packages\matplotlib\patches.py in __init__(self, edgecolor, facecolor, color, linewidth, linestyle, antialiased, hatch, fill, capstyle, joinstyle, **kwargs) 121 122 if len(kwargs): --> 123 self.update(kwargs) 124 125 def get_verts(self): C:\Anaconda3\lib\site-packages\matplotlib\artist.py in update(self, props) 854 func = getattr(self, 'set_' + k, None) 855 if func is None or not six.callable(func): --> 856 raise AttributeError('Unknown property %s' % k) 857 func(v) 858 changed = True AttributeError: Unknown property xycoords
txt
<matplotlib.text.Text at 0x7b81b00>
txt.get_window_extent()
Bbox([[246.0, 162.0], [280.0, 172.0]])
txt.get_window_extent()
Bbox([[246.0, 162.0], [280.0, 172.0]])
plt.Rectangle((246, 162), 22.5, 10)
<matplotlib.patches.Rectangle at 0x7b50048>
import matplotlib.pyplot as plt
ax = plt.gca()
circ = plt.Circle((0,0), 5)
ax.add_patch(circ) #autoscale works for this
#ax.add_artist(circ) #autoscale does not work for this
#ax.relim() # not strictly necessary
ax.autoscale(True)
ax = plt.gca()
text = plt.Text(0, 0, "bla")
ax.add_line(text) #autoscale works for this
ax.autoscale(True)
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-35-49db46b17576> in <module>() 2 text = plt.Text(0, 0, "bla") 3 ----> 4 ax.add_line(text) #autoscale works for this 5 6 ax.autoscale(True) C:\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in add_line(self, line) 1706 line.set_clip_path(self.patch) 1707 -> 1708 self._update_line_limits(line) 1709 if not line.get_label(): 1710 line.set_label('_line%d' % len(self.lines)) C:\Anaconda3\lib\site-packages\matplotlib\axes\_base.py in _update_line_limits(self, line) 1728 Figures out the data limit of the given line, updating self.dataLim. 1729 """ -> 1730 path = line.get_path() 1731 if path.vertices.size == 0: 1732 return AttributeError: 'Text' object has no attribute 'get_path'