In this Jupyter Notebook we illustrate how to generate the Plotly plot of a phylogram with rectangular layout.
from Bio import Phylo
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)
Read the Zika Virus tree file in newick
format, downloaded from nextstrain:
tree = Phylo.read('Data/nextstrain_zika_tree.new', "newick")
The next cells contain commented lines that were typed to inspect the file structure:
#print(tree)
#dir(tree)
#tree.get_terminals()
#tree.count_terminals()# Counts the number of terminal (leaf) nodes within this tree.
The functions get_x_coordinates()
, get_y_coordinates()
are mainly the functions with the same name
from Biopython: https://github.com/biopython/biopython/blob/master/Bio/Phylo/_utils.py.
They assign cartesian coordinates to the tree nodes.
def get_x_coordinates(tree):
# Associates to each clade a x-coord.
# returns a dict {clade: x-coord}, i.e the key is a clade, and x-coord its value
xcoords = tree.depths()
# tree.depth() maps tree clades to depths (by branch length).
# returns a dict {clade: depth} where clade runs over all Clade instances of the tree,
# and depth is the distance from root to clade
# If there are no branch lengths, assign unit branch lengths
if not max(xcoords.values()):
xcoords = tree.depths(unit_branch_lengths=True)
return xcoords
def get_y_coordinates(tree, dist=1.3):
# y-coordinates are multiple of dist (i*dist below);
# dist: vertical distance between two consecutive leafs; it is chosen such that to get a tree of
# reasonable height
# returns a dict {clade: y-coord}
maxheight = tree.count_terminals()#Counts the number of tree leafs.
ycoords = dict((leaf, maxheight - i*dist) for i, leaf in enumerate(reversed(tree.get_terminals())))
def calc_row(clade):
for subclade in clade:
if subclade not in ycoords:
calc_row(subclade)
ycoords[clade] = (ycoords[clade.clades[0]] +
ycoords[clade.clades[-1]]) / 2
if tree.root.clades:
calc_row(tree.root)
return ycoords
x_coords = get_x_coordinates(tree)
y_coords = get_y_coordinates(tree)
def get_clade_lines(orientation='horizontal', y_curr=0, x_start=0, x_curr=0, y_bot=0, y_top=0,
line_color='rgb(25,25,25)', line_width=0.5):
# define a Plotly shape of type 'line', for each branch
branch_line = dict(type= 'line',
layer='below',
line=dict(color=line_color,
width=line_width)
)
if orientation == 'horizontal':
branch_line.update(x0=x_start,
y0=y_curr,
x1=x_curr,
y1=y_curr)
elif orientation == 'vertical':
branch_line.update(x0=x_curr,
y0=y_bot,
x1=x_curr,
y1=y_top)
else:
raise ValueError("Line type can be 'horizontal' or 'vertical'")
return branch_line
def draw_clade(clade, x_start, line_shapes, line_color='rgb(15,15,15)', line_width=1):
# defines recursively the tree lines (branches), starting from the argument clade
x_curr = x_coords[clade]
y_curr = y_coords[clade]
# Draw a horizontal line
branch_line=get_clade_lines(orientation='horizontal', y_curr=y_curr, x_start=x_start, x_curr=x_curr,
line_color=line_color, line_width=line_width)
line_shapes.append(branch_line)
if clade.clades:
# Draw a vertical line connecting all children
y_top = y_coords[clade.clades[0]]
y_bot = y_coords[clade.clades[-1]]
line_shapes.append(get_clade_lines(orientation='vertical', x_curr=x_curr, y_bot=y_bot, y_top=y_top,
line_color=line_color, line_width=line_width))
# Draw descendants
for child in clade:
draw_clade(child, x_curr, line_shapes)
line_shapes = []
draw_clade(tree.root, 0, line_shapes, line_color='rgb(25,25,25)', line_width=1)
Get the node coordinates:
my_tree_clades = x_coords.keys()
X = [] # list of nodes x-coordinates
Y = [] # list of nodes y-coords
text = [] #list of text to be displayed on hover over nodes
for cl in my_tree_clades:
X.append(x_coords[cl])
Y.append(y_coords[cl])
text.append(cl.name)
Read the metatadata file to record more info about nodes:
df = pd.read_csv('Data/nextstrain_zika_metadata.csv')
df.columns
Index(['Strain', 'Accession', 'Date', 'Region', 'Country', 'Division', 'Authors', 'Journal', 'Title', 'Url', 'Num Date', 'Db', 'Raw Date'], dtype='object')
len(df)
377
set(list(df['Region']))
{'China', 'Japan Korea', 'North America', 'Oceania', 'South America', 'Southeast Asia'}
Set the intermediate node color:
intermediate_node_color = 'rgb(100,100,100)'
Set the node colors depending on region (continent):
# North Amer.
NA_color = {'Cuba': 'rgb(252, 196, 174)',
'Dominican Republic': 'rgb(201, 32, 32)',
'El Salvador': 'rgb(253, 202, 181)',
'Guadeloupe': 'rgb(253, 202, 181)',
'Guatemala': 'rgb(252, 190, 167)',
'Haiti': 'rgb(252, 145, 114)',
'Honduras': 'rgb(239, 66, 49)',
'Jamaica': 'rgb(252, 185, 161)',
'Martinique': 'rgb(252, 190, 167)',
'Mexico': 'rgb(247, 109, 82)',
'Nicaragua': 'rgb(249, 121, 92)',
'Panama': 'rgb(252, 185, 161)',
'Puerto Rico': 'rgb(252, 174, 148)',
'Saint Barthelemy': 'rgb(253, 202, 181)',
'USA': 'rgb(188, 20, 26)',
'USVI': 'rgb(206, 36, 34)'}
# South Amer.
SAmer_color = {'Brazil': 'rgb(21, 127, 59)',
'Colombia': 'rgb(153, 213, 149)',
'Ecuador': 'rgb(208, 237, 202)',
'French Guiana': 'rgb(211, 238, 205)',
'Peru': 'rgb(208, 237, 202)',
'Suriname': 'rgb(206, 236, 200)',
'Venezuela': 'rgb(202, 234, 196)'}
# South Asia
SAsia_color = {'Singapore': '#0000EE', 'Vietnam': '#1E90FF'}
pl_SAsia = [[0.0, '#1E90FF'],
[0.5, '#1E90FF'],
[0.5, '#0000EE'],
[1.0,'#0000EE' ]]
Oceania_color = {'American Samoa': 'rgb(209,95,238)',
'Fiji': 'rgb(238,130, 238)',
'French Polynesia': 'rgb(148,0,211)',
'Tonga': 'rgb(238,130, 238)'}
China_color={'China': 'rgb(255,185,15'}
JapanKorea_color={'Japan': '#fcdd04'}
Assign color to nodes according to region and country:
country=[]
region=[]
color=[intermediate_node_color]*len(X)
for k, strain in enumerate(df['Strain']):
i=text.index(strain)
text[i]=text[i]+f"<br>Country: {df.loc[k, 'Country']}\
<br>Region: {df.loc[k, 'Region']}\
<br>Collection date: {df.loc[k,'Date']}\
<br>Journal: {df.loc[k, 'Journal']}\
<br>Authors: {df.loc[k, 'Authors']}"
country.append(df.loc[k, 'Country'])
region.append(df.loc[k, 'Region'])
if df.loc[k, 'Region'] == 'North America':
color[i] = NA_color[df.loc[k, 'Country']]
elif df.loc[k, 'Region'] == 'South America':
color[i] = SAmer_color[df.loc[k, 'Country']]
elif df.loc[k, 'Region'] == 'Southeast Asia':
color[i] = SAsia_color[df.loc[k, 'Country']]
elif df.loc[k, 'Region'] == 'Oceania':
color[i] = Oceania_color[df.loc[k, 'Country']]
elif df.loc[k, 'Region'] == 'China':
color[i] = '#fecc00'
elif df.loc[k, 'Region'] == 'Japan Korea':
color[i]= '#dc7928'
else: pass
Define the Plotly trace for the tree nodes:
nodes = dict(type='scatter',
x=X,
y=Y,
mode='markers',
marker=dict(color=color,
size=5),
opacity=1.0,
text=text,
hoverinfo='text')
The branches are already defined and stored as Plotly shapes that are included in the plot layout below:
layout=dict(title='Phylogeny of Zika Virus<br>377 genomes colored according to region and country',
font=dict(family='Balto',size=14),
width=1000,
height=3000,
autosize=False,
showlegend=False,
xaxis=dict(showline=True,
zeroline=False,
showgrid=False,
ticklen=4,
showticklabels=True,
title='branch length'),
yaxis=dict(visible=False),
hovermode='closest',
plot_bgcolor='rgb(250,250,250)',
margin=dict(l=10),
shapes=line_shapes # lines for tree branches
)
fig = dict(data=[nodes], layout=layout)
iplot(fig)#offline plot
from IPython.core.display import HTML
def css_styling():
styles = open("./custom.css", "r").read()
return HTML(styles)
css_styling()