%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import scipy
Defining the column names:
names = ["Individual", "PC1", "PC2", "PC3", "PC4", "Population"]
Loading the eigenVec file from the pca run
pcaDat = pd.read_csv("/data/pca/results/pca.WestEurasia.evec",
delim_whitespace=True, skiprows=1, names=names)
Looking at the data, we find that it is a matrix, with each individual on one row, and the columns denoting the first 10 principal components. The last column contains the population for each individual:
pcaDat
Individual | PC1 | PC2 | PC3 | PC4 | Population | |
---|---|---|---|---|---|---|
0 | Yuk_009 | 0.0123 | 0.1252 | 0.1147 | 0.0567 | Yukagir |
1 | Yuk_025 | 0.0120 | 0.1258 | 0.1168 | 0.0576 | Yukagir |
2 | Yuk_022 | 0.0136 | 0.1303 | 0.1186 | 0.0564 | Yukagir |
3 | Yuk_020 | 0.0170 | 0.1278 | 0.1176 | 0.0584 | Yukagir |
4 | MC_40 | 0.0183 | 0.1226 | 0.1123 | 0.0537 | Chukchi |
5 | Yuk_024 | 0.0144 | 0.1271 | 0.1124 | 0.0584 | Yukagir |
6 | Yuk_023 | 0.0124 | 0.1348 | 0.1238 | 0.0642 | Yukagir |
7 | MC_16 | 0.0144 | 0.1266 | 0.1169 | 0.0541 | Chukchi |
8 | MC_15 | 0.0146 | 0.1250 | 0.1119 | 0.0559 | Chukchi |
9 | MC_18 | 0.0175 | 0.1238 | 0.1167 | 0.0523 | Chukchi |
10 | Yuk_004 | 0.0110 | 0.1273 | 0.1117 | 0.0573 | Yukagir |
11 | MC_08 | 0.0187 | 0.1253 | 0.1185 | 0.0564 | Chukchi |
12 | Nov_005 | 0.0152 | 0.1349 | 0.1285 | 0.0618 | Nganasan |
13 | MC_25 | 0.0182 | 0.1258 | 0.1196 | 0.0532 | Chukchi |
14 | Yuk_019 | 0.0161 | 0.1327 | 0.1229 | 0.0617 | Yukagir |
15 | Yuk_011 | 0.0152 | 0.1217 | 0.1148 | 0.0569 | Yukagir |
16 | Sesk_47 | 0.0167 | 0.1241 | 0.1177 | 0.0549 | Chukchi1 |
17 | MC_17 | 0.0180 | 0.1268 | 0.1147 | 0.0544 | Chukchi |
18 | Yuk_021 | 0.0141 | 0.1329 | 0.1210 | 0.0653 | Yukagir |
19 | MC_06 | 0.0159 | 0.1264 | 0.1135 | 0.0557 | Chukchi |
20 | MC_38 | 0.0178 | 0.1240 | 0.1143 | 0.0534 | Chukchi |
21 | MC_14 | 0.0165 | 0.1238 | 0.1114 | 0.0524 | Chukchi |
22 | Ul5 | 0.0070 | 0.1306 | 0.1144 | 0.0540 | Ulchi |
23 | Ul31 | 0.0056 | 0.1289 | 0.1182 | 0.0550 | Ulchi |
24 | Ul65 | 0.0051 | 0.1331 | 0.1117 | 0.0599 | Ulchi |
25 | Tuba12 | 0.0172 | 0.0906 | 0.0790 | 0.0362 | Tubalar |
26 | Tuba20 | 0.0129 | 0.0894 | 0.0767 | 0.0308 | Tubalar |
27 | Nel19 | 0.0273 | 0.0605 | 0.0608 | 0.0333 | Yukagir |
28 | Nlk16 | 0.0217 | 0.0744 | 0.0753 | 0.0360 | Even |
29 | Kor66 | 0.0148 | 0.1259 | 0.1157 | 0.0531 | Koryak |
... | ... | ... | ... | ... | ... | ... |
1259 | I0429 | 0.0413 | 0.0447 | 0.0440 | 0.0098 | Yamnaya_Samara |
1260 | I0438 | 0.0384 | 0.0497 | 0.0399 | 0.0020 | Yamnaya_Samara |
1261 | I0585 | 0.0770 | -0.0424 | 0.0372 | 0.0355 | WHG |
1262 | I0797 | -0.0101 | -0.0452 | -0.0342 | -0.0124 | LBK_EN |
1263 | I0795 | -0.0057 | -0.0495 | -0.0429 | 0.0098 | LBK_EN |
1264 | I0022 | -0.0133 | -0.0433 | -0.0356 | -0.0089 | LBK_EN |
1265 | I0026 | -0.0142 | -0.0438 | -0.0430 | -0.0027 | LBK_EN |
1266 | I1507 | 0.0866 | -0.0455 | 0.0393 | 0.0311 | WHG |
1267 | I0025 | -0.0103 | -0.0449 | -0.0404 | -0.0023 | LBK_EN |
1268 | I0443 | 0.0350 | 0.0401 | 0.0412 | 0.0028 | Yamnaya_Samara |
1269 | I0054 | -0.0054 | -0.0413 | -0.0410 | -0.0124 | LBK_EN |
1270 | I0046 | -0.0066 | -0.0446 | -0.0386 | -0.0092 | LBK_EN |
1271 | I0048 | -0.0128 | -0.0367 | -0.0388 | -0.0129 | LBK_EN |
1272 | I0056 | -0.0067 | -0.0472 | -0.0388 | -0.0054 | LBK_EN |
1273 | I0057 | -0.0113 | -0.0442 | -0.0357 | -0.0008 | LBK_EN |
1274 | I0100 | -0.0063 | -0.0455 | -0.0410 | -0.0051 | LBK_EN |
1275 | I0659 | -0.0084 | -0.0437 | -0.0431 | -0.0099 | LBK_EN |
1276 | I0821 | -0.0071 | -0.0428 | -0.0380 | -0.0103 | LBK_EN |
1277 | I1550 | -0.0107 | -0.0386 | -0.0402 | -0.0039 | LBK_EN |
1278 | BOO001 | 0.0399 | 0.0760 | 0.0915 | 0.0453 | BolshoyOleniOstrov |
1279 | BOO002 | 0.0445 | 0.0735 | 0.0925 | 0.0379 | BolshoyOleniOstrov |
1280 | BOO003 | 0.0466 | 0.0765 | 0.0862 | 0.0415 | BolshoyOleniOstrov |
1281 | BOO004 | 0.0411 | 0.0723 | 0.0938 | 0.0419 | BolshoyOleniOstrov |
1282 | BOO005 | 0.0461 | 0.0731 | 0.0909 | 0.0401 | BolshoyOleniOstrov |
1283 | BOO006 | 0.0394 | 0.0917 | 0.1002 | 0.0438 | BolshoyOleniOstrov |
1284 | CHV001 | 0.0441 | 0.0331 | 0.0587 | 0.0325 | ChalmnyVarre |
1285 | CHV002 | 0.0442 | 0.0351 | 0.0610 | 0.0373 | ChalmnyVarre |
1286 | JK1968 | 0.0398 | 0.0385 | 0.0661 | 0.0299 | Levanluhta |
1287 | JK1970 | 0.0408 | 0.0466 | 0.0600 | 0.0363 | Levanluhta |
1288 | JK2065 | 0.0392 | -0.0065 | 0.0195 | 0.0043 | JK2065 |
1289 rows × 6 columns
We can quickly plot the first two PCs for all individuals:
plt.figure(figsize=(10, 10))
plt.scatter(x=pcaDat["PC1"], y=pcaDat["PC2"])
plt.xlabel("PC1");
plt.ylabel("PC2");
which is not very helpful, because we can't see where each population falls. We can highlight a few populations to get a bit more of a feeling:
plt.figure(figsize=(10, 10))
plt.scatter(x=pcaDat["PC1"], y=pcaDat["PC2"], label="")
for pop in ["Finnish", "Sardinian", "Armenian", "BedouinB"]:
d = pcaDat[pcaDat["Population"] == pop]
plt.scatter(x=d["PC1"], y=d["PC2"], label=pop)
plt.legend()
plt.xlabel("PC1");
plt.ylabel("PC2");
We can improve the plot further by plotting only the population that we had in the population list. So we need to load the population list and while we're at it, also add color- and symbol numbers for plotting.
popListDat = pd.read_csv("/data/pca/WestEurasia.poplist.txt",
names=["Population"]).sort_values(by="Population")
nPops = len(popListDat)
nCols = 8
nSymbols = int(nPops / nCols)
colorIndices = [int(i / nSymbols) for i in range(nPops)]
symbolIndices = [i % nSymbols for i in range(nPops)]
popListDat = popListDat.assign(colorIndex=colorIndices, symbolIndex=symbolIndices)
popListDat
Population | colorIndex | symbolIndex | |
---|---|---|---|
1 | Abkhasian | 0 | 0 |
2 | Adygei | 0 | 1 |
3 | Albanian | 0 | 2 |
4 | Armenian | 0 | 3 |
5 | Assyrian | 0 | 4 |
6 | Balkar | 0 | 5 |
7 | Basque | 0 | 6 |
8 | BedouinA | 0 | 7 |
9 | BedouinB | 1 | 0 |
10 | Belarusian | 1 | 1 |
11 | Bulgarian | 1 | 2 |
12 | Canary_Islander | 1 | 3 |
13 | Chechen | 1 | 4 |
0 | Chuvash | 1 | 5 |
14 | Croatian | 1 | 6 |
15 | Cypriot | 1 | 7 |
16 | Czech | 2 | 0 |
17 | Druze | 2 | 1 |
18 | English | 2 | 2 |
19 | Estonian | 2 | 3 |
20 | Finnish | 2 | 4 |
21 | French | 2 | 5 |
22 | Georgian | 2 | 6 |
23 | German | 2 | 7 |
24 | Greek | 3 | 0 |
25 | Hungarian | 3 | 1 |
26 | Icelandic | 3 | 2 |
27 | Iranian | 3 | 3 |
28 | Irish | 3 | 4 |
29 | Irish_Ulster | 3 | 5 |
... | ... | ... | ... |
38 | Jew_Tunisian | 4 | 6 |
39 | Jew_Turkish | 4 | 7 |
40 | Jew_Yemenite | 5 | 0 |
41 | Jordanian | 5 | 1 |
42 | Kumyk | 5 | 2 |
44 | Lebanese | 5 | 3 |
43 | Lebanese_Christian | 5 | 4 |
45 | Lebanese_Muslim | 5 | 5 |
46 | Lezgin | 5 | 6 |
47 | Lithuanian | 5 | 7 |
48 | Maltese | 6 | 0 |
49 | Mordovian | 6 | 1 |
50 | North_Ossetian | 6 | 2 |
51 | Norwegian | 6 | 3 |
52 | Orcadian | 6 | 4 |
53 | Palestinian | 6 | 5 |
54 | Polish | 6 | 6 |
55 | Romanian | 6 | 7 |
56 | Russian | 7 | 0 |
57 | Sardinian | 7 | 1 |
58 | Saudi | 7 | 2 |
59 | Scottish | 7 | 3 |
60 | Shetlandic | 7 | 4 |
61 | Sicilian | 7 | 5 |
62 | Sorb | 7 | 6 |
64 | Spanish | 7 | 7 |
63 | Spanish_North | 8 | 0 |
65 | Syrian | 8 | 1 |
66 | Turkish | 8 | 2 |
67 | Ukrainian | 8 | 3 |
68 rows × 3 columns
and we end up with only 720 points. Note that I'm here flipping the x axis to make the correlation to Geography more obvious. We can now plot all points with colors and symbols:
plt.figure(figsize=(10,10))
symbolVec = ["8", "s", "p", "P", "*", "h", "H", "+", "x", "X", "D", "d"]
colorVec = [u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd',
u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']
for i, row in popListDat.iterrows():
d = pcaDat[pcaDat.Population == row["Population"]]
plt.scatter(x=-d["PC1"], y=d["PC2"], c=colorVec[row["colorIndex"]],
marker=symbolVec[row["symbolIndex"]], label=row["Population"])
plt.xlabel("PC1");
plt.ylabel("PC2");
plt.legend(loc=(1.1, 0), ncol=3)
<matplotlib.legend.Legend at 0x7f5700f54ba8>
Adding ancient populations:
plt.figure(figsize=(10,10))
symbolVec = ["8", "s", "p", "P", "*", "h", "H", "+", "x", "X", "D", "d", "v", "<", ">", "^"]
colorVec = [u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd',
u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']
for i, row in popListDat.iterrows():
d = pcaDat[pcaDat.Population == row["Population"]]
plt.scatter(x=-d["PC1"], y=d["PC2"], c=colorVec[row["colorIndex"]],
marker=symbolVec[row["symbolIndex"]], label=row["Population"])
for i, pop in enumerate(["Levanluhta", "Saami.DG", "BolshoyOleniOstrov", "Yamnaya_Samara", "LBK_EN", "WHG"]):
d = pcaDat[pcaDat.Population == pop]
plt.scatter(x=-d["PC1"], y=d["PC2"], c="black", marker=symbolVec[i], label=pop)
plt.xlabel("PC1");
plt.ylabel("PC2");
plt.legend(loc=(1.1, 0), ncol=3)
<matplotlib.legend.Legend at 0x7f5700a1d2e8>
popListDat = pd.read_csv("/data/pca/AllEurasia.poplist.txt",
names=["Population"]).sort_values(by="Population")
nPops = len(popListDat)
nCols = 9
nSymbols = int(nPops / nCols)
colorIndices = [int(i / nSymbols) for i in range(nPops)]
symbolIndices = [i % nSymbols for i in range(nPops)]
popListDat = popListDat.assign(colorIndex=colorIndices, symbolIndex=symbolIndices)
popListDat
Population | colorIndex | symbolIndex | |
---|---|---|---|
0 | Abkhasian | 0 | 0 |
1 | Adygei | 0 | 1 |
2 | Albanian | 0 | 2 |
3 | Aleut | 0 | 3 |
4 | Aleut_Tlingit | 0 | 4 |
5 | Altaian | 0 | 5 |
6 | Ami | 0 | 6 |
7 | Armenian | 0 | 7 |
8 | Assyrian | 0 | 8 |
9 | Atayal | 0 | 9 |
10 | Avar | 0 | 10 |
11 | Azeri | 0 | 11 |
12 | Balkar | 0 | 12 |
13 | Basque | 1 | 0 |
14 | BedouinA | 1 | 1 |
15 | BedouinB | 1 | 2 |
16 | Belarusian | 1 | 3 |
17 | Borneo | 1 | 4 |
18 | Bulgarian | 1 | 5 |
19 | Buryat | 1 | 6 |
20 | Cambodian | 1 | 7 |
21 | Chechen | 1 | 8 |
22 | Chukchi | 1 | 9 |
23 | Chukchi1 | 1 | 10 |
24 | Chuvash | 1 | 11 |
25 | Croatian | 1 | 12 |
26 | Cypriot | 2 | 0 |
27 | Czech | 2 | 1 |
28 | Dai | 2 | 2 |
29 | Daur | 2 | 3 |
... | ... | ... | ... |
89 | Saami.DG | 6 | 11 |
90 | Saami_WGA | 6 | 12 |
91 | Sardinian | 7 | 0 |
92 | Saudi | 7 | 1 |
93 | Scottish | 7 | 2 |
94 | Selkup | 7 | 3 |
95 | Semende | 7 | 4 |
96 | She | 7 | 5 |
97 | Sherpa.DG | 7 | 6 |
98 | Sicilian | 7 | 7 |
99 | Spanish | 7 | 8 |
100 | Spanish_North | 7 | 9 |
101 | Syrian | 7 | 10 |
102 | Tajik | 7 | 11 |
103 | Thai | 7 | 12 |
104 | Tibetan.DG | 8 | 0 |
105 | Tu | 8 | 1 |
106 | Tubalar | 8 | 2 |
107 | Tujia | 8 | 3 |
108 | Turkish | 8 | 4 |
109 | Turkmen | 8 | 5 |
110 | Tuvinian | 8 | 6 |
111 | Ukrainian | 8 | 7 |
112 | Ulchi | 8 | 8 |
113 | Uygur | 8 | 9 |
114 | Uzbek | 8 | 10 |
115 | Xibo | 8 | 11 |
116 | Yakut | 8 | 12 |
117 | Yi | 9 | 0 |
118 | Yukagir | 9 | 1 |
119 rows × 3 columns
pcaDat = pd.read_csv("/data/pca/results/pca.AllEurasia.evec",
delim_whitespace=True, skiprows=1, names=names)
plt.figure(figsize=(10,10))
symbolVec = ["8", "s", "p", "P", "*", "h", "H", "+", "x", "X", "D", "d", "v", "<", ">", "^"]
colorVec = [u'#1f77b4', u'#ff7f0e', u'#2ca02c', u'#d62728', u'#9467bd',
u'#8c564b', u'#e377c2', u'#7f7f7f', u'#bcbd22', u'#17becf']
for i, row in popListDat.iterrows():
d = pcaDat[pcaDat.Population == row["Population"]]
plt.scatter(x=-d["PC1"], y=d["PC2"], c=colorVec[row["colorIndex"]],
marker=symbolVec[row["symbolIndex"]], label=row["Population"])
for i, pop in enumerate(["Levanluhta", "Saami.DG", "BolshoyOleniOstrov", "Yamnaya_Samara", "LBK_EN", "WHG"]):
d = pcaDat[pcaDat.Population == pop]
plt.scatter(x=-d["PC1"], y=d["PC2"], c="black", marker=symbolVec[i], label=pop)
plt.xlabel("PC1");
plt.ylabel("PC2");
plt.legend(loc=(1.1, 0), ncol=3)
<matplotlib.legend.Legend at 0x7f7086207a58>