json_filename = 'x.json' # accidentally erased json part, re-integrate later
gexf_filename = 'gh43.gexf'
import os
#os.chdir("C:/Users/David/Documents/Dropbox")
os.chdir("C:/_Dropbox/Dropbox")
import pandas as pd
df = pd.read_csv("gh43treetable.txt", dtype=str)
df.columns = [['ancid', 'desc1', 'desc2', 'branchlength1', 'branchlength2']]
# replace spaces with underscores, then make a list of all ids with duplicates included, then remove duplicates
id_list = []
for idx, row in df.iterrows():
for field in ['ancid', 'desc1', 'desc2']:
df[field][idx] = (df[field][idx].strip()).replace(" ", "_")
id_list.append(df[field][idx])
print len(id_list)
id_list = list(set(id_list))
print len(id_list)
639 427
print df.iloc[2]
print id_list[:10]
ancid 217 desc1 216 desc2 ASPNI_A2QT85 branchlength1 0.0000000000 branchlength2 0.0000000000 Name: 2, dtype: object ['Paeby1p7_018872', 'Paeby1p7_018871', 'bri_CHGT_02150', 'ABN43C_PENCH', '344', '345', 'ABN43A_PENCH', 'CORTH_1_02834', '340', '341']
# make dicts of IDs and positions in id_list
dict_pti = {}
dict_itp = {}
for pos in range(len(id_list)):
dict_pti[pos] = id_list[pos]
dict_itp[id_list[pos]] = pos
#make link list
link_list = []
for idx, row in df.iterrows():
for descnum in ['desc1', 'desc2']:
templist = []
templist.append(df.ancid[idx])
templist.append(df[descnum][idx])
link_list.append(templist)
print link_list[:10]
[['215', 'ABN43A_ASPNG'], ['215', 'ANIG203143G'], ['216', '215'], ['216', 'ASPNI_P42256'], ['217', '216'], ['217', 'ASPNI_A2QT85'], ['218', 'PENCH_Q5H7M8'], ['218', 'ABN43A_PENCH'], ['219', 'Paeby1p7_018872'], ['219', '218']]
# write gexf
with open(gexf_filename, "w") as f:
f.write('<gexf xmlns="http://www.gexf.net/1.2draft" version="1.2">\n <meta lastmodifieddate="2009-03-20">\n <creator>Gexf.net</creator>\n')
with open(gexf_filename, "a") as f:
f.write(' <description>')
f.write(gexf_filename)
f.write('</description>\n </meta>\n <graph mode="static" defaultedgetype="directed">\n')
f.write(' <nodes>\n')
for pos in range(len(id_list)):
f.write(' <node id="')
f.write(str(pos))
f.write('" label="')
f.write(str(dict_pti[pos]))
f.write('" />\n')
f.write(' </nodes>\n')
f.write(' <edges>\n')
for i in range(len(link_list)):
f.write(' <edge id="')
f.write(str(i))
f.write('" source="')
f.write(str(dict_itp[link_list[i][0]]))
f.write('" target="')
f.write(str(dict_itp[link_list[i][1]]))
f.write('" />\n')
f.write(' </edges>\n')
f.write(' </graph>\n')
f.write('</gexf>\n')