#read in data from csv file #using Pkg #Pkg.add("CSV") using CSV # CSV File with oxyegn adsoprtion energies, columns are crystal number, crystal chemical formula # adsorption site and oxyegn adsoprtion energies #Oads=CSV.File("/Users/yusuliu/Downloads/18.337_2018/project/3colOads.csv") Oads=CSV.File("3colOads.csv") # CSV File with data from the periodic table # columns are element name, A site or B site element in a perovskite, atomic number, period # covalent radius, d-electron count, electronegativity, most common oxidation state # polarizability, ionization energy PD=CSV.File("PeriodicData.csv") #Make dictionaries from Periodic date AorB =Dict{String,String}() Z=Dict{String,Int64}() period=Dict{String,Int64}() radius=Dict{String,Float64}() d_elect=Dict{String,Int64}() Eneg=Dict{String,Float64}() Ox=Dict{String,Float64}() Pol=Dict{String,Float64}() IE=Dict{String,Float64}() for row in PD AorB[row.Element]=row.AorB Z[row.Element]=row.Z period[row.Element]=row.period radius[row.Element]=row.covalent_radius d_elect[row.Element]=row.d_elect Eneg[row.Element]=row.Eneg Ox[row.Element]=row.Ox Pol[row.Element]=row.Pol IE[row.Element]=row.IE end # functions to parse a chemical formula # get captical letters which are the indicator for the start of a new element function getcaps(string) pos=[] for (index, letter) in enumerate(string) if (letter==uppercase(letter)) append!(pos,index) end end return pos end # determine if an element is A or B site function getelements(string) set=getcaps(string) j=length(set) element_list_A=String[] element_list_B=String[] for i in 1:j-2 Astart=set[i] Bstart=set[i+1] new_el=string[Astart:(Bstart-1)] if AorB[new_el]=="A" push!(element_list_A, String(new_el)) elseif AorB[new_el]=="B" push!(element_list_B, String(new_el)) end end return element_list_A, element_list_B end # function to vectorize each element function makevec(element) vec=zeros(8) vec[1]=Z[element] vec[2]=period[element] vec[3]=radius[element] vec[4]=d_elect[element] vec[5]=Eneg[element] vec[6]=Ox[element] vec[7]=Pol[element] vec[8]=IE[element] return vec end # make input array, 24 attributes per cyrstal, 1000 crystals using Statistics input_mat=zeros(24,1000) for i=1:1003 for row in Oads if (row.Count)==i set_A,set_B=getelements(row.Name) size_A=length(set_A) size_B=length(set_B) vecsA=[] vecsB=[] for i in 1:size_A newvec=makevec((set_A[i])) push!(vecsA,newvec) end for i in 1:size_B newvec=makevec((set_B[i])) push!(vecsB,newvec) end input_mat[1:8,i]=mean(vecsA) input_mat[9:16,i]=mean(vecsB) input_mat[17:24,i]=makevec(row.Site) end end end display("text/plain",input_mat) # make target vector: oxygen adsorption energies target=Float64[] for row in Oads append!(target,row.E) end y=zeros(1,1000) for i in 1:1000 y[1,i]=target[i] end display("text/plain",y) # normalize x input array using Statistics, Random x=copy(input_mat) x_norm=copy(x) for i=1:size(x,1) m=mean(x[i,:]) stdev=std(x[i,:]) for j=1:size(x,2) x_norm[i,j]=(x[i,j]-m)/stdev end end # randomize and seperate into 700 training points and 300 test points index=randperm(Random.seed!(1),size(x,2)) xtrn=x_norm[:,index[1:700]] xtst=x_norm[:,index[701:end]] ytrn=y[:,index[1:700]] ytst=y[:,index[701:end]] # make weight matrix using Pkg #Pkg.add("Distributions") using Distributions d=Normal() weights=rand(d,24)*0.1 w=(weights,[0.0]) # function to take in weight and input, gives output prediction vector function predict_lr(w_mat, att_mat) # takes weight matrix and attributes matrix as arguments v_out=zeros(size(att_mat,2)) # initialize output matrix v_out=w_mat[1]'*att_mat.+w_mat[2] return v_out end # function to calculate mean absolute error function MAE_loss(w,x,y) # takes in weight, input matrix and ground truth ypred=predict_lr(w,x) summ=sum(abs(ypred[i]-y[i]) for i=1:size(y,2)) J=summ/(size(y,2)) return J end # training function using autograd and knet #Pkg.add("AutoGrad") #Pkg.add("Knet") using AutoGrad, Knet function train(ww,lr, epochs) lossgradient=grad(MAE_loss) println((0, :trnloss, MAE_loss(ww,xtrn,ytrn), :tstloss, MAE_loss(ww,xtst,ytst))) for epoch=1:epochs dw=lossgradient(ww, xtrn, ytrn) for i in 1:length(ww) for j in 1:length(ww[i]) ww[i][j] -=lr*dw[i][j] end end println((epoch, :trnloss, MAE_loss(ww,xtrn,ytrn), :tstloss, MAE_loss(ww,xtst,ytst))) end return ww end # 0.27 learning rate, 200 epochs d=Normal() weights=rand(d,24)*0.1 w=(weights,[0.0]) train(w,0.27,200) println(w) # analyze performance of test set # first look at by A and B numbers test_set=index[701:end] #retrieve formula from Oads table, categorize into # of As and Bs AB=Int64[] A2B=Int64[] AB2=Int64[] A2B2=Int64[] for i in test_set for row in Oads if (row.Count)==i set_A,set_B=getelements(row.Name) size_A=length(set_A) size_B=length(set_B) if size_A==1 if size_B==1 push!(AB,i) elseif size_B==2 push!(AB2,i) end elseif size_A==2 if size_B==1 push!(A2B,i) elseif size_B==2 push!(A2B2,i) end end end end end #Evaluate performance of each category by # of A sites and # of B sites loss_AB=[] for i in AB loss=abs((w[1]'*x_norm[:,i].+w[2])[1]-y[i]) push!(loss_AB,loss) end MAE_AB=mean(loss_AB) loss_A2B=[] for i in A2B loss=abs((w[1]'*x_norm[:,i].+w[2])[1]-y[i]) push!(loss_A2B,loss) end MAE_A2B=mean(loss_A2B) loss_A2B2=[] for i in A2B2 loss=abs((w[1]'*x_norm[:,i].+w[2])[1]-y[i]) push!(loss_A2B2,loss) end MAE_A2B2=mean(loss_A2B2) loss_AB2=[] for i in AB2 loss=abs((w[1]'*x_norm[:,i].+w[2])[1]-y[i]) push!(loss_AB2,loss) end MAE_AB2=mean(loss_AB2) using Plots x_dat=["AB", "A2B", "AB2", "A2B2"] y_dat=[MAE_AB, MAE_A2B, MAE_AB2,MAE_A2B2] plot(x_dat,y_dat,seriestype=:bar,xlabel = "Perovskite Type",ylabel = "MAE",legend=:none) #Evaluate performance of each category by elements set_el=String[] for row in PD push!(set_el,row.Element) end el_matt=zeros(length(set_el),1000) el_dict=Dict{String,Int64}() for i in 1:28 el_dict[set_el[i]]=i end # el_matt: row number is the dict value of each element, if entry=1, then entry # has that element # for example, row.Count=1, (22,1)=1 because it has Eu for i in test_set for row in Oads if (row.Count)==i set_A,set_B=getelements(row.Name) bigset=vcat(set_A,set_B) for el in set_el if el in bigset el_matt[el_dict[el],i]=1 end end end end end #collect all non-zero entries el_vec=[] count_1=[] dic_rev=Dict(value => key for (key, value) in el_dict) # get average MAE vector by elements for i in 1:28 count=[] for j in 1:1000 if el_matt[i,j]==1.0 loss=abs((w[1]'*x_norm[:,j].+w[2])[1]-y[j]) push!(count,loss) end end push!(el_vec,mean(count)) end el_vec using Plots x_dat_el=String[] for i in 1:28 element=dic_rev[i] push!(x_dat_el,element) end y_dat_el=el_vec plot(x_dat_el,y_dat_el,seriestype=:bar,xlabel = "Element",ylabel = "MAE",legend=:none, xticks = :all,xtickfont = font(8, "Courier"))