#!/usr/bin/env python # coding: utf-8 # In this notebook we will see how to learn mutiple BNs from multiple datasets where we # will put a bias on the BNs to be similar. First we need a few datasets. The easiest way to this is to grab a dataset and then sample from it: # In[1]: from pygobnilp.gobnilp import Gobnilp m = Gobnilp() m.learn('discrete.dat',end='data') #stop the process as soon as we have the data datas = [] import numpy as np np.random.seed(12) #so we get the same results each time for i in range(4): idx = np.random.randint(0, len(m.rawdata), size=1000) # sampling with replacement datas.append(m.rawdata[idx,:]) # OK, so now we have 4 numpy arrays of data each with 1000 rows and 6 columns. Now we can learn a BN for each of these data sets. # In[2]: ms = [] for i in range(4): ms.append(Gobnilp()) ms[i].learn(datas[i],varnames=['X_{0}_{1}'.format(i,j) for j in range(6)], pruning=False, # keep all parent sets for later abbrev=False) # want to see full variable names on plots # OK, so we have learned 4 BNs from 4 datasets. Now suppose we want to impose a preference that the 4 BNs be similar. To do this all MIP variables from the 4 separate problems need to be in the same MIP instance. We can do this by creating a new model which takes the union of the 4 sets of local scores as input. First off, we do this and then just learn a BN from this input without any preference for the BNs to be similar. (Note that this size of the plotted figure is made bigger so that we can see it well.) # In[3]: global_local_scores = {} for i in range(4): global_local_scores.update(ms[i].local_scores) m = Gobnilp() import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = [12, 6] #increase size of figure (since plot not interactive) m.learn(local_scores_source=global_local_scores,abbrev=False) # Before expressing a *preference* for the 4 BNs to be similar, let's do something easier: put in a hard constraint that each of the 4 BNs are exactly the same. We can do this most easily by requiring that the arrow variables for each learned BN are the same. # In[4]: def namei(u,i): 'get corresponding name for BN i' return u[:2]+str(i)+u[3:] hard_conss = [] for (u,v) in ms[0].arrow: mipvar = m.arrow[u,v] for i in range(1,4): hard_conss.append(m.addConstr(mipvar == m.arrow[namei(u,i),namei(v,i)])) m.learn(start='MIP model',abbrev=False) # OK, sure enough we get 4 identical Bayesian networks. Now let's just put a bias towards similarity. To do this we will create, for each pair of arrow variables in distinct BNs, a binary variable which takes the value 1 if the two arrow variables have different values. # In[5]: from gurobipy import GRB def xorvar(m,x,y,obj=0.0): 'return binary variable which is x xor y' r = m.addVar(obj=obj,vtype=GRB.BINARY) #convex hull method from Achterberg's thesis (14.29) m.addConstr( r - x - y <= 0) # disallow r=1,x=0,y=0 m.addConstr(-r + x - y <= 0) # disallow r=0,x=1,y=0 m.addConstr(-r - x + y <= 0) # disallow r=0,x=0,y=1 m.addConstr( r + x + y <= 2) # disallow r=1,x=1,y=1 return r # get rid of hard constraints m.remove(hard_conss) # add in soft constraints xors = [] for i in range(4): for j in range(i+1,4): for (u,v) in ms[0].arrow: arrowi = m.arrow[namei(u,i),namei(v,i)] arrowj = m.arrow[namei(u,j),namei(v,j)] xors.append(xorvar(m,arrowi,arrowj,-0.1)) m.learn(start='MIP model',abbrev=False) # So with only a 0.1 penalty for pairs of nodes in different BNs to be connected differently, we can still get not all BNs to be the same. Let's increase this penalty to 1 and see what happens. # In[6]: for v in xors: v.Obj = -1 m.learn(start='MIP model',abbrev=False) # So with a stronger preference for similarity all 4 BNs are the same.