Originally from: Data Origami.
import pandas as pd
import patsy
raw_data = {'patient': [1, 1, 1, 0, 0],
'obs': [1, 2, 3, 1, 2],
'treatment': [0, 1, 0, 1, 0],
'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])
df
patient | obs | treatment | score | |
---|---|---|---|---|
0 | 1 | 1 | 0 | strong |
1 | 1 | 2 | 1 | weak |
2 | 1 | 3 | 0 | normal |
3 | 0 | 1 | 1 | weak |
4 | 0 | 2 | 0 | strong |
# On the 'score' variable in the df dataframe, convert to a categorical variable, and spit out a dataframe
patsy.dmatrix('score', df, return_type='dataframe')
Intercept | score[T.strong] | score[T.weak] | |
---|---|---|---|
0 | 1 | 1 | 0 |
1 | 1 | 0 | 1 |
2 | 1 | 0 | 0 |
3 | 1 | 0 | 1 |
4 | 1 | 1 | 0 |
This is likely what you will want to do
# On the 'score' variable in the df dataframe, convert to a categorical variable, and spit out a dataframe
patsy.dmatrix('score - 1', df, return_type='dataframe')
score[normal] | score[strong] | score[weak] | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 0 | 0 | 1 |
2 | 1 | 0 | 0 |
3 | 0 | 0 | 1 |
4 | 0 | 1 | 0 |
patsy.dmatrix('patient + treatment + patient:treatment-1', df, return_type='dataframe')
patient | treatment | patient:treatment | |
---|---|---|---|
0 | 1 | 0 | 0 |
1 | 1 | 1 | 1 |
2 | 1 | 0 | 0 |
3 | 0 | 1 | 0 |
4 | 0 | 0 | 0 |