Generating a data set


In [1]:
import numpy as np
import pandas as pd

Dog breeds

In [2]:
noobs = 100
In [3]:
pure = np.array([True if np.random.random() < 0.5 else False for i in range(noobs)])
pure
Out[3]:
array([ True,  True,  True, False, False,  True, False,  True,  True,
       False, False,  True, False, False,  True,  True,  True, False,
       False, False, False, False,  True, False, False, False, False,
        True, False, False,  True, False, False,  True,  True, False,
        True, False,  True,  True, False, False,  True,  True, False,
        True, False, False,  True,  True, False, False, False, False,
       False,  True,  True,  True, False,  True, False,  True, False,
       False, False,  True,  True, False,  True,  True,  True, False,
       False, False, False,  True, False, False,  True,  True,  True,
        True, False,  True,  True, False,  True, False,  True,  True,
       False, False,  True,  True, False, False, False, False, False,
        True])
In [4]:
age = np.around(np.random.normal(4, 2, noobs)).astype(np.int)
age[age <= 0] = 0
age
Out[4]:
array([3, 7, 6, 3, 3, 5, 8, 3, 4, 5, 5, 6, 3, 3, 1, 4, 3, 8, 7, 2, 5, 3,
       6, 5, 3, 4, 4, 8, 2, 5, 4, 2, 9, 2, 3, 2, 2, 3, 1, 4, 3, 0, 8, 8,
       4, 7, 3, 3, 5, 5, 3, 2, 2, 6, 6, 6, 5, 2, 8, 4, 5, 5, 4, 5, 7, 4,
       0, 5, 1, 2, 3, 5, 2, 0, 3, 0, 2, 3, 6, 4, 6, 6, 3, 4, 5, 6, 4, 2,
       3, 6, 7, 6, 1, 6, 4, 4, 4, 6, 3, 7])
In [5]:
weight = np.random.normal(15, 5, noobs)
weight = np.absolute(weight)
weight = np.around(weight, 2)
weight
Out[5]:
array([ 7.94,  5.05, 16.15, 15.5 , 12.17, 14.11,  9.51, 15.6 , 12.36,
       19.18, 15.98,  4.85, 19.52, 11.21, 14.08, 14.96,  6.84, 15.76,
       17.78, 21.49, 16.26, 17.82,  7.9 , 18.26, 18.56, 11.95, 11.09,
       15.57,  6.97, 19.78, 18.58, 19.68, 16.92,  3.19, 21.78, 13.49,
        7.29, 11.74, 17.27, 13.89, 15.38, 12.44, 17.66, 12.55, 11.32,
       15.95, 10.89, 22.06,  8.11, 12.89, 14.57, 15.37, 20.71, 16.72,
       20.06, 16.1 , 16.87,  9.41, 19.5 , 16.18, 18.06, 15.5 , 14.61,
        4.82, 10.92,  9.79, 21.48, 16.14, 15.26, 16.36, 15.49,  8.84,
       13.85, 17.1 , 16.12,  3.67, 14.98,  8.28, 16.25, 21.81, 21.88,
       14.21,  5.05, 19.55,  6.6 , 13.39, 20.9 ,  8.02, 19.38, 16.52,
        9.02, 13.38, 21.5 , 10.55, 11.  , 20.76, 17.4 , 15.6 ,  8.37,
        9.09])
In [6]:
height = np.random.normal(50, 10, noobs)
height = np.absolute(height)
height = np.around(height, 1)
height
Out[6]:
array([49.6, 63.5, 68.2, 64.7, 59.7, 26.9, 49.2, 49.9, 48.1, 54.2, 57.2,
       56.7, 43. , 57.2, 47.7, 59.3, 42.5, 46.8, 39.8, 59.5, 42.8, 45.1,
       44.1, 50.4, 61. , 54.9, 46.1, 48. , 40.1, 49. , 48.2, 50.3, 39.9,
       47.9, 41.6, 41. , 55.8, 45.2, 55.2, 47.1, 47.6, 55.9, 37.2, 58.6,
       48.3, 56. , 47.6, 57. , 34.4, 36.5, 68.4, 39.1, 45.8, 48.3, 61.3,
       53.4, 51.3, 48.6, 51.9, 36.9, 41.9, 66.8, 55.5, 44.9, 43.2, 52.3,
       17.2, 55.8, 63.5, 55.2, 41.9, 53.4, 36. , 60.3, 28.4, 46.9, 46.7,
       41.3, 44.6, 52. , 43. , 63.4, 55.7, 41. , 48.1, 65.1, 52. , 48.2,
       63.5, 51.5, 58.3, 63.3, 55.7, 44.6, 43.7, 52.2, 47.5, 51.4, 54.1,
       50.3])
$$ l = 5 + 1.2p + \frac{h - 40.0}{20} - \frac{|40.0 - w|^2}{100} $$
In [7]:
lifespan = np.around(np.random.normal(5, 0.1, noobs) + ((~pure).astype(np.int) * 1.2) + (np.absolute(40.0 - weight)**2 / 100.0)).astype(np.int)
lifespan
Out[7]:
array([15, 17, 11, 12, 14, 12, 15, 11, 13, 10, 12, 17, 10, 14, 12, 11, 16,
       12, 11, 10, 12, 11, 15, 11, 11, 14, 15, 11, 17, 10, 10, 11, 11, 19,
        8, 13, 16, 14, 10, 12, 12, 14, 10, 13, 14, 11, 15, 10, 15, 12, 13,
       12, 10, 11, 10, 11, 10, 14, 10, 11, 11, 11, 13, 19, 14, 14,  8, 12,
       11, 11, 11, 16, 13, 12, 12, 18, 12, 16, 11,  8,  8, 12, 18,  9, 16,
       13,  9, 16,  9, 11, 16, 13,  8, 14, 15, 10, 11, 12, 16, 14])
In [8]:
df = pd.DataFrame({'pure': pure, 'age': age, 'weight': weight, 'height': height, 'lifespan': lifespan})
df
Out[8]:
pure age weight height lifespan
0 True 3 7.94 49.6 15
1 True 7 5.05 63.5 17
2 True 6 16.15 68.2 11
3 False 3 15.50 64.7 12
4 False 3 12.17 59.7 14
... ... ... ... ... ...
95 False 4 20.76 52.2 10
96 False 4 17.40 47.5 11
97 False 6 15.60 51.4 12
98 False 3 8.37 54.1 16
99 True 7 9.09 50.3 14

100 rows × 5 columns

In [9]:
df.to_csv("data/dogs.csv")

End