# Import required libraries
import numpy as np
import pandas as pd
# Set random seed
np.random.seed(42)
# Define total number of products
number_of_products = 10
# Create data dictionary
data = {'product_id':np.arange(1, number_of_products+1).tolist(),
'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)}
# Transform dictionary into a data frame
df = pd.DataFrame(data)
# Store the real mean in a separate variable
real_mean = round(df['measure'].mean(),3)
print(real_mean)
# View data frame
df
10.224
product_id | measure | |
---|---|---|
0 | 1 | 10.248 |
1 | 2 | 9.931 |
2 | 3 | 10.324 |
3 | 4 | 10.762 |
4 | 5 | 9.883 |
5 | 6 | 9.883 |
6 | 7 | 10.790 |
7 | 8 | 10.384 |
8 | 9 | 9.765 |
9 | 10 | 10.271 |
# Obtain simple random sample
# sample (n,axis)
simple_random_sample = df.sample(n=4).sort_values(by='product_id')
# Save the sample mean in a separate variable
simple_random_mean = round(simple_random_sample['measure'].mean(),3)
print(simple_random_mean)
# View sampled data frame
simple_random_sample
9.866
product_id | measure | |
---|---|---|
1 | 2 | 9.931 |
4 | 5 | 9.883 |
5 | 6 | 9.883 |
8 | 9 | 9.765 |
# Define systematic sampling function
def systematic_sampling(df, step):
indexes = np.arange(0,len(df),step=step)
systematic_sample = df.iloc[indexes]
return systematic_sample
# Obtain a systematic sample and save it in a new variable
systematic_sample = systematic_sampling(df, 3)
# Save the sample mean in a separate variable
systematic_mean = round(systematic_sample['measure'].mean(),3)
# View sampled data frame
systematic_sample
product_id | measure | |
---|---|---|
0 | 1 | 10.248 |
3 | 4 | 10.762 |
6 | 7 | 10.790 |
9 | 10 | 10.271 |
def cluster_sampling(df, number_of_clusters):
try:
# Divide the units into cluster of equal size
df['cluster_id'] = np.repeat([range(1,number_of_clusters+1)],len(df)/number_of_clusters)
print(df)
# Create an empty list
indexes = []
# Append the indexes from the clusters that meet the criteria
# For this formula, clusters id must be an even number
for i in range(0,len(df)):
if df['cluster_id'].iloc[i]%2 == 0:
indexes.append(i)
print(indexes)
cluster_sample = df.iloc[indexes]
return(cluster_sample)
except:
print("The population cannot be divided into clusters of equal size!")
# Obtain a cluster sample and save it in a new variable
cluster_sample = cluster_sampling(df,5)
# Save the sample mean in a separate variable
cluster_mean = round(cluster_sample['measure'].mean(),3)
# View sampled data frame
cluster_sample
product_id measure cluster_id 0 1 10.248 1 1 2 9.931 1 2 3 10.324 2 3 4 10.762 2 4 5 9.883 3 5 6 9.883 3 6 7 10.790 4 7 8 10.384 4 8 9 9.765 5 9 10 10.271 5 [2, 3, 6, 7]
product_id | measure | cluster_id | |
---|---|---|---|
2 | 3 | 10.324 | 2 |
3 | 4 | 10.762 | 2 |
6 | 7 | 10.790 | 4 |
7 | 8 | 10.384 | 4 |
# Create data dictionary
data = {'product_id':np.arange(1, number_of_products+1).tolist(),
'product_strata':np.repeat([1,2], number_of_products/2).tolist(),
'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)}
# Transform dictionary into a data frame
df = pd.DataFrame(data)
# View data frame
df
product_id | product_strata | measure | |
---|---|---|---|
0 | 1 | 1 | 8.780 |
1 | 2 | 1 | 10.302 |
2 | 3 | 1 | 9.874 |
3 | 4 | 1 | 9.918 |
4 | 5 | 1 | 9.262 |
5 | 6 | 2 | 10.743 |
6 | 7 | 2 | 9.988 |
7 | 8 | 2 | 10.178 |
8 | 9 | 2 | 10.209 |
9 | 10 | 2 | 10.416 |
# Import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit
# Set the split criteria
split = StratifiedShuffleSplit(n_splits=1, test_size=4)
# Perform data frame split
for x, y in split.split(df, df['product_strata']):
stratified_random_sample = df.iloc[y].sort_values(by='product_id')
# View sampled data frame
stratified_random_sample
# Obtain the sample mean for each group
stratified_random_sample.groupby('product_strata').mean().drop(['product_id'],axis=1)
measure | |
---|---|
product_strata | |
1 | 9.327 |
2 | 10.476 |
Once samples have been obtained using each sampling technique, let’s compare the samples means with the population mean (which usually is unknown, but not in this case) to determine the sampling technique that leads to the best approximation of the population measure mean.
# Create a dictionary with the mean outcomes for each sampling method and the real mean
outcomes = {'sample_mean':[simple_random_mean,systematic_mean,cluster_mean],
'real_mean':real_mean}
# Transform dictionary into a data frame
outcomes = pd.DataFrame(outcomes, index=['Simple Random Sampling','Systematic Sampling','Cluster Sampling'])
# Add a value corresponding to the absolute error
outcomes['abs_error'] = abs(outcomes['real_mean'] - outcomes['sample_mean'])
# Sort data frame by absolute error
outcomes.sort_values(by='abs_error')
sample_mean | real_mean | abs_error | |
---|---|---|---|
Simple Random Sampling | 10.316 | 10.224 | 0.092 |
Systematic Sampling | 10.518 | 10.224 | 0.294 |
Cluster Sampling | 10.565 | 10.224 | 0.341 |