In large dataset a relatively small group of points might be overplotted by the dominant group. In this case stratified sampling can help.
import numpy as np
import pandas as pd
from lets_plot import *
LetsPlot.setup_html()
N = 5000
small_group = 3
large_group = N - small_group
np.random.seed(123)
data = dict(
x = np.random.normal(0, 1, N),
y = np.random.normal(0, 1, N),
cond = ['A' for _ in range(small_group)] + ['B' for _ in range(large_group)]
)
# Data points in group 'A' (small group) are overplotted by the dominant group 'B'.
p = ggplot(data, aes('x','y',color='cond')) + \
scale_color_manual(values=["red", "#1C9E77"], breaks=['A', 'B'])
p + geom_point(size=5, alpha=.2)
# The 'random' sampling loses the group 'A' altogether.
p + geom_point(size=5, sampling=sampling_random(50, seed=2))
# Stratified sampling ensures that group 'A' is represented.
p + geom_point(size=5, sampling=sampling_random_stratified(50, seed=2))