import pandas as pd
# data to validate
df = pd.DataFrame({
"column1": [1, 4, 0, 10, 9],
"column2": [-1.3, -1.4, -2.9, -10.1, -20.4],
"column3": ["value_1", "value_2", "value_3", "value_2", "value_1"],
})
df
column1 | column2 | column3 | |
---|---|---|---|
0 | 1 | -1.3 | value_1 |
1 | 4 | -1.4 | value_2 |
2 | 0 | -2.9 | value_3 |
3 | 10 | -10.1 | value_2 |
4 | 9 | -20.4 | value_1 |
schema = pa.DataFrameSchema({
"column1": pa.Column(int, checks=pa.Check.le(10)),
"column2": pa.Column(float, checks=pa.Check.lt(-1.2)),
"column3": pa.Column(str, checks=[
pa.Check.str_startswith("value_"),
# define custom checks as functions that take a series as input and
# outputs a boolean or boolean Series
pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
]),
})
schema(df)
column1 | column2 | column3 | |
---|---|---|---|
0 | 1 | -1.3 | value_1 |
1 | 4 | -1.4 | value_2 |
2 | 0 | -2.9 | value_3 |
3 | 10 | -10.1 | value_2 |
4 | 9 | -20.4 | value_1 |
import pandera as pa
from pandera.typing import Series
class Schema(pa.DataFrameModel):
column1: Series[int] = pa.Field(le=10)
column2: Series[float] = pa.Field(lt=-1.2)
column3: Series[str] = pa.Field(str_startswith="value_")
@pa.check("column3")
def column_3_check(cls, series: Series[str]) -> Series[bool]:
"""Check that column3 values have two elements after being split with '_'"""
return series.str.split("_", expand=True).shape[1] == 2
Schema.validate(df)
column1 | column2 | column3 | |
---|---|---|---|
0 | 1 | -1.3 | value_1 |
1 | 4 | -1.4 | value_2 |
2 | 0 | -2.9 | value_3 |
3 | 10 | -10.1 | value_2 |
4 | 9 | -20.4 | value_1 |