Hi, this notebook will show you almost all the columns operation availables in Optimus. For row operation, IO, ML and DL please go to the examples folder in the repo
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
In this notebook you can see a detailed overview ok all the columns operations available in Optimus. You can access the operation via df.cols.
from optimus import Optimus
C:\Users\argenisleon\Anaconda3\lib\site-packages\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working from collections import Callable You are using PySparkling of version 2.4.10, but your PySpark is of version 2.3.1. Please make sure Spark and PySparkling versions are compatible. `formatargspec` is deprecated since Python 3.5. Use `signature` and the `Signature` object directly
# Create optimus
op = Optimus()
from pyspark.sql.types import StringType, IntegerType, ArrayType
df = op.create.df(
[
("words", "str", True),
("num", "int", True),
("animals", "str", True),
("thing", StringType(), True),
("two strings", StringType(), True),
("filter", StringType(), True),
("num 2", "string", True),
("col_array", ArrayType(StringType()), True),
("col_int", ArrayType(IntegerType()), True)
]
,
[
(" I like fish ", 1, "dog", "housé", "cat-car", "a", "1", ["baby", "sorry"], [1, 2, 3]),
(" zombies", 2, "cat", "tv", "dog-tv", "b", "2", ["baby 1", "sorry 1"], [3, 4]),
("simpsons cat lady", 2, "frog", "table", "eagle-tv-plus", "1", "3", ["baby 2", "sorry 2"], [5, 6, 7]),
(None, 3, "eagle", "glass", "lion-pc", "c", "4", ["baby 3", "sorry 3"], [7, 8])
])
df.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df = df.cols.append("new_col_1", 1)
df.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
not nullable
|
---|---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
1
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
1
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
1
|
df.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
not nullable
|
---|---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
1
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
1
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
1
|
df_col = op.create.df(
[
("new_col", "str", True),
],[
("q"),("w"), ("e"), ("r"),
])
df.cols.append(df_col).table()
run profiler send to queue <dispatched append> run profiler send to queue <function cols.<locals>.select at 0x000002335A5F1C80> run profiler send to queue <function cols.<locals>.count at 0x000002335A7611E0>
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
nullable
|
new_col
11 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
1
|
q
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
1
|
r
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
1
|
e
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
1
|
w
|
from pyspark.sql.functions import *
df.cols.append([
("new_col_2", 2.22),
("new_col_3", lit(3))
]).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
new_col_2
11 (double)
|
new_col_3
12 (int)
|
---|---|---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
2.22
|
3
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
2.22
|
3
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
2.22
|
3
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
2.22
|
3
|
df.cols.append([
("new_col_4", "test"),
("new_col_5", df['num'] * 2),
("new_col_6", [1, 2, 3])
]).table()
df.table()
columns = ["words", 1, "animals", 3]
df.cols.select(columns).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
None
|
3
|
eagle
|
glass
|
df.cols.select("n.*", regex=True).table()
num
1 (int)
nullable
|
num 2
2 (string)
nullable
|
new_col_1
3 (int)
|
---|---|---|
1
|
1
|
1
|
2
|
2
|
1
|
2
|
3
|
1
|
3
|
4
|
1
|
df.cols.select("*", data_type="str").table()
['col_int', 'num', 'new_col_1', 'col_array'] column(s) was not processed because is not ['str']
words
1 (string)
nullable
|
num 2
2 (string)
nullable
|
two strings
3 (string)
nullable
|
animals
4 (string)
nullable
|
thing
5 (string)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
cat-car
|
dog
|
housé
|
a
|
⸱⸱⸱⸱zombies
|
2
|
dog-tv
|
cat
|
tv
|
b
|
simpsons⸱⸱⸱cat⸱lady
|
3
|
eagle-tv-plus
|
frog
|
table
|
1
|
None
|
4
|
lion-pc
|
eagle
|
glass
|
c
|
You can not rename multiple columns using Spark Vanilla API
df.cols.rename('num', 'number').table()
words
1 (string)
nullable
|
number
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
df.cols.rename([('num', 'number'), ("animals", "gods")], str.upper).table()
WORDS
1 (string)
nullable
|
NUM
2 (int)
nullable
|
ANIMALS
3 (string)
nullable
|
THING
4 (string)
nullable
|
TWO STRINGS
5 (string)
nullable
|
FILTER
6 (string)
nullable
|
NUM 2
7 (string)
nullable
|
COL_ARRAY
8 (array<string>)
nullable
|
COL_INT
9 (array<int>)
nullable
|
NEW_COL_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
df.cols.rename(str.lower).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
df.cols.rename(str.upper).table()
WORDS
1 (string)
nullable
|
NUM
2 (int)
nullable
|
ANIMALS
3 (string)
nullable
|
THING
4 (string)
nullable
|
TWO STRINGS
5 (string)
nullable
|
FILTER
6 (string)
nullable
|
NUM 2
7 (string)
nullable
|
COL_ARRAY
8 (array<string>)
nullable
|
COL_INT
9 (array<int>)
nullable
|
NEW_COL_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
This is a opinionated way to handle column casting. One of the first thing that every data cleaning process need to acomplish is define a data dictionary. Because of that we prefer to create a tuple like this:
df.cols().cast( [("words","str"), ("num","int"), ("animals","float"), ("thing","str")] )
with astype() https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.astype.html
df.cols.cast(columns=[("num", "string"), ("num 2", "integer")]).dtypes
[('words', 'string'), ('num', 'string'), ('animals', 'string'), ('thing', 'string'), ('two strings', 'string'), ('filter', 'string'), ('num 2', 'int'), ('col_array', 'array<string>'), ('col_int', 'array<int>')]
df.cols.cast("num", "string").dtypes
[('words', 'string'), ('num', 'string'), ('animals', 'string'), ('thing', 'string'), ('two strings', 'string'), ('filter', 'string'), ('num 2', 'string'), ('col_array', 'array<string>'), ('col_int', 'array<int>'), ('new_col_1', 'int')]
df.cols.cast("*", "string").dtypes
[('words', 'string'), ('num', 'string'), ('animals', 'string'), ('thing', 'string'), ('two strings', 'string'), ('filter', 'string'), ('num 2', 'string'), ('col_array', 'string'), ('col_int', 'string'), ('new_col_1', 'string')]
from pyspark.ml.linalg import Vectors
df.cols.cast("col_int", Vectors)
DataFrame[words: string, num: int, animals: string, thing: string, two strings: string, filter: string, num 2: string, col_array: array<string>, col_int: vector, new_col_1: int]
from pyspark.sql.functions import *
df.withColumn("num", col("num").cast(StringType()))
df.table()
df.cols.keep("num").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
num
1 (int)
nullable
|
---|
1
|
2
|
2
|
3
|
df.cols.move("words", "after", "thing").table()
num
1 (int)
nullable
|
animals
2 (string)
nullable
|
thing
3 (string)
nullable
|
words
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
1
|
dog
|
housé
|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
2
|
cat
|
tv
|
⸱⸱⸱⸱zombies
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
2
|
frog
|
table
|
simpsons⸱⸱⸱cat⸱lady
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
3
|
eagle
|
glass
|
None
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
df.cols.sort().table()
animals
1 (string)
nullable
|
col_array
2 (array<string>)
nullable
|
col_int
3 (array<int>)
nullable
|
filter
4 (string)
nullable
|
new_col_1
5 (int)
|
num
6 (int)
nullable
|
num 2
7 (string)
nullable
|
thing
8 (string)
nullable
|
two strings
9 (string)
nullable
|
words
10 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|
dog
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
a
|
1
|
1
|
1
|
housé
|
cat-car
|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
cat
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
b
|
1
|
2
|
2
|
tv
|
dog-tv
|
⸱⸱⸱⸱zombies
|
frog
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
1
|
2
|
3
|
table
|
eagle-tv-plus
|
simpsons⸱⸱⸱cat⸱lady
|
eagle
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
c
|
1
|
3
|
4
|
glass
|
lion-pc
|
None
|
df.cols.sort(order="desc").table()
words
1 (string)
nullable
|
two strings
2 (string)
nullable
|
thing
3 (string)
nullable
|
num 2
4 (string)
nullable
|
num
5 (int)
nullable
|
new_col_1
6 (int)
|
filter
7 (string)
nullable
|
col_int
8 (array<int>)
nullable
|
col_array
9 (array<string>)
nullable
|
animals
10 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
cat-car
|
housé
|
1
|
1
|
1
|
a
|
[1,⸱2,⸱3]
|
['baby',⸱'sorry']
|
dog
|
⸱⸱⸱⸱zombies
|
dog-tv
|
tv
|
2
|
2
|
1
|
b
|
[3,⸱4]
|
['baby⸱1',⸱'sorry⸱1']
|
cat
|
simpsons⸱⸱⸱cat⸱lady
|
eagle-tv-plus
|
table
|
3
|
2
|
1
|
1
|
[5,⸱6,⸱7]
|
['baby⸱2',⸱'sorry⸱2']
|
frog
|
None
|
lion-pc
|
glass
|
4
|
3
|
1
|
c
|
[7,⸱8]
|
['baby⸱3',⸱'sorry⸱3']
|
eagle
|
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html
df2 = df.cols.drop("num")
df2.table()
words
1 (string)
nullable
|
animals
2 (string)
nullable
|
thing
3 (string)
nullable
|
two strings
4 (string)
nullable
|
filter
5 (string)
nullable
|
num 2
6 (string)
nullable
|
col_array
7 (array<string>)
nullable
|
col_int
8 (array<int>)
nullable
|
new_col_1
9 (int)
|
---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
df2 = df.cols.drop(["num", "words"])
df2.table()
animals
1 (string)
nullable
|
thing
2 (string)
nullable
|
two strings
3 (string)
nullable
|
filter
4 (string)
nullable
|
num 2
5 (string)
nullable
|
col_array
6 (array<string>)
nullable
|
col_int
7 (array<int>)
nullable
|
new_col_1
8 (int)
|
---|---|---|---|---|---|---|---|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
df.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
.cols y .rows attributes are used to organize and encapsulate optimus
functionality apart from Apache Spark Dataframe API.
At the same time it can be helpfull when you look at the code because every line is self explained.
The past transformations were done step by step, but this can be achieved by chaining all operations into one line of code, like the cell below. This way is much more efficient and scalable because it uses all optimization issues from the lazy evaluation approach.
df.table()
df \
.cols.rename([('num', 'number')]) \
.cols.drop(["number", "words"]) \
.withColumn("new_col_2", lit("spongebob")) \
.cols.append("new_col_1", 1) \
.cols.sort(order="desc") \
.rows.drop(df["num 2"] == 3) \
.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
two strings
1 (string)
nullable
|
thing
2 (string)
nullable
|
num 2
3 (string)
nullable
|
new_col_2
4 (string)
|
new_col_1
5 (int)
|
filter
6 (string)
nullable
|
col_int
7 (array<int>)
nullable
|
col_array
8 (array<string>)
nullable
|
animals
9 (string)
nullable
|
---|---|---|---|---|---|---|---|---|
cat-car
|
housé
|
1
|
spongebob
|
1
|
a
|
[1,⸱2,⸱3]
|
['baby',⸱'sorry']
|
dog
|
dog-tv
|
tv
|
2
|
spongebob
|
1
|
b
|
[3,⸱4]
|
['baby⸱1',⸱'sorry⸱1']
|
cat
|
lion-pc
|
glass
|
4
|
spongebob
|
1
|
c
|
[7,⸱8]
|
['baby⸱3',⸱'sorry⸱3']
|
eagle
|
df.table()
df.cols.unnest("two strings", "-") \
.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
two strings_0
11 (string)
nullable
|
two strings_1
12 (string)
nullable
|
two strings_2
13 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
cat
|
car
|
None
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
dog
|
tv
|
None
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
eagle
|
tv
|
plus
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
lion
|
pc
|
None
|
df.cols.unnest("two strings", "-", index=1).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
two strings_1
11 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
car
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
tv
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
tv
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
pc
|
df \
.cols.unnest("col_array") \
.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
col_array_0
11 (string)
nullable
|
col_array_1
12 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
baby
|
sorry
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
baby⸱1
|
sorry⸱1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
baby⸱2
|
sorry⸱2
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
baby⸱3
|
sorry⸱3
|
df \
.cols.unnest(["col_int"]) \
.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
col_int_0
11 (int)
nullable
|
col_int_1
12 (int)
nullable
|
col_int_2
13 (int)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
1
|
2
|
3
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
3
|
4
|
None
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
5
|
6
|
7
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
7
|
8
|
None
|
df \
.cols.unnest(["two strings"], splits=3, separator="-") \
.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
two strings_0
11 (string)
nullable
|
two strings_1
12 (string)
nullable
|
two strings_2
13 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
cat
|
car
|
None
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
dog
|
tv
|
None
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
eagle
|
tv
|
plus
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
lion
|
pc
|
None
|
# from pyspark.ml.linalg import Vectors
# df1 = op.sc.parallelize([
# ("assert", Vectors.dense([1, 2, 3])),
# ("require", Vectors.sparse(3, {1: 2}))
# ]).toDF()
# df1 \
# .cols.unnest(["vector"]) \
# .table()
# df = df.cols.append("new_col_1", 1)
df_fill = op.spark.createDataFrame([(1.0, float("nan"), "1"),
(2.0, float("nan"), "nan"),
(float("nan"), 3.0, None),
(4.0, 4.0, "2"),
(5.0, 5.0, "2")
], ["a", "b", "c"]
)
df_fill.table()
a
1 (double)
nullable
|
b
2 (double)
nullable
|
c
3 (string)
nullable
|
---|---|---|
1.0
|
nan
|
1
|
2.0
|
nan
|
nan
|
nan
|
3.0
|
None
|
4.0
|
4.0
|
2
|
5.0
|
5.0
|
2
|
df_fill.cols.impute(["a", "b"], "continuous", "median").table()
1 values imputed for column(s) 'a' 2 values imputed for column(s) 'b'
a
1 (float)
nullable
|
b
2 (float)
nullable
|
c
3 (string)
nullable
|
a_imputed
4 (float)
nullable
|
b_imputed
5 (float)
nullable
|
---|---|---|---|---|
1.0
|
nan
|
1
|
1.0
|
4.0
|
2.0
|
nan
|
nan
|
2.0
|
4.0
|
nan
|
3.0
|
None
|
2.0
|
3.0
|
4.0
|
4.0
|
2
|
4.0
|
4.0
|
5.0
|
5.0
|
2
|
5.0
|
5.0
|
df_fill.cols.impute(["c"], "categorical").table()
Including 'nan' as Null in processing 'c' 2 values imputed for column(s) 'c' with '2'
a
1 (double)
nullable
|
b
2 (double)
nullable
|
c
3 (string)
nullable
|
---|---|---|
1.0
|
nan
|
1
|
2.0
|
nan
|
2
|
nan
|
3.0
|
2
|
4.0
|
4.0
|
2
|
5.0
|
5.0
|
2
|
df.cols.select_by_dtypes("int").table()
['words', 'filter', 'two strings', 'animals', 'col_array', 'col_int', 'thing', 'num 2'] column(s) was not processed because is not ['int']
num
1 (int)
nullable
|
new_col_1
2 (int)
|
---|---|
1
|
1
|
2
|
1
|
2
|
1
|
3
|
1
|
Spark have few ways to transform data rdd, Columns Expression, UDF and Pandas UDF. apply() and apply_expr() try to make a consistent way to call this expression without knowing the implementation details.
You need to declare a UDF Spark function
Almost the same behavior that Optimus
df.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
Sometimes there are columns with for example with numbers even when are supposed to be only of words or letters.
In order to solve this problem, apply_by_dtypes() function can be used.
In the next example we replace a number in a string column with "new string"
def func(val, attr):
return attr
df.cols.apply_by_dtypes("filter", func, "string", "new string", data_type="integer").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
new⸱string
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
def func(val, attr):
return val + attr
df.cols.apply(["num", "new_col_1"], func, "int", 32, "udf").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
nullable
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
33
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
33
|
⸱⸱⸱⸱zombies
|
34
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
33
|
simpsons⸱⸱⸱cat⸱lady
|
34
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
33
|
None
|
35
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
33
|
def func(val, attr):
return val + attr
df.cols.apply(["num", "new_col_1"], func, "int", 10).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
nullable
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
11
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
11
|
⸱⸱⸱⸱zombies
|
12
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
11
|
simpsons⸱⸱⸱cat⸱lady
|
12
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
11
|
None
|
13
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
11
|
from optimus.audf import filter_row_by_data_type as fbdt
df.rows.select(fbdt("filter", "integer")).table()
from optimus.audf import abstract_udf as audf
def func(val, attr):
return val > 1
df.rows.select(audf("num", func, "boolean")).table()
from optimus.audf import abstract_udf as audf
def func(val, attr):
return val + attr[0] + attr[1]
df.withColumn("num_sum", audf("num", func, "int", [10, 20])).table()
from pyspark.sql import functions as F
def func(col_name, attr):
return F.when(F.col(col_name) > 2, 10).otherwise(1)
df.cols.apply_expr(["num", "num 2"], func).table()
def func(col_name, attr):
return F.upper(F.col(col_name))
df.cols.apply_expr(["two strings", "animals"], func).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
DOG
|
housé
|
CAT-CAR
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
CAT
|
tv
|
DOG-TV
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
FROG
|
table
|
EAGLE-TV-PLUS
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
EAGLE
|
glass
|
LION-PC
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
def func(val, attr):
return 10
col = "num"
df.cols.apply(col, func, "int", when=df["num"] > 1).table()
df.cols.apply(col, func, "int", when=fbdt(col, "int")).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
10
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
10
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
10
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
10
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
10
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
10
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
10
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
import numpy as np
df_null = op.spark.createDataFrame(
[(1, 1, None), (1, 2, float(5)), (1, 3, np.nan), (1, 4, None), (1, 5, float(10)), (1, 6, float('nan')),
(1, 6, float('nan'))],
('session', "timestamp1", "id2"))
df_null.table()
session
1 (bigint)
nullable
|
timestamp1
2 (bigint)
nullable
|
id2
3 (double)
nullable
|
---|---|---|
1
|
1
|
None
|
1
|
2
|
5.0
|
1
|
3
|
nan
|
1
|
4
|
None
|
1
|
5
|
10.0
|
1
|
6
|
nan
|
1
|
6
|
nan
|
df_null.cols.count_na("id2")
5
df_null.cols.count_na("*")
{'session': 0, 'timestamp1': 0, 'id2': 5}
df.cols.count_uniques("*")
{'words': 3, 'num': 3, 'animals': 4, 'thing': 4, 'two strings': 4, 'filter': 4, 'num 2': 4, 'col_array': 3, 'col_int': 4, 'new_col_1': 1}
df.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
new_col_1
10 (int)
|
---|---|---|---|---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⸱'sorry']
|
[1,⸱2,⸱3]
|
1
|
⸱⸱⸱⸱zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⸱1',⸱'sorry⸱1']
|
[3,⸱4]
|
1
|
simpsons⸱⸱⸱cat⸱lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⸱2',⸱'sorry⸱2']
|
[5,⸱6,⸱7]
|
1
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⸱3',⸱'sorry⸱3']
|
[7,⸱8]
|
1
|
df_distinct = op.create.df(
[
("words", "str", True),
("num", "int", True)
],
[
(" I like fish ", 1),
(" zombies", 2),
("simpsons cat lady", 2),
(None, 3),
(None, 0)
])
df_distinct.cols.unique("num").table()
num
1 (int)
nullable
|
---|
1
|
3
|
2
|
0
|
df_zeros = df_distinct
df_zeros.cols.count_zeros("*")
[{'num': 1}]
df.cols.dtypes('*')
{'words': 'string', 'num': 'int', 'animals': 'string', 'thing': 'string', 'two strings': 'string', 'filter': 'string', 'num 2': 'string', 'col_array': 'array<string>', 'col_int': 'array<int>'}
df.table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace("animals", ["dog", "cat"], "animals").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
animals
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
animals
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace(["two strings", "animals"], ["dog-tv", "cat", "eagle", "fish"], "animals").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
animals-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
animals
|
tv
|
animals
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
animals-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
animals
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace("animals", [("dog", "dog_1"), ("cat", "cat_1")]).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace("animals", "dog", "animal").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
animal
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace("*", ["a", "b", "c"], "%").table()
words
1 (string)
nullable
|
num
2 (string)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
%%t-%%r
|
%
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zom%ies
|
2
|
%%t
|
tv
|
dog-tv
|
%
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅%%t⋅l%dy
|
2
|
frog
|
t%%le
|
e%gle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
e%gle
|
gl%ss
|
lion-p%
|
%
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace('num', ["3", 2], 10).table()
words
1 (string)
nullable
|
num
2 (string)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
10
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
10
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
10
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace('num', [("3", 6), (2, 12)]).table()
words
1 (string)
nullable
|
num
2 (string)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace("animals", "dog", "animal", search_by="words").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
animal
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace("animals", "dog", "animal", search_by="words", output_cols="friends").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
friends
10 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
animal
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
dog
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
cat
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
frog
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
eagle
|
df.cols.replace("*", "1", "11", search_by="chars", output_cols="new").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
wordsnew
10 (string)
nullable
|
numnew
11 (string)
nullable
|
animalsnew
12 (string)
nullable
|
thingnew
13 (string)
nullable
|
two stringsnew
14 (string)
nullable
|
filternew
15 (string)
nullable
|
num 2new
16 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
11
|
dog
|
housé
|
cat-car
|
a
|
11
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
11
|
3
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
df.cols.replace("animals", [("dog", "dog_1"), ("cat", "cat_1")], "words").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace("animals", ["dog", "cat"], "animals", "words").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
animals
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
animals
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.replace_regex('*', '.*[Cc]at.*', 'cat_1').table()
words
1 (string)
nullable
|
num
2 (string)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat_1
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
cat_1
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
cat_1
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df = df.cols.copy("num", "num_1")
df.cols.nest(["num", "num_1"], output_col="col_nested", shape="vector").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
num_1
10 (int)
nullable
|
col_nested
11 (vector)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
1
|
[1.0,1.0]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
2
|
[2.0,2.0]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
2
|
[2.0,2.0]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
3
|
[3.0,3.0]
|
df.cols.nest(["animals", "two strings"], output_col="col_nested", shape="string").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
num_1
10 (int)
nullable
|
animalscol_nested
11 (string)
not nullable
|
two stringscol_nested
12 (string)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
1
|
dogcat-car
|
dogcat-car
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
2
|
catdog-tv
|
catdog-tv
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
2
|
frogeagle-tv-plus
|
frogeagle-tv-plus
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
3
|
eaglelion-pc
|
eaglelion-pc
|
df.cols.nest(["animals", "two strings", "num 2"], output_col="col_nested", shape="array").table()
words
1 (string)
nullable
|
num
2 (string)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (string)
nullable
|
col_int
9 (string)
nullable
|
animalscol_nested
10 (array<string>)
not nullable
|
two stringscol_nested
11 (array<string>)
not nullable
|
num 2col_nested
12 (array<string>)
not nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
1
|
[baby,⋅sorry]
|
[1,⋅2,⋅3]
|
['dog',⋅'cat-car',⋅'1']
|
['dog',⋅'cat-car',⋅'1']
|
['dog',⋅'cat-car',⋅'1']
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
2
|
[baby⋅1,⋅sorry⋅1]
|
[3,⋅4]
|
['cat',⋅'dog-tv',⋅'2']
|
['cat',⋅'dog-tv',⋅'2']
|
['cat',⋅'dog-tv',⋅'2']
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
[baby⋅2,⋅sorry⋅2]
|
[5,⋅6,⋅7]
|
['frog',⋅'eagle-tv-plus',⋅'3']
|
['frog',⋅'eagle-tv-plus',⋅'3']
|
['frog',⋅'eagle-tv-plus',⋅'3']
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
[baby⋅3,⋅sorry⋅3]
|
[7,⋅8]
|
['eagle',⋅'lion-pc',⋅'4']
|
['eagle',⋅'lion-pc',⋅'4']
|
['eagle',⋅'lion-pc',⋅'4']
|
df = op.load.csv("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv")
df.table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
df.cols.hist("price", 10)
[{'count': 1, 'lower': 1.0, 'upper': 1.9}, {'count': 1, 'lower': 1.9, 'upper': 2.8}, {'count': 4, 'lower': 2.8, 'upper': 3.6999999999999997}, {'count': 1, 'lower': 3.6999999999999997, 'upper': 4.6}, {'count': 2, 'lower': 4.6, 'upper': 5.5}, {'count': 0, 'lower': 5.5, 'upper': 6.4}, {'count': 0, 'lower': 6.4, 'upper': 7.300000000000001}, {'count': 6, 'lower': 7.300000000000001, 'upper': 8.200000000000001}, {'count': 2, 'lower': 8.200000000000001, 'upper': 9.100000000000001}, {'count': 2, 'lower': 9.100000000000001, 'upper': 10.0}]
df.cols.frequency("billingId")
{'billingId': [{'value': 992, 'count': 1}, {'value': 912, 'count': 1}, {'value': 886, 'count': 1}, {'value': 875, 'count': 1}, {'value': 812, 'count': 1}, {'value': 735, 'count': 1}, {'value': 672, 'count': 1}, {'value': 634, 'count': 1}, {'value': 624, 'count': 1}, {'value': 553, 'count': 1}]}
print(df.cols.min("billingId"))
print(df.cols.percentile(['billingId', 'price'], [0.05, 0.25, 0.5, 0.75, 0.95]))
print(df.cols.max("billingId"))
print(df.cols.median(["billingId", "price"]))
print(df.cols.range(["billingId", "price"]))
print(df.cols.std(["billingId", "price"]))
111 {'billingId': {'0.05': 111.0, '0.25': 111.0, '0.5': 111.0, '0.75': 111.0, '0.95': 111.0}, 'price': {'0.05': 1.0, '0.25': 1.0, '0.5': 1.0, '0.75': 1.0, '0.95': 1.0}} 992 {'billingId': 111.0, 'price': 1.0} {'billingId': {'min': 111, 'max': 992}, 'price': {'min': 1, 'max': 10}} {'billingId': 280.19735, 'price': 2.95285}
print(df.cols.min("*"))
{'id': 1, 'firstName': '((( Heinrich )))))', 'lastName': ' GALiLEI', 'billingId': 111, 'product': '110790', 'price': 1, 'birth': '1899/01/01', 'dummyCol': '#'}
print(df.cols.kurt("billingId"))
print(df.cols.mean("billingId"))
print(df.cols.skewness("billingId"))
print(df.cols.sum("billingId"))
print(df.cols.variance("billingId"))
print(df.cols.mad("billingId"))
-1.04115 556.0 -0.2137 10564 78510.55556 0.0
df.cols.mad("price")
0.0
df.cols.mad("price", more=True)
{'mad': 0.0, 'median': 1.0}
print(df.cols.percentile(['price'], [0.05, 0.25, 0.5, 0.75, 0.95]))
{'0.05': 1.0, '0.25': 1.0, '0.5': 1.0, '0.75': 1.0, '0.95': 1.0}
print(df.cols.mode(["price", "billingId"]))
[{'price': 8}, {'billingId': None}]
df.table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
df \
.cols.trim("lastName") \
.cols.lower("lastName") \
.cols.upper(["product", "firstName"]) \
.cols.reverse("firstName") \
.table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
SIUL
|
alvarez$$%!
|
123
|
CAKE
|
10
|
1980/07/07
|
never
|
2
|
ÉRDNA
|
ampère
|
423
|
PIZA
|
8
|
1950/07/08
|
gonna
|
3
|
SLEIN
|
böhr//((%%
|
551
|
PIZZA
|
8
|
1990/07/09
|
give
|
4
|
LUAP
|
dirac$
|
521
|
PIZZA
|
8
|
1954/07/10
|
you
|
5
|
TREBLA
|
einstein
|
634
|
PIZZA
|
8
|
1990/07/11
|
up
|
6
|
OELILAG
|
galilei
|
672
|
AREPA
|
5
|
1930/08/12
|
never
|
7
|
LRAC
|
ga%%%uss
|
323
|
TACO
|
3
|
1970/07/13
|
gonna
|
8
|
DIVAD
|
h$$$ilbert
|
624
|
TAAACCOO
|
3
|
1950/07/14
|
let
|
9
|
SENNAHOJ
|
kepler
|
735
|
TACO
|
3
|
1920/04/22
|
you
|
10
|
SEMAJ
|
m$$ax%%well
|
875
|
TACO
|
3
|
1923/03/12
|
down
|
df.cols.iqr("price")
5.0
df.cols.iqr("price", more=True)
{'iqr': 5.0, 'q1': 3.0, 'q2': 8.0, 'q3': 8.0}
df.cols.z_score("price").table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (double)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
1.336800040638705
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
0.6594882909731277
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
0.6594882909731277
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
0.6594882909731277
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
0.6594882909731277
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
0.3564793335252382
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
1.0337910831908155
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
1.0337910831908155
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
1.0337910831908155
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
1.0337910831908155
|
1923/03/12
|
down
|
df.cols.years_between("birth", "yyyyMMdd", ).table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (float)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
None
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
None
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
None
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
None
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
None
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
None
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
None
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
None
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
None
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
None
|
down
|
df.cols.remove("*", ["&", "%"]).table()
id
1 (string)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (string)
nullable
|
product
5 (string)
nullable
|
price
6 (string)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Gauss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$axwell
|
875
|
taco
|
3
|
1923/03/12
|
down
|
df.cols.remove_accents("lastName").table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampere
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Bohr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
df.cols.remove_special_chars("lastName").table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Gauss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
Hilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
Maxwell
|
875
|
taco
|
3
|
1923/03/12
|
down
|
df.cols.clip("billingId", 100, 200).table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
200
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
200
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
200
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
200
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
200
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
200
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
200
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
200
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
200
|
taco
|
3
|
1923/03/12
|
down
|
df_abs = op.create.df(
[
("words", "str", True),
("num", "int", True),
("animals", "str", True),
("thing", StringType(), True),
("two strings", StringType(), True),
("filter", StringType(), True),
("num 2", "string", True),
("col_array", ArrayType(StringType()), True),
("col_int", ArrayType(IntegerType()), True)
]
,
[
(" I like fish ", -1, "dog", "housé", "cat-car", "a", "-1", ["baby", "sorry"], [1, 2, 3]),
(" zombies", -2, "cat", "tv", "dog-tv", "b", "-2", ["baby 1", "sorry 1"], [3, 4]),
("simpsons cat lady", -2, "frog", "table", "eagle-tv-plus", "1", "3", ["baby 2", "sorry 2"], [5, 6, 7]),
(None, 3, "eagle", "glass", "lion-pc", "c", "4", ["baby 3", "sorry 3"], [7, 8])
])
df_abs.cols.abs(["num", "num 2"]).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
two strings
5 (string)
nullable
|
filter
6 (string)
nullable
|
num 2
7 (string)
nullable
|
col_array
8 (array<string>)
nullable
|
col_int
9 (array<int>)
nullable
|
---|---|---|---|---|---|---|---|---|
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
|
1
|
dog
|
housé
|
cat-car
|
a
|
-1
|
['baby',⋅'sorry']
|
[1,⋅2,⋅3]
|
⋅⋅⋅⋅zombies
|
2
|
cat
|
tv
|
dog-tv
|
b
|
-2
|
['baby⋅1',⋅'sorry⋅1']
|
[3,⋅4]
|
simpsons⋅⋅⋅cat⋅lady
|
2
|
frog
|
table
|
eagle-tv-plus
|
1
|
3
|
['baby⋅2',⋅'sorry⋅2']
|
[5,⋅6,⋅7]
|
None
|
3
|
eagle
|
glass
|
lion-pc
|
c
|
4
|
['baby⋅3',⋅'sorry⋅3']
|
[7,⋅8]
|
df.cols.qcut("billingId", 5).table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
billingId_QCUT
9 (double)
nullable
|
---|---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
0.0
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
1.0
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
2.0
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
2.0
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
3.0
|
6
|
Galileo
|
⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅⋅GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
3.0
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
1.0
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
2.0
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
3.0
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
4.0
|