%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
from optimus import Optimus
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType
# Create optimus
op = Optimus()
df = op.create.df([
("words", "str", True),
("num", "int", True),
("animals", "str", True),
("thing", StringType(), True),
("second", "int", True),
("filter", StringType(), True)
],
[
(" I like fish ", 1, "dog dog", "housé", 5 , "a"),
(" zombies", 2, "cat", "tv", 6, "b"),
("simpsons cat lady", 2, "frog", "table", 7, "1"),
(None, 3, "eagle", "glass", 8, "c")
])
df.show()
+-------------------+---+-------+-----+------+------+ | words|num|animals|thing|second|filter| +-------------------+---+-------+-----+------+------+ | I like fish | 1|dog dog|housé| 5| a| | zombies| 2| cat| tv| 6| b| |simpsons cat lady| 2| frog|table| 7| 1| | null| 3| eagle|glass| 8| c| +-------------------+---+-------+-----+------+------+
df.dtypes
[('words', 'string'), ('num', 'int'), ('animals', 'string'), ('thing', 'string'), ('second', 'int'), ('filter', 'string')]
df.rows.append(["this is a word",2, "this is an animal", "this is a thing", 64, "this is a filter"]).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ | 1 | dog⸱dog | housé | 5 | a |
⸱⸱⸱⸱zombies | 2 | cat | tv | 6 | b |
simpsons⸱⸱⸱cat⸱lady | 2 | frog | table | 7 | 1 |
None | 3 | eagle | glass | 8 | c |
this⸱is⸱a⸱word | 2 | this⸱is⸱an⸱animal | this⸱is⸱a⸱thing | 64 | this⸱is⸱a⸱filter |
df_bat = op.create.df(
[
("words", "str", True),
("num", "int", True),
("animals", "str", True),
("thing", StringType(), True),
("two strings", StringType(), True),
("filter", StringType(), True),
("num 2", "string", True),
("col_array", ArrayType(StringType()), True),
("col_int", ArrayType(IntegerType()), True)
],[
("I am batman", 1, "bat", "housé", "cat-car", "z", "10", ["screen", "sorry"], [11, 21, 31]),
])
df.rows.append(df_bat).table()
df.rows.sort("animals").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
simpsons⸱⸱⸱cat⸱lady | 2 | frog | table | 7 | 1 |
None | 3 | eagle | glass | 8 | c |
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ | 1 | dog⸱dog | housé | 5 | a |
⸱⸱⸱⸱zombies | 2 | cat | tv | 6 | b |
df.rows.sort("animals", "desc").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
simpsons⸱⸱⸱cat⸱lady | 2 | frog | table | 7 | 1 |
None | 3 | eagle | glass | 8 | c |
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ | 1 | dog⸱dog | housé | 5 | a |
⸱⸱⸱⸱zombies | 2 | cat | tv | 6 | b |
df.rows.sort([("animals","desc"),("thing","asc")]).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
simpsons⸱⸱⸱cat⸱lady | 2 | frog | table | 7 | 1 |
None | 3 | eagle | glass | 8 | c |
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ | 1 | dog⸱dog | housé | 5 | a |
⸱⸱⸱⸱zombies | 2 | cat | tv | 6 | b |
a = [("animals","desc"),("thing","asc")]
for c in a:
print(c[0])
animals thing
df.rows.select(df["num"]==1).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ | 1 | dog⸱dog | housé | 5 | a |
df.rows.select_by_dtypes("filter", "integer").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
simpsons⸱⸱⸱cat⸱lady | 2 | frog | table | 7 | 1 |
df.rows.drop((df["num"]==2) | (df["second"]==5)).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
None | 3 | eagle | glass | 8 | c |
df.rows.drop_by_dtypes("filter", "int").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ | 1 | dog⸱dog | housé | 5 | a |
⸱⸱⸱⸱zombies | 2 | cat | tv | 6 | b |
None | 3 | eagle | glass | 8 | c |
df.rows.drop_by_dtypes("filter", "integer").table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ | 1 | dog⸱dog | housé | 5 | a |
⸱⸱⸱⸱zombies | 2 | cat | tv | 6 | b |
None | 3 | eagle | glass | 8 | c |
from optimus.audf import abstract_udf as audf
def func_data_type(value, attr):
return value >1
df.rows.drop(audf("num", func_data_type, "boolean")).table()
words
1 (string)
nullable
|
num
2 (int)
nullable
|
animals
3 (string)
nullable
|
thing
4 (string)
nullable
|
second
5 (int)
nullable
|
filter
6 (string)
nullable
|
---|---|---|---|---|---|
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ | 1 | dog⸱dog | housé | 5 | a |