# UNCOMMENT THIS IF YOU'RE USING GOOGLE COLAB!
#!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#!wget -q http://apache.osuosl.org/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
#!tar xf spark-2.4.1-bin-hadoop2.7.tgz
#!pip install optimuspyspark
# AFTER RUNNING THIS CELL, YOU MUST RESTART THE RUNTIME TO USE UPDATED VERSIONS OF PACKAGES!
# UNCOMMENT THIS IF YOU'RE USING GOOGLE COLAB!
#import os
#os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#os.environ["SPARK_HOME"] = "/content/spark-2.3.3-bin-hadoop2.7"
# Run if you clone Optimus and want to hack it from the notebook else you must install
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
from optimus import Optimus
C:\Users\argenisleon\Anaconda3\lib\site-packages\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working from collections import Callable You are using PySparkling of version 2.4.10, but your PySpark is of version 2.3.1. Please make sure Spark and PySparkling versions are compatible.
# Create optimus
op = Optimus(master="local", app_name= "optimus", verbose = True)
INFO:optimus:Just check that Spark and all necessary environments vars are present... INFO:optimus:----- INFO:optimus:SPARK_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7 INFO:optimus:HADOOP_HOME=C:\opt\hadoop-2.7.7 INFO:optimus:PYSPARK_PYTHON=C:\Users\argenisleon\Anaconda3\python.exe INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter INFO:optimus:PYSPARK_SUBMIT_ARGS=--packages com.databricks:spark-avro_2.11:4.0.0 --conf "spark.sql.catalogImplementation=hive" pyspark-shell INFO:optimus:JAVA_HOME=C:\java INFO:optimus:Pyarrow Installed INFO:optimus:----- INFO:optimus:Starting or getting SparkSession and SparkContext... INFO:optimus:Spark Version:2.3.1 INFO:optimus: ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ INFO:optimus:Transform and Roll out... INFO:optimus:Optimus successfully imported. Have fun :).
op = Optimus(packages="datastax:spark-cassandra-connector:1.6.1-s_2.10")
op.spark
SparkSession - hive
op.sc
op.packages
['com.databricks:spark-avro_2.11:4.0.0']
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType
from datetime import date, datetime
df = op.create.df(
[
"names",
"height(ft)",
"function",
"rank",
"age",
"weight(t)",
"japanese name",
"last position seen",
"date arrival",
"last date seen",
"attributes",
"DateType",
"Tiemstamp",
"Cybertronian",
"function(binary)",
"NullType",
],
[
("Optim'us", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
"2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
None),
("bumbl#ebéé ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
"2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
None),
("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
"2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
None),
("Jazz", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
"2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
bytearray("First Lieutenant", "utf-8"), None),
("Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
("Metroplex_)^$", 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
[91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
]).h_repartition(1)
df.table()
names
1 (string)
nullable
|
height(ft)
2 (int)
nullable
|
function
3 (string)
nullable
|
rank
4 (int)
nullable
|
age
5 (int)
nullable
|
weight(t)
6 (float)
nullable
|
japanese name
7 (array<string>)
nullable
|
last position seen
8 (string)
nullable
|
date arrival
9 (string)
nullable
|
last date seen
10 (string)
nullable
|
attributes
11 (array<float>)
nullable
|
DateType
12 (date)
nullable
|
Tiemstamp
13 (timestamp)
nullable
|
Cybertronian
14 (boolean)
nullable
|
function(binary)
15 (binary)
nullable
|
NullType
16 (null)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Optim'us
|
28
|
Leader
|
10
|
5000000
|
4.300000190734863
|
['Inochi',⋅'Convoy']
|
19.442735,-99.201111
|
1980/04/10
|
2016/09/10
|
[8.53439998626709,⋅4300.0]
|
2016-09-10
|
2014-06-24⋅00:00:00
|
True
|
None
|
None
|
bumbl#ebéé⋅⋅
|
17
|
Espionage
|
7
|
5000000
|
2.0
|
['Bumble',⋅'Goldback']
|
10.642707,-71.612534
|
1980/04/10
|
2015/08/10
|
[5.334000110626221,⋅2000.0]
|
2015-08-10
|
2014-06-24⋅00:00:00
|
True
|
None
|
None
|
ironhide&
|
26
|
Security
|
7
|
5000000
|
4.0
|
['Roadbuster']
|
37.789563,-122.400356
|
1980/04/10
|
2014/07/10
|
[7.924799919128418,⋅4000.0]
|
2014-06-24
|
2014-06-24⋅00:00:00
|
True
|
None
|
None
|
Jazz
|
13
|
First⋅Lieutenant
|
8
|
5000000
|
1.7999999523162842
|
['Meister']
|
33.670666,-117.841553
|
1980/04/10
|
2013/06/10
|
[3.962399959564209,⋅1800.0]
|
2013-06-24
|
2014-06-24⋅00:00:00
|
True
|
None
|
None
|
Megatron
|
None
|
None
|
10
|
5000000
|
5.699999809265137
|
['Megatron']
|
None
|
1980/04/10
|
2012/05/10
|
[None,⋅5700.0]
|
2012-05-10
|
2014-06-24⋅00:00:00
|
True
|
None
|
None
|
Metroplex_)^$
|
300
|
Battle⋅Station
|
8
|
5000000
|
None
|
['Metroflex']
|
None
|
1980/04/10
|
2011/04/10
|
[91.44000244140625,⋅None]
|
2011-04-10
|
2014-06-24⋅00:00:00
|
True
|
None
|
None
|
import pandas as pd
pdf = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
'B': {0: 1, 1: 3, 2: 5},
'C': {0: 2, 1: 4, 2: 6}})
sdf = op.create.df(pdf=pdf)
Concat dataframes two or multiple dataframes in a row or column way
op.append([df,df], like="rows").table()
df_csv =op.load.csv("data/foo.csv").table(5)
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
df_csv =op.load.tsv("data/foo.tsv").table(5)
Sepal length
1 (string)
nullable
|
Sepal width
2 (double)
nullable
|
Petal length
3 (double)
nullable
|
Petal width
4 (string)
nullable
|
Species
5 (string)
nullable
|
---|---|---|---|---|
5.0
|
3.6
|
1.4
|
0.2
|
I.⋅setosa
|
5.0
|
3.6
|
1.4
|
0.2
|
I.⋅setosa
|
5.0
|
3.6
|
1.4
|
0.2
|
I.⋅setosa
|
5.0
|
3.6
|
1.4
|
0.2
|
I.⋅setosa
|
df_json =op.load.json("data/foo.json").table(5)
billingId
1 (bigint)
nullable
|
birth
2 (string)
nullable
|
dummyCol
3 (string)
nullable
|
firstName
4 (string)
nullable
|
id
5 (bigint)
nullable
|
lastName
6 (string)
nullable
|
price
7 (bigint)
nullable
|
product
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
123
|
1980/07/07
|
never
|
Luis
|
1
|
Alvarez$$%!
|
10
|
Cake
|
423
|
1950/07/08
|
gonna
|
André
|
2
|
Ampère
|
8
|
piza
|
551
|
1990/07/09
|
give
|
NiELS
|
3
|
Böhr//((%%
|
8
|
pizza
|
521
|
1954/07/10
|
you
|
PAUL
|
4
|
dirac$
|
8
|
pizza
|
634
|
1990/07/11
|
up
|
Albert
|
5
|
Einstein
|
8
|
pizza
|
df_parquet =op.load.parquet("data/foo.parquet").table(5)
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
df_avro =op.load.avro("data/foo.avro").table(5)
df_avro =op.load.excel("data/titanic3.xls").table(5)
pclass
1 (bigint)
nullable
|
survived
2 (bigint)
nullable
|
name
3 (string)
nullable
|
sex
4 (string)
nullable
|
age
5 (double)
nullable
|
sibsp
6 (bigint)
nullable
|
parch
7 (bigint)
nullable
|
ticket
8 (string)
nullable
|
fare
9 (double)
nullable
|
cabin
10 (string)
nullable
|
embarked
11 (string)
nullable
|
boat
12 (string)
nullable
|
body
13 (double)
nullable
|
home_dest
14 (string)
nullable
|
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1
|
1
|
Allen,⸱Miss.⸱Elisabeth⸱Walton
|
female
|
29.0
|
0
|
0
|
24160
|
211.3375
|
B5
|
S
|
2
|
nan
|
St⸱Louis,⸱MO
|
1
|
1
|
Allison,⸱Master.⸱Hudson⸱Trevor
|
male
|
0.9167
|
1
|
2
|
113781
|
151.55
|
C22⸱C26
|
S
|
11
|
nan
|
Montreal,⸱PQ⸱/⸱Chesterville,⸱ON
|
1
|
0
|
Allison,⸱Miss.⸱Helen⸱Loraine
|
female
|
2.0
|
1
|
2
|
113781
|
151.55
|
C22⸱C26
|
S
|
nan
|
nan
|
Montreal,⸱PQ⸱/⸱Chesterville,⸱ON
|
1
|
0
|
Allison,⸱Mr.⸱Hudson⸱Joshua⸱Creighton
|
male
|
30.0
|
1
|
2
|
113781
|
151.55
|
C22⸱C26
|
S
|
nan
|
135.0
|
Montreal,⸱PQ⸱/⸱Chesterville,⸱ON
|
1
|
0
|
Allison,⸱Mrs.⸱Hudson⸱J⸱C⸱(Bessie⸱Waldo⸱Daniels)
|
female
|
25.0
|
1
|
2
|
113781
|
151.55
|
C22⸱C26
|
S
|
nan
|
nan
|
Montreal,⸱PQ⸱/⸱Chesterville,⸱ON
|
df_csv =op.load.csv("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv")
df_csv.table()
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
11
|
Isaac
|
Newton
|
992
|
pasta
|
9
|
1999/02/15
|
never⸱
|
12
|
Emmy%%
|
Nöether$
|
234
|
pasta
|
9
|
1993/12/08
|
gonna
|
13
|
Max!!!
|
Planck!!!
|
111
|
hamburguer
|
4
|
1994/01/04
|
run⸱
|
14
|
Fred
|
Hoy&&&le
|
553
|
pizzza
|
8
|
1997/06/27
|
around
|
15
|
(((⸱⸱⸱Heinrich⸱)))))
|
Hertz
|
116
|
pizza
|
8
|
1956/11/30
|
and
|
16
|
William
|
Gilbert###
|
886
|
BEER
|
2
|
1958/03/26
|
desert
|
17
|
Marie
|
CURIE
|
912
|
Rice
|
1
|
2000/03/22
|
you
|
18
|
Arthur
|
COM%%%pton
|
812
|
110790
|
5
|
1899/01/01
|
#
|
19
|
JAMES
|
Chadwick
|
467
|
null
|
10
|
1921/05/03
|
#
|
df_json =op.load.json("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json")
df_json.table()
billingId
1 (bigint)
nullable
|
birth
2 (string)
nullable
|
dummyCol
3 (string)
nullable
|
firstName
4 (string)
nullable
|
id
5 (bigint)
nullable
|
lastName
6 (string)
nullable
|
price
7 (bigint)
nullable
|
product
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
123
|
1980/07/07
|
never
|
Luis
|
1
|
Alvarez$$%!
|
10
|
Cake
|
423
|
1950/07/08
|
gonna
|
André
|
2
|
Ampère
|
8
|
piza
|
551
|
1990/07/09
|
give
|
NiELS
|
3
|
Böhr//((%%
|
8
|
pizza
|
521
|
1954/07/10
|
you
|
PAUL
|
4
|
dirac$
|
8
|
pizza
|
634
|
1990/07/11
|
up
|
Albert
|
5
|
Einstein
|
8
|
pizza
|
672
|
1930/08/12
|
never
|
Galileo
|
6
|
⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI
|
5
|
arepa
|
323
|
1970/07/13
|
gonna
|
CaRL
|
7
|
Ga%%%uss
|
3
|
taco
|
624
|
1950/07/14
|
let
|
David
|
8
|
H$$$ilbert
|
3
|
taaaccoo
|
735
|
1920/04/22
|
you
|
Johannes
|
9
|
KEPLER
|
3
|
taco
|
875
|
1923/03/12
|
down
|
JaMES
|
10
|
M$$ax%%well
|
3
|
taco
|
992
|
1999/02/15
|
never⸱
|
Isaac
|
11
|
Newton
|
9
|
pasta
|
234
|
1993/12/08
|
gonna
|
Emmy%%
|
12
|
Nöether$
|
9
|
pasta
|
111
|
1994/01/04
|
run⸱
|
Max!!!
|
13
|
Planck!!!
|
4
|
hamburguer
|
553
|
1997/06/27
|
around
|
Fred
|
14
|
Hoy&&&le
|
8
|
pizzza
|
116
|
1956/11/30
|
and
|
(((⸱⸱⸱Heinrich⸱)))))
|
15
|
Hertz
|
8
|
pizza
|
886
|
1958/03/26
|
desert
|
William
|
16
|
Gilbert###
|
2
|
BEER
|
912
|
2000/03/22
|
you
|
Marie
|
17
|
CURIE
|
1
|
Rice
|
812
|
1899/01/01
|
#
|
Arthur
|
18
|
COM%%%pton
|
5
|
110790
|
467
|
1921/05/03
|
#
|
JAMES
|
19
|
Chadwick
|
10
|
null
|
df_parquet =op.load.parquet("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.parquet")
df_parquet.table()
C:\Users\ARGENI~1\AppData\Local\Temp\tmpvhxr0yyy.parquet
id
1 (int)
nullable
|
firstName
2 (string)
nullable
|
lastName
3 (string)
nullable
|
billingId
4 (int)
nullable
|
product
5 (string)
nullable
|
price
6 (int)
nullable
|
birth
7 (string)
nullable
|
dummyCol
8 (string)
nullable
|
---|---|---|---|---|---|---|---|
1
|
Luis
|
Alvarez$$%!
|
123
|
Cake
|
10
|
1980/07/07
|
never
|
2
|
André
|
Ampère
|
423
|
piza
|
8
|
1950/07/08
|
gonna
|
3
|
NiELS
|
Böhr//((%%
|
551
|
pizza
|
8
|
1990/07/09
|
give
|
4
|
PAUL
|
dirac$
|
521
|
pizza
|
8
|
1954/07/10
|
you
|
5
|
Albert
|
Einstein
|
634
|
pizza
|
8
|
1990/07/11
|
up
|
6
|
Galileo
|
⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI
|
672
|
arepa
|
5
|
1930/08/12
|
never
|
7
|
CaRL
|
Ga%%%uss
|
323
|
taco
|
3
|
1970/07/13
|
gonna
|
8
|
David
|
H$$$ilbert
|
624
|
taaaccoo
|
3
|
1950/07/14
|
let
|
9
|
Johannes
|
KEPLER
|
735
|
taco
|
3
|
1920/04/22
|
you
|
10
|
JaMES
|
M$$ax%%well
|
875
|
taco
|
3
|
1923/03/12
|
down
|
11
|
Isaac
|
Newton
|
992
|
pasta
|
9
|
1999/02/15
|
never⸱
|
12
|
Emmy%%
|
Nöether$
|
234
|
pasta
|
9
|
1993/12/08
|
gonna
|
13
|
Max!!!
|
Planck!!!
|
111
|
hamburguer
|
4
|
1994/01/04
|
run⸱
|
14
|
Fred
|
Hoy&&&le
|
553
|
pizzza
|
8
|
1997/06/27
|
around
|
15
|
(((⸱⸱⸱Heinrich⸱)))))
|
Hertz
|
116
|
pizza
|
8
|
1956/11/30
|
and
|
16
|
William
|
Gilbert###
|
886
|
BEER
|
2
|
1958/03/26
|
desert
|
17
|
Marie
|
CURIE
|
912
|
Rice
|
1
|
2000/03/22
|
you
|
18
|
Arthur
|
COM%%%pton
|
812
|
110790
|
5
|
1899/01/01
|
#
|
19
|
JAMES
|
Chadwick
|
467
|
null
|
10
|
1921/05/03
|
#
|
df_avro =op.load.avro("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.avro", "avro") df_avro.table()
df_csv.save.csv("test.csv")
df_csv.save.json("test.json")
df_csv.save.parquet("test.parquet")
df_csv.save.avro("test.avro")