from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("solution-2").getOrCreate()
df = spark.read.csv("./cars.csv", header = True)
df.printSchema()
root |-- YEAR: string (nullable = true) |-- Make: string (nullable = true) |-- Model: string (nullable = true) |-- Size: string (nullable = true) |-- (kW): string (nullable = true)
df = spark.read.csv("./cars.csv", header = True, inferSchema = True)
df.printSchema()
root |-- YEAR: integer (nullable = true) |-- Make: string (nullable = true) |-- Model: string (nullable = true) |-- Size: string (nullable = true) |-- (kW): integer (nullable = true)
df.show(10)
+----+----------+--------------------+----------+----+ |YEAR| Make| Model| Size|(kW)| +----+----------+--------------------+----------+----+ |2012|MITSUBISHI| i-MiEV|SUBCOMPACT| 49| |2012| NISSAN| LEAF| MID-SIZE| 80| |2013| FORD| FOCUS ELECTRIC| COMPACT| 107| |2013|MITSUBISHI| i-MiEV|SUBCOMPACT| 49| |2013| NISSAN| LEAF| MID-SIZE| 80| |2013| SMART|FORTWO ELECTRIC D...|TWO-SEATER| 35| |2013| SMART|FORTWO ELECTRIC D...|TWO-SEATER| 35| |2013| TESLA|MODEL S (40 kWh b...| FULL-SIZE| 270| |2013| TESLA|MODEL S (60 kWh b...| FULL-SIZE| 270| |2013| TESLA|MODEL S (85 kWh b...| FULL-SIZE| 270| +----+----------+--------------------+----------+----+ only showing top 10 rows
df.filter(df['YEAR'] == 2015).show(10)
+----+----------+--------------------+--------------------+----+ |YEAR| Make| Model| Size|(kW)| +----+----------+--------------------+--------------------+----+ |2015| BMW| i3| SUBCOMPACT| 125| |2015| CHEVROLET| SPARK EV| SUBCOMPACT| 104| |2015| FORD| FOCUS ELECTRIC| COMPACT| 107| |2015| KIA| SOUL EV|STATION WAGON - S...| 81| |2015|MITSUBISHI| i-MiEV| SUBCOMPACT| 49| |2015| NISSAN| LEAF| MID-SIZE| 80| |2015| SMART|FORTWO ELECTRIC D...| TWO-SEATER| 35| |2015| SMART|FORTWO ELECTRIC D...| TWO-SEATER| 35| |2015| TESLA|MODEL S (60 kWh b...| FULL-SIZE| 283| |2015| TESLA|MODEL S (70 kWh b...| FULL-SIZE| 283| +----+----------+--------------------+--------------------+----+ only showing top 10 rows
df_tesla = df.filter(df['Make'] == 'TESLA').show(10)
+----+-----+--------------------+---------+----+ |YEAR| Make| Model| Size|(kW)| +----+-----+--------------------+---------+----+ |2013|TESLA|MODEL S (40 kWh b...|FULL-SIZE| 270| |2013|TESLA|MODEL S (60 kWh b...|FULL-SIZE| 270| |2013|TESLA|MODEL S (85 kWh b...|FULL-SIZE| 270| |2013|TESLA| MODEL S PERFORMANCE|FULL-SIZE| 310| |2014|TESLA|MODEL S (60 kWh b...|FULL-SIZE| 225| |2014|TESLA|MODEL S (85 kWh b...|FULL-SIZE| 270| |2014|TESLA| MODEL S PERFORMANCE|FULL-SIZE| 310| |2015|TESLA|MODEL S (60 kWh b...|FULL-SIZE| 283| |2015|TESLA|MODEL S (70 kWh b...|FULL-SIZE| 283| |2015|TESLA|MODEL S (85/90 kW...|FULL-SIZE| 283| +----+-----+--------------------+---------+----+ only showing top 10 rows
df.select(df['Make'], df['Model'], df['Size']).show(10)
+----------+--------------------+----------+ | Make| Model| Size| +----------+--------------------+----------+ |MITSUBISHI| i-MiEV|SUBCOMPACT| | NISSAN| LEAF| MID-SIZE| | FORD| FOCUS ELECTRIC| COMPACT| |MITSUBISHI| i-MiEV|SUBCOMPACT| | NISSAN| LEAF| MID-SIZE| | SMART|FORTWO ELECTRIC D...|TWO-SEATER| | SMART|FORTWO ELECTRIC D...|TWO-SEATER| | TESLA|MODEL S (40 kWh b...| FULL-SIZE| | TESLA|MODEL S (60 kWh b...| FULL-SIZE| | TESLA|MODEL S (85 kWh b...| FULL-SIZE| +----------+--------------------+----------+ only showing top 10 rows
df_manufacturer = df.groupBy("Make").count()
df_manufacturer.show()
+----------+-----+ | Make|count| +----------+-----+ | NISSAN| 6| | FORD| 4| | CHEVROLET| 3| | TESLA| 23| | BMW| 2| | KIA| 2| | SMART| 8| |MITSUBISHI| 5| +----------+-----+
df_manufacturer.sort("count", ascending=False).show()
+----------+-----+ | Make|count| +----------+-----+ | TESLA| 23| | SMART| 8| | NISSAN| 6| |MITSUBISHI| 5| | FORD| 4| | CHEVROLET| 3| | BMW| 2| | KIA| 2| +----------+-----+
df_year = df.groupBy("Year").count().sort("count", ascending=False)
df_year.show()
+----+-----+ |Year|count| +----+-----+ |2016| 19| |2015| 14| |2013| 9| |2014| 9| |2012| 2| +----+-----+
df_pd = df.toPandas()
df_pd.head(10)
YEAR | Make | Model | Size | (kW) | |
---|---|---|---|---|---|
0 | 2012 | MITSUBISHI | i-MiEV | SUBCOMPACT | 49 |
1 | 2012 | NISSAN | LEAF | MID-SIZE | 80 |
2 | 2013 | FORD | FOCUS ELECTRIC | COMPACT | 107 |
3 | 2013 | MITSUBISHI | i-MiEV | SUBCOMPACT | 49 |
4 | 2013 | NISSAN | LEAF | MID-SIZE | 80 |
5 | 2013 | SMART | FORTWO ELECTRIC DRIVE CABRIOLET | TWO-SEATER | 35 |
6 | 2013 | SMART | FORTWO ELECTRIC DRIVE COUPE | TWO-SEATER | 35 |
7 | 2013 | TESLA | MODEL S (40 kWh battery) | FULL-SIZE | 270 |
8 | 2013 | TESLA | MODEL S (60 kWh battery) | FULL-SIZE | 270 |
9 | 2013 | TESLA | MODEL S (85 kWh battery) | FULL-SIZE | 270 |
df_pd.describe()
YEAR | (kW) | |
---|---|---|
count | 53.000000 | 53.000000 |
mean | 2014.735849 | 190.622642 |
std | 1.227113 | 155.526429 |
min | 2012.000000 | 35.000000 |
25% | 2014.000000 | 80.000000 |
50% | 2015.000000 | 107.000000 |
75% | 2016.000000 | 283.000000 |
max | 2016.000000 | 568.000000 |
spark.stop()