Following instructions are need to be execture in my-sql server first-
CREATE DATABASE diamonds;
USE diamonds;
CREATE TABLE BigDiamonds;
create table BigDiamonds(
Unnamed int, carat float4, cut varchar(60), color character, clarity varchar(60), tabl float4, depth float4, cert varchar(60), measurements varchar(60), price float4, x float4, y float4, z float4, PRIMARY KEY (Unnamed) );
We have now defined the schema for the table.
After this quit mysql-server and execute following instruction on terminal to read large BigDiamonds csv file in mysql:-
mysqlimport --fields-optionally-enclosed-by='"' --fields-terminated-by=, --lines-terminated-by="\n" -u root -p diamonds --local --ignore-lines=1 /home/anshul/data-sets/BigDiamonds.csv
Ignore-lines skips the first line “Some Large Database.” The –fields-terminated by tell the utility that the commas separate the columns. The Database is the name of the database in which your table is stored. You must put the absolute path of the csv file for it to register with the utility. The “BigDiamonds.csv” has to match the name of the table in your mysql database._
Now we'll do analysis on this table through python using sqlalchemy.
import sqlalchemy
engine = sqlalchemy.create_engine('mysql://root:Jarvis@127.0.0.1:3306/diamonds')
from sqlalchemy import text
with engine.connect() as con:
rs = con.execute(text('SELECT * FROM BigDiamonds limit 10'))
print rs.keys()
print rs.fetchall()
[u'Unnamed', u'carat', u'cut', u'color', u'clarity', u'tabl', u'depth', u'cert', u'measurements', u'price', u'x', u'y', u'z'] [(1L, 0.25, 'V.Good', 'K', 'I1', 59.0, 63.7, 'GIA', '3.96 x 3.95 x 2.52', 0.0, 3.96, 3.95, 2.52), (2L, 0.23, 'Good', 'G', 'I1', 61.0, 58.1, 'GIA', '4.00 x 4.05 x 2.30', 0.0, 4.0, 4.05, 2.3), (3L, 0.34, 'Good', 'J', 'I2', 58.0, 58.7, 'GIA', '4.56 x 4.53 x 2.67', 0.0, 4.56, 4.53, 2.67), (4L, 0.21, 'V.Good', 'D', 'I1', 60.0, 60.6, 'GIA', '3.80 x 3.82 x 2.31', 0.0, 3.8, 3.82, 2.31), (5L, 0.31, 'V.Good', 'K', 'I1', 59.0, 62.2, 'EGL', '4.35 x 4.26 x 2.68', 0.0, 4.35, 4.26, 2.68), (6L, 0.2, 'Good', 'G', 'SI2', 60.0, 64.4, 'GIA', '3.74 x 3.67 x 2.38', 0.0, 3.74, 3.67, 2.38), (7L, 0.2, 'Good', 'G', 'SI2', 63.0, 62.6, 'GIA', '3.72 x 3.65 x 2.31', 0.0, 3.72, 3.65, 2.31), (8L, 0.22, 'V.Good', 'D', 'I1', 61.0, 59.2, 'GIA', '3.95 x 3.97 x 2.34', 0.0, 3.95, 3.97, 2.34), (9L, 0.23, 'V.Good', 'K', 'SI2', 57.5, 63.6, 'IGI', '3.87 x 3.90 x 2.47', 0.0, 3.87, 3.9, 2.47), (10L, 0.2, 'Good', 'F', 'SI1', 65.0, 54.9, 'GIA', '3.83 x 4.00 x 2.14', 0.0, 3.83, 4.0, 2.14)]
from sqlalchemy import inspect
insp=inspect(engine)
for row in insp.get_columns("BigDiamonds"):
print row["name"], row["type"]
Unnamed INTEGER(11) carat FLOAT cut VARCHAR(60) color CHAR(1) clarity VARCHAR(60) tabl FLOAT depth FLOAT cert VARCHAR(60) measurements VARCHAR(60) price FLOAT x FLOAT y FLOAT z FLOAT
from sqlalchemy import (MetaData, Table,select)
with engine.connect() as con:
meta = MetaData(engine)
diamonds = Table('BigDiamonds', meta, autoload=True)
stm = select([diamonds]).limit(10)
rs = con.execute(stm)
print rs.fetchall()
[(1L, 0.25, 'V.Good', 'K', 'I1', 59.0, 63.7, 'GIA', '3.96 x 3.95 x 2.52', 0.0, 3.96, 3.95, 2.52), (2L, 0.23, 'Good', 'G', 'I1', 61.0, 58.1, 'GIA', '4.00 x 4.05 x 2.30', 0.0, 4.0, 4.05, 2.3), (3L, 0.34, 'Good', 'J', 'I2', 58.0, 58.7, 'GIA', '4.56 x 4.53 x 2.67', 0.0, 4.56, 4.53, 2.67), (4L, 0.21, 'V.Good', 'D', 'I1', 60.0, 60.6, 'GIA', '3.80 x 3.82 x 2.31', 0.0, 3.8, 3.82, 2.31), (5L, 0.31, 'V.Good', 'K', 'I1', 59.0, 62.2, 'EGL', '4.35 x 4.26 x 2.68', 0.0, 4.35, 4.26, 2.68), (6L, 0.2, 'Good', 'G', 'SI2', 60.0, 64.4, 'GIA', '3.74 x 3.67 x 2.38', 0.0, 3.74, 3.67, 2.38), (7L, 0.2, 'Good', 'G', 'SI2', 63.0, 62.6, 'GIA', '3.72 x 3.65 x 2.31', 0.0, 3.72, 3.65, 2.31), (8L, 0.22, 'V.Good', 'D', 'I1', 61.0, 59.2, 'GIA', '3.95 x 3.97 x 2.34', 0.0, 3.95, 3.97, 2.34), (9L, 0.23, 'V.Good', 'K', 'SI2', 57.5, 63.6, 'IGI', '3.87 x 3.90 x 2.47', 0.0, 3.87, 3.9, 2.47), (10L, 0.2, 'Good', 'F', 'SI1', 65.0, 54.9, 'GIA', '3.83 x 4.00 x 2.14', 0.0, 3.83, 4.0, 2.14)]
rs = engine.execute(text('SELECT avg(price) FROM BigDiamonds'))
print rs.first()
(8742.582102056105,)
rs = engine.execute(text('select avg(carat) from BigDiamonds'))
print rs.fetchone()
(1.0712967522829258,)
The SQLAlchemy Expression Language represents relational database structures and expressions using Python constructs. The expression language improves the maintainability of the code by hiding the SQL language and thus allowing not to mix Python code and SQL code.
import pandas as pd
import numpy as np
from sqlalchemy import func
with engine.connect() as con:
meta = MetaData(engine)
diamonds = Table('BigDiamonds', meta, autoload=True)
stm = select([func.avg(diamonds.c.carat),diamonds.c.color]).group_by(diamonds.c.color)
rs = con.execute(stm)
print pd.DataFrame(np.array(rs.fetchall()),columns=rs.keys())
avg_1 color 0 0.826618225403 D 1 0.831882373286 E 2 0.941053186937 F 3 1.06384079506 G 4 1.20994065749 H 5 1.27128226011 I 6 1.34753987722 J 7 1.49506455673 K 8 1.3632705044 L
rs=engine.execute(text('select avg(carat),cut from BigDiamonds group by cut'))
print pd.DataFrame(np.array(rs.fetchall()),columns=rs.keys())
avg(carat) cut 0 0.900303114521 Good 1 1.12019355723 Ideal 2 1.0247596733 V.Good
rs = engine.execute(text('SELECT * FROM BigDiamonds'))
diamonds=pd.DataFrame(np.array(rs.fetchall()),columns=rs.keys())
diamonds.describe()
Unnamed | carat | cut | color | clarity | tabl | depth | cert | measurements | price | x | y | z | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 598024 | 598024 | 598024 | 598024 | 598024 | 598024 | 598024 | 598024 | 598024 | 598024 | 598024 | 598024 | 598024 |
unique | 598024 | 628 | 3 | 9 | 9 | 187 | 285 | 9 | 241453 | 40313 | 1031 | 912 | 962 |
top | 527446 | 0.3 | Ideal | G | SI1 | 57.0 | 62.4 | GIA | 0.00 x 0.00 x 0.00 | 740.0 | 4.3 | 4.32 | 4.0 |
freq | 1 | 32388 | 369448 | 96204 | 116631 | 122574 | 24303 | 463555 | 425 | 1702 | 3415 | 3218 | 5415 |