第3章 pandasでデータを処理しよう¶

3-5: さまざまなデータの読み込み¶

In [1]:

# リスト3.5.1：CSVファイルの読み込み
import os
import pandas as pd

base_url = (
    "https://raw.githubusercontent.com/practical-jupyter/sample-data/master/anime/"
)
anime_csv = os.path.join(base_url, "anime.csv")
df = pd.read_csv(anime_csv)
df.head()

Out[1]:

	anime_id	name	genre	type	episodes	rating	members
0	32281	Kimi no Na wa.	Drama, Romance, School, Supernatural	Movie	1	9.37	200630
1	5114	Fullmetal Alchemist: Brotherhood	Action, Adventure, Drama, Fantasy, Magic, Mili...	TV	64	9.26	793665
2	28977	Gintama°	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.25	114262
3	9253	Steins;Gate	Sci-Fi, Thriller	TV	24	9.17	673572
4	9969	Gintama'	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.16	151266

In [2]:

# リスト3.5.2：インデックス列を番号で指定
# インデックスにする列を番号で指定
df = pd.read_csv(anime_csv, index_col=0)
df.head()

Out[2]:

	name	genre	type	episodes	rating	members
anime_id
32281	Kimi no Na wa.	Drama, Romance, School, Supernatural	Movie	1	9.37	200630
5114	Fullmetal Alchemist: Brotherhood	Action, Adventure, Drama, Fantasy, Magic, Mili...	TV	64	9.26	793665
28977	Gintama°	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.25	114262
9253	Steins;Gate	Sci-Fi, Thriller	TV	24	9.17	673572
9969	Gintama'	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.16	151266

In [3]:

# リスト3.5.3：インデックス列を列名で指定
# インデックスにする列を列名で指定
df = pd.read_csv(anime_csv, index_col="anime_id")
df.head()

Out[3]:

	name	genre	type	episodes	rating	members
anime_id
32281	Kimi no Na wa.	Drama, Romance, School, Supernatural	Movie	1	9.37	200630
5114	Fullmetal Alchemist: Brotherhood	Action, Adventure, Drama, Fantasy, Magic, Mili...	TV	64	9.26	793665
28977	Gintama°	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.25	114262
9253	Steins;Gate	Sci-Fi, Thriller	TV	24	9.17	673572
9969	Gintama'	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.16	151266

In [4]:

# リスト3.5.4：型を指定
df = pd.read_csv(anime_csv, dtype={"members": float})
df.head()

Out[4]:

	anime_id	name	genre	type	episodes	rating	members
0	32281	Kimi no Na wa.	Drama, Romance, School, Supernatural	Movie	1	9.37	200630.0
1	5114	Fullmetal Alchemist: Brotherhood	Action, Adventure, Drama, Fantasy, Magic, Mili...	TV	64	9.26	793665.0
2	28977	Gintama°	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.25	114262.0
3	9253	Steins;Gate	Sci-Fi, Thriller	TV	24	9.17	673572.0
4	9969	Gintama'	Action, Comedy, Historical, Parody, Samurai, S...	TV	51	9.16	151266.0

In [5]:

# リスト3.5.5：datetime型の変換
anime_stock_price_csv = os.path.join(base_url, "anime_stock_price.csv")
df = pd.read_csv(anime_stock_price_csv, parse_dates=["Date"])
df.dtypes

Out[5]:

Date              datetime64[ns]
TOEI ANIMATION           float64
IG Port                  float64
dtype: object

In [6]:

# リスト3.5.7：区切り文字を指定
anime_tsv = os.path.join(base_url, "anime.tsv")
df = pd.read_csv(anime_tsv, sep="\t")

In [7]:

# リスト3.5.8：Excelファイルの読み込み
anime_xlsx = os.path.join(base_url, "anime.xlsx")
df = pd.read_excel(anime_xlsx)
df.head()

Out[7]:

	anime_id	name	genre	type	episodes	rating	members
0	32281	Kimi no Na wa.	Drama, Romance, School, Supernatural	Movie	1	9.37	200630
1	15335	Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...	Action, Comedy, Historical, Parody, Samurai, S...	Movie	1	9.10	72534
2	28851	Koe no Katachi	Drama, School, Shounen	Movie	1	9.05	102733
3	199	Sen to Chihiro no Kamikakushi	Adventure, Drama, Supernatural	Movie	1	8.93	466254
4	12355	Ookami Kodomo no Ame to Yuki	Fantasy, Slice of Life	Movie	1	8.84	226193

In [8]:

# リスト3.5.9：シート名を指定した読み込み
df = pd.read_excel(anime_xlsx, sheetname="Movie")

In [9]:

# リスト3.5.10：SQLiteの読み込み
from urllib.request import urlopen
import sqlite3

anime_db = os.path.join(base_url, "anime.db")
res = urlopen(anime_db)
with open("anime.db", "wb") as f:
    f.write(res.read())
    with sqlite3.connect(f.name) as conn:
        df = pd.read_sql("SELECT * FROM anime", conn)

In [10]:

# リスト3.5.11：HTMLファイルの読み込み
url = "https://docs.python.org/3/py-modindex.html"
tables = pd.read_html(url, index_col=1)
tables[0].loc[:, 1:].dropna().head(10)  # 1番目のDataFrameから空の列と欠損値を除外

Out[10]:

	2
1
__future__	Future statement definitions
__main__	The environment where the top-level script is ...
_dummy_thread	Drop-in replacement for the _thread module.
_thread	Low-level threading API.
abc	Abstract base classes according to PEP 3119.
aifc	Read and write audio files in AIFF or AIFC for...
argparse	Command-line option and argument parsing library.
array	Space efficient arrays of uniformly typed nume...
ast	Abstract Syntax Tree classes and manipulation.
asynchat	Support for asynchronous command/response prot...

In [ ]: