from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import requests
import StringIO
import re
import pandas as pd
import numpy as np
from collections import defaultdict
print __version__
init_notebook_mode(connected=True)
1.12.6
df = pd.read_csv('kp_all_movies.csv')
df.head()
movie_id | name_rus | kp_rating | movie_duration | kp_rating_count | movie_year | imdb_rating | imdb_rating_count | genres | countries | budget | critics_rating | name_eng | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10000 | Иезавель | 7.411 | 104 | 518.0 | 1938 | 7.6 | 8585.0 | [драма, мелодрама] | [США] | $1 250 000 | 94.0 | Jezebel |
1 | 100049 | 47 ронинов | 7.660 | 204 | 297.0 | 1962 | 7.8 | 1496.0 | [боевик, драма, история] | [Япония] | NaN | NaN | Chûshingura |
2 | 10005 | Живем один раз | 7.168 | 86 | 330.0 | 1937 | 7.4 | 3812.0 | [фильм-нуар, драма, криминал] | [США] | $575 000 | 100.0 | You Only Live Once |
3 | 100053 | Колдун 2 | 7.745 | 113 | 109.0 | 2003 | 6.5 | 393.0 | [фэнтези, боевик] | [Япония] | NaN | NaN | Onmyoji 2 |
4 | 100096 | Ильза, тигрица из Сибири | 4.286 | 85 | 151.0 | 1977 | 4.9 | 1211.0 | [ужасы] | [Канада] | CAD 250 000 | NaN | Ilsa the Tigress of Siberia |
count_year_df = df.groupby('movie_year', as_index = False).movie_id.count()
trace = go.Bar(
x = count_year_df.movie_year,
y = count_year_df.movie_id
)
layout = go.Layout(
title='Фильмы на Кинопоиске',
)
fig = go.Figure(data = [trace], layout = layout)
iplot(fig)
rating_year_df = df.groupby('movie_year', as_index = False)[['kp_rating', 'imdb_rating']].mean()
trace_kp = go.Scatter(
x = rating_year_df.movie_year,
y = rating_year_df.kp_rating,
mode = 'lines',
name = u'КиноПоиск'
)
trace_imdb = go.Scatter(
x = rating_year_df.movie_year,
y = rating_year_df.imdb_rating,
mode = 'lines',
name = 'IMDb'
)
layout = go.Layout(
title='Оценки фильмов',
)
fig = go.Figure(data = [trace_kp, trace_imdb], layout = layout)
iplot(fig)
В первую очередь нужно распарсить поле genres
в dataframe.
df['genres'] = df['genres'].fillna('[]')
def parse_list(lst_str):
return filter(lambda y: y != '',
map(lambda x: x.strip(),
re.sub(r'[\[\]]', '', lst_str).split(',')))
genres_data = []
for record in df.to_dict(orient = 'records'):
genres_lst = parse_list(record['genres'])
for genre in genres_lst:
copy = record.copy()
copy['genre'] = genre
copy['weight'] = 1./len(genres_lst)
genres_data.append(copy)
genres_df = pd.DataFrame.from_dict(genres_data)
genres_df.head()
budget | countries | critics_rating | genre | genres | imdb_rating | imdb_rating_count | kp_rating | kp_rating_count | movie_duration | movie_id | movie_year | name_eng | name_rus | weight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | $1 250 000 | [США] | 94.0 | драма | [драма, мелодрама] | 7.6 | 8585.0 | 7.411 | 518.0 | 104 | 10000 | 1938 | Jezebel | Иезавель | 0.500000 |
1 | $1 250 000 | [США] | 94.0 | мелодрама | [драма, мелодрама] | 7.6 | 8585.0 | 7.411 | 518.0 | 104 | 10000 | 1938 | Jezebel | Иезавель | 0.500000 |
2 | NaN | [Япония] | NaN | боевик | [боевик, драма, история] | 7.8 | 1496.0 | 7.660 | 297.0 | 204 | 100049 | 1962 | Chûshingura | 47 ронинов | 0.333333 |
3 | NaN | [Япония] | NaN | драма | [боевик, драма, история] | 7.8 | 1496.0 | 7.660 | 297.0 | 204 | 100049 | 1962 | Chûshingura | 47 ронинов | 0.333333 |
4 | NaN | [Япония] | NaN | история | [боевик, драма, история] | 7.8 | 1496.0 | 7.660 | 297.0 | 204 | 100049 | 1962 | Chûshingura | 47 ронинов | 0.333333 |
# сформируем топ-10 жанров по количеству фильмов
top_genres = genres_df.groupby('genre')[['movie_id']].count()\
.sort_values('movie_id', ascending = False)\
.head(10).index.values.tolist()
N = float(len(top_genres))
# cгенерируем цвета для визуализации
c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, N)]
data = [{
'y': genres_df[genres_df.genre == top_genres[i]].kp_rating,
'type':'box',
'marker':{'color': c[i]},
'name': top_genres[i]
} for i in range(len(top_genres))]
layout = go.Layout(
title='Оценки фильмов',
yaxis = {'title': 'Оценка КиноПоиска'}
)
fig = go.Figure(data = data, layout = layout)
iplot(fig)
genres_coincidents = {}
for item in df.genres:
parsed_genres = parse_list(item)
for genre1 in parsed_genres:
if genre1 not in genres_coincidents:
genres_coincidents[genre1] = defaultdict(int)
for genre2 in parsed_genres:
genres_coincidents[genre1][genre2] += 1
genres_coincidents_df = pd.DataFrame.from_dict(genres_coincidents).fillna(0)
# отнормируем таблицу на количество фильмов каждого жанра
genres_coincidents_df_norm = genres_coincidents_df\
.apply(lambda x: x/genres_df.groupby('genre').movie_id.count(), axis = 1)
heatmap = go.Heatmap(
z = genres_coincidents_df_norm.values,
x = genres_coincidents_df_norm.index.values,
y = genres_coincidents_df_norm.columns
)
layout = go.Layout(
title = 'Связанные жанры'
)
fig = go.Figure(data = [heatmap], layout = layout)
iplot(fig)
genre_rating_year_df = genres_df.groupby(['movie_year', 'genre'], as_index = False)[['kp_rating', 'imdb_rating']].mean()
N = len(top_genres)
data = []
drop_menus = []
# конструируем все интересующие нас линии
for i in range(N):
genre = top_genres[i]
genre_df = genre_rating_year_df[genre_rating_year_df.genre == genre]
trace_kp = go.Scatter(
x = genre_df.movie_year,
y = genre_df.kp_rating,
mode = 'lines',
name = genre + ' КиноПоиск',
visible = (i == 0)
)
trace_imdb = go.Scatter(
x = genre_df.movie_year,
y = genre_df.imdb_rating,
mode = 'lines',
name = genre + ' IMDb',
visible = (i == 0)
)
data.append(trace_kp)
data.append(trace_imdb)
# создаем выпадающие меню
for i in range(N):
drop_menus.append(
dict(
args=['visible', [False]*2*i + [True]*2 + [False]*2*(N-1-i)],
label= top_genres[i],
method='restyle'
)
)
layout = go.Layout(
title='Фильмы по жанрам',
updatemenus=list([
dict(
x = -0.1,
y = 1,
yanchor = 'top',
buttons = drop_menus
)
]),
)
fig = go.Figure(data = data, layout = layout)
iplot(fig)