In [1]:
%load_ext watermark
%watermark -a "Romell D.Z." -u -d -p numpy,pandas,matplotlib,seaborn,statsmodels
Romell D.Z. 
last updated: 2018-11-24 

numpy 1.15.4
pandas 0.23.4
matplotlib 2.2.2
seaborn 0.9.0
statsmodels 0.10.0.dev0+3261eea
In [5]:
import warnings
warnings.simplefilter('ignore')

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import numpy as np
import pandas as pd
import seaborn as sns
sns.set('notebook')
from __future__ import division
import statsmodels.api as sm
plt.rcParams['figure.figsize'] = (18,8)
plt.rcParams['axes.titlesize'] = 40
plt.rcParams['axes.labelsize'] = 25
plt.rcParams['ytick.labelsize'] = 15
plt.rcParams['xtick.labelsize'] = 15
%config InlineBackend.figure_format = 'retina'

Visitantes hospedajes turísticos del Perú por meses y años

In [6]:
data_set_1 = pd.read_excel('rptaniomes_a.xls',sheet_name='rptaniomes',nrows=12,
                           skiprows=3,index_col=0,)
data_set_1
Out[6]:
2004 2005 2006 2007 2008 2009 2010 2011 Unnamed: 9 Unnamed: 10 2012 2013 2014 2015 2016 2017 2018
Enero 1420318 1549611 1453010 1640530 1908782 2157086 2481001 2875789 NaN NaN 3368321 3824017 3785834 3946743 4394706 4377928 4701689.0
Febrero 1378193 1506757 1416817 1618220 1893422 2095366 2287053 2776543 NaN NaN 3243486 3637264 3713502 3852850 4269580 4168647 4634612.0
Marzo 1452044 1647629 1508036 1761946 2020172 2123012 2378428 2950707 NaN NaN 3321767 3858708 3819804 3921180 4236281 4096350 4624401.0
Abril 1443234 1579655 1528134 1781544 1914748 2125960 2448616 2894167 NaN NaN 3322076 3627288 3734511 3733996 4021212 4129642 4410177.0
Mayo 1456655 1615637 1507493 1769547 2059807 2176022 2575576 2990604 NaN NaN 3420816 3876882 3840284 3967218 4190828 4272485 4590470.0
Junio 1453778 1608856 1491043 1795797 1967317 2087684 2510546 2954453 NaN NaN 3451280 3829325 3666169 3846446 3884905 4076724 4247453.0
Julio 1660090 1844513 1742885 1934685 2254533 2371392 2881280 3373290 NaN NaN 3726624 4178428 4026351 4246979 4535388 4726366 4985381.0
Agosto 1642581 1802806 1776761 1951739 2286921 2335697 2838218 3262290 NaN NaN 3783340 4156695 4112468 4159557 4420995 4577075 4877017.0
Septiembre 1480796 1660168 1624773 1798843 2075029 2233111 2602236 3063591 NaN NaN 3652406 3843260 3780242 3880323 4097249 4292631 4597647.0
Octubre 1614864 1784227 1741021 1890913 2231985 2439174 2877754 3306183 NaN NaN 3752538 4029520 3958797 4226737 4387730 4353795 NaN
Noviembre 1518051 1707205 1687323 1886121 2153152 2325433 2762187 3147613 NaN NaN 3644934 3865061 3993348 3992063 4126466 4196722 NaN
Diciembre 1531091 1741196 1621798 1826956 2044077 2244023 2768497 3163567 NaN NaN 3623744 3845475 3928390 4145905 3999337 4360261 NaN
In [7]:
def clean_dataframe(df):
    df.dropna(axis=1,how='all',inplace=True)
    columns_navalue =  df.columns[df.isna().any()].tolist()
    # set last value
    if columns_navalue != []:
        df.ix[-1,columns_navalue] = df[columns_navalue].median()
        df[columns_navalue] = df[columns_navalue].interpolate()
In [8]:
clean_dataframe(data_set_1)
data_set_1
Out[8]:
2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
Enero 1420318 1549611 1453010 1640530 1908782 2157086 2481001 2875789 3368321 3824017 3785834 3946743 4394706 4377928 4701689.0
Febrero 1378193 1506757 1416817 1618220 1893422 2095366 2287053 2776543 3243486 3637264 3713502 3852850 4269580 4168647 4634612.0
Marzo 1452044 1647629 1508036 1761946 2020172 2123012 2378428 2950707 3321767 3858708 3819804 3921180 4236281 4096350 4624401.0
Abril 1443234 1579655 1528134 1781544 1914748 2125960 2448616 2894167 3322076 3627288 3734511 3733996 4021212 4129642 4410177.0
Mayo 1456655 1615637 1507493 1769547 2059807 2176022 2575576 2990604 3420816 3876882 3840284 3967218 4190828 4272485 4590470.0
Junio 1453778 1608856 1491043 1795797 1967317 2087684 2510546 2954453 3451280 3829325 3666169 3846446 3884905 4076724 4247453.0
Julio 1660090 1844513 1742885 1934685 2254533 2371392 2881280 3373290 3726624 4178428 4026351 4246979 4535388 4726366 4985381.0
Agosto 1642581 1802806 1776761 1951739 2286921 2335697 2838218 3262290 3783340 4156695 4112468 4159557 4420995 4577075 4877017.0
Septiembre 1480796 1660168 1624773 1798843 2075029 2233111 2602236 3063591 3652406 3843260 3780242 3880323 4097249 4292631 4597647.0
Octubre 1614864 1784227 1741021 1890913 2231985 2439174 2877754 3306183 3752538 4029520 3958797 4226737 4387730 4353795 4606565.0
Noviembre 1518051 1707205 1687323 1886121 2153152 2325433 2762187 3147613 3644934 3865061 3993348 3992063 4126466 4196722 4615483.0
Diciembre 1531091 1741196 1621798 1826956 2044077 2244023 2768497 3163567 3623744 3845475 3928390 4145905 3999337 4360261 4624401.0
In [14]:
def plot_df(df,by_month = False,title ='Tourist'):
    col_name = 'Year'
    if by_month == True:
        df = df.T
        col_name = 'Month'
    info = df.sum().to_frame().reset_index()
    info.columns = [col_name,'Tourist_count']
    ax = sns.barplot(data=info,x=col_name,y='Tourist_count')
    for i, v in enumerate(info.Tourist_count.values):
        ax.text(i-.2 , v+v*.01 , str('%.1fMM'%(v/1000000)), color='black', fontweight='bold')
    xx, locs = plt.yticks()
    ll = ['{:,}'.format(a) for a in xx]
    plt.yticks(xx[:-1], ll[:-1])
    plt.title(title)
    plt.show();
In [15]:
plot_df(data_set_1,title='Arribos a establecimientos de hospedaje')
In [16]:
def show_descriptive_data(name_file,by_month = False,title ='Tourist'):
    df = pd.read_excel(name_file,sheet_name='rptaniomes',nrows=12,
                           skiprows=3,index_col=0,)
    clean_dataframe(df)
    plot_df(df,by_month,title)
In [17]:
show_descriptive_data('rptaniomes_b.xls',title='Arribo de visitantes nacionales a establecimientos de hospedaje')
In [18]:
show_descriptive_data('rptaniomes_c.xls',title='Arribo de visitantes extranjeros a establecimientos de hospedaje')
In [19]:
show_descriptive_data('rptaniomes_a.xls',by_month=True,title='Arribos mensual a establecimientos de hospedaje')
In [20]:
show_descriptive_data('rptaniomes_b.xls',by_month=True,title='Arribo mensual de visitantes nacionales a establecimientos de hospedaje')
In [21]:
show_descriptive_data('rptaniomes_c.xls',by_month=True,title='Arribo mensual visitantes extranjeros a establecimientos de hospedaje')