speakers = [{'name':'Mai Giménez', 'twitter': '@adahopper', 'weapons': ['Python', 'Bash', 'C++'], 'pyladies': True}, {'name':'Angela Rivera', 'twitter': '@ghilbrae ', 'weapons': ['Python', 'Django', 'C++'], 'pyladies': True}] for speaker in speakers: for k,v in speaker.items(): print("- {}: {}".format(k,v)) print() #print('\n'.join(["- {}: {}".format(k, v) for speaker in speakers for k,v in speaker.items()])) from IPython.display import Image Image(filename='pyladies.png') Image(filename='notebook.png') Image(filename='marvel_logo.jpg') from marvel.marvel import Marvel from marveldev import Developer developer = Developer() marvel = Marvel(*developer.get_marvel_credentials()) character_data_wrapper = marvel.get_characters(orderBy="-modified", limit="100") print(character_data_wrapper.status) for character in character_data_wrapper.data.results[:10]: print("* {character.name}: {character.modified_raw}".format(character=character)) ', '.join([attr for attr in dir(character) if not attr.startswith('_')]) from IPython.core.display import HTML HTML("".format(character_data_wrapper.data.results[2].wiki)) import json from os.path import join from os import listdir import socket MARVELOUSDB_PATH_A = "../marvelousdb-master/data/characters/" MARVELOUSDB_PATH_M = "../marvelousdb/data/characters/" MARVELOUSDB_PATH = MARVELOUSDB_PATH_M if 'alan' in socket.gethostname() else MARVELOUSDB_PATH_A json_db = [join(MARVELOUSDB_PATH, json_file) for json_file in listdir(MARVELOUSDB_PATH)] print("En MarvelousDB tenemos un backup de {} personajes".format(len(json_db))) import pandas as pd json_to_dataframe = [] for json_file in json_db: with open(json_file, 'r') as jf: json_character = json.loads(''.join(jf.readlines())) json_plain = pd.io.json.json_normalize(json_character) json_to_dataframe.append(json_plain) marvel_df = pd.concat(json_to_dataframe) df = pd.concat([pd.io.json.json_normalize(json.loads(''.join(open(json_file,'r').readlines()))) for json_file in json_db]) all(df == marvel_df) marvel_df.head() marvel_df.shape ', '.join(marvel_df.columns.values) marvel_df.dropna() #Sacamos la lista de creadores que hay en nuestros datos creators_serie = marvel_df['wiki.creators'].dropna() creators_serie.describe() #Renombramos la serie y el índice creators_serie.name = 'Creadores de personajes' creators_serie.index.name = 'creators' # Podemos usar head o como estamos sobre series también podemos coger una porción de la lista # creators_serie.head() creators_serie[:20] default_string = creators_serie != "this has not been updated yet" default_string.head() empty_string = creators_serie != "" empty_string[:10] default_string and empty_string creators_mask = default_string & empty_string creators_mask[:10] creators_serie[creators_mask].head() import re creators = [re.split('&|and|,', line) for line in creators_serie[creators_mask]] clean_creators = pd.Series([c.rstrip().lstrip() for creator in creators for c in creator]) clean_creators.head() clean_creators.value_counts() from IPython.display import Image Image(filename='stanvschris.png') marvel_df.dropna(subset=['wiki.current_members'])['name'] %timeit (~marvel_df['wiki.current_members'].isnull()) import numpy as np %timeit (np.invert(marvel_df['wiki.current_members'].isnull())) not_groups_mask = marvel_df['wiki.current_members'].isnull() not_groups_mask.head() marvel_df_characters = marvel_df[not_groups_mask] marvel_df_characters.head() marvel_df_characters.shape from IPython.display import Image Image(filename='oracle.jpg') marvel_df_characters['wiki.skin'].dropna() marvel_groups = marvel_df.dropna(subset=['wiki.current_members']) marvel_groups['wiki.leader'].dropna() # Agrupamos los datos para tener claro con que queremos trabajar # No hay nadie con 'ocupation' así que lo quitamos physical_data = {'wiki.hair':'hair', 'wiki.weight':'weight', 'wiki.height':'height', 'wiki.eyes':'eyes'} cultural_data = {'wiki.education':'education', 'wiki.citizenship':'citizenship', 'wiki.place_of_birth':'place_of_birth', 'wiki.occupation':'occupation'} personal_data = {'wiki.bio':'bio', 'wiki.bio_text':'bio', 'wiki.categories':'categories'} marvelesque_data = {'wiki.abilities':'abilities', 'wiki.weapons':'weapons', 'wiki.powers': 'powers'} data_keys = (list(physical_data.keys()) + list(cultural_data.keys()) + list(personal_data.keys()) + ['name','comics.available']) #+ marvelesque_data print(data_keys) clean_df = marvel_df_characters.dropna(subset = data_keys) clean_df = clean_df[data_keys].set_index('name') clean_df.shape clean_df[list(physical_data.keys())].head() clean_df[list(physical_data.keys())].describe() clean_df[list(cultural_data.keys())].head() clean_df[list(cultural_data.keys())].describe() clean_df.dtypes clean_df.describe() clean_df[clean_df['comics.available'] == 2575.000000] clean_df['wiki.categories'] women = clean_df['wiki.categories'].map(lambda x: 'Women' in x) clean_df['Women'] = women women[:5] # ~ Esto es una negación element-wise print("Women: #{}, men #{}".format(clean_df[women].shape[0],clean_df[~women].shape[0])) villain = clean_df['wiki.categories'].map(lambda x: 'Villains' in x) clean_df['Villain'] = villain men = ~women gender_data = {'Women':{'Heroes':0,'Villains':0},'Men':{'Heroes':0,'Villains':0}} # Women and villains gender_data['Women']['Villains'] = clean_df[villain & women].shape[0] # Women and heroes gender_data['Women']['Heroes'] = clean_df[~villain & women].shape[0] # Men and villains gender_data['Men']['Villains'] = clean_df[villain & men].shape[0] # Men and heroes gender_data['Men']['Heroes'] = clean_df[~villain & men].shape[0] gender_data %matplotlib inline import matplotlib.pyplot as plt n_groups = 2 men_data = (gender_data['Men']['Villains'], gender_data['Men']['Heroes']) women_data = (gender_data['Women']['Villains'], gender_data['Women']['Heroes']) fig, ax = plt.subplots() index = np.arange(n_groups) bar_width = 0.4 opacity = 0.5 rects1 = plt.bar(index, men_data, bar_width, alpha=opacity, color='b', label='Hombres') rects2 = plt.bar(index + bar_width, women_data, bar_width, alpha=opacity, color='r', label='Mujeres') plt.xlabel('Rol') plt.ylabel('Número de personajes') plt.title('Distribución por género y roles') plt.xticks(index + bar_width, ('Villanos', 'Héroes')) plt.legend(loc=0, borderaxespad=1.) plt.show() red_heads = clean_df['wiki.hair'].map(lambda x: 'Red' in x) clean_df['red_heads'] = red_heads red_heads[:5] print("Red heads: #{}, Non-red heads #{}".format(clean_df[red_heads].shape[0],clean_df[~red_heads].shape[0])) non_red = ~red_heads hair_data = {'Women':{'Red heads':0,'Non-red heads':0},'Men':{'Red heads':0,'Non-red heads':0}} # Red haired women hair_data['Women']['Red heads'] = clean_df[red_heads & women].shape[0] # Non-red haired women hair_data['Women']['Non-red heads'] = clean_df[~red_heads & women].shape[0] # Red haired men hair_data['Men']['Red heads'] = clean_df[red_heads & men].shape[0] # Non-red haired women hair_data['Men']['Non-red heads'] = clean_df[~red_heads & men].shape[0] hair_data #¿Qué es esto? redwomen = 30 / 199. redmen = 46 / 563. print('Women: {0:5.2f}%, Men: {1:5.2f}%'.format(redwomen * 100, redmen * 100)) import sys import matplotlib %matplotlib inline import sklearn print("Versión de Python: ", sys.version) print("Versión de Pandas: ", pd.version.short_version) print("Versión de Numpy: ", np.version.short_version) print("Versión de Matplotlib: ", matplotlib.__version__) print("Versión de Pandas: ", pd.version.short_version) print("Versión de scikit-learn: ", sklearn.__version__) clean_df['wiki.weight'].describe() physical = clean_df[clean_df['wiki.weight'] != "Unrevealed"] any(physical['wiki.height'] == "Unrevealed") physical_knn = physical[['wiki.weight', 'wiki.height', 'Women', 'Villain']] physical_knn.dtypes physical_knn physical_knn.applymap(str) physical_knn = physical_knn[physical_knn['wiki.weight'].str.contains("lbs.")] physical_knn = physical_knn[physical_knn['wiki.height'].str.contains('’|\'')] def get_weight(pandas_weight): """ Return first int parameter in a string """ for p in pandas_weight.split(): try: return int(p) except ValueError: pass physical_knn['wiki.weight'] = physical_knn['wiki.weight'].map(get_weight) FOOT = 30.48 INCH = 2.54 def get_height(pandas_height): """ Return first int parameter in a string """ height = None for p in pandas_height.split(): colon_split = p.split('\'') strange_colon_split = p.split('’') if len(colon_split) == 2 : height = colon_split elif len(colon_split) == 4 : height = colon_split[:2] height[1] += "\'" elif len(strange_colon_split) == 2 : height = strange_colon_split elif len((pandas_height.split()[-1]).split('\'')) == 2: height = pandas_height.split()[-1].split('\'') elif len((pandas_height.split()[-1]).split('’')) == 2: height = pandas_height.split()[-1].split('’') else: universe_split = ((pandas_height.split(';')[0]).split()[-1]).split('\'') if len(universe_split) == 2: height = universe_split else: space_split = (pandas_height.split(';')[0].split()[-2:]) if space_split[0][-1] == '\'' or space_split[0][-1] == '’': height = [space_split[0][:-1], space_split[1]] else: return None if height: try: foot_part = int(height[0]) inch_part = int(height[1][:-1]) if height[1][:-1].strip() else 0 return (foot_part*FOOT + inch_part*INCH) except ValueError: pass physical_knn['wiki.height'] = physical_knn['wiki.height'].map(get_height) physical_knn = physical_knn.dropna() physical_knn.dtypes physical_knn.shape from math import floor TRAIN_PERCENTAGE = 0.8 train_section = floor(physical_knn.shape[0]*TRAIN_PERCENTAGE) test_section = physical_knn.shape[0]-train_section print("Usaremos {} personajes para entrenar el clasificador y"\ " {} para probar el clasificador entrenado.".format(train_section, test_section)) train_rows = np.random.choice(physical_knn.index.values, train_section) test_rows = np.setdiff1d(physical_knn.index.values,train_rows) physical_knn.loc[train_rows[0]] X_train = physical_knn.loc[train_rows][['wiki.weight','wiki.height']] y_train = physical_knn.loc[train_rows]['Women'] X_test = physical_knn.loc[test_rows][['wiki.weight','wiki.height']] y_test = physical_knn.loc[test_rows]['Women'] for i, group in physical_knn.groupby(women): if not i: ax = group.plot(kind='scatter', x='wiki.height', y='wiki.weight', color='DarkBlue', label='Men'); else: print(i) group.plot(kind='scatter', x='wiki.height', y='wiki.weight', color='DarkGreen', label='Women', ax=ax) print(physical_knn.groupby(women).aggregate(np.mean)) from sklearn import neighbors classifier = neighbors.KNeighborsClassifier() classifier.fit(X_train, y_train) predict = classifier.predict(X_test) predict from sklearn import metrics accuracy = metrics.accuracy_score(y_test, predict) precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_test, predict) print("* Acierto: {:.2f}%".format(accuracy*100)) print("* Precisión: {}\n* Exhaustividad: {}.\n* F1-Score: {}".format(accuracy*100, precision, recall, f1)) %%latex \begin{align} accuray = \frac{\text{# True Positives}+\text{# True Negatives}} {\text{# True Positives}+\text{False Positives} + \text{False Negatives} + \text{True Negatives}} \end{align} %%latex \begin{align} precision = \frac{\text{# True Positives}} {\text{# True Positives}+\text{False Positives}} \end{align} from matplotlib.colors import ListedColormap cmap_light = ListedColormap(['#AAAAFF', '#AAFFAA']) cmap_bold = ListedColormap(['#0000FF', '#00FF00']) step = 2 x_min, x_max = X_test['wiki.height'].min() - 1, X_test['wiki.height'].max() + 1 y_min, y_max = X_test['wiki.weight'].min() - 1, X_test['wiki.weight'].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, step), np.arange(y_min, y_max, step)) prediction = classifier.predict(X_test) Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) plt.scatter( X_test['wiki.height'], X_test['wiki.weight'], c=y_test, cmap=cmap_bold) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), 400) import time for n in range(1,20, 2): classifier = neighbors.KNeighborsClassifier(n_neighbors=n) classifier.fit(X_train, y_train) predict = classifier.predict(X_test) accuracy = metrics.accuracy_score(y_test, predict) print("({}) Acierto: {:.2f}%".format(n, accuracy*100)) cultural_knn = clean_df[['wiki.education', 'wiki.citizenship', 'Women', 'Villain']] cultural_knn.dtypes usa = cultural_knn['wiki.citizenship'].map(lambda x: 'U.S.A.' in x) cultural_knn['USA'] = usa cultural_knn = cultural_knn.drop('wiki.citizenship',1) cultural_knn def delete_without_education(cultural_knn, not_education): for word in not_education: cultural_knn = cultural_knn[~cultural_knn['wiki.education'].str.contains(word)] return cultural_knn #Eliminar todo los que sean "Unreveal" cultural_knn = delete_without_education(cultural_knn, ["Unrevealed", "unrevealed", 'None', 'none', 'Not applicable', 'Unknown', 'unknown', 'Inapplicable', 'Limited']) cultural_knn = cultural_knn[cultural_knn['wiki.education'] != ''] # Crear los grupos de niveles educativos education = cultural_knn['wiki.education'] unfinished = education.map(lambda x: 'unfinished' in x or 'dropout' in x or 'incomplete' in x or 'drop-out' in x or 'No official schooling' in x or 'No formal education' in x or 'Unfinished' in x or 'Incomplete' in x) education[unfinished].tolist() education = education[~unfinished] phd = education.map(lambda x: 'Ph.D' in x or 'master' in x or 'Masters' in x or 'PhD' in x or 'doctorate' in x or 'Doctorate' in x or 'Ph.d.' in x or 'Doctoral' in x or 'NASA' in x or 'Journalism graduate' in x or 'scientist' in x or 'Geneticist' in x or 'residency' in x) education[phd].tolist() education = education[~phd] college = education.map(lambda x: 'College' in x or 'college' in x or 'University' in x or 'post-graduate' in x or 'B.A' in x or 'B.S.' in x or 'university' in x or 'Master' in x or 'Collage' in x or 'Degree' in x or 'degree' in x or 'Engineering' in x or 'engineer' in x or 'programming' in x or 'Programming' in x or 'Doctor' in x or 'Medical school' in x or 'higher education' in x) education[college].tolist() education = education[~college] militar = education.map(lambda x: 'Military' in x or 'Xandarian Nova Corps' in x or 'FBI' in x or 'S.H.I.E.L.D.' in x or 'military' in x or 'Nicholas Fury' in x or 'Warrior' in x or 'combat' in x or 'Combat' in x or 'Soldier' in x or 'spy academy' in x or 'Police' in x or 'warfare' in x or 'Public Eye' in x) education[militar].tolist() education = education[~militar] hs = education.map(lambda x: 'High school' in x or 'high school' in x or 'High-school' in x or 'High School' in x or 'high School' in x) education[hs].tolist() education = education[~hs] tutored = education.map(lambda x: 'Tutored' in x or 'tutors' in x or 'tutored' in x or 'Mentored' in x or 'Home schooled' in x or 'Private education' in x) education[tutored].tolist() education = education[~tutored] autodidacta = education.map(lambda x: 'Self-taught' in x or 'self-taught' in x or 'Little or no formal schooling' in x or 'Little formal schooling' in x or 'Some acting school' in x or 'through observation' in x) education[autodidacta].tolist() education = education[~autodidacta] special = education.map(lambda x: 'Sorcery' in x or 'cosmic experience' in x or 'magic' in x or 'Priests of Pama' in x or 'Xavier Institute' in x or 'Carlos Javier’s' in x or 'Self educated' in x or 'Shao-Lom' in x or 'Centuries of study and experience' in x or 'Askani' in x or 'Madame DuPont' in x or 'Titanian' in x or 'arcane arts' in x or 'Muir-MacTaggert' in x or 'Uploaded data' in x or 'Programmed' in x or 'Accelerated' in x or 'Inhumans' in x or 'Able to access knowledge' in x or 'lifetime' in x or 'Watchers\' homeworld' in x or 'Uranian Eternals' in x or 'Arcturus' in x or 'Oatridge School for Boys' in x) education[special].tolist() education = education[~special] basic = education.map(lambda x: 'Self-taught' in x or 'Homed schooled' in x or 'graduate school' in x or 'Elementary school' in x or 'Secondary school' in x or 'school graduate' in x or 'Boarding school' in x or 'Massachusetts Academy' in x or 'school graduate' in x) education[basic].tolist() education = education[~basic] educational_dict = {'autodidacta': autodidacta, 'unfinished': unfinished, 'superior': phd, 'college':college, 'militar': militar, 'high school':hs, 'tutored': tutored, 'special':special, 'basic': basic} numeric = {'autodidacta': 1, 'unfinished': 2, 'superior': 3, 'college':4, 'militar': 5, 'high school':6, 'tutored': 7, 'special':8, 'basic': 9} def clean_education_levels(educational_dict, cultural_knn): """ It will use our new categories in the wiki.education column""" for k, education in educational_dict.items(): index = education[education.loc[:]].index for character in index: cultural_knn.loc[character, 'wiki.education'] = numeric[k] clean_education_levels(educational_dict, cultural_knn) TRAIN_PERCENTAGE = 0.8 train_section = floor(cultural_knn.shape[0]*TRAIN_PERCENTAGE) test_section = cultural_knn.shape[0]-train_section print("Usaremos {} personajes para entrenar el clasificador y"\ " {} para probar el clasificador entrenado.\n".format(train_section, test_section)) train_rows = np.random.choice(cultural_knn.index.values, train_section) test_rows = np.setdiff1d(cultural_knn.index.values,train_rows) print(cultural_knn.loc[train_rows[0]]) for i, group in cultural_knn.groupby(women): print(group) for i, group in cultural_knn.groupby(women): if not i: area = (np.pi * (group.shape[0])**2)*.002 ax = group.plot(kind='scatter', x='wiki.education', y='USA', s=area, color='Cornflowerblue', label='Men', alpha=0.5); else: area = (np.pi * (group.shape[0])**2)*.002 group.plot(kind='scatter', x='wiki.education', y='USA', color='LightGreen', label='Women', ax=ax, s=area, alpha=0.5) ax.set_xticks(range(1,10)) ax.set_xticklabels(list(numeric.keys()), rotation='vertical') ax.set_yticks(range(0,2)) ax.set_yticklabels(['USA', 'non USA'], rotation='horizontal') ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=2, fancybox=True, shadow=True) X_train = cultural_knn.loc[train_rows][['wiki.education','USA']] y_train = cultural_knn.loc[train_rows]['Women'] X_test = cultural_knn.loc[test_rows][['wiki.education','USA']] y_test = cultural_knn.loc[test_rows]['Women'] classifier = neighbors.KNeighborsClassifier() classifier.fit(X_train, y_train) predict = classifier.predict(X_test) accuracy = metrics.accuracy_score(y_test, predict) precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_test, predict) print("* Acierto: {:.2f}%".format(accuracy*100)) print("* Precisión: {}\n* Exhaustividad: {}.\n* F1-Score: {}".format(accuracy*100, precision, recall, f1)) cmap_light = ListedColormap(['#AAAAFF', '#AAFFAA']) cmap_bold = ListedColormap(['#0000FF', '#00FF00']) step = 1 xx, yy = np.meshgrid(np.arange(1, 10, step), np.arange(0, 1, step)) prediction = classifier.predict(X_test) Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) plt.scatter( X_test['wiki.education'], X_test['USA'], c=y_test, cmap=cmap_bold) plt.xlim(xx.min(), xx.max()) plt.yticks(range(0,2),['USA', 'non USA'], rotation='horizontal') plt.ylim(-0.5, 1.5) plt.xticks(range(1,11), list(numeric.keys()), rotation='vertical') plt.xlabel("Education") %pylab inline --no-import-all pd.set_option('display.mpl_style', 'default') figsize(15, 6) pd.set_option('display.line_width', 4000) pd.set_option('display.max_columns', 100) from matplotlib.pyplot import * marvel_df['modified'] = pd.to_datetime(marvel_df['modified']) plot(marvel_df['modified']) start = marvel_df.modified.min() end = marvel_df.modified.max() yearly_range = pd.date_range(start, end, freq='365D6H') marvel_df[['modified']].head() characters_per_year = marvel_df.groupby(marvel_df['modified'].map(lambda x: x.year)).size() characters_per_year characters_per_year.plot() marvel_df.sort('modified',ascending=False).head()