#!/usr/bin/env python # coding: utf-8 # 「機械学習のための特徴量エンジニアリング」の9章の動作確認 # # メモ # # - データは https://www.openacademic.ai/oag/ # - ただしv1の方を使わないと動かない # - メモリ32GByteでも足りない # In[1]: import pandas as pd # In[2]: get_ipython().system('head ./data/mag_papers_0.txt -n 1') # In[3]: get_ipython().system('wc -l ./data/mag_papers_0.txt') # In[4]: get_ipython().system('head ./data/mag_papers_0.txt -n 20000 > ./data/mag_papers_0_head.txt') # In[5]: orig_df = pd.read_json('./data/mag_papers_0_head.txt', lines=True) orig_df.shape # In[6]: orig_df.head() # In[7]: model_df = (orig_df.query('lang == "en"') .drop_duplicates(subset='title', keep='first') .drop(['doc_type', 'doi', 'id', 'issue', 'lang', 'n_citation', 'page_end', 'page_start', 'publisher', 'references', 'url', 'venue', 'volume'], axis=1) ) model_df.shape # In[8]: model_df.head() # In[9]: del orig_df # In[10]: def feature_array(x): df_list = [pd.DataFrame([[1] * len(val)], columns=val, index=[index]) if isinstance(val, list) else pd.DataFrame(index=[index]) for val, index in zip(x.values, x.index)] feature_df = pd.concat(df_list, axis=1, sort=True) return feature_df.fillna(0) # In[11]: year_features = pd.get_dummies(model_df['year'].astype('category')) year_features.head(2) # In[12]: fos_features = feature_array(model_df['fos']) first_features = fos_features.join(year_features).T from sys import getsizeof print('Size of first feature array: ', getsizeof(first_features)) # In[13]: first_features.head() # In[14]: from scipy.spatial.distance import cosine def item_collab_filter(features_df): return pd.DataFrame([[1 - cosine(col_val1, col_val2) for col_val1 in features_df.T.values] for col_val2 in features_df.T.values], index = features_df.columns, columns = features_df.columns) # In[15]: item_collab_filter(first_features.loc[:, 0:5]) # In[ ]: first_items = item_collab_filter(first_features.loc[:, 0:1000]) first_items.head() # In[ ]: import seaborn as sns import numpy as np sns.set() ax = sns.heatmap(first_items.fillna(0), vmin=0, vmax=1, cmap='YlGnBu', xticklabels=250, yticklabels=250) ax.tick_params(labelsize=12) # In[16]: year_min = model_df['year'].min() year_max = model_df['year'].max() print('Year spread:', year_min, '-', year_max) print('Quantile spread:\n', model_df['year'].quantile([0.25, 0.5, 0.75])) # In[17]: model_df['year'].hist(bins=year_max - year_min) # In[18]: bins = int(round(year_max - year_min) / 10) bins # In[19]: temp_df = pd.DataFrame(index=model_df.index) temp_df['yearBinned'] = pd.cut(model_df['year'].tolist(), bins, precision=0) # In[20]: X_yrs = pd.get_dummies(temp_df['yearBinned']) X_yrs.columns.categories # In[21]: X_yrs.sum().plot(kind='bar') # In[22]: X_fos = fos_features.values print('Our pandas Series, in bytes: ', getsizeof(fos_features)) print('Our hashed numpy array, in bytes: ', getsizeof(X_fos)) # In[23]: X_fos.shape # In[24]: X_fos.nbytes # In[43]: second_features = np.append(X_fos, X_yrs, axis = 1) print('The power of feature engineering saves us, in bytes: ', getsizeof(first_features) - getsizeof(second_features)) from sklearn.metrics.pairwise import cosine_similarity def piped_collab_filter(features_matrix, index, top_n): item_similarities = \ 1 - cosine_similarity(features_matrix[index:index+1], features_matrix).flatten() related_indices = \ [i for i in item_similarities.argsort()[::-1] if i != index] return [(index, item_similarities[index]) for index in related_indices][0:top_n] # In[36]: first_features.memory_usage().sum()/1000/1000 # In[37]: second_features.nbytes/1000/1000 # In[35]: print('The power of feature engineering saves us, in bytes: ', first_features.memory_usage().sum() - second_features.nbytes) # In[44]: filled_df = model_df.fillna('None') from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_abstract = vectorizer.fit_transform(filled_df['abstract']) third_features = np.append(second_features, X_abstract.toarray(), axis = 1) # In[45]: authors_list = [] for row in filled_df.itertuples(): # それぞれのSeriesインデックスからdict型オブジェクトを作成する if isinstance(row.authors, str): y = {'None': row.Index} if isinstance(row.authors, list): # これらのキー、値をdict型オブジェクトに追加する y = dict.fromkeys(row.authors[0].values(), row.Index) authors_list.append(y) authors_list[0:5] # In[46]: from sklearn.feature_extraction import DictVectorizer v = DictVectorizer(sparse=False) D = authors_list X_authors = v.fit_transform(D) fourth_features = np.append(third_features, X_authors, axis=1) # In[47]: len(D) # In[48]: X_authors.shape # In[49]: del second_features del third_features del X_fos # In[ ]: def paper_recommender(items_df, paper_ix, top_n): if paper_ix in model_df.index: print('Based on the paper:') print('Paper index = ', model_df.loc[paper_ix].name) print('Title :', model_df.loc[paper_ix]['title']) print('FOS :', model_df.loc[paper_ix]['fos']) print('Year :', model_df.loc[paper_ix]['year']) print('Abstract :', model_df.loc[paper_ix]['abstract']) print('Authors :', model_df.loc[paper_ix]['authors'], '\n') array_ix = model_df.index.get_loc(paper_ix) top_results = piped_collab_filter(items_df, array_ix, top_n) print('\nTop',top_n,'results: ') order = 1 for i in range(len(top_results)): print(order,'. Paper index = ', model_df.iloc[top_results[i][0]].name) print('Similarity score: ', top_results[i][1]) print('Title :', model_df.iloc[top_results[i][0]]['title']) print('FOS :', model_df.iloc[top_results[i][0]]['fos']) print('Year :', model_df.iloc[top_results[i][0]]['year']) print('Abstract :', model_df.iloc[top_results[i][0]]['abstract']) print('Authors :', model_df.iloc[top_results[i][0]]['authors'], '\n') if order < top_n: order += 1 else: print('Whoops! Choose another paper. Try something from here: \n', model_df.index[100:200]) paper_recommender(fourth_features, 2, 3) # In[ ]: