#!/usr/bin/env python
# coding: utf-8

# 「機械学習のための特徴量エンジニアリング」の9章の動作確認
# 
# メモ
# 
# - データは https://www.openacademic.ai/oag/
# - ただしv1の方を使わないと動かない
# - メモリ32GByteでも足りない

# In[1]:


import pandas as pd


# In[2]:


get_ipython().system('head ./data/mag_papers_0.txt -n 1')


# In[3]:


get_ipython().system('wc -l ./data/mag_papers_0.txt')


# In[4]:


get_ipython().system('head ./data/mag_papers_0.txt -n 20000 > ./data/mag_papers_0_head.txt')


# In[5]:


orig_df = pd.read_json('./data/mag_papers_0_head.txt', lines=True)
orig_df.shape


# In[6]:


orig_df.head()


# In[7]:


model_df = (orig_df.query('lang == "en"')
            .drop_duplicates(subset='title', keep='first')
            .drop(['doc_type', 'doi', 'id', 'issue', 'lang', 'n_citation',
                   'page_end', 'page_start', 'publisher', 'references', 'url', 'venue', 'volume'], axis=1)
           )
model_df.shape


# In[8]:


model_df.head()


# In[9]:


del orig_df


# In[10]:


def feature_array(x):
    df_list = [pd.DataFrame([[1] * len(val)], columns=val, index=[index])
                  if isinstance(val, list) else pd.DataFrame(index=[index])
                  for val, index in zip(x.values, x.index)]
    feature_df = pd.concat(df_list, axis=1, sort=True)
    return feature_df.fillna(0)


# In[11]:


year_features = pd.get_dummies(model_df['year'].astype('category'))
year_features.head(2)


# In[12]:


fos_features = feature_array(model_df['fos'])
first_features = fos_features.join(year_features).T

from sys import getsizeof
print('Size of first feature array: ', getsizeof(first_features))


# In[13]:


first_features.head()


# In[14]:


from scipy.spatial.distance import cosine

def item_collab_filter(features_df):
    return pd.DataFrame([[1 - cosine(col_val1, col_val2)
                                     for col_val1 in features_df.T.values] for col_val2 in features_df.T.values],
                                     index = features_df.columns, columns = features_df.columns)


# In[15]:


item_collab_filter(first_features.loc[:, 0:5])


# In[ ]:


first_items = item_collab_filter(first_features.loc[:, 0:1000])
first_items.head()


# In[ ]:


import seaborn as sns
import numpy as np

sns.set()
ax = sns.heatmap(first_items.fillna(0), 
                 vmin=0, vmax=1, 
                 cmap='YlGnBu', 
                 xticklabels=250, yticklabels=250)
ax.tick_params(labelsize=12)


# In[16]:


year_min = model_df['year'].min()
year_max = model_df['year'].max()
print('Year spread:', year_min, '-', year_max)
print('Quantile spread:\n', model_df['year'].quantile([0.25, 0.5, 0.75]))


# In[17]:


model_df['year'].hist(bins=year_max - year_min)


# In[18]:


bins = int(round(year_max - year_min) / 10)
bins


# In[19]:


temp_df = pd.DataFrame(index=model_df.index)
temp_df['yearBinned'] = pd.cut(model_df['year'].tolist(), bins, precision=0)


# In[20]:


X_yrs = pd.get_dummies(temp_df['yearBinned'])
X_yrs.columns.categories


# In[21]:


X_yrs.sum().plot(kind='bar')


# In[22]:


X_fos = fos_features.values

print('Our pandas Series, in bytes: ', getsizeof(fos_features))
print('Our hashed numpy array, in bytes: ', getsizeof(X_fos))


# In[23]:


X_fos.shape


# In[24]:


X_fos.nbytes


# In[43]:


second_features = np.append(X_fos, X_yrs, axis = 1)

print('The power of feature engineering saves us, in bytes: ',
         getsizeof(first_features) - getsizeof(second_features))

from sklearn.metrics.pairwise import cosine_similarity

def piped_collab_filter(features_matrix, index, top_n):
    item_similarities = \
        1 - cosine_similarity(features_matrix[index:index+1],
                              features_matrix).flatten()
    related_indices = \
        [i for i in item_similarities.argsort()[::-1] if i != index]
    return [(index, item_similarities[index]) for index in related_indices][0:top_n]


# In[36]:


first_features.memory_usage().sum()/1000/1000


# In[37]:


second_features.nbytes/1000/1000


# In[35]:


print('The power of feature engineering saves us, in bytes: ',
         first_features.memory_usage().sum() - second_features.nbytes)


# In[44]:


filled_df = model_df.fillna('None')

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
X_abstract = vectorizer.fit_transform(filled_df['abstract'])

third_features = np.append(second_features, X_abstract.toarray(), axis = 1)


# In[45]:


authors_list = []

for row in filled_df.itertuples():
    # それぞれのSeriesインデックスからdict型オブジェクトを作成する
    if isinstance(row.authors, str):
        y = {'None': row.Index}
    if isinstance(row.authors, list):
        # これらのキー、値をdict型オブジェクトに追加する
        y = dict.fromkeys(row.authors[0].values(), row.Index)
    authors_list.append(y)

authors_list[0:5]


# In[46]:


from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = authors_list
X_authors = v.fit_transform(D)
fourth_features = np.append(third_features, X_authors, axis=1)


# In[47]:


len(D)


# In[48]:


X_authors.shape


# In[49]:


del second_features
del third_features
del X_fos


# In[ ]:


def paper_recommender(items_df, paper_ix, top_n):
    if paper_ix in model_df.index:
        print('Based on the paper:')
        print('Paper index = ', model_df.loc[paper_ix].name)
        print('Title :', model_df.loc[paper_ix]['title'])
        print('FOS :', model_df.loc[paper_ix]['fos'])
        print('Year :', model_df.loc[paper_ix]['year'])
        print('Abstract :', model_df.loc[paper_ix]['abstract'])
        print('Authors :', model_df.loc[paper_ix]['authors'], '\n')

        array_ix = model_df.index.get_loc(paper_ix)
        top_results = piped_collab_filter(items_df, array_ix, top_n)
        print('\nTop',top_n,'results: ')

        order = 1
        for i in range(len(top_results)):
            print(order,'. Paper index = ', model_df.iloc[top_results[i][0]].name)
            print('Similarity score: ', top_results[i][1])
            print('Title :', model_df.iloc[top_results[i][0]]['title'])
            print('FOS :', model_df.iloc[top_results[i][0]]['fos'])
            print('Year :', model_df.iloc[top_results[i][0]]['year'])
            print('Abstract :', model_df.iloc[top_results[i][0]]['abstract'])
            print('Authors :', model_df.iloc[top_results[i][0]]['authors'], '\n')
            if order < top_n: order += 1
                    
    else:
        print('Whoops! Choose another paper. Try something from here: \n', model_df.index[100:200])

paper_recommender(fourth_features, 2, 3)


# In[ ]: