#!/usr/bin/env python
# coding: utf-8

# # Tutorial for using Gensim's API for downloading corpuses/models
# Let's start by importing the api module.

# In[1]:


import logging
import gensim.downloader as api

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


# Now, lets download the text8 corpus and load it to memory (automatically)

# In[2]:


corpus = api.load('text8')


# As the corpus has been downloaded and loaded, let's create a word2vec model of our corpus.

# In[3]:


from gensim.models.word2vec import Word2Vec

model = Word2Vec(corpus)


# Now that we have our word2vec model, let's find words that are similar to 'tree'

# In[4]:


model.most_similar('tree')


# You can use the API to download many corpora and models. You can get the list of all the models and corpora that are provided, by using the code below:

# In[5]:


import json
data_list = api.info()
print(json.dumps(data_list, indent=4))


# If you want to get detailed information about the model/corpus, use:

# In[6]:


fake_news_info = api.info('fake-news')
print(json.dumps(fake_news_info, indent=4))


# Sometimes, you do not want to load the model to memory. You would just want to get the path to the model. For that, use :

# In[7]:


print(api.load('glove-wiki-gigaword-50', return_path=True))


# If you want to load the model to memory, then:

# In[8]:


model = api.load("glove-wiki-gigaword-50")
model.most_similar("glass")


# In corpora, the corpus is never loaded to memory, all corpuses wrapped to special class `Dataset` and provide `__iter__` method