!pip install -q tensorflow-gpu==2.0.0-beta1
# !pip install -q tensorflow-gpu==1.15

import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
from sklearn.metrics import classification_report
from gensim.models import Word2Vec


tf.__version__

# train and save word2vec model, this step can be removed after uploading the trained model to github
# !wget https://github.com/raqueeb/datasets/raw/master/bnwiki-texts.zip
# !unzip bnwiki-texts.zip

# preprocessed_text_file_path = 'bnwiki-texts-preprocessed.txt'

# lines_from_file = []
# with open(preprocessed_text_file_path, encoding='utf8') as text_file:
#     for line in text_file:
#         lines_from_file.append(line)

# tokenized_lines = []
# for single_line in lines_from_file:
#     tokenized_lines.append(single_line.split())

# model = Word2Vec(tokenized_lines, size=200, window=5, min_count=10)


# model.wv.most_similar('ছেলে', topn=5)


# model.wv.save_word2vec_format('bn-wiki-word2vec-300.txt', binary=False)


!wget http://119.81.77.70:8090/bn-wiki-word2vec-300.txt

!ls

!wget https://raw.githubusercontent.com/tensorflow/hub/master/examples/text_embeddings_v2/export_v2.py
# !wget https://raw.githubusercontent.com/tensorflow/hub/master/examples/text_embeddings/export.py


!python export_v2.py --embedding_file=/content/bn-wiki-word2vec-300.txt --export_path=text_module --num_lines_to_ignore=1 
# !python export.py --embedding_file=/content/bn-wiki-word2vec-300.txt --export_path=text_module --num_lines_to_ignore=1 --preprocess_text=True

module_path = "text_module"
embedding_layer = hub.KerasLayer(module_path, trainable=False)

embedding_layer(['বাস বাস আমার '], ['আমার']).shape


!wget http://119.81.77.70:8090/bangla-sentiment.neg
!wget http://119.81.77.70:8090/bangla-sentiment.pos

all_sentences = []
with open('bangla-sentiment.pos', encoding='utf8') as f:
    all_sentences.extend([(line.strip(), 'positive') for line in f])
        
with open('bangla-sentiment.neg', encoding='utf8') as f:
    all_sentences.extend([(line.strip(), 'negative') for line in f])

pos_count = 0
neg_count = 0
for sentence, label in all_sentences:
  if label =='positive':
      pos_count +=1
  else:
      neg_count +=1
print(pos_count)
print(neg_count)

import random

def generator():
    random.shuffle(all_sentences)    
    for sentence, label in all_sentences:
        if label =='positive':
            label = tf.keras.utils.to_categorical(1, num_classes=2)
        else:
            label = tf.keras.utils.to_categorical(0, num_classes=2)
        sentence_tensor = tf.constant(sentence, dtype=tf.dtypes.string)
        yield sentence_tensor, label

def make_dataset(train_size):
  data = tf.data.Dataset.from_generator(generator=generator, 
                                        output_types=(tf.string, tf.float32))
  train_size = 4000
  train_data = data.take(train_size)
  validation_data = data.skip(train_size)
  return train_data, validation_data

train_data, validation_data = make_dataset(0.80)

# get a single batch of 2 elements from train_data
next(iter(train_data.batch(2)))

sentences_in_a_single_batch, labels_in_a_single_batch = next(iter(train_data.batch(2)))

sentences_in_a_single_batch

sentences_in_a_single_batch.shape

labels_in_a_single_batch.shape

sentence, label = next(iter(train_data.take(1)))

# numpy() returns the string as bytes. we need to decode it to read it
sentence.numpy().decode('utf8')

# label after converted by to_categorical()
label.numpy() 

def create_model():
  model = tf.keras.Sequential()
  model.add(embedding_layer)
  # model.add(tf.keras.layers.Flatten())
  # model.add(tf.keras.layers.SpatialDropout1D(0.2))
  # model.add(tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2))
  # model.add(Dense(13, activation='softmax'))
  model.add(tf.keras.layers.Dense(256, activation="relu"))
  model.add(tf.keras.layers.Dense(128, activation="relu"))
  model.add(tf.keras.layers.Dense(2, activation="softmax"))
  model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['acc'])
  return model

model = create_model()
# Create earlystopping callback
# early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3)

batch_size = 256
history = model.fit(train_data.batch(batch_size), 
                    validation_data=validation_data.batch(batch_size), 
                    epochs=10,)

model.summary()

tf.saved_model.save(model, export_dir="my_model")

sents = ['আমরা খুবি খুশি অফারটির জন্য', 'বই পড়তে পছন্দ করি', 'বই পড়তে পছন্দ করি না', 'আমার ভালো লাগছে না', 
         'আমার কষ্ট লাগছে', 'এই বইটা বেশ ভালো লাগছে', 'একটা দুর্ঘটনা ঘটে গেল',
         'জিপি আমার প্রিয় নেটওয়ার্ক', 'মোবাইল অপারেটর বেশ টাকা কাটে', 'আমাদের প্রতিদিনের সমস্যা নিয়ে ঝামেলায় আছি',
         'ঢাকা-সিলেটসহ আশপাশের সড়কের যানবাহন চলাচল বন্ধ হয়ে যায়',]
pred_dataset = tf.data.Dataset.from_tensor_slices(sents)
prediction = model.predict(np.array(sents))

for sentence, pred_sentiment in zip(sents, prediction.argmax(axis=1)):
  print("Sentence:{} - predicted: {}".format(sentence, pred_sentiment))