!pip install -q tensorflow-gpu==2.0.0-beta1 # !pip install -q tensorflow-gpu==1.15 import tensorflow as tf import tensorflow_hub as hub import numpy as np import os from sklearn.metrics import classification_report from gensim.models import Word2Vec tf.__version__ # train and save word2vec model, this step can be removed after uploading the trained model to github # !wget https://github.com/raqueeb/datasets/raw/master/bnwiki-texts.zip # !unzip bnwiki-texts.zip # preprocessed_text_file_path = 'bnwiki-texts-preprocessed.txt' # lines_from_file = [] # with open(preprocessed_text_file_path, encoding='utf8') as text_file: # for line in text_file: # lines_from_file.append(line) # tokenized_lines = [] # for single_line in lines_from_file: # tokenized_lines.append(single_line.split()) # model = Word2Vec(tokenized_lines, size=200, window=5, min_count=10) # model.wv.most_similar('ছেলে', topn=5) # model.wv.save_word2vec_format('bn-wiki-word2vec-300.txt', binary=False) !wget http://119.81.77.70:8090/bn-wiki-word2vec-300.txt !ls !wget https://raw.githubusercontent.com/tensorflow/hub/master/examples/text_embeddings_v2/export_v2.py # !wget https://raw.githubusercontent.com/tensorflow/hub/master/examples/text_embeddings/export.py !python export_v2.py --embedding_file=/content/bn-wiki-word2vec-300.txt --export_path=text_module --num_lines_to_ignore=1 # !python export.py --embedding_file=/content/bn-wiki-word2vec-300.txt --export_path=text_module --num_lines_to_ignore=1 --preprocess_text=True module_path = "text_module" embedding_layer = hub.KerasLayer(module_path, trainable=False) embedding_layer(['বাস বাস আমার '], ['আমার']).shape !wget http://119.81.77.70:8090/bangla-sentiment.neg !wget http://119.81.77.70:8090/bangla-sentiment.pos all_sentences = [] with open('bangla-sentiment.pos', encoding='utf8') as f: all_sentences.extend([(line.strip(), 'positive') for line in f]) with open('bangla-sentiment.neg', encoding='utf8') as f: all_sentences.extend([(line.strip(), 'negative') for line in f]) pos_count = 0 neg_count = 0 for sentence, label in all_sentences: if label =='positive': pos_count +=1 else: neg_count +=1 print(pos_count) print(neg_count) import random def generator(): random.shuffle(all_sentences) for sentence, label in all_sentences: if label =='positive': label = tf.keras.utils.to_categorical(1, num_classes=2) else: label = tf.keras.utils.to_categorical(0, num_classes=2) sentence_tensor = tf.constant(sentence, dtype=tf.dtypes.string) yield sentence_tensor, label def make_dataset(train_size): data = tf.data.Dataset.from_generator(generator=generator, output_types=(tf.string, tf.float32)) train_size = 4000 train_data = data.take(train_size) validation_data = data.skip(train_size) return train_data, validation_data train_data, validation_data = make_dataset(0.80) # get a single batch of 2 elements from train_data next(iter(train_data.batch(2))) sentences_in_a_single_batch, labels_in_a_single_batch = next(iter(train_data.batch(2))) sentences_in_a_single_batch sentences_in_a_single_batch.shape labels_in_a_single_batch.shape sentence, label = next(iter(train_data.take(1))) # numpy() returns the string as bytes. we need to decode it to read it sentence.numpy().decode('utf8') # label after converted by to_categorical() label.numpy() def create_model(): model = tf.keras.Sequential() model.add(embedding_layer) # model.add(tf.keras.layers.Flatten()) # model.add(tf.keras.layers.SpatialDropout1D(0.2)) # model.add(tf.keras.layers.LSTM(100, dropout=0.2, recurrent_dropout=0.2)) # model.add(Dense(13, activation='softmax')) model.add(tf.keras.layers.Dense(256, activation="relu")) model.add(tf.keras.layers.Dense(128, activation="relu")) model.add(tf.keras.layers.Dense(2, activation="softmax")) model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['acc']) return model model = create_model() # Create earlystopping callback # early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=3) batch_size = 256 history = model.fit(train_data.batch(batch_size), validation_data=validation_data.batch(batch_size), epochs=10,) model.summary() tf.saved_model.save(model, export_dir="my_model") sents = ['আমরা খুবি খুশি অফারটির জন্য', 'বই পড়তে পছন্দ করি', 'বই পড়তে পছন্দ করি না', 'আমার ভালো লাগছে না', 'আমার কষ্ট লাগছে', 'এই বইটা বেশ ভালো লাগছে', 'একটা দুর্ঘটনা ঘটে গেল', 'জিপি আমার প্রিয় নেটওয়ার্ক', 'মোবাইল অপারেটর বেশ টাকা কাটে', 'আমাদের প্রতিদিনের সমস্যা নিয়ে ঝামেলায় আছি', 'ঢাকা-সিলেটসহ আশপাশের সড়কের যানবাহন চলাচল বন্ধ হয়ে যায়',] pred_dataset = tf.data.Dataset.from_tensor_slices(sents) prediction = model.predict(np.array(sents)) for sentence, pred_sentiment in zip(sents, prediction.argmax(axis=1)): print("Sentence:{} - predicted: {}".format(sentence, pred_sentiment))