#!/usr/bin/env python
# coding: utf-8

# In[1]:


get_ipython().run_line_magic('reload_ext', 'autoreload')
get_ipython().run_line_magic('autoreload', '2')
get_ipython().run_line_magic('matplotlib', 'inline')
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 


# # Building an Arabic Sentiment Analyzer With BERT
# 
# In this notebook, we will build a simple, fast, and accurate Arabic-language text classification model with minimal effort. More specifically, we will build a model that classifies Arabic hotel reviews as either positive or negative.
# 
# The dataset can be downloaded from Ashraf Elnagar's GitHub repository (https://github.com/elnagara/HARD-Arabic-Dataset).
# 
# Each entry in the dataset includes a review in Arabic and a rating between 1 and 5.  We will convert this to a binary classification dataset by assigning reviews with a rating of above 3 a positive label and assigning reviews with a rating of less than 3 a negative label.
# 
# (**Disclaimer:** I don't speak Arabic. Please forgive mistakes.) 
# 
# 

# In[3]:


# convert ratings to a binary format:  pos=positive, neg=negative
import pandas as pd
df = pd.read_csv('data/arabic_hotel_reviews/balanced-reviews.txt', delimiter='\t', encoding='utf-16')
df = df[['rating', 'review']] 
df['rating'] = df['rating'].apply(lambda x: 'neg' if x < 3 else 'pos')
df.head()


# Let's split out a training and validation set.

# In[4]:


df_train = df.sample(frac=0.85, random_state=42)
df_test = df.drop(df_train.index)
len(df_train), len(df_test)


# With the [Transformer API in *ktrain*](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/tutorials/tutorial-A3-hugging_face_transformers.ipynb), we can select any Hugging Face `transformers` model appropriate for our data.  Since we are dealing with Arabic, we will use [AraBERT](https://huggingface.co/aubmindlab/bert-base-arabert) by the AUB MIND Lab instead of multilingual BERT (which is normally used by *ktrain* for non-English datasets in the alternative [text_classifier API in *ktrain*](https://github.com/amaiya/ktrain/blob/master/examples/text/ArabicHotelReviews-BERT.ipynb)).  As you can see below, with only 1 epoch, we obtain a **96.37** accuracy on the validation set.

# In[7]:


import ktrain
from ktrain import text
MODEL_NAME = 'aubmindlab/bert-base-arabertv01'
t = text.Transformer(MODEL_NAME, maxlen=128)
trn = t.preprocess_train(df_train.review.values, df_train.rating.values)
val = t.preprocess_test(df_test.review.values, df_test.rating.values)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32)
learner.fit_onecycle(5e-5, 1)


# ### Making Predictions on New Data

# In[8]:


p = ktrain.get_predictor(learner.model, t)


# Predicting label for the text
# > "*The room was clean, the food excellent, and I loved the view from my room.*"

# In[9]:


p.predict("الغرفة كانت نظيفة ، الطعام ممتاز ، وأنا أحب المنظر من غرفتي.")


# Predicting label for:
# > "*This hotel was too expensive and the staff is rude.*"

# In[10]:


p.predict('كان هذا الفندق باهظ الثمن والموظفين غير مهذبين.')


# ### Save our Predictor for Later Deployment

# In[11]:


# save model for later use
p.save('/tmp/arabic_predictor')


# In[12]:


# reload from disk
p = ktrain.load_predictor('/tmp/arabic_predictor')


# In[13]:


# still works as expected after reloading from disk
p.predict("الغرفة كانت نظيفة ، الطعام ممتاز ، وأنا أحب المنظر من غرفتي.")


# In[ ]: