#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0"; # # Building an Arabic Sentiment Analyzer With BERT # # In this notebook, we will build a simple, fast, and accurate Arabic-language text classification model with minimal effort. More specifically, we will build a model that classifies Arabic hotel reviews as either positive or negative. # # The dataset can be downloaded from Ashraf Elnagar's GitHub repository (https://github.com/elnagara/HARD-Arabic-Dataset). # # Each entry in the dataset includes a review in Arabic and a rating between 1 and 5. We will convert this to a binary classification dataset by assigning reviews with a rating of above 3 a positive label and assigning reviews with a rating of less than 3 a negative label. # # (**Disclaimer:** I don't speak Arabic. Please forgive mistakes.) # # # In[3]: # convert ratings to a binary format: pos=positive, neg=negative import pandas as pd df = pd.read_csv('data/arabic_hotel_reviews/balanced-reviews.txt', delimiter='\t', encoding='utf-16') df = df[['rating', 'review']] df['rating'] = df['rating'].apply(lambda x: 'neg' if x < 3 else 'pos') df.head() # Let's split out a training and validation set. # In[4]: df_train = df.sample(frac=0.85, random_state=42) df_test = df.drop(df_train.index) len(df_train), len(df_test) # With the [Transformer API in *ktrain*](https://nbviewer.jupyter.org/github/amaiya/ktrain/blob/master/tutorials/tutorial-A3-hugging_face_transformers.ipynb), we can select any Hugging Face `transformers` model appropriate for our data. Since we are dealing with Arabic, we will use [AraBERT](https://huggingface.co/aubmindlab/bert-base-arabert) by the AUB MIND Lab instead of multilingual BERT (which is normally used by *ktrain* for non-English datasets in the alternative [text_classifier API in *ktrain*](https://github.com/amaiya/ktrain/blob/master/examples/text/ArabicHotelReviews-BERT.ipynb)). As you can see below, with only 1 epoch, we obtain a **96.37** accuracy on the validation set. # In[7]: import ktrain from ktrain import text MODEL_NAME = 'aubmindlab/bert-base-arabertv01' t = text.Transformer(MODEL_NAME, maxlen=128) trn = t.preprocess_train(df_train.review.values, df_train.rating.values) val = t.preprocess_test(df_test.review.values, df_test.rating.values) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) learner.fit_onecycle(5e-5, 1) # ### Making Predictions on New Data # In[8]: p = ktrain.get_predictor(learner.model, t) # Predicting label for the text # > "*The room was clean, the food excellent, and I loved the view from my room.*" # In[9]: p.predict("الغرفة كانت نظيفة ، الطعام ممتاز ، وأنا أحب المنظر من غرفتي.") # Predicting label for: # > "*This hotel was too expensive and the staff is rude.*" # In[10]: p.predict('كان هذا الفندق باهظ الثمن والموظفين غير مهذبين.') # ### Save our Predictor for Later Deployment # In[11]: # save model for later use p.save('/tmp/arabic_predictor') # In[12]: # reload from disk p = ktrain.load_predictor('/tmp/arabic_predictor') # In[13]: # still works as expected after reloading from disk p.predict("الغرفة كانت نظيفة ، الطعام ممتاز ، وأنا أحب المنظر من غرفتي.") # In[ ]: