from IPython.display import Image
Image(filename='text-analysis.jpg')
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
data_set = pd.read_csv('/home/pybokeh/Downloads/FnS_temp.csv', usecols=[4,7,8])
data_set.head(15)
SHRT_PART_NO | ORIG_CUST_CNTN_TXT | TARGET | |
---|---|---|---|
0 | 01473 | DURING LAST SERVICE INSPECTION TECH FOUND LEFT... | 0 |
1 | 04712 | CLIENT STATES THERE IS DAMAGE UNDER FRONT BUMP... | 0 |
2 | 04712 | CLIENT STATES THERE IS DAMAGE UNDER FRONT BUMP... | 1 |
3 | 04715 | CUSTOMER STATES REAR RIGHT BUMPER IS OUT OF AL... | 1 |
4 | 04715 | PER DPSM SUBLET LR QUARTER TO BODY SHOP | 1 |
5 | 04814 | CLIENT STATES RATTLE FROM PASSENGER B-PILLAR A... | 0 |
6 | 04816 | AUTH 119B - GENERAL MANAGER STATES PASSENGER S... | 0 |
7 | 15400 | GOODWILL CAR RAMPS FOR CLIENT PER ACURA | 0 |
8 | 15400 | LA PREP LOWER SEAT COVERS PER BEN ARIAS | 0 |
9 | 15400 | AUTH`D BY DPSM AND TECH LINE REF #TLC4000852. ... | 1 |
10 | 15420 | PLEASE PERFORM FULL NSX INSPECTION | 1 |
11 | 15610 | OBSERVED AT PDI/TQI; OIL RESIDUE ON LOWER ENGI... | 0 |
12 | 17011 | CUSTOMER STATES THE CHECK ENGINE LIGHT AND PAR... | 1 |
13 | 17013 | CUST STATES THE FUEL TANK DOES NOT GO ABOVE 3/4 | 0 |
14 | 17045 | C/S: WHILE DRIVING ALL WARNING LIGHTS CAME ON ... | 0 |
partno = data_set.SHRT_PART_NO.values
complaints = data_set.ORIG_CUST_CNTN_TXT.values
target = data_set.TARGET.values
partno
is a categorical variable consisting of string value, so need to do Label Encoding on it¶enc_label = LabelEncoder()
X_train_partno_labelencoded = enc_label.fit_transform(partno)
X_train_partno_labelencoded
array([ 0, 1, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 17, 18, 18, 18, 18, 18, 19, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 24, 25, 25, 26, 27, 28, 28, 28, 28, 29, 30, 31, 32, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 37, 38, 38, 39, 39, 40, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46, 46, 46, 47, 47, 48, 48, 48, 48, 48, 49, 50, 51, 52, 53, 54, 54, 54, 54, 54, 54, 55, 56, 57, 57, 57, 58, 59, 60, 61, 61, 62, 62, 62, 63, 64, 65, 66, 66, 67, 68, 69, 70, 71, 71, 71, 71, 71, 72, 73, 74, 75, 76, 77, 78, 79, 79, 80, 80, 81, 81, 82, 82, 83, 84, 85, 85, 85, 86, 86, 86, 86, 86, 87, 87, 87, 87, 88, 88, 88, 88, 88, 88, 89, 89, 89, 89, 89, 89, 90, 90, 90, 90, 90, 91, 91, 91, 92, 92, 92, 92, 92, 92, 92, 92, 92, 93, 93, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 103, 103, 104, 105, 106, 107, 107, 108, 109, 110, 111, 112, 113, 113, 114, 115, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 124, 125, 126, 127, 128, 128, 129, 129, 130, 131, 132, 133, 134, 135, 135, 136, 136, 137, 138, 139, 140, 141, 141, 142, 142, 142, 143, 143, 144, 145, 146, 147, 148, 148, 148, 148, 148, 149, 149, 150, 151, 151, 151, 151, 152])
X_train_partno_labelencoded.shape
(301,)
enc_onehot = OneHotEncoder()
X_train_partno_onehot = enc_onehot.fit_transform(X_train_partno_labelencoded)
/home/pybokeh/envs/notebook/lib/python3.6/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample. DeprecationWarning)
X_train_partno_onehot = enc_onehot.fit_transform(X_train_partno_labelencoded.reshape(-1,1))
X_train_partno_onehot
<301x153 sparse matrix of type '<class 'numpy.float64'>' with 301 stored elements in Compressed Sparse Row format>
X_train_partno_onehot.shape
(301, 153)
Vectorize
the complaints training data with CountVectorizer
¶count_vect = CountVectorizer()
X_train_complaint_counts = count_vect.fit_transform(complaints)
X_train_complaint_counts.shape
(301, 1141)
Transform
the complaints training data with tfidf
¶tfidf_transformer = TfidfTransformer()
X_train_complaint_tfidf = tfidf_transformer.fit_transform(X_train_complaint_counts)
X_train_complaint_tfidf.shape
(301, 1141)
type(X_train_complaint_tfidf)
scipy.sparse.csr.csr_matrix
X_train_combined_tfidf = sparse.hstack((X_train_partno_onehot, X_train_complaint_tfidf), format='csr')
X_train_combined_tfidf
<301x1294 sparse matrix of type '<class 'numpy.float64'>' with 4451 stored elements in Compressed Sparse Row format>
clf = MultinomialNB().fit(X_train_combined_tfidf, target)
Predict
with test part # and complaint¶part_test = np.array(['19301'])
complaint_test = np.array(['CLIENT STATES THE CHECK EMISSIONS LIGHT IS ON'])
X_new_part_labelencoded = enc_label.fit_transform(part_test)
X_new_part_onehot = enc_onehot.fit_transform(X_new_part_labelencoded.reshape(-1,1))
X_new_complaint_counts = count_vect.transform(complaint_test)
X_new_complaint_tfidf = tfidf_transformer.transform(X_new_complaint_counts)
X_new_combined_tfidf = sparse.hstack((X_new_part_onehot, X_new_complaint_tfidf), format='csr')
predicted = clf.predict(X_new_combined_tfidf)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-32-117abe6371c6> in <module>() 10 X_new_combined_tfidf = sparse.hstack((X_new_part_onehot, X_new_complaint_tfidf), format='csr') 11 ---> 12 predicted = clf.predict(X_new_combined_tfidf) /home/pybokeh/envs/notebook/lib/python3.6/site-packages/sklearn/naive_bayes.py in predict(self, X) 63 Predicted target values for X 64 """ ---> 65 jll = self._joint_log_likelihood(X) 66 return self.classes_[np.argmax(jll, axis=1)] 67 /home/pybokeh/envs/notebook/lib/python3.6/site-packages/sklearn/naive_bayes.py in _joint_log_likelihood(self, X) 705 706 X = check_array(X, accept_sparse='csr') --> 707 return (safe_sparse_dot(X, self.feature_log_prob_.T) + 708 self.class_log_prior_) 709 /home/pybokeh/envs/notebook/lib/python3.6/site-packages/sklearn/utils/extmath.py in safe_sparse_dot(a, b, dense_output) 182 """ 183 if issparse(a) or issparse(b): --> 184 ret = a * b 185 if dense_output and hasattr(ret, "toarray"): 186 ret = ret.toarray() /home/pybokeh/envs/notebook/lib/python3.6/site-packages/scipy/sparse/base.py in __mul__(self, other) 403 404 if other.shape[0] != self.shape[1]: --> 405 raise ValueError('dimension mismatch') 406 407 result = self._mul_multivector(np.asarray(other)) ValueError: dimension mismatch
X_new_part_onehot.shape
(1, 1)
X_new_complaint_tfidf.shape
(1, 1141)
X_new_combined_tfidf.shape
(1, 1142)