Yelp Nlp



IMPORTS

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

%matplotlib inline

READ IN DATA

yelp = pd.read_csv('../Yelp-reviews/yelp.csv')

EXAMINE DATA

yelp.head()
business_id date review_id stars text type user_id cool useful funny
0 9yKzy9PApeiPPOUJEtnvkg 2011-01-26 fWKvX83p0-ka4JS3dc6E5A 5 My wife took me here on my birthday for breakf... review rLtl8ZkDX5vH5nAx9C3q5Q 2 5 0
1 ZRJwVLyzEJq1VAihDhYiow 2011-07-27 IjZ33sJrzXqU-0X6U8NwyA 5 I have no idea why some people give bad review... review 0a2KyEL0d3Yb1V6aivbIuQ 0 0 0
2 6oRAC4uyJCsJl1X0WZpVSA 2012-06-14 IESLBzqUCLdSzSqm0eCSxQ 4 love the gyro plate. Rice is so good and I als... review 0hT2KtfLiobPvh6cDC8JQg 0 1 0
3 _1QQZuf4zZOyFCvXc0o6Vg 2010-05-27 G-WvGaISbqqaMHlNnByodA 5 Rosie, Dakota, and I LOVE Chaparral Dog Park!!... review uZetl9T0NcROGOyFfughhg 1 2 0
4 6ozycU1RpktNG2-1BroVtw 2012-01-05 1uJFq2r5QfJG_6ExMRCaGw 5 General Manager Scott Petello is a good egg!!!... review vYmM4KTsC8ZfQBg-j5MWkw 0 0 0
yelp.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
business_id    10000 non-null object
date           10000 non-null object
review_id      10000 non-null object
stars          10000 non-null int64
text           10000 non-null object
type           10000 non-null object
user_id        10000 non-null object
cool           10000 non-null int64
useful         10000 non-null int64
funny          10000 non-null int64
dtypes: int64(4), object(6)
memory usage: 781.3+ KB
yelp.describe()
stars cool useful funny
count 10000.000000 10000.000000 10000.000000 10000.000000
mean 3.777500 0.876800 1.409300 0.701300
std 1.214636 2.067861 2.336647 1.907942
min 1.000000 0.000000 0.000000 0.000000
25% 3.000000 0.000000 0.000000 0.000000
50% 4.000000 0.000000 1.000000 0.000000
75% 5.000000 1.000000 2.000000 1.000000
max 5.000000 77.000000 76.000000 57.000000
# Adding column 'text lenth' = # of words in text column
yelp['text length'] = yelp['text'].apply(len)

EDA

yelp.head()
business_id date review_id stars text type user_id cool useful funny text length
0 9yKzy9PApeiPPOUJEtnvkg 2011-01-26 fWKvX83p0-ka4JS3dc6E5A 5 My wife took me here on my birthday for breakf... review rLtl8ZkDX5vH5nAx9C3q5Q 2 5 0 889
1 ZRJwVLyzEJq1VAihDhYiow 2011-07-27 IjZ33sJrzXqU-0X6U8NwyA 5 I have no idea why some people give bad review... review 0a2KyEL0d3Yb1V6aivbIuQ 0 0 0 1345
2 6oRAC4uyJCsJl1X0WZpVSA 2012-06-14 IESLBzqUCLdSzSqm0eCSxQ 4 love the gyro plate. Rice is so good and I als... review 0hT2KtfLiobPvh6cDC8JQg 0 1 0 76
3 _1QQZuf4zZOyFCvXc0o6Vg 2010-05-27 G-WvGaISbqqaMHlNnByodA 5 Rosie, Dakota, and I LOVE Chaparral Dog Park!!... review uZetl9T0NcROGOyFfughhg 1 2 0 419
4 6ozycU1RpktNG2-1BroVtw 2012-01-05 1uJFq2r5QfJG_6ExMRCaGw 5 General Manager Scott Petello is a good egg!!!... review vYmM4KTsC8ZfQBg-j5MWkw 0 0 0 469
# histogram of text length based off of the star ratings
g = sns.FacetGrid(yelp, col='stars')
g.map(plt.hist, 'text length')
<seaborn.axisgrid.FacetGrid at 0x1a0b7fb7f0>

png

# box plot of text legth for each star category
sns.boxplot(x = 'stars', y = 'text length', data=yelp, palette='rainbow')
<matplotlib.axes._subplots.AxesSubplot at 0x1a0c28e4e0>

png

# countplot of the # of occurrences for each type of star rating
sns.countplot(x = 'stars', data = yelp, palette = 'rainbow')
<matplotlib.axes._subplots.AxesSubplot at 0x1a0c510e10>

png

# mean values of numerical columns to the star rating 
stars = yelp.groupby('stars').mean()
stars
cool useful funny text length
stars
1 0.576769 1.604806 1.056075 826.515354
2 0.719525 1.563107 0.875944 842.256742
3 0.788501 1.306639 0.694730 758.498289
4 0.954623 1.395916 0.670448 712.923142
5 0.944261 1.381780 0.608631 624.999101
# numerical showing of correlations 
stars.corr()
cool useful funny text length
cool 1.000000 -0.743329 -0.944939 -0.857664
useful -0.743329 1.000000 0.894506 0.699881
funny -0.944939 0.894506 1.000000 0.843461
text length -0.857664 0.699881 0.843461 1.000000
# Visual of correlations 
sns.heatmap(stars.corr(), cmap = 'coolwarm', annot = True)
<matplotlib.axes._subplots.AxesSubplot at 0x1a0c736940>

png

NLP CLASSIFICATION

yelp_class = yelp[(yelp.stars == 1) | (yelp.stars == 5)]
yelp_class.head()
business_id date review_id stars text type user_id cool useful funny text length
0 9yKzy9PApeiPPOUJEtnvkg 2011-01-26 fWKvX83p0-ka4JS3dc6E5A 5 My wife took me here on my birthday for breakf... review rLtl8ZkDX5vH5nAx9C3q5Q 2 5 0 889
1 ZRJwVLyzEJq1VAihDhYiow 2011-07-27 IjZ33sJrzXqU-0X6U8NwyA 5 I have no idea why some people give bad review... review 0a2KyEL0d3Yb1V6aivbIuQ 0 0 0 1345
3 _1QQZuf4zZOyFCvXc0o6Vg 2010-05-27 G-WvGaISbqqaMHlNnByodA 5 Rosie, Dakota, and I LOVE Chaparral Dog Park!!... review uZetl9T0NcROGOyFfughhg 1 2 0 419
4 6ozycU1RpktNG2-1BroVtw 2012-01-05 1uJFq2r5QfJG_6ExMRCaGw 5 General Manager Scott Petello is a good egg!!!... review vYmM4KTsC8ZfQBg-j5MWkw 0 0 0 469
6 zp713qNhx8d9KCJJnrw1xA 2010-02-12 riFQ3vxNpP4rWLk_CSri2A 5 Drop what you're doing and drive here. After I... review wFweIWhv2fREZV_dYkz_1g 7 7 4 1565
yelp_class['stars'].unique()
array([5, 1])
# Creating X and Y 
X = yelp_class['text']
y = yelp_class['stars']
cv = CountVectorizer()
X = cv.fit_transform(X)

TRAIN TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=101)

MODEL

nb = MultinomialNB()
nb.fit(X_train,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Predictions

predictions = nb.predict(X_test)
confusion_matrix(y_test,predictions)
array([[159,  69],
       [ 22, 976]])
print(classification_report(y_test, predictions))
             precision    recall  f1-score   support

          1       0.88      0.70      0.78       228
          5       0.93      0.98      0.96       998

avg / total       0.92      0.93      0.92      1226

NOW WILL USE TEXT PROCESSING

pipeline = Pipeline([
    ('bow', CountVectorizer()), # Strings to token interger counts
    ('tfidf', TfidfTransformer()), # Integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()) # Train on TF-IDF vector w/ Naive Bayes Classifier
])

X = yelp_class['text']
y = yelp_class['stars']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=101)
pipeline.fit(X_train, y_train)
Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
predictions = pipeline.predict(X_test)
confusion_matrix(y_test,predictions)
array([[  0, 228],
       [  0, 998]])
print(classification_report(y_test, predictions))
             precision    recall  f1-score   support

          1       0.00      0.00      0.00       228
          5       0.81      1.00      0.90       998

avg / total       0.66      0.81      0.73      1226



/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
# Looks like using Tf-idf made things worse 
# will play around with the pipeline to see if anything changes
Written on October 16, 2018