Yelp Nlp
IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('white')
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline
READ IN DATA
yelp = pd.read_csv('../Yelp-reviews/yelp.csv')
EXAMINE DATA
yelp.head()
business_id | date | review_id | stars | text | type | user_id | cool | useful | funny | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 9yKzy9PApeiPPOUJEtnvkg | 2011-01-26 | fWKvX83p0-ka4JS3dc6E5A | 5 | My wife took me here on my birthday for breakf... | review | rLtl8ZkDX5vH5nAx9C3q5Q | 2 | 5 | 0 |
1 | ZRJwVLyzEJq1VAihDhYiow | 2011-07-27 | IjZ33sJrzXqU-0X6U8NwyA | 5 | I have no idea why some people give bad review... | review | 0a2KyEL0d3Yb1V6aivbIuQ | 0 | 0 | 0 |
2 | 6oRAC4uyJCsJl1X0WZpVSA | 2012-06-14 | IESLBzqUCLdSzSqm0eCSxQ | 4 | love the gyro plate. Rice is so good and I als... | review | 0hT2KtfLiobPvh6cDC8JQg | 0 | 1 | 0 |
3 | _1QQZuf4zZOyFCvXc0o6Vg | 2010-05-27 | G-WvGaISbqqaMHlNnByodA | 5 | Rosie, Dakota, and I LOVE Chaparral Dog Park!!... | review | uZetl9T0NcROGOyFfughhg | 1 | 2 | 0 |
4 | 6ozycU1RpktNG2-1BroVtw | 2012-01-05 | 1uJFq2r5QfJG_6ExMRCaGw | 5 | General Manager Scott Petello is a good egg!!!... | review | vYmM4KTsC8ZfQBg-j5MWkw | 0 | 0 | 0 |
yelp.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
business_id 10000 non-null object
date 10000 non-null object
review_id 10000 non-null object
stars 10000 non-null int64
text 10000 non-null object
type 10000 non-null object
user_id 10000 non-null object
cool 10000 non-null int64
useful 10000 non-null int64
funny 10000 non-null int64
dtypes: int64(4), object(6)
memory usage: 781.3+ KB
yelp.describe()
stars | cool | useful | funny | |
---|---|---|---|---|
count | 10000.000000 | 10000.000000 | 10000.000000 | 10000.000000 |
mean | 3.777500 | 0.876800 | 1.409300 | 0.701300 |
std | 1.214636 | 2.067861 | 2.336647 | 1.907942 |
min | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 3.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 4.000000 | 0.000000 | 1.000000 | 0.000000 |
75% | 5.000000 | 1.000000 | 2.000000 | 1.000000 |
max | 5.000000 | 77.000000 | 76.000000 | 57.000000 |
# Adding column 'text lenth' = # of words in text column
yelp['text length'] = yelp['text'].apply(len)
EDA
yelp.head()
business_id | date | review_id | stars | text | type | user_id | cool | useful | funny | text length | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9yKzy9PApeiPPOUJEtnvkg | 2011-01-26 | fWKvX83p0-ka4JS3dc6E5A | 5 | My wife took me here on my birthday for breakf... | review | rLtl8ZkDX5vH5nAx9C3q5Q | 2 | 5 | 0 | 889 |
1 | ZRJwVLyzEJq1VAihDhYiow | 2011-07-27 | IjZ33sJrzXqU-0X6U8NwyA | 5 | I have no idea why some people give bad review... | review | 0a2KyEL0d3Yb1V6aivbIuQ | 0 | 0 | 0 | 1345 |
2 | 6oRAC4uyJCsJl1X0WZpVSA | 2012-06-14 | IESLBzqUCLdSzSqm0eCSxQ | 4 | love the gyro plate. Rice is so good and I als... | review | 0hT2KtfLiobPvh6cDC8JQg | 0 | 1 | 0 | 76 |
3 | _1QQZuf4zZOyFCvXc0o6Vg | 2010-05-27 | G-WvGaISbqqaMHlNnByodA | 5 | Rosie, Dakota, and I LOVE Chaparral Dog Park!!... | review | uZetl9T0NcROGOyFfughhg | 1 | 2 | 0 | 419 |
4 | 6ozycU1RpktNG2-1BroVtw | 2012-01-05 | 1uJFq2r5QfJG_6ExMRCaGw | 5 | General Manager Scott Petello is a good egg!!!... | review | vYmM4KTsC8ZfQBg-j5MWkw | 0 | 0 | 0 | 469 |
# histogram of text length based off of the star ratings
g = sns.FacetGrid(yelp, col='stars')
g.map(plt.hist, 'text length')
<seaborn.axisgrid.FacetGrid at 0x1a0b7fb7f0>
# box plot of text legth for each star category
sns.boxplot(x = 'stars', y = 'text length', data=yelp, palette='rainbow')
<matplotlib.axes._subplots.AxesSubplot at 0x1a0c28e4e0>
# countplot of the # of occurrences for each type of star rating
sns.countplot(x = 'stars', data = yelp, palette = 'rainbow')
<matplotlib.axes._subplots.AxesSubplot at 0x1a0c510e10>
# mean values of numerical columns to the star rating
stars = yelp.groupby('stars').mean()
stars
cool | useful | funny | text length | |
---|---|---|---|---|
stars | ||||
1 | 0.576769 | 1.604806 | 1.056075 | 826.515354 |
2 | 0.719525 | 1.563107 | 0.875944 | 842.256742 |
3 | 0.788501 | 1.306639 | 0.694730 | 758.498289 |
4 | 0.954623 | 1.395916 | 0.670448 | 712.923142 |
5 | 0.944261 | 1.381780 | 0.608631 | 624.999101 |
# numerical showing of correlations
stars.corr()
cool | useful | funny | text length | |
---|---|---|---|---|
cool | 1.000000 | -0.743329 | -0.944939 | -0.857664 |
useful | -0.743329 | 1.000000 | 0.894506 | 0.699881 |
funny | -0.944939 | 0.894506 | 1.000000 | 0.843461 |
text length | -0.857664 | 0.699881 | 0.843461 | 1.000000 |
# Visual of correlations
sns.heatmap(stars.corr(), cmap = 'coolwarm', annot = True)
<matplotlib.axes._subplots.AxesSubplot at 0x1a0c736940>
NLP CLASSIFICATION
yelp_class = yelp[(yelp.stars == 1) | (yelp.stars == 5)]
yelp_class.head()
business_id | date | review_id | stars | text | type | user_id | cool | useful | funny | text length | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9yKzy9PApeiPPOUJEtnvkg | 2011-01-26 | fWKvX83p0-ka4JS3dc6E5A | 5 | My wife took me here on my birthday for breakf... | review | rLtl8ZkDX5vH5nAx9C3q5Q | 2 | 5 | 0 | 889 |
1 | ZRJwVLyzEJq1VAihDhYiow | 2011-07-27 | IjZ33sJrzXqU-0X6U8NwyA | 5 | I have no idea why some people give bad review... | review | 0a2KyEL0d3Yb1V6aivbIuQ | 0 | 0 | 0 | 1345 |
3 | _1QQZuf4zZOyFCvXc0o6Vg | 2010-05-27 | G-WvGaISbqqaMHlNnByodA | 5 | Rosie, Dakota, and I LOVE Chaparral Dog Park!!... | review | uZetl9T0NcROGOyFfughhg | 1 | 2 | 0 | 419 |
4 | 6ozycU1RpktNG2-1BroVtw | 2012-01-05 | 1uJFq2r5QfJG_6ExMRCaGw | 5 | General Manager Scott Petello is a good egg!!!... | review | vYmM4KTsC8ZfQBg-j5MWkw | 0 | 0 | 0 | 469 |
6 | zp713qNhx8d9KCJJnrw1xA | 2010-02-12 | riFQ3vxNpP4rWLk_CSri2A | 5 | Drop what you're doing and drive here. After I... | review | wFweIWhv2fREZV_dYkz_1g | 7 | 7 | 4 | 1565 |
yelp_class['stars'].unique()
array([5, 1])
# Creating X and Y
X = yelp_class['text']
y = yelp_class['stars']
cv = CountVectorizer()
X = cv.fit_transform(X)
TRAIN TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3,
random_state=101)
MODEL
nb = MultinomialNB()
nb.fit(X_train,y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
Predictions
predictions = nb.predict(X_test)
confusion_matrix(y_test,predictions)
array([[159, 69],
[ 22, 976]])
print(classification_report(y_test, predictions))
precision recall f1-score support
1 0.88 0.70 0.78 228
5 0.93 0.98 0.96 998
avg / total 0.92 0.93 0.92 1226
NOW WILL USE TEXT PROCESSING
pipeline = Pipeline([
('bow', CountVectorizer()), # Strings to token interger counts
('tfidf', TfidfTransformer()), # Integer counts to weighted TF-IDF scores
('classifier', MultinomialNB()) # Train on TF-IDF vector w/ Naive Bayes Classifier
])
X = yelp_class['text']
y = yelp_class['stars']
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3,
random_state=101)
pipeline.fit(X_train, y_train)
Pipeline(memory=None,
steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), preprocessor=None, stop_words=None,
strip_...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
predictions = pipeline.predict(X_test)
confusion_matrix(y_test,predictions)
array([[ 0, 228],
[ 0, 998]])
print(classification_report(y_test, predictions))
precision recall f1-score support
1 0.00 0.00 0.00 228
5 0.81 1.00 0.90 998
avg / total 0.66 0.81 0.73 1226
/anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
'precision', 'predicted', average, warn_for)
# Looks like using Tf-idf made things worse
# will play around with the pipeline to see if anything changes
Written on October 16, 2018