In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
data = pd.read_csv("C:/Users/Rahul/Desktop/Capstone papers/Reviews.csv")
In [3]:
data.shape
Out[3]:
(568454, 10)
In [4]:
data.head()
Out[4]:
Id ProductId UserId ProfileName HelpfulnessNumerator HelpfulnessDenominator Score Time Summary Text
0 1 B001E4KFG0 A3SGXH7AUHU8GW delmartian 1 1 5 1303862400 Good Quality Dog Food I have bought several of the Vitality canned d...
1 2 B00813GRG4 A1D87F6ZCVE5NK dll pa 0 0 1 1346976000 Not as Advertised Product arrived labeled as Jumbo Salted Peanut...
2 3 B000LQOCH0 ABXLMWJIXXAIN Natalia Corres "Natalia Corres" 1 1 4 1219017600 "Delight" says it all This is a confection that has been around a fe...
3 4 B000UA0QIQ A395BORC6FGVXV Karl 3 3 2 1307923200 Cough Medicine If you are looking for the secret ingredient i...
4 5 B006K2ZZ7K A1UQRSCLF8GW1T Michael D. Bigham "M. Wassir" 0 0 5 1350777600 Great taffy Great taffy at a great price. There was a wid...
In [6]:
df_count_prcnt = data.Score.value_counts()
print(df_count_prcnt)
5    363122
4     80655
1     52268
3     42640
2     29769
Name: Score, dtype: int64
In [7]:
print(df_count_prcnt).sum()
568454
In [8]:
def compute_percentage(x):
    pct = float(x/df_count_prcnt.sum()) * 100
    return round(pct, 2)
In [9]:
score_prnct = compute_percentage(df_count_prnct)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-3956bcb3bba6> in <module>()
----> 1 score_prnct = compute_percentage(df_count_prnct)

NameError: name 'df_count_prnct' is not defined
In [15]:
df_count_prcnt = data.Score.value_counts()
def compute_percentage(x):
    pct = (x/df_count_prcnt.sum()) * 100
    return pct
score_prcnt = compute_percentage(df_count_prcnt)
print(score_prcnt)
score_prcnt.plot(kind="bar", colormap='jet')
5    63.878871
4    14.188483
1     9.194763
3     7.501047
2     5.236835
Name: Score, dtype: float64
Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a45e320>
In [17]:
data['datetime'] = pd.to_datetime(data["Time"], unit='s')
In [18]:
data_grp = data.groupby([data.datetime.dt.year, data.datetime.dt.month, data.Score]).count()['ProductId'].unstack().fillna(0)
In [19]:
data_grp.plot(figsize=(20,10), rot=45, colormap='jet')
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a45ef28>
In [20]:
data_grp.plot(kind="bar",figsize=(30,10), stacked=True, colormap='jet')
Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x19f59e10>