# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import datetime
import xgboost as xgb
from scipy import stats
import plotly.express as px
import pandas_profiling as pp
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
df_train = pd.read_csv('/kaggle/input/logical-rythm-2k20-game-price-prediction/steam_train.csv')
df_train.describe()
df_train.columns
#pp.ProfileReport(df_train)
Studying the profile report for various variables we can comment on many of the variables:
appid: Unique field only for identification and doesn't contribute to price.
name,developer and publisher: High cardinality fields that can't possibly be used for any kind of feature extraction which might be useful for price prediction.
median_playtime: Has a very high correlation with average_playtime and therefore one of them can be safely eliminated.
platforms, categories, genres and steamspy_tags: These columns need to be separated for the values given in these columns using ';' as separater.
for i in df_train.columns:
print( i+" \t: " +str(df_train[i].isnull().sum()))
Since there are no null values in the dataset so we do not need to do any kind of data cleaning. Thus we will move on to feature engineering.
column = 'appid'
name = 'App-ID'
a= []
for i in df_train.index:
a.append(name +' : '+ str(df_train[column][i]))
df_train[column+'_visual'] = a
fig = px.box(data_frame = df_train.reset_index(),hover_name = 'appid_visual',
y = 'price',hover_data = ['name'],height = 800,color = 'owners',
width = 1000,labels = {'price':'Sale Price in "$"',"owners":"Total Downloads",
'name':'Name'},
title = 'Box plot of the sale price(Hover for details)')
fig.show()
As we can see from the box plot there are a lot of outliers so we set a threshold for the price.
Here I have taken it to be $50 (Purely arbitary choice)
removed = 0
threshold = 30
for i in df_train.index:
if df_train['price'][i]>threshold:
df_train = df_train.drop(i)
removed+=1
print('Total number of data points removed till now are: '+str(removed))
Since we do not want to remove too many data ponts, we will keep a track of how many data points have been removed. Here we have removed less than 1% of the most total dataset which seem like outliers.
fig = px.box(data_frame = df_train.reset_index(),hover_name = 'appid_visual',
y = 'price',hover_data = ['name'],height = 800,color = 'owners',
width = 1000,labels = {'price':'Sale Price in "$"',"owners":"Total Downloads",
'name':'Name'},
title = 'Box plot of the sale price(Hover for details)')
fig.show()
As you can see the resulting plot looks much more well scaled over the dataset.
There are mainly 6 categorical columns that we really care about so we do preprocessing on those only.
The four columns platforms, categories, genres and steamspy_tags need to be separated for the values given in these columns.
Owners: For this column we can take the number of owners as the average of upper and lower limits of the class. Although this will not be completely accurate but will definitely give us more flexibility to work with the column.
Release_date: For this column what we can do is take the difference in number of days from today which will give us a number of how many days the app has been on the play store.
encoding_columns = ['platforms', 'categories','genres','steamspy_tags']
unique_sets = []
for column in encoding_columns:
unique_vals = set()
for i in df_train[column]:
for j in i.split(";"):
unique_vals.add(j)
unique_vals = list(unique_vals)
unique_set = []
for i in unique_vals:
encode = []
for j in df_train[column]:
if i in j.split(";"):
encode.append(1)
else:
encode.append(0)
if sum(encode)>250 and sum(encode)< 24500:
unique_set.append(column+"_"+i)
df_train[column+"_"+i] = encode
unique_sets.append(unique_set)
encode = []
for i in df_train['owners']:
x = i.split('-')
encode.append(sum([int(i) for i in x])/len(x))
df_train['Owners'] = encode
encode = []
for i in df_train['release_date']:
x,y,z = i.split('-')
i = datetime.date(int(x),int(y),int(z))
diff = (datetime.date.today() - i).days
encode.append(diff)
df_train['days_in_store'] = encode
df_train
We have removed any fields where more than 99% values are the same as these are not useful for the model
df_train.columns
useful_classes = set() #A list to store classes that show significant relation with price.
correlation_threshold = 0 #Any class with a correlation with more than 0.05 will be considered significant.
height = 550
title = '<b>Correlation Matrix for the dataset:</b>'
colors = 'Viridis'
#-------------------------------------------------------------------------#
df_train = df_train
classes = ['required_age', 'achievements', 'days_in_store', 'positive_ratings',
'negative_ratings','average_playtime','Owners','price']
#-------------------------------------------------------------------------#
correlation = df_train[classes].corr()
correlation_mat = df_train[classes].corr().to_numpy()
correlation_mat = (correlation_mat//0.0001)/10000
correlation_mat_norm = (correlation_mat//0.01)/100
fig = ff.create_annotated_heatmap(correlation_mat, x=classes, y=classes,
annotation_text=correlation_mat_norm,
colorscale=colors,text = correlation_mat,
hovertemplate='Column: %{x}<br>'+
'Row: %{y}<br>'+
'Correlation: %{text}<extra></extra>')
fig.update_layout(title_text= title,width = (height*(1.618))//1,height = height,
xaxis = {'title':'Columns'},
yaxis = {'title':'Rows','autorange':'reversed'})
fig.update_traces(showscale = True)
fig.show()
Now we can see that only two columns show a significant correlation with price which are required_age and days_in_store. The reason that these show correlation are:
required_age: The reson behng this correlation is that the people who are older and earn are more likely to spend on the app and thus the game companies keep the price of such apps more.
days_in_store: At first it might seem counter-intuitive for this column to show a positive correlation considering inflation but it seems obvious considering the rise of free-to-play games which according to many studies make more money through in-app-purchases compared to pay-to-play games.
The columns positive_ratings and negative_ratings do not seem to have any significant correlation woth price but have a huge correlation with number of owners so it seems reasonable to normalise these columns by dividing them with the owners.
We can also introduce a new variable likeliness defined as the difference of the positive and negative ratings and normalised likeliness when likeliness is divided by the number of owners
df_train['positive_ratings_norm'] = df_train['positive_ratings']/df_train['Owners']
df_train['negative_ratings_norm'] = df_train['negative_ratings']/df_train['Owners']
df_train['likeliness'] = (df_train['positive_ratings']-df_train['negative_ratings'])/df_train['Owners']
df_train['likeliness_norm'] = df_train['likeliness']/df_train['Owners']
height = 700
title = '<b>Correlation Matrix for the dataset:</b>'
colors = 'Viridis'
#-------------------------------------------------------------------------#
df_train = df_train
classes = ['required_age', 'achievements', 'days_in_store', 'positive_ratings',
'negative_ratings','positive_ratings_norm','negative_ratings_norm',
'average_playtime','Owners','likeliness','likeliness_norm','price']
#-------------------------------------------------------------------------#
correlation = df_train[classes].corr()
correlation_mat = df_train[classes].corr().to_numpy()
correlation_mat = (correlation_mat//0.0001)/10000
correlation_mat_norm = (correlation_mat//0.01)/100
fig = ff.create_annotated_heatmap(correlation_mat, x=classes, y=classes,
annotation_text=correlation_mat_norm,
colorscale=colors,text = correlation_mat,
hovertemplate='Column: %{x}<br>'+
'Row: %{y}<br>'+
'Correlation: %{text}<extra></extra>')
fig.update_layout(title_text= title,width = (height*(1.618))//1,height = height,
xaxis = {'title':'Columns'},
yaxis = {'title':'Rows','autorange':'reversed'})
fig.update_traces(showscale = True)
fig.show()
for i in classes:
if abs(correlation[i]['price'])>= correlation_threshold:
useful_classes.add(i)
We see 4 more columns that show a significant correlation with price which are positive_ratings_norm, negative_ratings_norm, difference, difference_norm out of which the correlation for normalised positive ratings was expected. For normalised negative ratings it is positive because games that are popular have more of both positive and negative ratigs.
The positive correlation for normalised likeliness I think has more to do with the derived nature of the variable than its significance as a whole but the positive correlation for likeliness is something new as neither positive_ratings nor negative_ratings have any significant correlation with price directly.
values = sorted(list(df_train['Owners'].unique()))
encoding = []
for i in df_train['Owners']:
encoding.append(values.index(i)+1)
df_train['owners_color'] = encoding
fig = go.Figure()
classes = [x for x in classes if x != 'price']
for step in range (len(classes)):
fig.add_trace(
go.Scattergl(
visible = False,mode = 'markers',
marker = {'color' : df_train['owners_color']},
text = df_train[['appid_visual']],x = df_train[classes[step]],y = df_train['price'],
hovertemplate = '<b>%{text}</b><br>Sale Price in "$": %{y}<br>'+
" ".join([x.capitalize() for x in classes[step].split("_")]) +
': %{x}<extra></extra>'
))
fig.data[0].visible = True
steps = []
for i in range(len(fig.data)):
step = dict(
method = "update",
label = " ".join([x.capitalize() for x in classes[i].split("_")]),
args=[{"visible": [False] * len(fig.data)},
{"title": "Slider switched to step: " + str(classes[i])}],
)
step["args"][0]["visible"][i] = True
steps.append(step)
sliders = [dict(
active=0,
currentvalue={"prefix": "Scatter plot for Price vs "},
pad={"t": 50},
steps=steps
)]
fig.update_layout(
sliders=sliders
)
fig.show()