# import IPython.display.YouTubeVideo class.
from IPython.display import YouTubeVideo

# create an instance of YouTubeVideo class with provided youtube video id.
youtube_video = YouTubeVideo('GaP5f0jVTWE')

# display youtube video
display(youtube_video)


#plotly is used for interactive web-based visualizations


#conda install -c plotly plotly


# Base
# -----------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
# Hypothesis testing
# -----------------------------------
from scipy.stats import shapiro
import scipy.stats as stats
# Interactive display
# -----------------------------------
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout
from plotly.subplots import make_subplots
import plotly
import plotly.express as px
import plotly.graph_objects as go
init_notebook_mode(connected=False)
import json
# Configuration
# -----------------------------------
import warnings


warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format


path = "/Users/minyan/Desktop/Python Project/AB testing_interactive display/Datasets/mobilegames_cookie_cats.csv"

def load(path, info = True):
    
    import pandas as pd
    import io
    
    if len(path.split(".csv")) > 1:
        read = pd.read_csv(path)
    elif len(path.split(".xlsx")) > 1:
        read = pd.read_excel(path)
    
    if info:
        if len(read) > 0:
            print("# Data imported!")
            print("# ------------------------------------", "\n")
        
            print("# DIMENSIONS -------------------------")
            print("Observation:", read.shape[0], "Column:", read.shape[1], "\n")
    
            print("# DTYPES -----------------------------")
            if len(read.select_dtypes("object").columns) > 0:
                print("Object Variables:", "\n", "# of Variables:", 
                      len(read.select_dtypes("object").columns), "\n", 
                      read.select_dtypes("object").columns.tolist(), "\n")
    
            if len(read.select_dtypes("integer").columns) > 0:
                print("Integer Variables:", "\n", "# of Variables:", 
                      len(read.select_dtypes("integer").columns), "\n", 
                      read.select_dtypes("integer").columns.tolist(), "\n")
    
            if len(read.select_dtypes("float").columns) > 0:
                print("Float Variables:", "\n", "# of Variables:", 
                      len(read.select_dtypes("float").columns), "\n", 
                      read.select_dtypes("float").columns.tolist(), "\n")
    
            if len(read.select_dtypes("bool").columns) > 0:
                print("Bool Variables:", "\n", "# of Variables:", 
                      len(read.select_dtypes("bool").columns), "\n", 
                      read.select_dtypes("bool").columns.tolist(), "\n")
    
            print("# MISSING VALUE ---------------------")
            print("Are there any missing values? \n ", 
                  np.where(read.isnull().values.any() == False, 
                 "No missing value!", "Data includes missing value!"), "\n")
            
            buf = io.StringIO()
            read.info(buf=buf)
            info = buf.getvalue().split('\n')[-2].split(":")[1].strip()
            print("# MEMORY USAGE ---------------------- \n", info)
          
        else:
            print("# Data did not import!")
    
    return read
    
df = load(path, info = True)
df.head()

# Data imported!
# ------------------------------------ 

# DIMENSIONS -------------------------
Observation: 90189 Column: 5 

# DTYPES -----------------------------
Object Variables: 
 # of Variables: 1 
 ['version'] 

Integer Variables: 
 # of Variables: 2 
 ['userid', 'sum_gamerounds'] 

Bool Variables: 
 # of Variables: 2 
 ['retention_1', 'retention_7'] 

# MISSING VALUE ---------------------
Are there any missing values? 
  No missing value! 

# MEMORY USAGE ---------------------- 
 2.2+ MB


#df = pd.read_csv('/Users/minyan/Desktop/Python Project/AB testing_interactive display/Datasets/mobilegames_cookie_cats.csv')
#df.head()


#### check missing values from the datasets
print("# Data Stats ---------------------")
df.info()
print("# MISSING VALUE ---------------------")
#### list how many null values for each feature
print('Are there any missing values in the Data? :', '\n',df.isnull().sum().sort_values(ascending=False))

# Data Stats ---------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90189 entries, 0 to 90188
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   userid          90189 non-null  int64 
 1   version         90189 non-null  object
 2   sum_gamerounds  90189 non-null  int64 
 3   retention_1     90189 non-null  bool  
 4   retention_7     90189 non-null  bool  
dtypes: bool(2), int64(2), object(1)
memory usage: 2.2+ MB
# MISSING VALUE ---------------------
Are there any missing values in the Data? : 
 userid            0
version           0
sum_gamerounds    0
retention_1       0
retention_7       0
dtype: int64


# Number of Unique User
df.userid.nunique() == df.shape[0]

True


# Summary Stats: sum_gamerounds
df.describe([0.20, 0.50, 0.6,0.64,0.7,0.80])[["sum_gamerounds"]].T
#Alternative
#df['sum_gamerounds'].describe()


# A/B Groups Target Summary Stats: sum_gamerounds
df.groupby("version").sum_gamerounds.agg(["count", "median", "mean", "std", "max"])
# Alternative
#df.groupby('version').count()
#df.groupby('version')['sum_gamerounds'].median()
#df.groupby('version')['sum_gamerounds'].mean()


#counting the number of players for each of gamerounds
plot_df = df.groupby('sum_gamerounds')['userid'].count()
plot_df

plot_ga=df[df['version']=='gate_30'].groupby('sum_gamerounds')['userid'].count()
plot_gb=df[df['version']=='gate_40'].groupby('sum_gamerounds')['userid'].count()


#prepare data: the number of players that is grouped by sum_gamerounds
plot_ga=df[df['version']=='gate_30'].groupby('sum_gamerounds')['userid'].count()
plot_gb=df[df['version']=='gate_40'].groupby('sum_gamerounds')['userid'].count()


#make subplots
#fig = make_subplots(rows=1, cols=1)

#fig.add_trace(px.histogram(plot_ga,x= plot_ga),row=1, col=1)
#fig.add_trace(px.histogram(plot_gb,x= plot_gb),row=1, col=2)
#fig.add_trace(px.box(df, x=df['version'], y=df['sum_gamerounds']),row=1, col=1)


#fig.show()
#iplot(fig, show_link=False)
fig =px.histogram(df[:500],x='sum_gamerounds', nbins= 40,facet_col='version',
                  marginal = 'box',
                  title ='The distribution of the number of players for each of game rounds between AB group')

#customise figure titles
#fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

iplot(fig, show_link=False)


# prepare data
trace1 = go.Histogram(
    x=plot_ga,
    opacity=0.5,
    name = 'gate_30',
    marker = dict(color ='rgba(171,50,97,0.6)'))

trace2 = go.Histogram(
    x=plot_gb,
    opacity=0.5,
    name = 'gate_40',
    marker = dict(color = 'rgba(12,50,196,0.6)'))

da = [trace1, trace2]

lay = go.Layout(barmode = 'overlay',
                title = 'The distribution of the number of players for each of game rounds between AB group',
                xaxis = dict(title ='Number of players for each of game rounds'),
                yaxis = dict(title = 'Count'))

fig = go.Figure(data=da, layout=lay)
iplot(fig, show_link=False)


#prepare data: the number of players that is grouped by sum_gamerounds, and version
#group by multiple columns, add the result of count to the exisitng dataframe

da = df.groupby(['sum_gamerounds','version'])
dda=da.size().reset_index(name='count')
dda.head()


#display subplots using plotly.express facet parameters
fig =px.line(dda[:100],x='sum_gamerounds',y='count',facet_col='version',
            title ='The distribution of the number of players for each of game rounds between AB group')


#fig.update_layout(
#                 showlegend = False,
#                 xaxis = dict(title ='the number of players for each of gamerounds'),
#                 yaxis = dict(title = 'Counts')
#)

#fig.show()
iplot(fig, show_link=False)


#plot the distribution of players for each game rounds
#prepare the dataframe
plot_df = df.groupby('sum_gamerounds')['userid'].count()
da = plot_df[:100]
fig=px.line(da)

fig.update_layout(title = 'the number of players that played the 0-100 game rounds during the first week',
                  showlegend = False,
                  xaxis = dict(title ='the number of players for each of gamerounds'),
                  yaxis = dict(title = 'Counts')
)

#fig.show()
iplot(fig, show_link=False)


#use plotly.express library
fig=px.box(df, x=df['version'], y=df['sum_gamerounds'])
iplot(fig,show_link=False)


df = df[df.sum_gamerounds < df.sum_gamerounds.max()]

# Summary Stats: sum_gamerounds
df.describe([0.20, 0.50, 0.6,0.64,0.7,0.80])[["sum_gamerounds"]].T


#use plotly.express library
fig=px.box(df, x=df['version'], y=df['sum_gamerounds'])
iplot(fig,show_link=False)


#show the number of players at game round 0
da = df.groupby(['sum_gamerounds'])
dda=da.size().reset_index(name='count')
dda.head()


# Summary Stats: sum_gamerounds
df.describe([0.20, 0.50, 0.6,0.64,0.7,0.80])[["sum_gamerounds"]].T

#Alternative
#df['sum_gamerounds'].describe()


#players reached game round 30
d30=dda[dda.sum_gamerounds>30]
d40=dda[dda.sum_gamerounds>40]
print('The number of users reached game round 30: ',d30['count'].sum())
print('The number of users reached game round 40: ',d40['count'].sum())

The number of users reached game round 30:  32626
The number of users reached game round 40:  26887


#plot the distribution of players for each game rounds
#prepare the dataframe
da = df.groupby(['sum_gamerounds'])
dda=da.size().reset_index(name='count')
dda.head()
fig=px.histogram(dda,x='sum_gamerounds',y='count',
                 nbins=1800,range_x=(-5,500),
                 cumulative = True, marginal = 'box')
fig.update_layout(title = 'the cumulative distribution of the number of players along game rounds',
                  showlegend = False,
                  xaxis = dict(title ='the number of players for along gamerounds'),
                  yaxis = dict(title = 'Counts')
)
#make x, game_rounds as a interger, instead of a float
fig.update_xaxes(type='category')
 
fig.add_shape(type='line',
              x0=30, y0=0, x1=30, y1=100000,
              line=dict(color='red',width=0.5)
             )

fig.add_shape(type='line',
              x0=40, y0=0, x1=40, y1=100000,
              line=dict(color='red',width=0.5)
             )

#fig.show()
iplot(fig, show_link=False)


#calculate rention counts, and retention ration
retention=pd.DataFrame({"Ret1_Count":df['retention_1'].value_counts(),
              "Ret7_Count":df['retention_7'].value_counts(),
              "Ret1_ration":df['retention_1'].value_counts()/len(df),
              "Ret7_ration":df['retention_7'].value_counts()/len(df)})
print(retention)

       Ret1_Count  Ret7_Count  Ret1_ration  Ret7_ration
False       50035       73408       0.5548       0.8139
True        40153       16780       0.4452       0.1861


df.groupby(['version','retention_1']).sum_gamerounds.agg(["count","median","mean","std","max"])


df.groupby(['version','retention_7']).sum_gamerounds.agg(["count","median","mean","std","max"])


df['retention']= np.where((df.retention_1 == True) & (df.retention_7 == True),1,0)
df.groupby(['version','retention'])['sum_gamerounds'].agg(["count","median","mean","std","max"])


#calculate p_pool
p_pool = df['retention_1'].sum()/df['retention_1'].count()
p_pool

0.4452144409455803


#A/B test retention for each AB group
df.groupby('version')['retention_1'].mean()

version
gate_30   0.4482
gate_40   0.4423
Name: retention_1, dtype: float64


#Solution 1:calculate the mean difference
p_diff=df[df['version']== "gate_30"]['retention_1'].mean()-df[df['version']== "gate_40"]['retention_1'].mean()
p_diff

0.005915196587034155


#calculate pooled standard error 
count1=df[df['version']== "gate_30"]['retention_1'].count()
count2=df[df['version']== "gate_40"]['retention_1'].count()
se_pool = np.sqrt(p_pool*(1-p_pool)*(1/count1+1/count2))
se_pool

0.0033099350735518005


#calculate the Z-value for the test
z_calculated = (p_diff-0)/se_pool
z_calculated

1.787103509763628


# conducted right tail test, calculate the critical Z-value
from scipy.stats import norm
#left tail test: z=round(norm.ppf(alpha/2),2)
#right tail test: z=round(norm.ppf(1-alpha/2),2)


alpha=0.05
z=round(norm.ppf(1-alpha/2),2)
if z_calculated>z:
    print('Reject null hypothesis.')
else:
    print('Do not reject null hypothesis')

Do not reject null hypothesis


# method 2: find out the condifence interval, compare the confidence interval with 0.
#for 95% confidence interval the value of Z is 1.96 or we can use pcipy package to calculate it
from scipy.stats import norm
alpha=0.05
z=round(norm.ppf(1-alpha/2),2)
#calculate marginal error
marginal_error = round((z*se_pool),4)
marginal_error

lb=p_diff-marginal_error
ub=p_diff+marginal_error

if lb>0:
    print('Reject null hypothesis.')
else:
    print('Do not reject null hypothesis')

Do not reject null hypothesis


#Solution 2: Bootstrapping: should we be confident in the difference?
#predict the statistics under the null hypothesis


#create a list with bootstrappwd means for each AB-group
boot_1d=[]
for i in range(1000):
    boot_mean=df.sample(frac = 1, replace = True).groupby('version')['retention_1'].mean()
    boot_1d.append(boot_mean)
    
#transform the list to a Dataframe
boot_1d=pd.DataFrame(boot_1d)
print(boot_1d)

version      gate_30  gate_40
retention_1   0.4474   0.4454
retention_1   0.4470   0.4407
retention_1   0.4457   0.4408
retention_1   0.4490   0.4410
retention_1   0.4479   0.4463
...              ...      ...
retention_1   0.4452   0.4440
retention_1   0.4459   0.4388
retention_1   0.4467   0.4417
retention_1   0.4440   0.4419
retention_1   0.4518   0.4386

[1000 rows x 2 columns]


# A kernel Density estimate plot of the boostrap distributiona
# Use distplot for density curve, along with Pandas
import plotly.figure_factory as ff
fig = ff.create_distplot([boot_1d[c] for c in boot_1d.columns], boot_1d.columns,
                          show_rug=False, show_hist=False)
fig.update_layout(title_text = 'A kernel density plot of the boostrap distribution')
#fig.show()
iplot(fig, show_link=False)


#add a column with the difference between AB group
boot_1d['diff'] = (boot_1d.gate_30 - boot_1d.gate_40)/boot_1d.gate_40*100
#plot the bootstrap % difference

da = pd.DataFrame(boot_1d['diff'])
fig = ff.create_distplot( [da[c] for c in da.columns], da.columns,
                          show_rug=False, show_hist=False)
fig.add_shape(type='line',
              x0=p_diff, y0=-0.01, x1=p_diff, y1=0.6,
              line=dict(color='red',width=2)
             )
fig.add_annotation(
                x=p_diff,
                y=0.61,
                showarrow=False,
                text= p_diff)
fig.update_layout(title_text = '%difference in 1-day retention between AB groups',
                 showlegend = False,
                 xaxis = dict(title ='Percentage of Difference'),
                 yaxis = dict(title = 'Density')
                 )
#fig.show()
iplot(fig, show_link=False)


#calculate the probablity that 1-day retention is greater when the gate is at level 30
print((boot_1d['diff']>p_diff).mean())
print('Probablity that 1-day retention is greater than observed difference when the gate is at level 30:', (boot_1d['diff']>p_diff).mean())

0.964
Probablity that 1-day retention is greater than observed difference when the gate is at level 30: 0.964


print('We cant reject Ho because p-value(0.966>0.05), the difference is insignificant.')

We cant reject Ho because p-value(0.966>0.05), the difference is insignificant.

	sum_gamerounds	count
0	0	3994
1	1	5538
2	2	4606
3	3	3958
4	4	3629

		count	median	mean	std	max
version	retention_7
gate_30	False	36198	11	25.7965	43.3162	981
gate_30	True	8501	105	160.1175	179.3586	2961
gate_40	False	37210	11	25.8564	44.4061	2640
gate_40	True	8279	111	165.6498	183.7925	2294

		count	median	mean	std	max
version	retention
gate_30	0	38023	12	28.0703	48.0175	1072
gate_30	1	6676	127	183.8863	189.6264	2961
gate_40	0	38983	12	28.1034	48.9278	2640
gate_40	1	6506	133	190.2824	194.2201	2294

Mobile Games' User Behavior(User Retention): A/B Testing¶

Table of Contents¶

Resources¶

1. Project background¶

1.2 AB Testing Process¶

2. About the data¶

2.1 Data description¶

3. Analyzing Player Behavior¶

3.1 import packages for the project¶

3.2 read and check the data¶

3.3 Check player behavior data statistics¶

3.4 Check the outlier¶

3.5 remove the outlier¶

3.6 check data after removing the outlier¶

3.7 further details on player behavior¶

3.7.1 The users install the game but 3994 palyers never playered the games.¶

3.7.2 Between AB group, many players didn't even reach game round 30 or 40 where gate was set.¶

While testing whether gate make a difference on user retention rate, notice there were users didn't reach the game rounds with gate at all.¶

4. Comparing 1-day retention¶

4.1 Basics on rentention data¶

4.2 AB testing¶

A common metric messuring how fun and engaging a game is is 1-day retention.¶

4.2.1 Hypothesis¶

H0: (Retention_1)a-(Retention_1)b=0¶

H1:(Retention_1)a-(Rerention_1)b>0¶

Note:¶

4.2.2 parametric based modeling AB testing¶

unpaired(independent) t-test¶

Under $H_{0}$ hypothesis, assumping group a,b follows $(\mu, \sigma^{2})$ binomial distribution, with the same mean $\mu$, and variance $\sigma^{2}$.¶

4.2.3 compatational based AB testing(bootstrapping)¶

5. Conclusion¶

Come back to the business problem!¶

	count	median	mean	std	max
version
gate_30	44700	17	52.4563	256.7164	49854
gate_40	45489	16	51.2988	103.2944	2640

	sum_gamerounds	version	count
0	0	gate_30	1937
1	0	gate_40	2057
2	1	gate_30	2749
3	1	gate_40	2789
4	2	gate_30	2198

		count	median	mean	std	max
version	retention_1
gate_30	False	24665	6	16.3591	36.5284	1072
gate_30	True	20034	48	94.4117	135.0377	2961
gate_40	False	25370	6	16.3404	35.9258	1241
gate_40	True	20119	49	95.3812	137.8873	2640

Mobile Games' User Behavior(User Retention): A/B Testing¶

Table of Contents¶

Resources¶

1. Project background¶

1.1 About the game: Cookie Cats¶

But where should the gates be placed? Initially the first gate was placed at level 30. In this project, it is to analyze an AB-test where we moved the first gate in Cookie Cats from level 30 to level 40. In particular, the analysis will look at the impact on player retention.¶

1.2 AB Testing Process¶

2. About the data¶

2.1 Data description¶

3. Analyzing Player Behavior¶

3.1 import packages for the project¶

3.2 read and check the data¶

3.3 Check player behavior data statistics¶

3.4 Check the outlier¶

3.5 remove the outlier¶

3.6 check data after removing the outlier¶

3.7 further details on player behavior¶

3.7.1 The users install the game but 3994 palyers never playered the games.¶

3.7.2 Between AB group, many players didn't even reach game round 30 or 40 where gate was set.¶

While testing whether gate make a difference on user retention rate, notice there were users didn't reach the game rounds with gate at all.¶

4. Comparing 1-day retention¶

4.1 Basics on rentention data¶

4.2 AB testing¶

A common metric messuring how fun and engaging a game is is 1-day retention.¶

4.2.1 Hypothesis¶

H0: (Retention_1)a-(Retention_1)b=0¶

H1:(Retention_1)a-(Rerention_1)b>0¶

Note:¶

4.2.2 parametric based modeling AB testing¶

unpaired(independent) t-test¶

Under $H_{0}$ hypothesis, assumping group a,b follows $(\mu, \sigma^{2})$ binomial distribution, with the same mean $\mu$, and variance $\sigma^{2}$.¶

4.2.3 compatational based AB testing(bootstrapping)¶

5. Conclusion¶

Come back to the business problem!¶