A/B testing: A step-by-step guide in Python

From experimental design to hypothesis testing

1. Designing our experiment

Formulating a hypothesis

Choosing the variables

Choosing a sample size

# Packages imports
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.stats.api as sms
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
%matplotlib inline# Some plot styling preferences
plt.style.use('seaborn-whitegrid')
font = {'family' : 'Helvetica',
'weight' : 'bold',
'size' : 14}
mpl.rc('font', **font)effect_size = sms.proportion_effectsize(0.13, 0.15) # Calculating effect size based on our expected ratesrequired_n = sms.NormalIndPower().solve_power(
effect_size,
power=0.8,
alpha=0.05,
ratio=1
) # Calculating sample size needed
required_n = ceil(required_n) # Rounding up to next whole number print(required_n)

2. Collecting and preparing the data

df = pd.read_csv('ab_data.csv')df.head()
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 294478 non-null int64
1 timestamp 294478 non-null object
2 group 294478 non-null object
3 landing_page 294478 non-null object
4 converted 294478 non-null int64
dtypes: int64(2), object(3)
memory usage: 11.2+ MB# To make sure all the control group are seeing the old page and viceversa
pd.crosstab(df['group'], df['landing_page'])
session_counts = df['user_id'].value_counts(ascending=False)
multi_users = session_counts[session_counts > 1].count()
print(f'There are {multi_users} users that appear multiple times in the dataset')
users_to_drop = session_counts[session_counts > 1].indexdf = df[~df['user_id'].isin(users_to_drop)]
print(f'The updated dataset now has {df.shape[0]} entries')

Sampling

control_sample = df[df['group'] == 'control'].sample(n=required_n, random_state=22)
treatment_sample = df[df['group'] == 'treatment'].sample(n=required_n, random_state=22)
ab_test = pd.concat([control_sample, treatment_sample], axis=0)
ab_test.reset_index(drop=True, inplace=True)ab_test
ab_test.info()
ab_test['group'].value_counts()

3. Visualising the results

conversion_rates = ab_test.groupby('group')['converted']std_p = lambda x: np.std(x, ddof=0)              # Std. deviation of the proportion
se_p = lambda x: stats.sem(x, ddof=0) # Std. error of the proportion (std / sqrt(n))
conversion_rates = conversion_rates.agg([np.mean, std_p, se_p])
conversion_rates.columns = ['conversion_rate', 'std_deviation', 'std_error']
conversion_rates.style.format('{:.3f}')
plt.figure(figsize=(8,6))sns.barplot(x=ab_test['group'], y=ab_test['converted'], ci=False)plt.ylim(0, 0.17)
plt.title('Conversion rate by group', pad=20)
plt.xlabel('Group', labelpad=15)
plt.ylabel('Converted (proportion)', labelpad=15);

4. Testing the hypothesis

from statsmodels.stats.proportion import proportions_ztest, proportion_confintcontrol_results = ab_test[ab_test['group'] == 'control']['converted']
treatment_results = ab_test[ab_test['group'] == 'treatment']['converted']n_con = control_results.count()
n_treat = treatment_results.count()
successes = [control_results.sum(), treatment_results.sum()]
nobs = [n_con, n_treat]
z_stat, pval = proportions_ztest(successes, nobs=nobs)
(lower_con, lower_treat), (upper_con, upper_treat) = proportion_confint(successes, nobs=nobs, alpha=0.05)
print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

5. Drawing conclusions

|Business Intelligence|Graduate-Northeastern|MS Information Systems|Reporting|Data Engineer|Database|Analytics|