Statistic overview#

!pip install -q -r requirements.txt
^C
import numpy as np
import pandas as pd
number1 = 319 #@param {type:"slider", min:3, max:1000, step:1}
number2 = 321 #@param {type:"slider", min:3, max:1000, step:1}
#@markdown ---
mu1 = 0 #@param {type:"slider", min:-10, max:10, step:0.01}
mu2 = 0.14 #@param {type:"slider", min:-10, max:10, step:0.01}
#@title
sigma = 0.1 # standard deviation
s1 = np.random.normal(mu1, sigma, number1)
d1 = pd.DataFrame(s1, columns = ['Area'])
d1['Label'] = '1'
d1.head()
Area Label
0 0.036129 1
1 -0.064217 1
2 -0.085285 1
3 -0.100871 1
4 0.078571 1
s2 = np.random.normal(mu2, sigma, number2)
d2 = pd.DataFrame(s2, columns = ['Area'])
d2['Label'] = '2'
d2.head()
Area Label
0 0.272011 2
1 0.077220 2
2 -0.007429 2
3 0.097255 2
4 0.124592 2
result = pd.concat([d1, d2])
result.head()
Area Label
0 0.036129 1
1 -0.064217 1
2 -0.085285 1
3 -0.100871 1
4 0.078571 1
import bokeh.plotting
import bokeh.io

#!pip install --upgrade bokeh-catplot
import bokeh_catplot

bokeh.io.output_notebook()
C:\Users\schatzm\Anaconda3\envs\julab\lib\site-packages\bokeh_catplot\__init__.py:13: DeprecationWarning: bokeh-catplot is deprecated. Use iqplot instead.
  warnings.warn("bokeh-catplot is deprecated. Use iqplot instead.", DeprecationWarning)
Loading BokehJS ...
p = bokeh_catplot.histogram(
    data=result,
    cats='Label',
    val='Area'
)

bokeh.io.show(p)
p = bokeh_catplot.strip(
    data=result,
    cats='Label',
    val='Area',
    horizontal=True,
    jitter=True,
    height=250,
    marker_kwargs=dict(alpha=0.5),
)

p = bokeh_catplot.box(
    data=result,
    cats='Label',
    val='Area',
    horizontal=True,
    whisker_caps=True,
    display_points=False,
    outlier_marker='diamond',
    box_kwargs=dict(fill_color=None, line_color='black'),
    median_kwargs=dict(line_color='black', line_width=2),
    whisker_kwargs=dict(line_color='black', line_width=2),
    p=p,
)

bokeh.io.show(p)
from scipy.stats import normaltest

k2, p = normaltest(d1['Area'])
alpha = 1e-3
print("p = {:g}".format(p))
print('null hypothesis: Data1 comes from a normal distribution')
if p < alpha:  # null hypothesis: Data1 comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

k2, p = normaltest(d2['Area'])
alpha = 1e-3
print("p = {:g}".format(p))
print('null hypothesis: Data2  from a normal distribution')
if p < alpha:  # null hypothesis: Data2 (Aptamil) from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")
p = 0.343844
null hypothesis: Data1 comes from a normal distribution
The null hypothesis cannot be rejected
p = 0.402473
null hypothesis: Data2  from a normal distribution
The null hypothesis cannot be rejected
# t-test
from numpy.random import seed
from numpy.random import randn
from scipy.stats import ttest_ind
# import random  
from random import sample 
data1=d1['Area'].sample(n=100, random_state=1)
data2=d2['Area'].sample(n=100, random_state=1)
print('null hypothesis: data sets are from the same distribution')
# compare samples
stat, p = ttest_ind(data1, data2)
print('Statistics=%.3f, p=%.16f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')
null hypothesis: data sets are from the same distribution
Statistics=-9.765, p=0.0000000000000000
Different distribution (reject H0)
# Mann-Whitney U test
from numpy.random import seed
from numpy.random import randn
from scipy.stats import mannwhitneyu
# import random  
from random import sample 
data1=d1['Area'].sample(n=100, random_state=1)
data2=d2['Area'].sample(n=100, random_state=1)
print('null hypothesis: data sets are from the same distribution')
# compare samples
stat, p = mannwhitneyu(data1, data2)
print('Statistics=%.3f, p=%.16f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')
null hypothesis: data sets are from the same distribution
Statistics=1682.000, p=0.0000000000000005
Different distribution (reject H0)
from watermark import watermark
watermark(iversions=True, globals_=globals())
print(watermark())
print(watermark(packages="watermark,numpy,scipy,pandas,matplotlib,bokeh,statannotations"))
Last updated: 2023-01-05T13:43:49.111960+01:00

Python implementation: CPython
Python version       : 3.9.15
IPython version      : 8.8.0

Compiler    : MSC v.1929 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 85 Stepping 7, GenuineIntel
CPU cores   : 40
Architecture: 64bit

watermark      : 2.3.1
numpy          : 1.23.5
scipy          : 1.10.0
pandas         : 1.5.2
matplotlib     : 3.6.2
bokeh          : 3.0.3
statannotations: 0.5.0