Big Data definition

!pip install -q -r requirements.txt

^C

import numpy as np

import pandas as pd

import bokeh.plotting
import bokeh.io

!pip install --upgrade bokeh-catplot
import bokeh_catplot

bokeh.io.output_notebook()

Collecting bokeh-catplot
  Using cached bokeh_catplot-0.1.9-py2.py3-none-any.whl (16 kB)
Collecting xarray
  Using cached xarray-2022.12.0-py3-none-any.whl (969 kB)
Collecting numba
  Using cached numba-0.56.4-cp39-cp39-win_amd64.whl (2.5 MB)
Requirement already satisfied: bokeh in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh-catplot) (3.0.3)
Collecting colorcet
  Using cached colorcet-3.0.1-py2.py3-none-any.whl (1.7 MB)
Requirement already satisfied: numpy in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh-catplot) (1.23.5)
Requirement already satisfied: pandas in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh-catplot) (1.5.2)
Requirement already satisfied: xyzservices>=2021.09.1 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh->bokeh-catplot) (2022.9.0)
Requirement already satisfied: PyYAML>=3.10 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh->bokeh-catplot) (6.0)
Requirement already satisfied: Jinja2>=2.9 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh->bokeh-catplot) (3.1.2)
Requirement already satisfied: pillow>=7.1.0 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh->bokeh-catplot) (9.4.0)
Requirement already satisfied: packaging>=16.8 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh->bokeh-catplot) (22.0)
Requirement already satisfied: tornado>=5.1 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh->bokeh-catplot) (6.2)
Requirement already satisfied: contourpy>=1 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from bokeh->bokeh-catplot) (1.0.6)
Requirement already satisfied: pytz>=2020.1 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from pandas->bokeh-catplot) (2022.7)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from pandas->bokeh-catplot) (2.8.2)
Collecting pyct>=0.4.4
  Using cached pyct-0.4.8-py2.py3-none-any.whl (15 kB)
Requirement already satisfied: setuptools in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from numba->bokeh-catplot) (65.6.3)
Collecting llvmlite<0.40,>=0.39.0dev0
  Using cached llvmlite-0.39.1-cp39-cp39-win_amd64.whl (23.2 MB)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from Jinja2>=2.9->bokeh->bokeh-catplot) (2.1.1)
Requirement already satisfied: param>=1.7.0 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from pyct>=0.4.4->colorcet->bokeh-catplot) (1.12.3)
Requirement already satisfied: six>=1.5 in c:\users\schatzm\anaconda3\envs\julab\lib\site-packages (from python-dateutil>=2.8.1->pandas->bokeh-catplot) (1.16.0)
Installing collected packages: pyct, llvmlite, numba, colorcet, xarray, bokeh-catplot
Successfully installed bokeh-catplot-0.1.9 colorcet-3.0.1 llvmlite-0.39.1 numba-0.56.4 pyct-0.4.8 xarray-2022.12.0

C:\Users\schatzm\Anaconda3\envs\julab\lib\site-packages\bokeh_catplot\__init__.py:13: DeprecationWarning: bokeh-catplot is deprecated. Use iqplot instead.
  warnings.warn("bokeh-catplot is deprecated. Use iqplot instead.", DeprecationWarning)

Loading BokehJS ...

data1Path = "Dataset/Results_MM.csv" #@param {type:"string"}

data1=pd.read_csv(data1Path)  

data1.head()

		Label	Area	Mean	Mode	Min	Max	X	Y	...	%Area	RawIntDen	Slice	FeretX	FeretY	FeretAngle	MinFeret	AR	Round	Solidity
0	1	Data1	2541	56.0	56	56	56	598.54486	170.83176	...	100	142296.0	1	593	142	98.53077	55.75403	1.06228	0.94137	0.97319
1	2	Data1	2419	96.0	96	96	96	2718.95432	624.50827	...	100	232224.0	1	2711	653	72.47443	52.97387	1.10986	0.90102	0.97032
2	3	Data1	1855	4.0	4	4	4	237.42237	661.82938	...	100	7420.0	1	224	683	56.30993	48.00000	1.03663	0.96466	0.96867
3	4	Data1	2596	65.0	65	65	65	293.91371	820.11710	...	100	168740.0	1	266	807	163.66396	56.02472	1.05496	0.94790	0.97320
4	5	Data1	1409	31.0	31	31	31	2998.58375	1046.93364	...	100	43679.0	1	2989	1068	68.42869	39.00000	1.15185	0.86817	0.96739

5 rows × 36 columns

data1.describe()

C:\Users\schatzm\Anaconda3\envs\julab\lib\site-packages\numpy\lib\function_base.py:4527: RuntimeWarning: invalid value encountered in subtract
  diff_b_a = subtract(b, a)

		Area	Mean	StdDev	Mode	Min	Max	X	Y	XM	...	%Area	RawIntDen	Slice	FeretX	FeretY	FeretAngle	MinFeret	AR	Round	Solidity
count	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	...	1615323.0	1.615323e+06	1615323.0	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06	1.615323e+06
mean	8.076620e+05	1.331084e+02	6.671007e+03	2.810688e+02	6.488793e+03	6.444779e+03	7.109614e+03	1.440901e+03	1.394038e+03	1.440899e+03	...	100.0	4.633171e+05	1.0	1.436129e+03	1.392875e+03	1.049987e+02	1.030235e+01	1.249056e+00	8.442305e-01	8.899525e-01
std	4.663037e+05	2.516350e+02	6.086050e+03	1.091096e+03	6.101919e+03	6.101029e+03	6.399855e+03	8.383394e+02	8.243418e+02	8.383402e+02	...	0.0	7.302585e+05	0.0	8.382947e+02	8.243501e+02	4.822601e+01	6.097962e+00	3.688458e-01	1.536807e-01	5.326279e-02
min	1.000000e+00	1.200000e+01	1.000000e+00	0.000000e+00	1.000000e+00	1.000000e+00	1.000000e+00	3.409090e+00	3.166670e+00	3.409090e+00	...	100.0	7.000000e+01	1.0	1.000000e+00	1.000000e+00	1.507440e+00	3.000000e+00	1.000000e+00	1.111700e-01	3.585100e-01
25%	4.038315e+05	3.700000e+01	1.956000e+03	0.000000e+00	1.768000e+03	1.723000e+03	2.069000e+03	7.161416e+02	6.896124e+02	7.161596e+02	...	100.0	1.616980e+05	1.0	7.120000e+02	6.880000e+02	5.446232e+01	7.000000e+00	1.061510e+00	8.218400e-01	8.709700e-01
50%	8.076620e+05	5.800000e+01	4.868000e+03	0.000000e+00	4.595000e+03	4.534000e+03	5.249000e+03	1.434662e+03	1.368470e+03	1.434676e+03	...	100.0	3.169700e+05	1.0	1.430000e+03	1.367000e+03	1.255377e+02	8.000000e+00	1.109590e+00	9.012400e-01	9.014100e-01
75%	1.211492e+06	1.160000e+02	9.709000e+03	0.000000e+00	9.492000e+03	9.442000e+03	1.045600e+04	2.148138e+03	2.046322e+03	2.148138e+03	...	100.0	5.530990e+05	1.0	2.143000e+03	2.045000e+03	1.444623e+02	1.100000e+01	1.216780e+00	9.420500e-01	9.230800e-01
max	1.615323e+06	2.469600e+04	3.233200e+04	1.428012e+04	3.233200e+04	3.233200e+04	3.233200e+04	3.189167e+03	3.226500e+03	3.189167e+03	...	100.0	1.135496e+08	1.0	3.187000e+03	3.228000e+03	1.791449e+02	1.612912e+02	8.994840e+00	1.000000e+00	1.000000e+00

8 rows × 35 columns

data2Path = "Dataset/Results_aptamil.csv" #@param {type:"string"}

data2=pd.read_csv(data2Path)  

data2.head()

		Label	Area	Mean	Mode	Min	Max	X	Y	...	%Area	RawIntDen	Slice	FeretX	FeretY	FeretAngle	MinFeret	AR	Round	Solidity
0	1	Data2	2776	15.0	15	15	15	2574.66967	80.45245	...	100	41640.0	1	2510	79	0.84876	29.09975	4.99331	0.20027	0.94857
1	2	Data2	2628	14.0	14	14	14	1015.16629	128.37938	...	100	36792.0	1	1013	92	95.04245	53.00000	1.28544	0.77795	0.97010
2	3	Data2	337	13.0	13	13	13	2602.27448	103.22404	...	100	4381.0	1	2598	93	105.25512	20.78831	1.07470	0.93050	0.92837
3	4	Data2	491	16.0	16	16	16	2038.76069	290.29837	...	100	7856.0	1	2030	280	127.69424	24.00000	1.05842	0.94480	0.96464
4	5	Data2	509	5.0	5	5	5	2717.95972	401.13654	...	100	2545.0	1	2706	408	33.11134	24.00000	1.09402	0.91406	0.95140

5 rows × 36 columns

data2.describe()

C:\Users\schatzm\Anaconda3\envs\julab\lib\site-packages\numpy\lib\function_base.py:4527: RuntimeWarning: invalid value encountered in subtract
  diff_b_a = subtract(b, a)

		Area	Mean	StdDev	Mode	Min	Max	X	Y	XM	...	%Area	RawIntDen	Slice	FeretX	FeretY	FeretAngle	MinFeret	AR	Round	Solidity
count	666214.000000	666214.000000	666214.000000	666214.000000	666214.000000	666214.000000	666214.000000	666214.000000	666214.000000	666214.000000	...	666214.0	6.662140e+05	666214.0	666214.000000	666214.000000	666214.000000	666214.000000	666214.000000	666214.000000	666214.000000
mean	333107.500000	107.186941	4778.728803	107.668110	4758.339600	4671.752471	4898.814561	1462.707549	1378.698306	1462.708680	...	100.0	2.964580e+05	1.0	1458.731747	1377.911724	97.782395	8.929439	1.235981	0.834769	0.890030
std	192319.560456	303.942842	4594.148721	620.511333	4627.082933	4580.314488	4698.678348	820.313937	786.322358	820.315810	...	0.0	6.119494e+05	0.0	820.405865	786.316653	47.704262	5.969685	0.278721	0.120508	0.043218
min	1.000000	11.000000	1.000000	0.000000	1.000000	1.000000	1.000000	3.425930	3.029410	3.425930	...	100.0	2.900000e+01	1.0	1.000000	1.000000	0.848760	3.000000	1.000000	0.078880	0.371820
25%	166554.250000	31.000000	873.000000	0.000000	848.000000	830.000000	882.000000	765.134703	708.711415	765.137985	...	100.0	4.565525e+04	1.0	761.000000	708.000000	45.000000	6.000000	1.091640	0.791510	0.869570
50%	333107.500000	46.000000	3257.365065	0.000000	3173.000000	3072.000000	3337.000000	1470.000000	1378.719670	1470.022270	...	100.0	1.744200e+05	1.0	1466.000000	1378.000000	116.565050	7.000000	1.158650	0.863080	0.894740
75%	499660.750000	75.000000	7718.000000	0.000000	7704.000000	7545.000000	7952.000000	2163.673910	2044.148023	2163.689600	...	100.0	3.975745e+05	1.0	2160.000000	2044.000000	135.000000	9.000000	1.263408	0.916050	0.918600
max	666214.000000	58667.000000	21305.000000	10052.812470	21305.000000	21305.000000	21305.000000	3139.107140	3086.326090	3139.107140	...	100.0	2.731261e+08	1.0	3136.000000	3089.000000	179.292680	359.467960	12.677090	1.000000	1.000000

8 rows × 35 columns

d1Area = data1[['Label', 'Area','Feret','AR']]
d2Area = data2[['Label', 'Area','Feret','AR']]

d1Area.dropna(how='all')
d2Area.dropna(how='all')

	Label	Area	Feret	AR
0	Data2	2776	135.01481	4.99331
1	Data2	2628	68.26419	1.28544
2	Data2	337	22.80351	1.07470
3	Data2	491	27.80288	1.05842
4	Data2	509	27.45906	1.09402
...	...	...	...	...
666209	Data2	1787	62.64982	1.59116
666210	Data2	180	18.78829	1.38055
666211	Data2	45	8.94427	1.16188
666212	Data2	75	12.36932	1.28449
666213	Data2	62	15.52417	2.93601

666214 rows × 4 columns

d1Area = d1Area.replace({'Data1':'MM'})
d2Area = d2Area.replace({'Data2':'Aptamil'})

d1Area.head()

	Label	Area	Feret	AR
0	MM	2541	60.67125	1.06228
1	MM	2419	59.77458	1.10986
2	MM	1855	50.47772	1.03663
3	MM	2596	60.44005	1.05496
4	MM	1409	46.23851	1.15185

d1Area['Area'].describe() 

count    1.615323e+06
mean     1.331084e+02
std      2.516350e+02
min      1.200000e+01
25%      3.700000e+01
50%      5.800000e+01
75%      1.160000e+02
max      2.469600e+04
Name: Area, dtype: float64

df_median = d1Area['Area'].median()

# Take a look
df_median

58.0

d2Area['Area'].describe() 

count    666214.000000
mean        107.186941
std         303.942842
min          11.000000
25%          31.000000
50%          46.000000
75%          75.000000
max       58667.000000
Name: Area, dtype: float64

df_median = d2Area['Area'].median()

# Take a look
df_median

46.0

result = pd.concat([d1Area, d2Area])

#del d1Area, d2Area

result.head()

	Label	Area	Feret	AR
0	MM	2541	60.67125	1.06228
1	MM	2419	59.77458	1.10986
2	MM	1855	50.47772	1.03663
3	MM	2596	60.44005	1.05496
4	MM	1409	46.23851	1.15185

p = bokeh_catplot.histogram(
    data=result[np.mod(np.arange(result.index.size),3)!=0],
    cats='Label',
    val='Area'
)

bokeh.io.show(p)

C:\Users\schatzm\Anaconda3\envs\julab\lib\site-packages\bokeh_catplot\dist.py:452: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  _, df["__label"] = utils._source_and_labels_from_cats(df, cats)

p = bokeh_catplot.strip(
    data=result[np.mod(np.arange(result.index.size),3)!=0],
    cats='Label',
    val='Area',
    horizontal=True,
    jitter=True,
    height=250
)

p = bokeh_catplot.box(
    data=result,
    cats='Label',
    val='Area',
    horizontal=True,
    whisker_caps=True,
    display_points=False,
    outlier_marker='diamond',
    #box_kwargs=dict(fill_color=None, line_color='gray'),
    #median_kwargs=dict(line_color='gray'),
    #whisker_kwargs=dict(line_color='gray'),
    # p=p,
)

bokeh.io.show(p)

from scipy.stats import normaltest

k2, p = normaltest(d1Area['Area'])
alpha = 1e-3
print("p = {:g}".format(p))
print('null hypothesis: Data1 (MM) comes from a normal distribution')
if p < alpha:  # null hypothesis: Data1 (MM) comes from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

k2, p = normaltest(d2Area['Area'])
alpha = 1e-3
print("p = {:g}".format(p))
print('null hypothesis: Data2 (Aptamil) from a normal distribution')
if p < alpha:  # null hypothesis: Data2 (Aptamil) from a normal distribution
    print("The null hypothesis can be rejected")
else:
    print("The null hypothesis cannot be rejected")

p = 0
null hypothesis: Data1 (MM) comes from a normal distribution
The null hypothesis can be rejected
p = 0
null hypothesis: Data2 (Aptamil) from a normal distribution
The null hypothesis can be rejected

# Mann-Whitney U test
from numpy.random import seed
from numpy.random import randn
from scipy.stats import mannwhitneyu
# import random  
from random import sample 
data1=d1Area['Area'].sample(n=100, random_state=1)
data2=d2Area['Area'].sample(n=100, random_state=1)
print('null hypothesis: data sets are from the same distribution')
# compare samples
stat, p = mannwhitneyu(data1, data2)
print('Statistics=%.3f, p=%.16f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

null hypothesis: data sets are from the same distribution
Statistics=6272.500, p=0.0018810635846991
Different distribution (reject H0)

from watermark import watermark
watermark(iversions=True, globals_=globals())
print(watermark())
print(watermark(packages="watermark,numpy,scipy,pandas,matplotlib,bokeh,statannotations"))

Last updated: 2023-01-05T13:42:29.700464+01:00

Python implementation: CPython
Python version       : 3.9.15
IPython version      : 8.8.0

Compiler    : MSC v.1929 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 85 Stepping 7, GenuineIntel
CPU cores   : 40
Architecture: 64bit

watermark      : 2.3.1
numpy          : 1.23.5
scipy          : 1.10.0
pandas         : 1.5.2
matplotlib     : 3.6.2
bokeh          : 3.0.3
statannotations: 0.5.0

MB100T01 Advanced Image Analysis Course

Contents

Big Data definition#

Example#

Solution#

Practical example#

Goal#

Expected results from 3D measurements#

MB100T01 Advanced Image Analysis Course

Big Data definition

Contents

Big Data definition#

Example#

Solution#

Practical example#

MICROSTRUCTURE OF INFANT FORMULA RELATED TO ITS FUNCTION in 2D#

Goal#

Expected results from 3D measurements#