Small diabetes study
Contents
42.18. Small diabetes study#
In this assignment, we will work with a small dataset of diabetes patients taken from here.
42.18.1. Introduction to probability and statistics#
import pandas as pd
import numpy as np
import matplotlib
import pytest
import ipytest
import unittest
ipytest.autoconfig()
df = pd.read_csv("../../assets/data/diabetes.tsv",sep='\t')
df.head()
In this dataset, columns as the following:
Age and sex are self-explanatory
BMI is body mass index
BP is average blood pressure
S1 through S6 are different blood measurements
Y is the qualitative measure of disease progression over one year
Letβs study this dataset using methods of probability and statistics.
42.18.1.1. Task 1: Compute mean values and variance for all values#
def get_df_mean(df):
if df is None:
raise Exception('df cannot be None.')
return df____
def get_df_std(df):
if df is None:
raise Exception('df cannot be None.')
return df____
df_mean = get_df_mean(df)
df_std = get_df_std(df)
print(df_mean, df_std)
Check result by executing below... π
%%ipytest -qq
def create_test_df():
return pd.DataFrame(
{
'c1': [1, 2, 3, 4, 5],
'c2': [6, 7, 8, 9, 10]
}
)
class TestGetDFMean(unittest.TestCase):
def test_get_df_mean_happy_case(self):
# assign
test_df = create_test_df()
# act
actual_result = get_df_mean(test_df)
# assert
assert actual_result['c1'] == 3
assert actual_result['c2'] == 8
def test_get_df_mean_with_none_df(self):
# act & assert
with pytest.raises(Exception):
get_df_mean(None)
def test_get_df_mean_with_empty_df(self):
# act
actual_result = get_df_mean(pd.DataFrame())
# assert
assert actual_result.equals(pd.Series(dtype="float64"))
class TestGetDFStd(unittest.TestCase):
def test_get_df_std_happy_case(self):
# assign
test_df = create_test_df()
# act
actual_result = get_df_std(test_df)
# assert
assert actual_result['c1'] == 1.58113883008418981
assert actual_result['c2'] == 1.5811388300841898
def test_get_df_std_with_none_df(self):
# act & assert
with pytest.raises(Exception):
get_std(None)
def test_get_df_std_with_empty_df(self):
# act
actual_result = get_df_std(pd.DataFrame())
# assert
assert actual_result.equals(pd.Series(dtype="float64"))
π©βπ» Hint
You can consider to use pandas.DataFrame.mean
and pandas.DataFrame.std
.
42.18.1.2. Task 2: Plot boxplots for BMI, BP and Y depending on gender#
42.18.1.2.1. Part 1: get the data ready#
columns_to_plot = ['BMI', 'BP', 'Y']
def filter_by(df, column_name, column_value):
return df[df[____] == ____]
# filter the df by 'SEX' == 1
df_sex_1 = filter_by(____, ____, ____)
# filter the df by 'SEX' == 2
df_sex_2 = filter_by(____, ____, ____)
Check result by executing below... π
%%ipytest -qq
def create_test_df():
return pd.DataFrame(
{
'name': ["Lucy", "Tom", "Lily", "Anny", "Mike"],
'gender': ["f", "m", "f", "f", "m"],
}
)
class TestFilterBy(unittest.TestCase):
def test_filter_by_happy_case(self):
# assign
test_df = create_test_df()
# act
actual_result = filter_by(test_df, 'gender', "f")
# assert
assert actual_result.equals(
pd.DataFrame(
{
'name': ["Lucy", "Lily", "Anny"],
'gender': ["f", "f", "f"],
},
index=pd.Index([0, 2, 3]),
)
)
def test_filter_by_with_none_df(self):
# act & assert
with pytest.raises(Exception):
filter_by(None, 'gender', "f")
def test_filter_by_with_empty_df(self):
# act & assert
with pytest.raises(Exception):
filter_by(pd.DataFrame(), 'gender', "f")
def test_filter_by_invalid_column_name(self):
# act & assert
with pytest.raises(Exception):
filter_by(test_df, 'invalid_column_name', "f")
def test_filter_by_invalid_column_value(self):
# assign
test_df = create_test_df()
# act
actual_result = filter_by(test_df, 'gender', "invalid_column_value")
# assert
assert actual_result.equals(
pd.DataFrame(
columns=['name', 'gender']
)
)
π©βπ» Hint
Refer to indexin and selecting data on pandas.DataFrame
.
42.18.1.2.2. Part 2: plot the data#
def df_boxplot(df, column):
if df is not None and not df.empty:
df.____(column=____)
df_boxplot(df_sex_1, columns_to_plot)
df_boxplot(df_sex_2, columns_to_plot)
Check result by executing below... π
%%ipytest -qq
from unittest.mock import Mock, patch
class TestDFBoxPlot(unittest.TestCase):
def test_df_boxplot_happy_case(self):
# assign
test_df = Mock(return_value=pd.DataFrame(
{
'c1': [1, 2, 3, 4, 5],
}
))
test_df.empty = False
with patch.object(test_df, 'boxplot') as mock_df_boxplot:
# act
actual_result = df_boxplot(test_df)
# assert
mock_df_boxplot.assert_called_once()
def test_df_boxplot_with_empty_df(self):
# assign
test_df = Mock(return_value=pd.DataFrame())
with patch.object(test_df, 'boxplot') as mock_df_boxplot:
# act
actual_result = df_boxplot(test_df)
# assert
mock_df_boxplot.assert_not_called()
def test_df_boxplot_with_none_df(self):
# assign
test_df = Mock(return_value=None)
with patch.object(test_df, 'boxplot') as mock_df_boxplot:
# act
actual_result = df_boxplot(test_df)
# assert
mock_df_boxplot.assert_not_called()
π©βπ» Hint
You can consider to use pandas.DataFrame.boxplot
.
42.18.1.3. Task 3: What is the distribution of Age, Sex, BMI and Y variables?#
def df_plot(df, column):
if df is not None and not df.empty:
df____(column=____)
age_distribution = df_plot(df['AGE'], columns_to_plot)
sex_distribution = df_plot(df['SEX'], columns_to_plot)
bmi_distribution = df_plot(df['BMI'], columns_to_plot)
y_distribution = df_plot(df['Y'], columns_to_plot)
Check result by executing below... π
%%ipytest -qq
from unittest.mock import Mock, patch
class TestDFPlot(unittest.TestCase):
def test_df_plot_happy_case(self):
# assign
test_df = Mock(return_value=pd.DataFrame(
{
'c1': [1, 2, 3, 4, 5],
}
))
test_df.empty = False
with patch.object(test_df, 'plot') as mock_df_plot:
# act
actual_result = df_plot(test_df)
# assert
mock_df_plot.assert_called_once()
def test_df_plot_with_empty_df(self):
# assign
test_df = Mock(return_value=pd.DataFrame())
with patch.object(test_df, 'plot') as mock_df_plot:
# act
actual_result = df_plot(test_df)
# assert
mock_df_plot.assert_not_called()
def test_df_plot_with_none_df(self):
# assign
test_df = Mock(return_value=None)
with patch.object(test_df, 'plot') as mock_df_plot:
# act
actual_result = df_plot(test_df)
# assert
mock_df_plot.assert_not_called()
π©βπ» Hint
You can consider to use pandas.DataFrame.plot
.
42.18.1.4. Task 4: Test the correlation between different variables and disease progression (Y)#
Hint Correlation matrix would give you the most useful information on which values are dependent.
def get_df_corr_with(df, with_column):
if df is not None and not df.empty:
return df____()[____]
df_corr_y = get_df_corr_with(____, ____)
df_corr_y
Check result by executing below... π
%%ipytest -qq
def create_test_df():
return pd.DataFrame(
{
'c1': [1, 2, 3, 4, 5],
'c2': [6, 7, 8, 9, 10]
}
)
class TestGetDfCorrWith(unittest.TestCase):
def test_get_df_corr_with_happy_case(self):
# assign
test_df = create_test_df()
# act
actual_result = get_df_corr_with(test_df, 'c2')
# assert
assert actual_result.equals(pd.Series(
[1.0, 1.0], index=['c1', 'c2']
))
def test_get_df_corr_with_with_none_df(self):
# act
actual_result = get_df_corr_with(None, 'any_column')
# assert
assert not actual_result
def test_get_df_corr_with_with_empty_df(self):
# act
actual_result = get_df_corr_with(pd.DataFrame(), 'any_column')
# assert
assert not actual_result
def test_get_df_corr_with_with_invalid_column_name(self):
# act & assert
with pytest.raises(Exception):
get_df_corr_with(create_test_df(), 'invalid_column')
π©βπ» Hint
You can consider to use pandas.DataFrame.corr
.
42.18.1.5. Task 5: Test the hypothesis that the degree of diabetes progression is different between men and women#
# get the correlation between 'SEX' and 'Y'
df_corr_sex_with_y = get_df_corr_with(____, ____)[____]
df_corr_sex_with_y
# plot the scatterplot between 'SEX' and 'Y'
def df_scatterplot(df, c1, c2):
if df is not None and not df.empty:
df.____(____, _____)
df_scatterplot(df, 'SEX', 'Y')
# fill in True or False
diabetes_progression_correlated_with_sex = ____
print(f"The degree of diabetes progression is {'different' if diabetes_progression_correlated_with_sex else 'not different'} between men and women.")
Check result by executing below... π
%%ipytest -qq
from unittest.mock import Mock, patch
class TestDFScatterPlot(unittest.TestCase):
def test_df_scatterplot_happy_case(self):
# assign
test_df = Mock(
plot=Mock(scatter=Mock()),
empty=False,
return_value=pd.DataFrame(
{
'c1': [1, 2, 3, 4, 5],
'c2': [6, 7, 8, 9, 10],
}
)
)
# act
actual_result = df_scatterplot(test_df, 'c1', 'c2')
# assert
test_df.plot.scatter.assert_called_once_with('c1', 'c2')
def test_df_scatterplot_with_empty_df(self):
# assign
test_df = Mock(plot=Mock(scatter=Mock()), empty=True, return_value=pd.DataFrame())
# act
actual_result = df_scatterplot(test_df, 'c1', 'c2')
# assert
test_df.plot.scatter.assert_not_called()
def test_df_scatterplot_with_none_df(self):
# assign
test_df = Mock(plot=Mock(scatter=Mock()), empty=True, return_value=None)
# act
actual_result = df_scatterplot(test_df, 'c1', 'c2')
# assert
test_df.plot.scatter.assert_not_called()
assert not diabetes_progression_correlated_with_sex
π©βπ» Hint
You can consider to use pandas.DataFrame.corrwith
to get the correlation, and pandas.DataFrame.plot.scatter
to plot the scatterplots.
42.18.2. Rubric#
Exemplary |
Adequate |
Needs Improvement |
---|---|---|
All required tasks are complete, graphically illustrated and explained |
Most of the tasks are complete, explanations or takeaways from graphs and/or obtained values are missing |
Only basic tasks such as computation of mean/variance and basic plots are complete, no conclusions are made from the data |
42.18.3. Acknowledgments#
Thanks to Microsoft for creating the open-source course Data Science for Beginners. It inspires the majority of the content in this chapter.