42.18. Small diabetes study#

In this assignment, we will work with a small dataset of diabetes patients taken from here.

42.18.1. Introduction to probability and statistics#

import pandas as pd
import numpy as np
import matplotlib
import pytest
import ipytest
import unittest

ipytest.autoconfig()

df = pd.read_csv("../../assets/data/diabetes.tsv",sep='\t')
df.head()

In this dataset, columns as the following:

  • Age and sex are self-explanatory

  • BMI is body mass index

  • BP is average blood pressure

  • S1 through S6 are different blood measurements

  • Y is the qualitative measure of disease progression over one year

Let’s study this dataset using methods of probability and statistics.

42.18.1.1. Task 1: Compute mean values and variance for all values#

def get_df_mean(df):
    if df is None:
        raise Exception('df cannot be None.')
    return df____

def get_df_std(df):
    if df is None:
        raise Exception('df cannot be None.')
    return df____

df_mean = get_df_mean(df)
df_std = get_df_std(df)

print(df_mean, df_std)
Check result by executing below... πŸ“
%%ipytest -qq

def create_test_df():
    return pd.DataFrame(
        {
            'c1': [1, 2, 3, 4, 5], 
            'c2': [6, 7, 8, 9, 10]
        }
    )

class TestGetDFMean(unittest.TestCase):

    def test_get_df_mean_happy_case(self):
        # assign
        test_df = create_test_df()
        
        # act
        actual_result = get_df_mean(test_df)

        # assert
        assert actual_result['c1'] == 3
        assert actual_result['c2'] == 8

    def test_get_df_mean_with_none_df(self):
        # act & assert
        with pytest.raises(Exception):
            get_df_mean(None)

    def test_get_df_mean_with_empty_df(self):
        # act
        actual_result = get_df_mean(pd.DataFrame())

        # assert
        assert actual_result.equals(pd.Series(dtype="float64"))

class TestGetDFStd(unittest.TestCase):

    def test_get_df_std_happy_case(self):
        # assign
        test_df = create_test_df()
        
        # act
        actual_result = get_df_std(test_df)

        # assert
        assert actual_result['c1'] == 1.58113883008418981
        assert actual_result['c2'] == 1.5811388300841898

    def test_get_df_std_with_none_df(self):
        # act & assert
        with pytest.raises(Exception):
            get_std(None)

    def test_get_df_std_with_empty_df(self):
        # act
        actual_result = get_df_std(pd.DataFrame())

        # assert
        assert actual_result.equals(pd.Series(dtype="float64"))
πŸ‘©β€πŸ’» Hint

You can consider to use pandas.DataFrame.mean and pandas.DataFrame.std.

42.18.1.2. Task 2: Plot boxplots for BMI, BP and Y depending on gender#

42.18.1.2.1. Part 1: get the data ready#

columns_to_plot = ['BMI', 'BP', 'Y']

def filter_by(df, column_name, column_value):
    return df[df[____] == ____]

# filter the df by 'SEX' == 1
df_sex_1 = filter_by(____, ____, ____)

# filter the df by 'SEX' == 2
df_sex_2 = filter_by(____, ____, ____)
Check result by executing below... πŸ“
%%ipytest -qq

def create_test_df():
    return pd.DataFrame(
        {
            'name': ["Lucy", "Tom", "Lily", "Anny", "Mike"], 
            'gender': ["f", "m", "f", "f", "m"],
        }
    )

class TestFilterBy(unittest.TestCase):

    def test_filter_by_happy_case(self):
        # assign
        test_df = create_test_df()
        
        # act
        actual_result = filter_by(test_df, 'gender', "f")

        # assert
        assert actual_result.equals(
            pd.DataFrame(
                {
                    'name': ["Lucy", "Lily", "Anny"], 
                    'gender': ["f", "f", "f"],
                },
                index=pd.Index([0, 2, 3]),
            )
        )

    def test_filter_by_with_none_df(self):
        # act & assert
        with pytest.raises(Exception):
            filter_by(None, 'gender', "f")

    def test_filter_by_with_empty_df(self):
        # act & assert
        with pytest.raises(Exception):
            filter_by(pd.DataFrame(), 'gender', "f")
            
    def test_filter_by_invalid_column_name(self):
        # act & assert
        with pytest.raises(Exception):
            filter_by(test_df, 'invalid_column_name', "f")
            
    
    def test_filter_by_invalid_column_value(self):
        # assign
        test_df = create_test_df()
        
        # act
        actual_result = filter_by(test_df, 'gender', "invalid_column_value")
                
        # assert
        assert actual_result.equals(
            pd.DataFrame(
                columns=['name', 'gender']
            )
        )
πŸ‘©β€πŸ’» Hint

Refer to indexin and selecting data on pandas.DataFrame.

42.18.1.2.2. Part 2: plot the data#

def df_boxplot(df, column):
    if df is not None and not df.empty:
        df.____(column=____)
df_boxplot(df_sex_1, columns_to_plot)
df_boxplot(df_sex_2, columns_to_plot)
Check result by executing below... πŸ“
%%ipytest -qq

from unittest.mock import Mock, patch

class TestDFBoxPlot(unittest.TestCase):
  
    def test_df_boxplot_happy_case(self):
        # assign
        test_df = Mock(return_value=pd.DataFrame(
            {
                'c1': [1, 2, 3, 4, 5], 
            }
        ))
        test_df.empty = False
        
        with patch.object(test_df, 'boxplot') as mock_df_boxplot:
            # act
            actual_result = df_boxplot(test_df)

            # assert
            mock_df_boxplot.assert_called_once()

    def test_df_boxplot_with_empty_df(self):
        # assign
        test_df = Mock(return_value=pd.DataFrame())
        
        with patch.object(test_df, 'boxplot') as mock_df_boxplot:
            # act
            actual_result = df_boxplot(test_df)

            # assert
            mock_df_boxplot.assert_not_called()
            
    def test_df_boxplot_with_none_df(self):
        # assign
        test_df = Mock(return_value=None)
        
        with patch.object(test_df, 'boxplot') as mock_df_boxplot:
            # act
            actual_result = df_boxplot(test_df)

            # assert
            mock_df_boxplot.assert_not_called()
πŸ‘©β€πŸ’» Hint

You can consider to use pandas.DataFrame.boxplot.

42.18.1.3. Task 3: What is the distribution of Age, Sex, BMI and Y variables?#

def df_plot(df, column):
    if df is not None and not df.empty:
        df____(column=____)
age_distribution = df_plot(df['AGE'], columns_to_plot)
sex_distribution = df_plot(df['SEX'], columns_to_plot)
bmi_distribution = df_plot(df['BMI'], columns_to_plot)
y_distribution = df_plot(df['Y'], columns_to_plot)
Check result by executing below... πŸ“
%%ipytest -qq

from unittest.mock import Mock, patch

class TestDFPlot(unittest.TestCase):

    def test_df_plot_happy_case(self):
        # assign
        test_df = Mock(return_value=pd.DataFrame(
            {
                'c1': [1, 2, 3, 4, 5], 
            }
        ))
        test_df.empty = False
        
        with patch.object(test_df, 'plot') as mock_df_plot:
            # act
            actual_result = df_plot(test_df)

            # assert
            mock_df_plot.assert_called_once()

    def test_df_plot_with_empty_df(self):
        # assign
        test_df = Mock(return_value=pd.DataFrame())
        
        with patch.object(test_df, 'plot') as mock_df_plot:
            # act
            actual_result = df_plot(test_df)

            # assert
            mock_df_plot.assert_not_called()
            
    def test_df_plot_with_none_df(self):
        # assign
        test_df = Mock(return_value=None)
        
        with patch.object(test_df, 'plot') as mock_df_plot:
            # act
            actual_result = df_plot(test_df)

            # assert
            mock_df_plot.assert_not_called()
πŸ‘©β€πŸ’» Hint

You can consider to use pandas.DataFrame.plot.

42.18.1.4. Task 4: Test the correlation between different variables and disease progression (Y)#

Hint Correlation matrix would give you the most useful information on which values are dependent.

def get_df_corr_with(df, with_column):
    if df is not None and not df.empty:
        return df____()[____]

df_corr_y = get_df_corr_with(____, ____)
df_corr_y
Check result by executing below... πŸ“
%%ipytest -qq

def create_test_df():
    return pd.DataFrame(
        {
            'c1': [1, 2, 3, 4, 5], 
            'c2': [6, 7, 8, 9, 10]
        }
    )

class TestGetDfCorrWith(unittest.TestCase):

    def test_get_df_corr_with_happy_case(self):
        # assign
        test_df = create_test_df()
        
        # act
        actual_result = get_df_corr_with(test_df, 'c2')

        # assert
        assert actual_result.equals(pd.Series(
            [1.0, 1.0], index=['c1', 'c2']
        ))

    def test_get_df_corr_with_with_none_df(self):
        # act 
        actual_result = get_df_corr_with(None, 'any_column')
        
        # assert
        assert not actual_result

    def test_get_df_corr_with_with_empty_df(self):
        # act
        actual_result = get_df_corr_with(pd.DataFrame(), 'any_column')

        # assert
        assert not actual_result
        
    def test_get_df_corr_with_with_invalid_column_name(self):
        # act & assert
        with pytest.raises(Exception):
            get_df_corr_with(create_test_df(), 'invalid_column')
πŸ‘©β€πŸ’» Hint

You can consider to use pandas.DataFrame.corr.

42.18.1.5. Task 5: Test the hypothesis that the degree of diabetes progression is different between men and women#

# get the correlation between 'SEX' and 'Y'
df_corr_sex_with_y = get_df_corr_with(____, ____)[____]
df_corr_sex_with_y
# plot the scatterplot between 'SEX' and 'Y'

def df_scatterplot(df, c1, c2):
    if df is not None and not df.empty:
        df.____(____, _____)

df_scatterplot(df, 'SEX', 'Y')
# fill in True or False
diabetes_progression_correlated_with_sex = ____
print(f"The degree of diabetes progression is {'different' if diabetes_progression_correlated_with_sex else 'not different'} between men and women.")
Check result by executing below... πŸ“
%%ipytest -qq

from unittest.mock import Mock, patch

class TestDFScatterPlot(unittest.TestCase):

    def test_df_scatterplot_happy_case(self):
        # assign
        test_df = Mock(
            plot=Mock(scatter=Mock()),
            empty=False,
            return_value=pd.DataFrame(
                {
                    'c1': [1, 2, 3, 4, 5], 
                    'c2': [6, 7, 8, 9, 10], 
                }
            )
        )
        
        # act
        actual_result = df_scatterplot(test_df, 'c1', 'c2')

        # assert
        test_df.plot.scatter.assert_called_once_with('c1', 'c2')

    def test_df_scatterplot_with_empty_df(self):
        # assign
        test_df = Mock(plot=Mock(scatter=Mock()), empty=True, return_value=pd.DataFrame())
        
        # act
        actual_result = df_scatterplot(test_df, 'c1', 'c2')

        # assert
        test_df.plot.scatter.assert_not_called()
        
    def test_df_scatterplot_with_none_df(self):
        # assign
        test_df = Mock(plot=Mock(scatter=Mock()), empty=True, return_value=None)
        
        # act
        actual_result = df_scatterplot(test_df, 'c1', 'c2')

        # assert
        test_df.plot.scatter.assert_not_called()

assert not diabetes_progression_correlated_with_sex
πŸ‘©β€πŸ’» Hint

You can consider to use pandas.DataFrame.corrwith to get the correlation, and pandas.DataFrame.plot.scatter to plot the scatterplots.

42.18.2. Rubric#

Exemplary

Adequate

Needs Improvement

All required tasks are complete, graphically illustrated and explained

Most of the tasks are complete, explanations or takeaways from graphs and/or obtained values are missing

Only basic tasks such as computation of mean/variance and basic plots are complete, no conclusions are made from the data

42.18.3. Acknowledgments#

Thanks to Microsoft for creating the open-source course Data Science for Beginners. It inspires the majority of the content in this chapter.