LICENSE

Copyright 2015 Donne Martin

Licensed under the Apache License, Version 2.0 (the β€œLicense”); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an β€œAS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

42.29. Matplotlib applied#

  • Applying Matplotlib Visualizations to Kaggle: Titanic

  • Bar Plots, Histograms, subplot2grid

  • Normalized Plots

  • Scatter Plots, subplots

  • Kernel Density Estimation Plots

42.30. Challenge#

  • This is an assignment to learn about Data Cleaning Visualization and plot

42.30.1. Applying Matplotlib Visualizations to Kaggle: Titanic#

Prepare the titanic data to plot:

import matplotlib
import pandas as pd
import numpy as np
import pylab as plt
import seaborn
import pytest
import ipytest
import unittest


ipytest.autoconfig()
# Set the global default size of matplotlib figures
plt.rc("figure", figsize=(10, 5))
# Set seaborn aesthetic parameters to defaults
seaborn.set()
df_train = pd.read_csv("../../assets/data/titanic_train.csv")
def label_encode(df, column_name, encoded_column_name):
    """Label encode one column of a Dataframe.

    Args:
        df (DataFrame): a data structure
        column_name (string): the column name to encode
        encoded_column_name (string): the new column name for the encoded result
    """
    if df is None:
        raise Exception("df cannot be None.")

    column = np.____(df[____].unique())

    # Generate a mapping of column from a string to a number representation
    column_value_mapping = ____(____(column, range(0, len(column))))

    # Transform column from a string to a number representation
    df[____] = df[column_name].map(column_value_mapping).astype(int)
Check result by executing below... πŸ“
%%ipytest -qq

def create_test_df():
    return pd.DataFrame({"c1": ["male", "female", "female", "male", "male"]})


class TestLabelEncode(unittest.TestCase):
    def test_label_encode_happy_case(self):
        # assign
        test_df = create_test_df()
        expected_result = pd.DataFrame({"c1": [1, 0, 0, 1, 1]}, dtype=int)

        # act
        actual_result = label_encode(test_df, "c1", "encoded_c1")

        # assert
        assert test_df["encoded_c1"].equals(expected_result["c1"])

    def test_label_encode_with_none_df(self):
        # act & assert
        with pytest.raises(Exception):
            label_encode(None, "c1", "encoded_c1")

    def test_label_encode_with_empty_df(self):
        # act & assert
        with pytest.raises(Exception):
            label_encode(pd.DataFrame(), "c1", "encoded_c1")

    def test_label_encode_invalid_column_name(self):
        # act & assert
        with pytest.raises(Exception):
            label_encode(test_df, "invalid_column_name", "encoded_c1")

    def test_label_encode_invalid_encoded_column_name(self):
        # act & assert
        with pytest.raises(Exception):
            label_encode(test_df, "c1", "invalid_column_name")
πŸ‘©β€πŸ’» Hint

You can consider to use numpy.sort.
Refer to indexin and selecting data on pandas.DataFrame.
Refer to function dict() on python.
Refer to function zip() on python.

def one_hot_encode(df, column_name, encoded_column_name_prefix):
    """Transforms a column in a DataFrame from a string to dummy variables.
    
    Args:
        df (DataFrame): a data structure
        column_name (string): the name of the column to be encoded
        encoded_column_name_prefix (string): the prefix to be added to the names of the encoded columns
    """
    return pd.____(
        [df, pd.____(df[____], prefix=____, dtype="int64")], axis=1
    )
Check result by executing below... πŸ“
%%ipytest -qq

from pandas.testing import assert_frame_equal

def create_test_df():
    return pd.DataFrame({"Embarked": ["S", "C", "Q", "S", "C"]})


class TestOneHotEncode(unittest.TestCase):
    def test_one_hot_encode_happy_case(self):
        # assign
        test_df = create_test_df()
        expected_result = pd.DataFrame(
            {
                "Embarked": ["S", "C", "Q", "S", "C"],
                "Embarked_Val_C": [0, 1, 0, 0, 1],
                "Embarked_Val_Q": [0, 0, 1, 0, 0],
                "Embarked_Val_S": [1, 0, 0, 1, 0],
            }
        )

        # act
        actual_result = one_hot_encode(test_df, "Embarked", "Embarked_Val")

        # assert
        assert_frame_equal(actual_result, expected_df)

    def test_one_hot_encode_with_none_df(self):
        # act & assert
        with pytest.raises(Exception):
            one_hot_encode(None, "Embarked", "Embarked_Val")

    def test_one_hot_encode_with_empty_df(self):
        # act & assert
        with pytest.raises(Exception):
            one_hot_encode(pd.DataFrame(), "Embarked", "Embarked_Val")

    def test_one_hot_encode_invalid_column_name(self):
        # act & assert
        with pytest.raises(Exception):
            one_hot_encode(test_df, "invalid_column_name", "Embarked_Val")
πŸ‘©β€πŸ’» Hint

Refer to indexin and selecting data on pandas.DataFrame.
You can consider to use pandas.concat and pandas.get_dummies.

def impute_with_mean(df, column_name, imputed_column_name):
    """Impute the gaps with the mean.

    Args:
        df (DataFrame): a data structure
        column_name (string): the column name to impute
        encoded_column_name (string): the new column name for the imputed result
    """
    if len(df[df[____].isnull()]) > 0:
        imputed_column_name = df[____].____()
        df.replace({None: imputed_column_name}, inplace=True)
Check result by executing below... πŸ“
%%ipytest -qq

def create_test_df():
    return pd.DataFrame(
        {
            "price": [9, 8, 1, None, None],
            "price_add_average": [None, None, None, None, None],
        }
    )


class TestCleanFare(unittest.TestCase):
    def test_impute_with_mean_happy_case(self):
        # assign
        test_df = create_test_df()
        expected_result = pd.DataFrame({"result": [6, 6, 6.0, 6.0, 6.0]}, dtype=float)

        # act
        impute_with_mean(test_df, "price", "price_add_average")

        # assert
        assert test_df["price_add_average"].equals(expected_result["result"])

    def test_impute_with_mean_with_none_df(self):
        # act & assert
        with pytest.raises(Exception):
            impute_with_mean(None, "price", "price_add_average")

    def test_impute_with_mean_with_empty_df(self):
        # act & assert
        with pytest.raises(Exception):
            impute_with_mean(pd.DataFrame(), "price", "price_add_average")

    def test_impute_with_mean_invalid_column_name(self):
        # act & assert
        with pytest.raises(Exception):
            impute_with_mean(test_df, "invalid_column_name", "price_add_average")
πŸ‘©β€πŸ’» Hint

You can consider to use pandas.DataFrame.mean and refer to indexin and selecting data on pandas.DataFrame.

def impute_with_median(df, column_name, column_value_fill, column_value):
    """Impute the missing ages with the median.

    Args:
        df (DataFrame): a data structure
        column_name (string): the column name to impute
        column_value_fill (string): the new column name for the imputed result
        column_value (string): determine the column name typical for each passenger class by sex value
    """
    df[column_value_fill] = df[____]
    df[column_value_fill] = (
        df[____]
        .groupby([df[____], df["Pclass"]], group_keys=False)
        .apply(lambda x: x.____(x.____()))
    )
Check result by executing below... πŸ“
%%ipytest -qq

from pandas.testing import assert_frame_equal

def create_test_df():
    return pd.DataFrame(
        {
            "Age": [
                1,
                2,
                3,
                4,
                5,
                6,
                None,
                None,
                None,
                7,
                8,
                9,
                10,
                11,
                12,
                None,
                None,
                None,
            ],
            "Age_median_impute": [
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                None,
            ],
            "Pclass": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3],
            "Sex_Val": [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        }
    )


class TestCleanFare(unittest.TestCase):
    def test_impute_with_median_happy_case(self):
        # assign
        test_df = create_test_df()
        expected_df = pd.DataFrame(
            {
                "Age": [
                    1,
                    2,
                    3,
                    4,
                    5,
                    6,
                    None,
                    None,
                    None,
                    7,
                    8,
                    9,
                    10,
                    11,
                    12,
                    None,
                    None,
                    None,
                ],
                "Age_median_impute": [
                    1.0,
                    2.0,
                    3.0,
                    4.0,
                    5.0,
                    6.0,
                    2.5,
                    3.5,
                    4.5,
                    7.0,
                    8.0,
                    9.0,
                    10.0,
                    11.0,
                    12.0,
                    8.5,
                    9.5,
                    10.5,
                ],
                "Pclass": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3],
                "Sex_Val": [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
            }
        )

        # act
        impute_with_median(test_df, "Age", "Age_median_impute", "Sex_Val")

        # assert
        assert_frame_equal(test_df, expected_df)

    def test_impute_with_median_with_none_df(self):
        # act & assert
        with pytest.raises(Exception):
            impute_with_median(None, "Age", "Age_median_impute", "Sex_Val")

    def test_impute_with_median_with_empty_df(self):
        # act & assert
        with pytest.raises(Exception):
            impute_with_median(pd.DataFrame(), "Age", "Age_median_impute", "Sex_Val")

    def test_impute_with_median_invalid_column_name(self):
        # act & assert
        with pytest.raises(Exception):
            impute_with_median(test_df, "invalid_column_name", "Age_median_impute", "Sex_Val")
πŸ‘©β€πŸ’» Hint

You can consider to use pandas.DataFrame.fillna and pandas.DataFrame.median
Refer to indexin and selecting data on pandas.DataFrame.

def clean_data(df):
    # Fill in missing values of Embarked
    # Since the vast majority of passengers embarked in 'S': 3,
    # we assign the missing values in Embarked to 'S':
    df["Embarked"] = df["Embarked"].fillna("S")
    label_encode(df, "Sex", "Sex_Val")

    # Get the unique values of Embarked
    label_encode(df, "Embarked", "Embarked_Val")

    # Transform Embarked from a string to dummy variables
    df = one_hot_encode(df, "Embarked", "Embarked_Val")

    # Fill in missing values of Fare with the average Fare
    impute_with_mean(df, "Fare", "Fare_add_average")

    # To keep Age intact, make a copy of it called AgeFill
    # that we will use to fill in the missing ages:
    # Determine the Age typical for each passenger class by Sex_Val.
    # We'll use the median instead of the mean because the Age
    # histogram seems to be right skewed.
    impute_with_median(df, "Age", "AgeFill", "Sex_Val")

    # Define a new feature FamilySize that is the sum of
    # Parch (number of parents or children on board) and
    # SibSp (number of siblings or spouses):
    df["FamilySize"] = df["Parch"] + df["SibSp"]

    return df


df_train = clean_data(df_train)

42.31. Bar Plots, Histograms, subplot2grid#

# Size of matplotlib figures that contain subplots
figsize_with_subplots = (10, 10)

# Set up a grid of plots
fig = plt.figure(figsize=figsize_with_subplots)
fig_dims = (3, 2)

# Plot death and survival counts
plt.subplot2grid(fig_dims, (0, 0))


def create_sub_plot_2_grid(
    df, column_name, plot_title, plot_kind, plot_color="b", plot_align="center"
):
    df[____].____().____(
        kind=plot_kind, title=plot_title, color=plot_color, align=plot_align
    )


create_sub_plot_2_grid(
    df_train, "Survived", "Death and Survival Counts", "bar", "r", "center"
)

# Plot Pclass counts
plt.subplot2grid(fig_dims, (0, 1))

create_sub_plot_2_grid(df_train, "Pclass", "Passenger Class Counts", "bar")
# Plot Sex counts
plt.subplot2grid(fig_dims, (1, 0))

create_sub_plot_2_grid(df_train, "Sex", "Gender Counts", "bar")
plt.xticks(rotation=0)

# Plot Embarked counts
plt.subplot2grid(fig_dims, (1, 1))

create_sub_plot_2_grid(df_train, "Embarked", "Ports of Embarkation Counts", "bar")

# Plot the Age histogram
plt.subplot2grid(fig_dims, (2, 0))
df_train["Age"].hist()
plt.title("Age Histogram")
πŸ‘©β€πŸ’» Hint

Refer to indexin and selecting data on pandas.DataFrame.
You can consider to use pandas.Series.value_counts and pandas.DataFrame.plot.

# Get the unique values of Embarked and its maximum
family_sizes = np.____(df_train["FamilySize"].unique())
family_size_max = max(family_sizes)

df1 = df_train[df_train["Survived"] == 0]["FamilySize"]
df2 = df_train[df_train["Survived"] == 1]["FamilySize"]
plt.____([df1, df2], bins=family_size_max + 1, range=(0, family_size_max), stacked=True)
plt.legend(("Died", "Survived"), loc="best")
plt.title("Survivors by Family Size")
πŸ‘©β€πŸ’» Hint You can consider to use numpy.sort and use matplotlib.pyplot.hist to plot.

42.32. Normalized Plots#

pclass_xt = pd.crosstab(df_train["Pclass"], df_train["Survived"])

# Normalize the cross tab to sum to 1:
pclass_xt_pct = pclass_xt.____(pclass_xt.sum(1).astype(float), axis=0)

pclass_xt_pct.____(kind="bar", stacked=True, title="Survival Rate by Passenger Classes")
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")
def get_survival_rate_by_gender(gender):
    """Computes the survival rate for a given gender.
    
    Args:
        gender (string): gender for prediction of survival rate
    
    Returns:
        a DataFrame containing the survival rate of passengers of the specified gender in each passenger class
    """
    gender_df = df_train[df_train["Sex"] == gender]
    gender_xt = pd.crosstab(gender_df["Pclass"], df_train["Survived"])
    gender_xt_pct = gender_xt.____(gender_xt.sum(1).astype(float), axis=0)
    return gender_xt_pct


# Plot survival rate by Sex
gender_xt_pct = get_survival_rate_by_gender("female")


gender_xt_pct.____(
    kind="bar", stacked=True, title="Female Survival Rate by Passenger Class"
)
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")
# Plot survival rate by Pclass
gender_xt_pct = get_survival_rate_by_gender("male")
gender_xt_pct.____(
    kind="bar", stacked=True, title="Male Survival Rate by Passenger Class"
)
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")
πŸ‘©β€πŸ’» Hint You can consider to use pandas.DataFrame.div and pandas.DataFrame.plot.

42.33. Scatter Plots, subplots#

# Set up a grid of plots
fig, axes = plt.subplots(2, 1, figsize=figsize_with_subplots)


def get_age_by_survived(df, survived):
    """Get passenger age from survived.
    
    Args:
        df (DataFrame): a data structure
        survived (int): a binary variable indicating whether age information is to be returned
    
    Returns:
        a Pandas Series containing the age of all passengers with the specified survival status
    """
    df = df[df["Survived"] == survived]["Age"]
    return df


df1 = get_age_by_survived(df_train, 0)
df2 = get_age_by_survived(df_train, 1)
max_age = int(max(df_train["AgeFill"]))

axes[1].____([df1, df2], bins=int(max_age / 10), range=(1, max_age), stacked=True)
axes[1].legend(("Died", "Survived"), loc="best")
axes[1].set_title("Survivors by Age Groups Histogram")
axes[1].set_xlabel("Age")
axes[1].set_ylabel("Count")

# Scatter plot Survived and AgeFill
axes[0].____(df_train["Survived"], df_train["AgeFill"])
axes[0].set_title("Survivors by Age Plot")
axes[0].set_xlabel("Survived")
axes[0].set_ylabel("Age")
πŸ‘©β€πŸ’» Hint You can consider to use matplotlib.axes.Axes.hist and matplotlib.axes.Axes.scatter.

42.34. Kernel Density Estimation Plots#

# Get the unique values of Pclass:
def get_the_unique_values_of_Pclass(df):
    """Plot the column 'AgeFill' for each unique value of pclass in the input DataFrame.
    
    Args:
        df (DataFrame): a data structure
    """
    passenger_classes = np.____(df["Pclass"].unique())
    for pclass in passenger_classes:
        df["AgeFill"][df["Pclass"] == pclass].____(kind="kde")


get_the_unique_values_of_Pclass(df_train)
plt.title("Age Density Plot by Passenger Class")
plt.xlabel("Age")
plt.legend(("1st Class", "2nd Class", "3rd Class"), loc="best")
πŸ‘©β€πŸ’» Hint You can consider to use numpy.sort and pandas.DataFrame.plot.

42.34.1. Acknowledgments#

Thanks to Donne Martin for creating the open-source project data-science-ipython-notebooks, which inspires the majority of the content in this chapter.