Matplotlib applied
Contents
LICENSE
Copyright 2015 Donne Martin
Licensed under the Apache License, Version 2.0 (the βLicenseβ); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an βAS ISβ BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
42.29. Matplotlib applied#
Applying Matplotlib Visualizations to Kaggle: Titanic
Bar Plots, Histograms, subplot2grid
Normalized Plots
Scatter Plots, subplots
Kernel Density Estimation Plots
42.30. Challenge#
This is an assignment to learn about Data Cleaning Visualization and plot
42.30.1. Applying Matplotlib Visualizations to Kaggle: Titanic#
Prepare the titanic data to plot:
import matplotlib
import pandas as pd
import numpy as np
import pylab as plt
import seaborn
import pytest
import ipytest
import unittest
ipytest.autoconfig()
# Set the global default size of matplotlib figures
plt.rc("figure", figsize=(10, 5))
# Set seaborn aesthetic parameters to defaults
seaborn.set()
df_train = pd.read_csv("../../assets/data/titanic_train.csv")
def label_encode(df, column_name, encoded_column_name):
"""Label encode one column of a Dataframe.
Args:
df (DataFrame): a data structure
column_name (string): the column name to encode
encoded_column_name (string): the new column name for the encoded result
"""
if df is None:
raise Exception("df cannot be None.")
column = np.____(df[____].unique())
# Generate a mapping of column from a string to a number representation
column_value_mapping = ____(____(column, range(0, len(column))))
# Transform column from a string to a number representation
df[____] = df[column_name].map(column_value_mapping).astype(int)
Check result by executing below... π
%%ipytest -qq
def create_test_df():
return pd.DataFrame({"c1": ["male", "female", "female", "male", "male"]})
class TestLabelEncode(unittest.TestCase):
def test_label_encode_happy_case(self):
# assign
test_df = create_test_df()
expected_result = pd.DataFrame({"c1": [1, 0, 0, 1, 1]}, dtype=int)
# act
actual_result = label_encode(test_df, "c1", "encoded_c1")
# assert
assert test_df["encoded_c1"].equals(expected_result["c1"])
def test_label_encode_with_none_df(self):
# act & assert
with pytest.raises(Exception):
label_encode(None, "c1", "encoded_c1")
def test_label_encode_with_empty_df(self):
# act & assert
with pytest.raises(Exception):
label_encode(pd.DataFrame(), "c1", "encoded_c1")
def test_label_encode_invalid_column_name(self):
# act & assert
with pytest.raises(Exception):
label_encode(test_df, "invalid_column_name", "encoded_c1")
def test_label_encode_invalid_encoded_column_name(self):
# act & assert
with pytest.raises(Exception):
label_encode(test_df, "c1", "invalid_column_name")
π©βπ» Hint
You can consider to use numpy.sort
.
Refer to indexin and selecting data on pandas.DataFrame
.
Refer to function dict() on python
.
Refer to function zip() on python
.
def one_hot_encode(df, column_name, encoded_column_name_prefix):
"""Transforms a column in a DataFrame from a string to dummy variables.
Args:
df (DataFrame): a data structure
column_name (string): the name of the column to be encoded
encoded_column_name_prefix (string): the prefix to be added to the names of the encoded columns
"""
return pd.____(
[df, pd.____(df[____], prefix=____, dtype="int64")], axis=1
)
Check result by executing below... π
%%ipytest -qq
from pandas.testing import assert_frame_equal
def create_test_df():
return pd.DataFrame({"Embarked": ["S", "C", "Q", "S", "C"]})
class TestOneHotEncode(unittest.TestCase):
def test_one_hot_encode_happy_case(self):
# assign
test_df = create_test_df()
expected_result = pd.DataFrame(
{
"Embarked": ["S", "C", "Q", "S", "C"],
"Embarked_Val_C": [0, 1, 0, 0, 1],
"Embarked_Val_Q": [0, 0, 1, 0, 0],
"Embarked_Val_S": [1, 0, 0, 1, 0],
}
)
# act
actual_result = one_hot_encode(test_df, "Embarked", "Embarked_Val")
# assert
assert_frame_equal(actual_result, expected_df)
def test_one_hot_encode_with_none_df(self):
# act & assert
with pytest.raises(Exception):
one_hot_encode(None, "Embarked", "Embarked_Val")
def test_one_hot_encode_with_empty_df(self):
# act & assert
with pytest.raises(Exception):
one_hot_encode(pd.DataFrame(), "Embarked", "Embarked_Val")
def test_one_hot_encode_invalid_column_name(self):
# act & assert
with pytest.raises(Exception):
one_hot_encode(test_df, "invalid_column_name", "Embarked_Val")
π©βπ» Hint
Refer to indexin and selecting data on pandas.DataFrame
.
You can consider to use pandas.concat
and pandas.get_dummies
.
def impute_with_mean(df, column_name, imputed_column_name):
"""Impute the gaps with the mean.
Args:
df (DataFrame): a data structure
column_name (string): the column name to impute
encoded_column_name (string): the new column name for the imputed result
"""
if len(df[df[____].isnull()]) > 0:
imputed_column_name = df[____].____()
df.replace({None: imputed_column_name}, inplace=True)
Check result by executing below... π
%%ipytest -qq
def create_test_df():
return pd.DataFrame(
{
"price": [9, 8, 1, None, None],
"price_add_average": [None, None, None, None, None],
}
)
class TestCleanFare(unittest.TestCase):
def test_impute_with_mean_happy_case(self):
# assign
test_df = create_test_df()
expected_result = pd.DataFrame({"result": [6, 6, 6.0, 6.0, 6.0]}, dtype=float)
# act
impute_with_mean(test_df, "price", "price_add_average")
# assert
assert test_df["price_add_average"].equals(expected_result["result"])
def test_impute_with_mean_with_none_df(self):
# act & assert
with pytest.raises(Exception):
impute_with_mean(None, "price", "price_add_average")
def test_impute_with_mean_with_empty_df(self):
# act & assert
with pytest.raises(Exception):
impute_with_mean(pd.DataFrame(), "price", "price_add_average")
def test_impute_with_mean_invalid_column_name(self):
# act & assert
with pytest.raises(Exception):
impute_with_mean(test_df, "invalid_column_name", "price_add_average")
π©βπ» Hint
You can consider to use pandas.DataFrame.mean
and refer to indexin and selecting data on pandas.DataFrame
.
def impute_with_median(df, column_name, column_value_fill, column_value):
"""Impute the missing ages with the median.
Args:
df (DataFrame): a data structure
column_name (string): the column name to impute
column_value_fill (string): the new column name for the imputed result
column_value (string): determine the column name typical for each passenger class by sex value
"""
df[column_value_fill] = df[____]
df[column_value_fill] = (
df[____]
.groupby([df[____], df["Pclass"]], group_keys=False)
.apply(lambda x: x.____(x.____()))
)
Check result by executing below... π
%%ipytest -qq
from pandas.testing import assert_frame_equal
def create_test_df():
return pd.DataFrame(
{
"Age": [
1,
2,
3,
4,
5,
6,
None,
None,
None,
7,
8,
9,
10,
11,
12,
None,
None,
None,
],
"Age_median_impute": [
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
],
"Pclass": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3],
"Sex_Val": [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
}
)
class TestCleanFare(unittest.TestCase):
def test_impute_with_median_happy_case(self):
# assign
test_df = create_test_df()
expected_df = pd.DataFrame(
{
"Age": [
1,
2,
3,
4,
5,
6,
None,
None,
None,
7,
8,
9,
10,
11,
12,
None,
None,
None,
],
"Age_median_impute": [
1.0,
2.0,
3.0,
4.0,
5.0,
6.0,
2.5,
3.5,
4.5,
7.0,
8.0,
9.0,
10.0,
11.0,
12.0,
8.5,
9.5,
10.5,
],
"Pclass": [1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3],
"Sex_Val": [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
}
)
# act
impute_with_median(test_df, "Age", "Age_median_impute", "Sex_Val")
# assert
assert_frame_equal(test_df, expected_df)
def test_impute_with_median_with_none_df(self):
# act & assert
with pytest.raises(Exception):
impute_with_median(None, "Age", "Age_median_impute", "Sex_Val")
def test_impute_with_median_with_empty_df(self):
# act & assert
with pytest.raises(Exception):
impute_with_median(pd.DataFrame(), "Age", "Age_median_impute", "Sex_Val")
def test_impute_with_median_invalid_column_name(self):
# act & assert
with pytest.raises(Exception):
impute_with_median(test_df, "invalid_column_name", "Age_median_impute", "Sex_Val")
π©βπ» Hint
You can consider to use pandas.DataFrame.fillna
and pandas.DataFrame.median
Refer to indexin and selecting data on pandas.DataFrame
.
def clean_data(df):
# Fill in missing values of Embarked
# Since the vast majority of passengers embarked in 'S': 3,
# we assign the missing values in Embarked to 'S':
df["Embarked"] = df["Embarked"].fillna("S")
label_encode(df, "Sex", "Sex_Val")
# Get the unique values of Embarked
label_encode(df, "Embarked", "Embarked_Val")
# Transform Embarked from a string to dummy variables
df = one_hot_encode(df, "Embarked", "Embarked_Val")
# Fill in missing values of Fare with the average Fare
impute_with_mean(df, "Fare", "Fare_add_average")
# To keep Age intact, make a copy of it called AgeFill
# that we will use to fill in the missing ages:
# Determine the Age typical for each passenger class by Sex_Val.
# We'll use the median instead of the mean because the Age
# histogram seems to be right skewed.
impute_with_median(df, "Age", "AgeFill", "Sex_Val")
# Define a new feature FamilySize that is the sum of
# Parch (number of parents or children on board) and
# SibSp (number of siblings or spouses):
df["FamilySize"] = df["Parch"] + df["SibSp"]
return df
df_train = clean_data(df_train)
42.31. Bar Plots, Histograms, subplot2grid#
# Size of matplotlib figures that contain subplots
figsize_with_subplots = (10, 10)
# Set up a grid of plots
fig = plt.figure(figsize=figsize_with_subplots)
fig_dims = (3, 2)
# Plot death and survival counts
plt.subplot2grid(fig_dims, (0, 0))
def create_sub_plot_2_grid(
df, column_name, plot_title, plot_kind, plot_color="b", plot_align="center"
):
df[____].____().____(
kind=plot_kind, title=plot_title, color=plot_color, align=plot_align
)
create_sub_plot_2_grid(
df_train, "Survived", "Death and Survival Counts", "bar", "r", "center"
)
# Plot Pclass counts
plt.subplot2grid(fig_dims, (0, 1))
create_sub_plot_2_grid(df_train, "Pclass", "Passenger Class Counts", "bar")
# Plot Sex counts
plt.subplot2grid(fig_dims, (1, 0))
create_sub_plot_2_grid(df_train, "Sex", "Gender Counts", "bar")
plt.xticks(rotation=0)
# Plot Embarked counts
plt.subplot2grid(fig_dims, (1, 1))
create_sub_plot_2_grid(df_train, "Embarked", "Ports of Embarkation Counts", "bar")
# Plot the Age histogram
plt.subplot2grid(fig_dims, (2, 0))
df_train["Age"].hist()
plt.title("Age Histogram")
π©βπ» Hint
Refer to indexin and selecting data on pandas.DataFrame
.
You can consider to use pandas.Series.value_counts
and pandas.DataFrame.plot
.
# Get the unique values of Embarked and its maximum
family_sizes = np.____(df_train["FamilySize"].unique())
family_size_max = max(family_sizes)
df1 = df_train[df_train["Survived"] == 0]["FamilySize"]
df2 = df_train[df_train["Survived"] == 1]["FamilySize"]
plt.____([df1, df2], bins=family_size_max + 1, range=(0, family_size_max), stacked=True)
plt.legend(("Died", "Survived"), loc="best")
plt.title("Survivors by Family Size")
π©βπ» Hint
You can consider to usenumpy.sort
and use matplotlib.pyplot.hist
to plot.
42.32. Normalized Plots#
pclass_xt = pd.crosstab(df_train["Pclass"], df_train["Survived"])
# Normalize the cross tab to sum to 1:
pclass_xt_pct = pclass_xt.____(pclass_xt.sum(1).astype(float), axis=0)
pclass_xt_pct.____(kind="bar", stacked=True, title="Survival Rate by Passenger Classes")
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")
def get_survival_rate_by_gender(gender):
"""Computes the survival rate for a given gender.
Args:
gender (string): gender for prediction of survival rate
Returns:
a DataFrame containing the survival rate of passengers of the specified gender in each passenger class
"""
gender_df = df_train[df_train["Sex"] == gender]
gender_xt = pd.crosstab(gender_df["Pclass"], df_train["Survived"])
gender_xt_pct = gender_xt.____(gender_xt.sum(1).astype(float), axis=0)
return gender_xt_pct
# Plot survival rate by Sex
gender_xt_pct = get_survival_rate_by_gender("female")
gender_xt_pct.____(
kind="bar", stacked=True, title="Female Survival Rate by Passenger Class"
)
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")
# Plot survival rate by Pclass
gender_xt_pct = get_survival_rate_by_gender("male")
gender_xt_pct.____(
kind="bar", stacked=True, title="Male Survival Rate by Passenger Class"
)
plt.xlabel("Passenger Class")
plt.ylabel("Survival Rate")
π©βπ» Hint
You can consider to usepandas.DataFrame.div
and pandas.DataFrame.plot
.
42.33. Scatter Plots, subplots#
# Set up a grid of plots
fig, axes = plt.subplots(2, 1, figsize=figsize_with_subplots)
def get_age_by_survived(df, survived):
"""Get passenger age from survived.
Args:
df (DataFrame): a data structure
survived (int): a binary variable indicating whether age information is to be returned
Returns:
a Pandas Series containing the age of all passengers with the specified survival status
"""
df = df[df["Survived"] == survived]["Age"]
return df
df1 = get_age_by_survived(df_train, 0)
df2 = get_age_by_survived(df_train, 1)
max_age = int(max(df_train["AgeFill"]))
axes[1].____([df1, df2], bins=int(max_age / 10), range=(1, max_age), stacked=True)
axes[1].legend(("Died", "Survived"), loc="best")
axes[1].set_title("Survivors by Age Groups Histogram")
axes[1].set_xlabel("Age")
axes[1].set_ylabel("Count")
# Scatter plot Survived and AgeFill
axes[0].____(df_train["Survived"], df_train["AgeFill"])
axes[0].set_title("Survivors by Age Plot")
axes[0].set_xlabel("Survived")
axes[0].set_ylabel("Age")
π©βπ» Hint
You can consider to usematplotlib.axes.Axes.hist
and matplotlib.axes.Axes.scatter
.
42.34. Kernel Density Estimation Plots#
# Get the unique values of Pclass:
def get_the_unique_values_of_Pclass(df):
"""Plot the column 'AgeFill' for each unique value of pclass in the input DataFrame.
Args:
df (DataFrame): a data structure
"""
passenger_classes = np.____(df["Pclass"].unique())
for pclass in passenger_classes:
df["AgeFill"][df["Pclass"] == pclass].____(kind="kde")
get_the_unique_values_of_Pclass(df_train)
plt.title("Age Density Plot by Passenger Class")
plt.xlabel("Age")
plt.legend(("1st Class", "2nd Class", "3rd Class"), loc="best")
π©βπ» Hint
You can consider to usenumpy.sort
and pandas.DataFrame.plot
.42.34.1. Acknowledgments#
Thanks to Donne Martin for creating the open-source project data-science-ipython-notebooks, which inspires the majority of the content in this chapter.