# Install the necessary dependencies

import os
import sys
!{sys.executable} -m pip install --quiet pandas scikit-learn numpy matplotlib jupyterlab_myst ipython seaborn pywaffle

43.7. Data visualization#

43.7.1. 1. What’s data visualization#

Visualizing data is one of the most important tasks of a data scientist. Images are worth 1000 words, and a visualization can help you identify all kinds of interesting parts of your data such as,

  • spikes,

  • outliers,

  • groupings,

  • tendencies,

  • etc.

43.7.2. 2. Visualizing quantities#

An excellent library to create both simple and sophisticated plots and charts of various kinds is Matplotlib.

Use the best chart to suit your data’s structure and the story you want to tell.

  • To analyze trends over time: line

  • To compare values: bar, column, pie, scatterplot

  • To show how parts relate to a whole: pie

  • To show distribution of data: scatterplot, bar

  • To show trends: line, column

  • To show relationships between values: line, scatterplot, bubble Build a line plot about bird wingspan values#

import pandas as pd
import matplotlib.pyplot as plt
birds = pd.read_csv('https://static-1300131294.cos.accelerate.myqcloud.com/data/birds.csv')
Name ScientificName Category Order Family Genus ConservationStatus MinLength MaxLength MinBodyMass MaxBodyMass MinWingspan MaxWingspan
0 Black-bellied whistling-duck Dendrocygna autumnalis Ducks/Geese/Waterfowl Anseriformes Anatidae Dendrocygna LC 47.0 56.0 652.0 1020.0 76.0 94.0
1 Fulvous whistling-duck Dendrocygna bicolor Ducks/Geese/Waterfowl Anseriformes Anatidae Dendrocygna LC 45.0 53.0 712.0 1050.0 85.0 93.0
2 Snow goose Anser caerulescens Ducks/Geese/Waterfowl Anseriformes Anatidae Anser LC 64.0 79.0 2050.0 4050.0 135.0 165.0
3 Ross's goose Anser rossii Ducks/Geese/Waterfowl Anseriformes Anatidae Anser LC 57.3 64.0 1066.0 1567.0 113.0 116.0
4 Greater white-fronted goose Anser albifrons Ducks/Geese/Waterfowl Anseriformes Anatidae Anser LC 64.0 81.0 1930.0 3310.0 130.0 165.0
wingspan = birds['MaxWingspan'] 
plt.title('Max Wingspan in Centimeters')
plt.ylabel('Wingspan (CM)')
x = birds['Name'] 
y = birds['MaxWingspan']

plt.plot(x, y)

../../_images/data-visualization_10_0.png Explore bar charts#

    title='Birds of Minnesota'
category_count = birds.value_counts(birds['Category'].values, sort=True)
plt.rcParams['figure.figsize'] = [6, 12]
../../_images/data-visualization_13_1.png Comparing data#

maxlength = birds['MaxLength']
plt.barh(y=birds['Category'], width=maxlength)
plt.rcParams['figure.figsize'] = [6, 12]
minLength = birds['MinLength']
maxLength = birds['MaxLength']
category = birds['Category']

plt.barh(category, maxLength)
plt.barh(category, minLength)


43.7.3. 3. Visualizing distributions#

Another way to dig into data is by looking at its distribution, or how the data is organized along an axis. Explore the birds dataset#

birds.plot(kind='scatter', x='MaxLength', y='Order', figsize=(12, 8))

plt.title('Max Length per Order')
plt.xlabel('Max Length')

../../_images/data-visualization_19_0.png Working with histograms#

birds['MaxBodyMass'].plot(kind='hist', bins=10, figsize=(12, 12))
birds['MaxBodyMass'].plot(kind='hist', bins=30, figsize=(12, 12))
filteredBirds = birds[(birds['MaxBodyMass'] > 1) & (birds['MaxBodyMass'] < 60)]
filteredBirds['MaxBodyMass'].plot(kind='hist', bins=40, figsize=(12, 12))
x = filteredBirds['MaxBodyMass']
y = filteredBirds['MaxLength']

fig, ax = plt.subplots(tight_layout=True)
hist = ax.hist2d(x, y)
../../_images/data-visualization_24_0.png Explore the dataset for distributions using text data#

x1 = filteredBirds.loc[filteredBirds.ConservationStatus == 'EX', 'MinWingspan']
x2 = filteredBirds.loc[filteredBirds.ConservationStatus == 'CR', 'MinWingspan']
x3 = filteredBirds.loc[filteredBirds.ConservationStatus == 'EN', 'MinWingspan']
x4 = filteredBirds.loc[filteredBirds.ConservationStatus == 'NT', 'MinWingspan']
x5 = filteredBirds.loc[filteredBirds.ConservationStatus == 'VU', 'MinWingspan']
x6 = filteredBirds.loc[filteredBirds.ConservationStatus == 'LC', 'MinWingspan']

kwargs = dict(alpha=0.5, bins=20)

plt.hist(x1, **kwargs, color='red', label='Extinct')
plt.hist(x2, **kwargs, color='orange', label='Critically Endangered')
plt.hist(x3, **kwargs, color='yellow', label='Endangered')
plt.hist(x4, **kwargs, color='green', label='Near Threatened')
plt.hist(x5, **kwargs, color='blue', label='Vulnerable')
plt.hist(x6, **kwargs, color='gray', label='Least Concern')

plt.gca().set(title='Conservation Status', ylabel='Min Wingspan')
../../_images/data-visualization_26_0.png Density plots#

import seaborn as sns
    data=filteredBirds, x="MaxBodyMass", hue="Order",
    fill=True, common_norm=False, palette="crest",
    alpha=.5, linewidth=0,
    data=filteredBirds, x="MinLength", y="MaxLength", hue="ConservationStatus"
43.7.4. 4. Visualizing proportions#

We will use a given dataset about mushrooms to experiment with tasty visualizations such as:

  • Pie charts 🥧

  • Donut charts 🍩

  • Waffle charts 🧇 Get to know your mushrooms 🍄#

mushrooms = pd.read_csv('../data/mushrooms.csv')
class cap-shape cap-surface cap-color bruises odor gill-attachment gill-spacing gill-size gill-color ... stalk-surface-below-ring stalk-color-above-ring stalk-color-below-ring veil-type veil-color ring-number ring-type spore-print-color population habitat
0 Poisonous Convex Smooth Brown Bruises Pungent Free Close Narrow Black ... Smooth White White Partial White One Pendant Black Scattered Urban
1 Edible Convex Smooth Yellow Bruises Almond Free Close Broad Black ... Smooth White White Partial White One Pendant Brown Numerous Grasses
2 Edible Bell Smooth White Bruises Anise Free Close Broad Brown ... Smooth White White Partial White One Pendant Brown Numerous Meadows
3 Poisonous Convex Scaly White Bruises Pungent Free Close Narrow Brown ... Smooth White White Partial White One Pendant Black Scattered Urban
4 Edible Convex Smooth Green No Bruises None Free Crowded Broad Black ... Smooth White White Partial White One Evanescent Brown Abundant Grasses

5 rows Ă— 23 columns

cols = mushrooms.select_dtypes(["object"]).columns
mushrooms[cols] = mushrooms[cols].astype('category')
edibleclass = mushrooms.groupby(['class']).count()
cap-shape cap-surface cap-color bruises odor gill-attachment gill-spacing gill-size gill-color stalk-shape ... stalk-surface-below-ring stalk-color-above-ring stalk-color-below-ring veil-type veil-color ring-number ring-type spore-print-color population habitat
Edible 4208 4208 4208 4208 4208 4208 4208 4208 4208 4208 ... 4208 4208 4208 4208 4208 4208 4208 4208 4208 4208
Poisonous 3916 3916 3916 3916 3916 3916 3916 3916 3916 3916 ... 3916 3916 3916 3916 3916 3916 3916 3916 3916 3916

2 rows Ă— 22 columns Pie!#

labels = ['Edible', 'Poisonous']
plt.pie(edibleclass['population'], labels=labels, autopct='%.1f %%')
../../_images/data-visualization_37_0.png Donuts!#

habitat = mushrooms.groupby(['habitat']).count()
class cap-shape cap-surface cap-color bruises odor gill-attachment gill-spacing gill-size gill-color ... stalk-surface-above-ring stalk-surface-below-ring stalk-color-above-ring stalk-color-below-ring veil-type veil-color ring-number ring-type spore-print-color population
Grasses 2148 2148 2148 2148 2148 2148 2148 2148 2148 2148 ... 2148 2148 2148 2148 2148 2148 2148 2148 2148 2148
Leaves 832 832 832 832 832 832 832 832 832 832 ... 832 832 832 832 832 832 832 832 832 832
Meadows 292 292 292 292 292 292 292 292 292 292 ... 292 292 292 292 292 292 292 292 292 292
Paths 1144 1144 1144 1144 1144 1144 1144 1144 1144 1144 ... 1144 1144 1144 1144 1144 1144 1144 1144 1144 1144
Urban 368 368 368 368 368 368 368 368 368 368 ... 368 368 368 368 368 368 368 368 368 368
Waste 192 192 192 192 192 192 192 192 192 192 ... 192 192 192 192 192 192 192 192 192 192
Wood 3148 3148 3148 3148 3148 3148 3148 3148 3148 3148 ... 3148 3148 3148 3148 3148 3148 3148 3148 3148 3148

7 rows Ă— 22 columns

labels = ['Grasses', 'Leaves', 'Meadows', 'Paths', 'Urban', 'Waste', 'Wood']

    habitat['class'], labels=labels,
    autopct='%1.1f%%', pctdistance=0.85

center_circle = plt.Circle((0, 0), 0.40, fc='white')
fig = plt.gcf()


plt.title('Mushroom Habitats')

../../_images/data-visualization_40_0.png Waffles!#

capcolor = mushrooms.groupby(['cap-color']).count()
class cap-shape cap-surface bruises odor gill-attachment gill-spacing gill-size gill-color stalk-shape ... stalk-surface-below-ring stalk-color-above-ring stalk-color-below-ring veil-type veil-color ring-number ring-type spore-print-color population habitat
Brown 2284 2284 2284 2284 2284 2284 2284 2284 2284 2284 ... 2284 2284 2284 2284 2284 2284 2284 2284 2284 2284
Buff 168 168 168 168 168 168 168 168 168 168 ... 168 168 168 168 168 168 168 168 168 168
Cinnamon 44 44 44 44 44 44 44 44 44 44 ... 44 44 44 44 44 44 44 44 44 44
Green 1856 1856 1856 1856 1856 1856 1856 1856 1856 1856 ... 1856 1856 1856 1856 1856 1856 1856 1856 1856 1856
Pink 144 144 144 144 144 144 144 144 144 144 ... 144 144 144 144 144 144 144 144 144 144
Purple 16 16 16 16 16 16 16 16 16 16 ... 16 16 16 16 16 16 16 16 16 16
Red 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 ... 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500
White 1040 1040 1040 1040 1040 1040 1040 1040 1040 1040 ... 1040 1040 1040 1040 1040 1040 1040 1040 1040 1040
Yellow 1072 1072 1072 1072 1072 1072 1072 1072 1072 1072 ... 1072 1072 1072 1072 1072 1072 1072 1072 1072 1072

9 rows Ă— 22 columns

from pywaffle import Waffle

data = {
    'color': ['brown', 'buff', 'cinnamon', 'green', 'pink', 'purple', 'red', 'white', 'yellow'],
    'amount': capcolor['class']

df = pd.DataFrame(data)

fig = plt.figure(
    figsize=(30, 30),
        "brown", "tan", "maroon", "green", "pink",
        "purple", "red", "whitesmoke", "yellow"

43.7.5. 5. Visualizing relationships: all about honey#

Seaborn, which we have used before, as a good library to visualize relationships between variables. Scatterplots#

honey = pd.read_csv('../data/honey.csv')
state numcol yieldpercol totalprod stocks priceperlb prodvalue year
0 AL 16000.0 71 1136000.0 159000.0 0.72 818000.0 1998
1 AZ 55000.0 60 3300000.0 1485000.0 0.64 2112000.0 1998
2 AR 53000.0 65 3445000.0 1688000.0 0.59 2033000.0 1998
3 CA 450000.0 83 37350000.0 12326000.0 0.62 23157000.0 1998
4 CO 27000.0 72 1944000.0 1594000.0 0.70 1361000.0 1998
sns.relplot(x="priceperlb", y="state", data=honey, height=15, aspect=.5)
    x="priceperlb", y="state", hue="year",
    palette="YlOrBr", data=honey, height=15, aspect=.5
../../_images/data-visualization_48_0.png Line charts#

sns.relplot(x="year", y="priceperlb", kind="line", data=honey)
sns.relplot(x="year", y="totalprod", kind="line", data=honey);
../../_images/data-visualization_51_0.png Facet grids#

    x="yieldpercol", y="numcol",
../../_images/data-visualization_53_1.png Dual-line plots#

fig, ax = plt.subplots(figsize=(12, 6))
lineplot = sns.lineplot(x=honey['year'], y=honey['numcol'], data=honey,
                        label='Number of bee colonies', legend=False)
plt.ylabel('# colonies')
plt.title('Honey Production Year over Year')

ax2 = ax.twinx()
lineplot2 = sns.lineplot(
    x=honey['year'], y=honey['yieldpercol'], ax=ax2, color="r",
    label='Yield per colony', legend=False
plt.ylabel('colony yield')

