Code
#!pip install joypy pywaffle calmap scipy squarify scikit-learn statsmodels seaborn
# !pip install ipkernel
Below is a compendium of visualizations created in Python. While I continue to much prefer using R for visualizations. It is simpler and probably a bit better than what you can do in Python. This is changing somewhat as Python works to match the capabilites if ggplot
.
Recently, Posit has released a Python Module great_tables
that attempts to reproduce the capabilities in R’s gt
package.
Below plots and tables are illlustrated. I did this primarily to learn everythnig I could to create effective visualizations in Python. While I am glad to have done this, I think I’ll be sticking with R for some time to come. There is just no good reason to leave R. It still holds an edge in capabilites. And for me, R is far easier to sue and requires signicantly fewer lines of code compared to Python.
#!pip install joypy pywaffle calmap scipy squarify scikit-learn statsmodels seaborn
# !pip install ipkernel
import joypy
from pywaffle import Waffle
import calmap
import random
import os
import numpy as np
import pandas as pd
from pandas.plotting import andrews_curves
from pandas.plotting import parallel_coordinates
from sklearn.cluster import AgglomerativeClustering
import seaborn as sns
#from great_tables import GT
import great_tables
from great_tables import GT, loc, style
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.path import Path
from matplotlib.patches import PathPatch
from matplotlib.patches import Patch
import matplotlib.patches as patches
from scipy.spatial import ConvexHull
from scipy.signal import find_peaks
from scipy.stats import sem
import scipy.cluster.hierarchy as shc
import squarify
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.tsa.stattools as stattools
from statsmodels.tsa.seasonal import seasonal_decompose
from dateutil.parser import parse
from IPython.display import Image
def truncate_long_text(text: str, max_length: int = 80) -> str:
"""
Truncate text to a maximum length, adding ellipsis if truncated.
Args:
text (str): Input text to potentially truncate
max_length (int): Maximum allowed length
Returns:
str: Truncated text with ellipsis if longer than max_length
"""
# Only truncate if text is a string and longer than max_length
if isinstance(text, str) and len(text) > max_length:
return text[:max_length] + '...'
return text
def truncate_long_columns(df: pd.DataFrame, max_length: int = 80) -> pd.DataFrame:
"""
Truncate text in columns that have strings longer than max_length.
Args:
df (pd.DataFrame): Input DataFrame
max_length (int): Maximum allowed character length
Returns:
pd.DataFrame: DataFrame with long text truncated
"""
# Create a copy of the DataFrame to avoid modifying the original
= df.copy()
truncated_df
# Iterate through all columns
for col in truncated_df.columns:
# Check if column contains string-like data
if truncated_df[col].dtype == 'object':
= truncated_df[col].apply(lambda x: truncate_long_text(x, max_length))
truncated_df[col]
return truncated_df
def process_csv_from_data_folder(file_name: str, dataframe: pd.DataFrame = None):
"""
Processes a CSV file or DataFrame by selecting 10 random records,
limiting the GT table to 10 columns, appending the file name to the title,
and adding a footnote if there are extra columns.
Args:
file_name (str): The name of the CSV file in the 'data' folder, or name for DataFrame.
dataframe (pd.DataFrame, optional): DataFrame to process instead of CSV file.
Returns:
tuple: A tuple containing the file name and a styled GT table.
"""
# Check if a DataFrame is provided
if dataframe is not None:
= dataframe
df else:
# Define the path to the 'data' folder
= "data"
data_folder = os.path.join(data_folder, file_name)
file_path
# Check if the file exists
if not os.path.isfile(file_path):
raise FileNotFoundError(f"File '{file_name}' not found in the 'data' folder.")
# Load the CSV file into a DataFrame
try:
= pd.read_csv(file_path)
df except Exception as e:
raise ValueError(f"Error reading the CSV file: {e}")
# Select 10 random records
if len(df) < 10:
= df # Use entire dataframe if less than 10 records
random_sample else:
= df.sample(n=10, random_state=42)
random_sample
# Limit to 10 columns or fewer
= df.columns.tolist()
all_columns = all_columns[:10]
displayed_columns = all_columns[10:]
extra_columns
# Create a copy for formatting
= random_sample[displayed_columns].copy()
limited_sample
# Format numeric columns for better readability
= limited_sample.copy()
formatted_sample for col in formatted_sample.select_dtypes(include=['float', 'int']).columns:
= formatted_sample[col].map(lambda x: f"{x:,.2f}")
formatted_sample[col]
# Truncate text in columns longer than 80 characters
= truncate_long_columns(formatted_sample)
formatted_sample
# Create the GT table using the formatted data
= GT(data=formatted_sample)
gt_table
# Enhanced styling for better engagement
= (gt_table
gt_table # Title and subtitle with more dynamic styling
.tab_header(=f"Random Sample of 10 Records from '{file_name}'",
title="Exploring Data"
subtitle
)
# Column-specific styling
.cols_label(# Optional: Rename columns to be more user-friendly
**{col: col.replace('_', ' ').title() for col in displayed_columns}
)
# Header styling with a modern, clean look
=6, color='blue')
.opt_stylize(style
)
# Conditionally add numeric formatting
= formatted_sample.select_dtypes(include=['float', 'int']).columns
numeric_columns if len(numeric_columns) > 0:
= gt_table.fmt_number(columns=numeric_columns)
gt_table
# Add a footnote if there are extra columns
if extra_columns:
= gt_table.tab_source_note(
gt_table =f"💡 Additional columns not displayed: {', '.join(extra_columns)}"
source_note
)
# Optional: Add source information
= gt_table.tab_source_note(
gt_table =f"🔍 Data Exploration: {file_name} | Sample Size: 10 Records"
source_note
)
return gt_table
# Example usage:
# process_csv_from_data_folder("My Test DF", dataframe = df)
A scatter plot (aka scatter chart, scatter graph) uses dots to represent values for two different numeric variables. The position of each dot on the horizontal and vertical axis indicates values for an individual data point. Scatter plots are used to observe relationships between variables.
This dataset contains demographic and socioeconomic information for counties in Illinois. Key characteristics include:
Geographic Information - County names and state (Illinois) - Area (likely in square miles) - Metropolitan status (inmetro) Population Statistics - Total population (poptotal) - Population density (popdensity) - Racial composition (popwhite, popblack, popamerindian, popasian, popother) - Percentage of each racial group Socioeconomic Indicators - Adult population (popadults) - Education levels (perchsd, percollege, percprof) - Poverty statistics (percbelowpoverty, percchildbelowpovert, percadultpoverty, percelderlypoverty) Additional Features - Unique identifier for each county (PID) - Categorical classification (category) - Dot size (possibly for visualization purposes)
The data provides a comprehensive overview of Illinois counties, allowing for analysis of population distribution, racial demographics, education levels, and poverty rates across different regions of the state.
= pd.read_csv('data\midwest_filter.csv')
df "Plot Data Raw", dataframe = df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Plot Data Raw' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Pid | County | State | Area | Poptotal | Popdensity | Popwhite | Popblack | Popamerindian | Popasian |
589.00 | FULTON | IL | 0.05 | 38,080.00 | 732.31 | 37,117.00 | 668.00 | 83.00 | 105.00 |
3,033.00 | RICHLAND | WI | 0.03 | 17,521.00 | 515.32 | 17,411.00 | 12.00 | 34.00 | 38.00 |
649.00 | STEPHENSON | IL | 0.03 | 48,052.00 | 1,456.12 | 44,524.00 | 3,081.00 | 58.00 | 304.00 |
1,244.00 | LUCE | MI | 0.06 | 5,763.00 | 104.78 | 5,418.00 | 2.00 | 331.00 | 6.00 |
629.00 | MORGAN | IL | 0.03 | 36,397.00 | 1,102.94 | 34,561.00 | 1,510.00 | 48.00 | 130.00 |
1,206.00 | BENZIE | MI | 0.02 | 12,200.00 | 610.00 | 11,863.00 | 30.00 | 237.00 | 35.00 |
2,986.00 | BUFFALO | WI | 0.04 | 13,584.00 | 339.60 | 13,521.00 | 5.00 | 22.00 | 29.00 |
1,224.00 | GRAND TRAVERSE | MI | 0.03 | 64,273.00 | 2,142.43 | 63,019.00 | 259.00 | 555.00 | 318.00 |
1,264.00 | OSCODA | MI | 0.03 | 7,842.00 | 237.64 | 7,781.00 | 2.00 | 41.00 | 5.00 |
1,247.00 | MANISTEE | MI | 0.03 | 21,265.00 | 664.53 | 20,851.00 | 54.00 | 189.00 | 54.00 |
💡 Additional columns not displayed: popother, percwhite, percblack, percamerindan, percasian, percother, popadults, perchsd, percollege, percprof, poppovertyknown, percpovertyknown, percbelowpoverty, percchildbelowpovert, percadultpoverty, percelderlypoverty, inmetro, category, dot_size | |||||||||
🔍 Data Exploration: Plot Data Raw | Sample Size: 10 Records |
# Select the specified columns
= df[['county', 'state', 'area' ,'poptotal', 'popwhite', 'popblack', 'popamerindian', 'popasian', 'category']]
df "Plot Data Selected", dataframe = df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Plot Data Selected' | ||||||||
---|---|---|---|---|---|---|---|---|
Exploring Data | ||||||||
County | State | Area | Poptotal | Popwhite | Popblack | Popamerindian | Popasian | Category |
FULTON | IL | 0.05 | 38,080.00 | 37,117.00 | 668.00 | 83.00 | 105.00 | AAR |
RICHLAND | WI | 0.03 | 17,521.00 | 17,411.00 | 12.00 | 34.00 | 38.00 | AAR |
STEPHENSON | IL | 0.03 | 48,052.00 | 44,524.00 | 3,081.00 | 58.00 | 304.00 | AAR |
LUCE | MI | 0.06 | 5,763.00 | 5,418.00 | 2.00 | 331.00 | 6.00 | AHR |
MORGAN | IL | 0.03 | 36,397.00 | 34,561.00 | 1,510.00 | 48.00 | 130.00 | AAR |
BENZIE | MI | 0.02 | 12,200.00 | 11,863.00 | 30.00 | 237.00 | 35.00 | AAR |
BUFFALO | WI | 0.04 | 13,584.00 | 13,521.00 | 5.00 | 22.00 | 29.00 | AAR |
GRAND TRAVERSE | MI | 0.03 | 64,273.00 | 63,019.00 | 259.00 | 555.00 | 318.00 | HAR |
OSCODA | MI | 0.03 | 7,842.00 | 7,781.00 | 2.00 | 41.00 | 5.00 | LHR |
MANISTEE | MI | 0.03 | 21,265.00 | 20,851.00 | 54.00 | 189.00 | 54.00 | AAR |
🔍 Data Exploration: Plot Data Selected | Sample Size: 10 Records |
= plt.figure(figsize = (12, 6))
fig = fig.add_subplot(1,1,1,)
ax
# iterate over each state
for cat in sorted(list(df["state"].unique())):
# filter x and the y for each category
= df[df["state"] == cat]["area"]
ar = df[df["state"] == cat]["poptotal"]
pop = df[df["state"] == cat]["popwhite"]
wht
# plot the data poptoal vs area colored by popwhite
= cat, s = wht/200)
ax.scatter(ar, pop, label
"top"].set_color("None")
ax.spines["right"].set_color("None")
ax.spines[
# set a specific label for each axis
"Area")
ax.set_xlabel("Population")
ax.set_ylabel(
-0.01)
ax.set_xlim("Scatter plot of population vs area: Symbols size = White population")
ax.set_title(= "upper left", fontsize = 10);
ax.legend(loc plt.grid()
= plt.figure(figsize = (12, 6))
fig = fig.add_subplot(1,1,1,)
ax
# prepare the data for plotting
= df["poptotal"].sum()
size_total # we want every group to have a different marker
= [".", ",", "o", "v", "^", "<", ">", "1", "2", "3", "4", "8", "s", "p", "P", "*", "h", "H", "+", "x", "X", "D", "d"]
markers
# iterate over each category and plot the data.
for cat, marker in zip(sorted(list(df["category"].unique())), markers):
# filter x and the y for each category
= df[df["category"] == cat]["area"]
ar = df[df["category"] == cat]["poptotal"]
pop
# this will allow us to set a specific size for each group.
= pop/size_total
size
# plot the data
= cat, s = size*10000, marker = marker)
ax.scatter(ar, pop, label
# ----------------------------------------------------------------------------------------------------
# create an encircle
# based on this solution
# https://stackoverflow.com/questions/44575681/how-do-i-encircle-different-data-sets-in-scatter-plot
# steps to take:
# filter a specific group selecting state OH
= df[df["state"] == "OH"]
encircle_data
# separete x and y
= encircle_data["area"]
encircle_x = encircle_data["poptotal"]
encircle_y
= np.c_[encircle_x,encircle_y]
p
# uing ConvexHull (we imported it before) to calculate the limits of the polygon
= ConvexHull(p)
hull
# create the polygon with a specific color based on the vertices of our data/hull
= plt.Polygon(p[hull.vertices,:], ec = "orange", fc = "none")
poly
# add the patch to the axes/plot)
ax.add_patch(poly)
"top"].set_color("None")
ax.spines["right"].set_color("None")
ax.spines[
# set a specific label for each axis
"Area")
ax.set_xlabel("Population")
ax.set_ylabel(
-0.01)
ax.set_xlim("Bubble plot with encircling")
ax.set_title(= "upper left", fontsize = 10);
ax.legend(loc plt.grid()
"mpg_ggplot2.csv") process_csv_from_data_folder(
Random Sample of 10 Records from 'mpg_ggplot2.csv' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Manufacturer | Model | Displ | Year | Cyl | Trans | Drv | Cty | Hwy | Fl |
dodge | ram 1500 pickup 4wd | 4.70 | 2,008.00 | 8.00 | manual(m6) | 4 | 9.00 | 12.00 | e |
toyota | toyota tacoma 4wd | 4.00 | 2,008.00 | 6.00 | auto(l5) | 4 | 16.00 | 20.00 | r |
toyota | camry | 2.20 | 1,999.00 | 4.00 | auto(l4) | f | 21.00 | 27.00 | r |
audi | a4 quattro | 2.00 | 2,008.00 | 4.00 | manual(m6) | 4 | 20.00 | 28.00 | p |
jeep | grand cherokee 4wd | 4.70 | 2,008.00 | 8.00 | auto(l5) | 4 | 14.00 | 19.00 | r |
hyundai | sonata | 2.40 | 1,999.00 | 4.00 | manual(m5) | f | 18.00 | 27.00 | r |
toyota | corolla | 1.80 | 2,008.00 | 4.00 | manual(m5) | f | 28.00 | 37.00 | r |
ford | mustang | 4.00 | 2,008.00 | 6.00 | auto(l5) | r | 16.00 | 24.00 | r |
volkswagen | jetta | 2.00 | 1,999.00 | 4.00 | manual(m5) | f | 21.00 | 29.00 | r |
audi | a6 quattro | 2.80 | 1,999.00 | 6.00 | auto(l5) | 4 | 15.00 | 24.00 | p |
💡 Additional columns not displayed: class | |||||||||
🔍 Data Exploration: mpg_ggplot2.csv | Sample Size: 10 Records |
There are two functions in seaborn
to create a scatter plot with a regression line: regplot
and lmplot
. Note that this function requires the data argument with a pandas data frame as input.
# get the data
= pd.read_csv('data/mpg_ggplot2.csv')
df "Plot Data Raw", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Plot Data Raw' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Manufacturer | Model | Displ | Year | Cyl | Trans | Drv | Cty | Hwy | Fl |
dodge | ram 1500 pickup 4wd | 4.70 | 2,008.00 | 8.00 | manual(m6) | 4 | 9.00 | 12.00 | e |
toyota | toyota tacoma 4wd | 4.00 | 2,008.00 | 6.00 | auto(l5) | 4 | 16.00 | 20.00 | r |
toyota | camry | 2.20 | 1,999.00 | 4.00 | auto(l4) | f | 21.00 | 27.00 | r |
audi | a4 quattro | 2.00 | 2,008.00 | 4.00 | manual(m6) | 4 | 20.00 | 28.00 | p |
jeep | grand cherokee 4wd | 4.70 | 2,008.00 | 8.00 | auto(l5) | 4 | 14.00 | 19.00 | r |
hyundai | sonata | 2.40 | 1,999.00 | 4.00 | manual(m5) | f | 18.00 | 27.00 | r |
toyota | corolla | 1.80 | 2,008.00 | 4.00 | manual(m5) | f | 28.00 | 37.00 | r |
ford | mustang | 4.00 | 2,008.00 | 6.00 | auto(l5) | r | 16.00 | 24.00 | r |
volkswagen | jetta | 2.00 | 1,999.00 | 4.00 | manual(m5) | f | 21.00 | 29.00 | r |
audi | a6 quattro | 2.80 | 1,999.00 | 6.00 | auto(l5) | 4 | 15.00 | 24.00 | p |
💡 Additional columns not displayed: class | |||||||||
🔍 Data Exploration: Plot Data Raw | Sample Size: 10 Records |
# filter only 2 clases
= df[df["cyl"].isin([4,8])]
df
# plot the data using seaborn
='displ', y ='hwy', data = df,hue = "cyl")
sns.lmplot(x plt.grid()
The seaborn.stripplot
draws a categorical scatterplot using jitter to reduce overplotting. A jitter plot is a variant of the strip plot with a better view of overlapping data points, used to visualize the distribution of many individual 1D values.
Using the same data from the previous plot.
=df, x="cty", y="hwy")
sns.stripplot(dataTrue)
plt.grid( plt.show()
A counts plot is a variant of the strip plot with a better view of overlapping data points, used to visualize the distribution of many individual 1D values.
Using the same raw data from the previous plot.
= df.groupby(["cty", "hwy"]).size().reset_index(name = "counts")
gb_df
# sort the values
"cty", "hwy", "counts"], ascending = True, inplace = True)
gb_df.sort_values([
# create a color for each group.
= {i:np.random.random(3,) for i in sorted(list(gb_df["cty"].unique()))}
colors
= plt.figure(figsize = (10, 5))
fig = fig.add_subplot()
ax
# ----------------------------------------------------------------------------------------------------
# iterate over each category and plot the data. This way, every group has it's own color and sizwe.
for x in sorted(list(gb_df["cty"].unique())):
# get x and y values for each group
= gb_df[gb_df["cty"] == x]["cty"]
x_values = gb_df[gb_df["cty"] == x]["hwy"]
y_values
# extract the size of each group to plot
= gb_df[gb_df["cty"] == x]["counts"]
size
# extract the color for each group and covert it from rgb to hex
= matplotlib.colors.rgb2hex(colors[x])
color
# plot the data
= size*10, c = color)
ax.scatter(x_values, y_values, s
"Counts plot");
ax.set_title( plt.grid()
= df.groupby(["cty", "hwy"]).size().reset_index(name = "counts")
gb_df
# sort the values
"cty", "hwy", "counts"], ascending = True, inplace = True)
gb_df.sort_values([
# create a color for each group.
= {i:np.random.random(3,) for i in sorted(list(gb_df["cty"].unique()))}
colors
= plt.figure(figsize = (10, 5))
fig = fig.add_subplot()
ax
# ----------------------------------------------------------------------------------------------------
# iterate over each category and plot the data. This way, every group has it's own color and size.
for x in sorted(list(gb_df["cty"].unique())):
# get x and y values for each group
= gb_df[gb_df["cty"] == x]["cty"]
x_values = gb_df[gb_df["cty"] == x]["hwy"]
y_values
# extract the size of each group to plot
= gb_df[gb_df["cty"] == x]["counts"]
size
# extract the color for each group and covert it from rgb to hex
= matplotlib.colors.rgb2hex(colors[x])
color
# plot the data
= size*10, c = color)
ax.scatter(x_values, y_values, s
"Counts plot");
ax.set_title( plt.grid()
Marginal histograms are histograms added to the margin of each axis of a scatter plot for analyzing the distribution of each measure. Creating the following scatter plot with marginal histograms.
Using the same data from the previous plot.
# separate x and y
= df["displ"]
x = df["hwy"]
y
= plt.figure(figsize = (10, 5))
fig # in this case we use gridspec.
# check the basics section of this kernel if you need help.
= fig.add_gridspec(5, 5)
gs = fig.add_subplot(gs[:4, :-1])
ax1
# main axis: scatter plot
# this line is very nice c = df.manufacturer.astype('category').cat.codes
# since it basically generate a color for each category
= df.manufacturer.astype('category').cat.codes)
ax1.scatter(x, y, c
# set the labels for x and y
"Dist")
ax1.set_xlabel("Hwy")
ax1.set_ylabel(
# set the title for the main plot
"Scatter plot with marginal histograms")
ax1.set_title(
"right"].set_color("None")
ax1.spines["top"].set_color("None")
ax1.spines[
= fig.add_subplot(gs[4:, :-1])
ax2 40, orientation = 'vertical', color = "pink")
ax2.hist(x,
ax2.invert_yaxis()
ax2.set_xticks([])
ax2.set_yticks([])
= False
ax2.axison
= fig.add_subplot(gs[:4, -1])
ax3 40, orientation = "horizontal", color = "pink")
ax3.hist(y,
ax3.set_xticks([])
ax3.set_yticks([])
= False
ax3.axison
fig.tight_layout()
Marginal boxplot serves a similar purpose as marginal histogram. However, the boxplot helps to pinpoint the median, 25th and 75th percentiles of the X and the Y.
Using the same raw data from the previous plot.
= df["displ"]
x = df["hwy"]
y
# in this plot we create the colors separatly
= df["manufacturer"].astype("category").cat.codes
colors
= plt.figure(figsize = (10, 5))
fig
= fig.add_gridspec(6, 6)
gs = fig.add_subplot(gs[:4, :-1])
ax1
# main axis: scatter plot
= df.manufacturer.astype('category').cat.codes)
ax1.scatter(x, y, c
# set the labels for x and y
"Dist")
ax1.set_xlabel("Hwy")
ax1.set_ylabel(
# set the title for the main plot
"Scatter plot with marginal boxplots")
ax1.set_title(
"right"].set_color("None")
ax1.spines["top"].set_color("None")
ax1.spines[
= fig.add_subplot(gs[4:, :-1])
ax2 = False,
ax2.boxplot(x, vert = 0.75 # make the boxplot lines shorter
whis
)
ax2.set_xticks([])
ax2.set_yticks([])
# left plot
= fig.add_subplot(gs[:4, -1])
ax3 = 0.75 )
ax3.boxplot(y, whis
ax3.set_xticks([])
ax3.set_yticks([])
fig.tight_layout()
Seaborn offers simple utilities for creating correlation heatmaps. The heatmap displays a matrix with colors that indicate the degree of correlation between the variables.
= pd.read_csv('data/mtcars.csv')
df "Plot Data Raw", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Plot Data Raw' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Model | Mpg | Cyl | Disp | Hp | Drat | Wt | Qsec | Vs | Am |
Ferrari Dino | 19.70 | 6.00 | 145.00 | 175.00 | 3.62 | 2.77 | 15.50 | 0.00 | 1.00 |
Lincoln Continental | 10.40 | 8.00 | 460.00 | 215.00 | 3.00 | 5.42 | 17.82 | 0.00 | 0.00 |
Pontiac Firebird | 19.20 | 8.00 | 400.00 | 175.00 | 3.08 | 3.85 | 17.05 | 0.00 | 0.00 |
Fiat 128 | 32.40 | 4.00 | 78.70 | 66.00 | 4.08 | 2.20 | 19.47 | 1.00 | 1.00 |
Merc 230 | 22.80 | 4.00 | 140.80 | 95.00 | 3.92 | 3.15 | 22.90 | 1.00 | 0.00 |
Merc 280 | 19.20 | 6.00 | 167.60 | 123.00 | 3.92 | 3.44 | 18.30 | 1.00 | 0.00 |
Maserati Bora | 15.00 | 8.00 | 301.00 | 335.00 | 3.54 | 3.57 | 14.60 | 0.00 | 1.00 |
Fiat X1-9 | 27.30 | 4.00 | 79.00 | 66.00 | 4.08 | 1.94 | 18.90 | 1.00 | 1.00 |
Merc 450SL | 17.30 | 8.00 | 275.80 | 180.00 | 3.07 | 3.73 | 17.60 | 0.00 | 0.00 |
Mazda RX4 | 21.00 | 6.00 | 160.00 | 110.00 | 3.90 | 2.62 | 16.46 | 0.00 | 1.00 |
💡 Additional columns not displayed: gear, carb | |||||||||
🔍 Data Exploration: Plot Data Raw | Sample Size: 10 Records |
=df[['mpg','cyl','disp','hp','drat','wt','qsec']]
df1"Plot Data Selected", dataframe=df1) process_csv_from_data_folder(
Random Sample of 10 Records from 'Plot Data Selected' | ||||||
---|---|---|---|---|---|---|
Exploring Data | ||||||
Mpg | Cyl | Disp | Hp | Drat | Wt | Qsec |
19.70 | 6.00 | 145.00 | 175.00 | 3.62 | 2.77 | 15.50 |
10.40 | 8.00 | 460.00 | 215.00 | 3.00 | 5.42 | 17.82 |
19.20 | 8.00 | 400.00 | 175.00 | 3.08 | 3.85 | 17.05 |
32.40 | 4.00 | 78.70 | 66.00 | 4.08 | 2.20 | 19.47 |
22.80 | 4.00 | 140.80 | 95.00 | 3.92 | 3.15 | 22.90 |
19.20 | 6.00 | 167.60 | 123.00 | 3.92 | 3.44 | 18.30 |
15.00 | 8.00 | 301.00 | 335.00 | 3.54 | 3.57 | 14.60 |
27.30 | 4.00 | 79.00 | 66.00 | 4.08 | 1.94 | 18.90 |
17.30 | 8.00 | 275.80 | 180.00 | 3.07 | 3.73 | 17.60 |
21.00 | 6.00 | 160.00 | 110.00 | 3.90 | 2.62 | 16.46 |
🔍 Data Exploration: Plot Data Selected | Sample Size: 10 Records |
# calculate the correlation between all variables
= df1.corr()
corr
= np.zeros_like(corr)
mask = True
mask[np.triu_indices_from(mask)]
= plt.figure(figsize = (10, 5))
fig
# plot the data using seaborn
= sns.heatmap(corr,
ax = mask,
mask = 0.3,
vmax = True,
square = "viridis")
cmap # set the title for the figure
"Heatmap using seaborn");
ax.set_title( plt.grid()
If you want to see how the items are varying based on a single metric and visualize the order and amount of this variance, the diverging bars is a great tool.
Diverging Bar Charts are used to ease the comparison of multiple groups. Its design allows us to compare numerical values in various groups. It also helps us to quickly visualize the favorable and unfavorable or positive and negative responses.
Using the same raw data as previous plot.
= pd.read_csv('data/mtcars.csv')
df "Plot Data Raw", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Plot Data Raw' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Model | Mpg | Cyl | Disp | Hp | Drat | Wt | Qsec | Vs | Am |
Ferrari Dino | 19.70 | 6.00 | 145.00 | 175.00 | 3.62 | 2.77 | 15.50 | 0.00 | 1.00 |
Lincoln Continental | 10.40 | 8.00 | 460.00 | 215.00 | 3.00 | 5.42 | 17.82 | 0.00 | 0.00 |
Pontiac Firebird | 19.20 | 8.00 | 400.00 | 175.00 | 3.08 | 3.85 | 17.05 | 0.00 | 0.00 |
Fiat 128 | 32.40 | 4.00 | 78.70 | 66.00 | 4.08 | 2.20 | 19.47 | 1.00 | 1.00 |
Merc 230 | 22.80 | 4.00 | 140.80 | 95.00 | 3.92 | 3.15 | 22.90 | 1.00 | 0.00 |
Merc 280 | 19.20 | 6.00 | 167.60 | 123.00 | 3.92 | 3.44 | 18.30 | 1.00 | 0.00 |
Maserati Bora | 15.00 | 8.00 | 301.00 | 335.00 | 3.54 | 3.57 | 14.60 | 0.00 | 1.00 |
Fiat X1-9 | 27.30 | 4.00 | 79.00 | 66.00 | 4.08 | 1.94 | 18.90 | 1.00 | 1.00 |
Merc 450SL | 17.30 | 8.00 | 275.80 | 180.00 | 3.07 | 3.73 | 17.60 | 0.00 | 0.00 |
Mazda RX4 | 21.00 | 6.00 | 160.00 | 110.00 | 3.90 | 2.62 | 16.46 | 0.00 | 1.00 |
💡 Additional columns not displayed: gear, carb | |||||||||
🔍 Data Exploration: Plot Data Raw | Sample Size: 10 Records |
"x_plot"] = (df["mpg"] - df["mpg"].mean())/df["mpg"].std()
df[
"x_plot", inplace = True)
df.sort_values(= True)
df.reset_index(inplace
= ["red" if x < 0 else "green" for x in df["x_plot"]]
colors
= plt.figure(figsize = (10, 8))
fig = fig.add_subplot()
ax # plot using horizontal lines and make it look like a column by changing the linewidth
= df.index, xmin = 0 , xmax = df["x_plot"], color = colors, linewidth = 5)
ax.hlines(y
"Mileage")
ax.set_xlabel("Car Name")
ax.set_ylabel(
# set a title
"Diverging plot in matplotlib")
ax.set_title(
='--', alpha=0.5)
ax.grid(linestyle
ax.set_yticks(df.index); ax.set_yticklabels(df.model)
Divergent lines refer to a set of lines that originate from a common point and gradually spread or move apart from each other as they extend further. Using the same data as the previous plot.
# https://statisticsbyjim.com/glossary/standardization/
"x_plot"] = (df["mpg"] - df["mpg"].mean())/df["mpg"].std()
df[
# sort value and reset the index
"x_plot", inplace = True)
df.sort_values(=True)
df.reset_index(inplace
# create a color list, where if value is above > 0 it's green otherwise red
= ["red" if x < 0 else "green" for x in df["x_plot"]]
colors
= plt.figure(figsize = (10, 8))
fig = fig.add_subplot()
ax
= df.index, xmin = 0 , color = colors, xmax = df["x_plot"], linewidth = 1)
ax.hlines(y
# iterate over x and y
for x, y in zip(df["x_plot"], df.index):
# annotate text
- 0.1 if x < 0 else x + 0.1,
ax.text(x
y, round(x, 2),
= "red" if x < 0 else "green",
color ='right' if x < 0 else 'left',
horizontalalignment= 10)
size
ax.scatter(x,
y, = "red" if x < 0 else "green",
color = 0.5)
alpha
# set title
"Diverging plot in matplotlib")
ax.set_title(# change x lim
-3, 3)
ax.set_xlim(
# set labels
"Mileage")
ax.set_xlabel("Car Name")
ax.set_ylabel(
='--', alpha=0.5)
ax.grid(linestyle
ax.set_yticks(df.index)
ax.set_yticklabels(df.model)"top"].set_color("None")
ax.spines["left"].set_color("None")
ax.spines['right'].set_position(('data',0))
ax.spines['right'].set_color('black') ax.spines[
A diverging dot plot is useful in plotting variance.
Same raw data as the previous plot.
# https://statisticsbyjim.com/glossary/standardization/
"x_plot"] = (df["mpg"] - df["mpg"].mean())/df["mpg"].std()
df[
# sort value and reset the index
"x_plot", inplace = True)
df.sort_values(=True, inplace=True)
df.reset_index(drop
# create a color list, where if value is above > 0 it's green otherwise red
= ["red" if x < 0 else "green" for x in df["x_plot"]]
colors
= plt.figure(figsize = (10, 8))
fig = fig.add_subplot()
ax
# iterate over x and y and annotate text and plot the data
for x, y in zip(df["x_plot"], df.index):
# make a horizontal line from the y till the x value
# this doesn't appear in the original 50 plot challenge
= y,
ax.hlines(y = -3,
xmin = x,
xmax = 0.5,
linewidth = 0.3,
alpha = "red" if x < 0 else "green")
color
# annotate text
ax.text(x,
y, round(x, 2),
= "black",
color ='center',
horizontalalignment='center',
verticalalignment= 8)
size
# plot the points
ax.scatter(x,
y, = "red" if x < 0 else "green",
color = 300,
s = 0.5)
alpha # set title
"Diverging plot in matplotlib")
ax.set_title(
# change x lim
-3, 3)
ax.set_xlim(
# set labels
"Mileage")
ax.set_xlabel("Car Name")
ax.set_ylabel(
ax.set_yticks(df.index)
ax.set_yticklabels(df.model)
"top"].set_color("None")
ax.spines["left"].set_color("None")
ax.spines[
'right'].set_position(('data',0))
ax.spines['right'].set_color('grey') ax.spines[
A diverging lollipop chart is a useful tool for comparing data that falls into two categories, usually indicated by different colors.
Using the same raw data as previous plot.
# https://statisticsbyjim.com/glossary/standardization/
"x_plot"] = (df["mpg"] - df["mpg"].mean())/df["mpg"].std()
df[
# sort value and reset the index
"x_plot", inplace = True)
df.sort_values(=True, inplace = True)
df.reset_index(drop
"color"] = df["model"].apply(lambda car_name: "orange" if car_name == "Fiat X1-9" else "black")
df[
= plt.figure(figsize = (8, 12))
fig = fig.add_subplot()
ax
= df.index,
ax.hlines(y = 0,
xmin = df["x_plot"],
xmax = df["color"],
color = 0.6)
alpha
# plot the dots
= df["x_plot"],
ax.scatter(x = df.index,
y = 100,
s = df["color"],
color = 0.6)
alpha
def add_patch(verts, ax, color):
'''
Takes the vertices and the axes as argument and adds the patch to our plot.
'''
= [
codes
Path.MOVETO,
Path.LINETO,
Path.LINETO,
Path.LINETO,
Path.CLOSEPOLY,
]
= Path(verts, codes)
path = PathPatch(path, facecolor = color, lw = 2, alpha = 0.3)
pathpatch
ax.add_patch(pathpatch)
# coordinates for the bottom shape
= [
verts_bottom -2.5, -0.5), # left, bottom
(-2.5, 2), # left, top
(-1.5, 2), # right, top
(-1.5, -0.5), # right, bottom
(0., 0.), # ignored
(
]
# coordinates for the upper shape
= [
verts_upper 1.5, 27), # left, bottom
(1.5, 33), # left, top
(2.5, 33), # right, top
(2.5, 27), # right, bottom
(0., 0.), # ignored
(
]
# use the function to add them to the existing plot
= "red")
add_patch(verts_bottom, ax, color = "green")
add_patch(verts_upper, ax, color
# annotate text
'Mercedes Models',
ax.annotate(= (0.0, 11.0),
xy = (1.5, 11),
xytext = 'data',
xycoords = 10,
fontsize = 'center',
ha = 'center',
va = dict(boxstyle = 'square', fc = 'blue', alpha = 0.1),
bbox = dict(arrowstyle = '-[, widthB=2.0, lengthB=1.5', lw = 2.0, color = 'grey'), color = 'black')
arrowprops
# set title
"Diverging Lollipop of Car Mileage")
ax.set_title(
# autoscale
ax.autoscale_view()
# change x lim
-3, 3)
ax.set_xlim(
# set labels
"Mileage")
ax.set_xlabel("Car Name")
ax.set_ylabel(
ax.set_yticks(df.index)
ax.set_yticklabels(df.model)
"right"].set_color("None")
ax.spines["top"].set_color("None")
ax.spines[
='--', alpha=0.5); ax.grid(linestyle
seaborn.pairplot()
: To plot multiple pairwise bivariate distributions in a dataset, you can use the .pairplot()
function. The diagonal plots are the univariate plots, and this displays the relationship for the (n, 2) combination of variables in a DataFrame as a matrix of plots.
= sns.load_dataset('iris')
df "Plot Data Raw", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Plot Data Raw' | ||||
---|---|---|---|---|
Exploring Data | ||||
Sepal Length | Sepal Width | Petal Length | Petal Width | Species |
6.10 | 2.80 | 4.70 | 1.20 | versicolor |
5.70 | 3.80 | 1.70 | 0.30 | setosa |
7.70 | 2.60 | 6.90 | 2.30 | virginica |
6.00 | 2.90 | 4.50 | 1.50 | versicolor |
6.80 | 2.80 | 4.80 | 1.40 | versicolor |
5.40 | 3.40 | 1.50 | 0.40 | setosa |
5.60 | 2.90 | 3.60 | 1.30 | versicolor |
6.90 | 3.10 | 5.10 | 2.30 | virginica |
6.20 | 2.20 | 4.50 | 1.50 | versicolor |
5.80 | 2.70 | 3.90 | 1.20 | versicolor |
🔍 Data Exploration: Plot Data Raw | Sample Size: 10 Records |
# plot the data using seaborn
= "species" ); sns.pairplot(df, hue
An area chart is really similar to a line chart, except that the area between the x axis and the line is filled in with color or shading. It represents the evolution of a numeric variable.
= pd.DataFrame({
df 'sales': [6, 4, 9, 7, 13, 10],
'signups': [9,13, 15, 12, 20, 26],
'visits': [20, 42, 28, 62, 81, 50],
=pd.date_range(start='2024/05/01', end='2024/11/01',
}, index='ME'))
freq
= df.plot.area() ax
A more complex area chart provided below using timeseries data.
= pd.read_csv('data\economics.csv')
df "econimics.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'econimics.csv' | |||||
---|---|---|---|---|---|
Exploring Data | |||||
Date | Pce | Pop | Psavert | Uempmed | Unemploy |
2010-05-01 | 10,140.20 | 309,376.00 | 6.00 | 22.30 | 14,849.00 |
1973-05-01 | 843.10 | 211,577.00 | 12.80 | 4.90 | 4,329.00 |
1978-06-01 | 1,429.80 | 222,379.00 | 9.50 | 6.00 | 6,028.00 |
2002-09-01 | 7,426.10 | 288,618.00 | 4.90 | 9.50 | 8,251.00 |
2012-12-01 | 11,245.20 | 315,532.00 | 10.50 | 17.60 | 12,272.00 |
1994-04-01 | 4,690.70 | 262,631.00 | 5.80 | 9.10 | 8,331.00 |
1983-03-01 | 2,208.60 | 233,613.00 | 10.00 | 10.40 | 11,408.00 |
1969-12-01 | 623.70 | 203,675.00 | 11.70 | 4.60 | 2,884.00 |
1974-04-01 | 912.70 | 213,361.00 | 12.70 | 5.00 | 4,618.00 |
1993-05-01 | 4,441.30 | 259,680.00 | 7.70 | 8.10 | 9,149.00 |
🔍 Data Exploration: econimics.csv | Sample Size: 10 Records |
"pce_monthly_change"] = (df["psavert"] - df["psavert"].shift(1))/df["psavert"].shift(1)
df[
# convert todatetime
"date_converted"] = pd.to_datetime(df["date"])
df[
# filter our df for a specific date
= df[df["date_converted"] < np.datetime64("1975-01-01")]
df
# separate x and y
= df["date_converted"]
x = df["pce_monthly_change"]
y
# calculate the max values to annotate on the plot
= y.max()
y_max
# find the index of the max value
= np.where(y == y_max)
x_ind
# find the x based on the index of max
= x.iloc[x_ind]
x_max
= plt.figure(figsize = (15, 10))
fig = fig.add_subplot()
ax
= "black")
ax.plot(x, y, color = 300, color = "green", alpha = 0.3)
ax.scatter(x_max, y_max, s
# annotate the text of the Max value
r'Max value',
ax.annotate(= (x_max, y_max),
xy = (-90, -50),
xytext = 'offset points',
textcoords = 16,
fontsize = dict(arrowstyle = "->", connectionstyle = "arc3,rad=.2")
arrowprops
)
0, y, where = 0 > y, facecolor='red', interpolate = True, alpha = 0.3)
ax.fill_between(x, 0, y, where = 0 <= y, facecolor='green', interpolate = True, alpha = 0.3)
ax.fill_between(x,
min() * 1.1, y.max() * 1.1)
ax.set_ylim(y.
= [str(m)[:3].upper() + "-" + str(y) for y,m in zip(df.date_converted.dt.year, df.date_converted.dt.month_name())]
xtickvals
# this way we can set the ticks to be every 6 months.
6])
ax.set_xticks(x[::
6], rotation=45, fontdict={'horizontalalignment': 'center', 'verticalalignment': 'center_baseline'})
ax.set_xticklabels(xtickvals[::
# add a grid
= 0.3)
ax.grid(alpha
# set the title
"Monthly variation return %"); ax.set_title(
Sort bars in increasing/decreasing order in a bar chart in Matplotlib
. Create the ordered bar chart to show comparisons among discrete categories.
Reusing data that we have seen before.
= pd.read_csv('data\mpg_ggplot2.csv')
df "mpg_ggplot2.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'mpg_ggplot2.csv' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Manufacturer | Model | Displ | Year | Cyl | Trans | Drv | Cty | Hwy | Fl |
dodge | ram 1500 pickup 4wd | 4.70 | 2,008.00 | 8.00 | manual(m6) | 4 | 9.00 | 12.00 | e |
toyota | toyota tacoma 4wd | 4.00 | 2,008.00 | 6.00 | auto(l5) | 4 | 16.00 | 20.00 | r |
toyota | camry | 2.20 | 1,999.00 | 4.00 | auto(l4) | f | 21.00 | 27.00 | r |
audi | a4 quattro | 2.00 | 2,008.00 | 4.00 | manual(m6) | 4 | 20.00 | 28.00 | p |
jeep | grand cherokee 4wd | 4.70 | 2,008.00 | 8.00 | auto(l5) | 4 | 14.00 | 19.00 | r |
hyundai | sonata | 2.40 | 1,999.00 | 4.00 | manual(m5) | f | 18.00 | 27.00 | r |
toyota | corolla | 1.80 | 2,008.00 | 4.00 | manual(m5) | f | 28.00 | 37.00 | r |
ford | mustang | 4.00 | 2,008.00 | 6.00 | auto(l5) | r | 16.00 | 24.00 | r |
volkswagen | jetta | 2.00 | 1,999.00 | 4.00 | manual(m5) | f | 21.00 | 29.00 | r |
audi | a6 quattro | 2.80 | 1,999.00 | 6.00 | auto(l5) | 4 | 15.00 | 24.00 | p |
💡 Additional columns not displayed: class | |||||||||
🔍 Data Exploration: mpg_ggplot2.csv | Sample Size: 10 Records |
# groupby and create the target x and y
= df.groupby(["manufacturer"])[["cyl", "displ", "cty"]].mean()
gb_df "cty", inplace = True)
gb_df.sort_values(
= gb_df.index
x = gb_df["cty"]
y
= plt.figure(figsize = (10, 8))
fig = fig.add_subplot()
ax
for x_, y_ in zip(x, y):
# this is very cool, since we can pass a function to matplotlib
# and it will plot the color based on the result of the evaluation
= "red" if y_ < y.mean() else "green", alpha = 0.3)
ax.bar(x_, y_, color
# add some text
+ 0.3, round(y_, 1), horizontalalignment = 'center')
ax.text(x_, y_
= patches.Rectangle((.124, -0.005), width = .360, height = .13, alpha = .1, facecolor = 'red', transform = fig.transFigure)
p2
fig.add_artist(p2)
# green one
= patches.Rectangle((.124 + .360, -0.005), width = .42, height = .13, alpha = .1, facecolor = 'green', transform = fig.transFigure)
p1
fig.add_artist(p1)
# rotate the x ticks 90 degrees
# Before setting tick labels, set the tick locations
range(len(x))) # Use range of x-axis length
ax.set_xticks(=90)
ax.set_xticklabels(x, rotation
# add an y label
"Average Miles per Gallon by Manufacturer")
ax.set_ylabel(
# set a title
"Bar Chart for Highway Mileage");
ax.set_title( plt.grid()
Lollipop Charts are nothing but a variation of the bar chart in which the thick bar is replaced with just a line and a dot-like “o” (o-shaped) at the end.
Same data as previous plot.
= df.groupby(["manufacturer"])[["cyl", "displ", "cty"]].mean()
gb_df "cty", inplace = True)
gb_df.sort_values(
= gb_df.index
x = gb_df["cty"]
y
= plt.figure(figsize = (10, 8))
fig = fig.add_subplot()
ax
for x_, y_ in zip(x, y):
# make a scatter plot
= "red" if y_ < y.mean() else "green", alpha = 0.3, s = 100)
ax.scatter(x_, y_, color
= 0, ymax = y_, color = "red" if y_ < y.mean() else "green", alpha = 0.3)
ax.vlines(x_, ymin
# add text with the data
+ 0.5, round(y_, 1), horizontalalignment='center')
ax.text(x_, y_
0, 30)
ax.set_ylim(
# rotate the x ticks 90 degrees
# Before setting tick labels, set the tick locations
range(len(x))) # Use range of x-axis length
ax.set_xticks(=90)
ax.set_xticklabels(x, rotation
"Average Miles per Gallon by Manufacturer")
ax.set_ylabel(
# set a title
"Lollipop Chart for Highway Mileage");
ax.set_title( plt.grid()
The dot plot conveys the rank order of the items. This is a simple graph that uses solid circles, or dots, to show the frequency of each unique data value.
Same data used as in the plot above.
= df.groupby(["manufacturer"])[["cyl", "displ", "cty"]].mean()
gb_df "cty", inplace = True)
gb_df.sort_values(
= gb_df.index
x = gb_df["cty"]
y
= plt.figure(figsize = (10, 8))
fig = fig.add_subplot()
ax
for x_, y_ in zip(x, y):
= "red" if y_ < y.mean() else "green", alpha = 0.3, s = 100)
ax.scatter(y_, x_, color
8, 27)
ax.set_xlim(
"Average Miles per Gallon by Manufacturer")
ax.set_xlabel(
# set the title
"Dot Plot for Highway Mileage")
ax.set_title(
= 'major', axis = 'y', linestyle = '--'); ax.grid(which
A slope chart is a graphical representation used to display changes in values between two or more data points or categories.
= pd.read_csv('data\gdppercap.csv')
df "gdppercap.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'gdppercap.csv' | ||
---|---|---|
Exploring Data | ||
Continent | 1952 | 1957 |
Africa | 1,252.57 | 1,385.24 |
Americas | 4,079.06 | 4,616.04 |
Asia | 5,195.48 | 4,003.13 |
Europe | 5,661.06 | 6,963.01 |
Oceania | 10,298.09 | 11,598.52 |
🔍 Data Exploration: gdppercap.csv | Sample Size: 10 Records |
"color"] = df.apply(lambda row: "green" if row["1957"] >= row["1952"] else "red", axis = 1)
df[= plt.figure(figsize = (8, 12))
fig = fig.add_subplot()
ax for cont in df["continent"]:
# prepare the data for plotting
# extract each point and the color
= df.columns[1]
x_start = df.columns[2]
x_finish = df[df["continent"] == cont]["1952"]
y_start = df[df["continent"] == cont]["1957"]
y_finish = df[df["continent"] == cont]["color"]
color
= color, s = 200)
ax.scatter(x_start, y_start, color = color, s = 200*(y_finish/y_start))
ax.scatter(x_finish, y_finish, color
float(y_start.iloc[0]), float(y_finish.iloc[0])], linestyle = "-", color = color.values[0])
ax.plot([x_start, x_finish], [
# annotate the value for each continent
0] - 0.05, y_start.iloc[0], r'{}:{}k'.format(cont, int(y_start.iloc[0])/1000), \
ax.text(ax.get_xlim()[= 'right', verticalalignment = 'center', fontdict = {'size':8})
horizontalalignment 1] + 0.05, y_finish.iloc[0], r'{}:{}k'.format(cont, int(y_finish.iloc[0])/1000), \
ax.text(ax.get_xlim()[= 'left', verticalalignment = 'center', fontdict = {'size':8}) horizontalalignment
The dumbbell plot (aka connected dot plot) is great for displaying changes between two points in time, two conditions or differences between two groups.
= pd.read_csv('data\health.csv')
df "health.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'health.csv' | ||
---|---|---|
Exploring Data | ||
Area | Pct 2014 | Pct 2013 |
Phoenix | 0.13 | 0.17 |
Portland | 0.09 | 0.13 |
Houston | 0.19 | 0.22 |
Minneapolis | 0.06 | 0.08 |
All Metro Areas | 0.11 | 0.14 |
Charlotte | 0.13 | 0.15 |
New York | 0.10 | 0.12 |
Miami | 0.19 | 0.24 |
Pittsburgh | 0.06 | 0.07 |
Los Angeles | 0.14 | 0.20 |
🔍 Data Exploration: health.csv | Sample Size: 10 Records |
= plt.figure(figsize = (8, 8))
fig = fig.add_subplot()
ax for i, area in zip(df.index, df["Area"]):
= df[df["Area"] == area]["pct_2013"].values[0]
start_data = df[df["Area"] == area]["pct_2014"].values[0]
finish_data
= "blue", alpha = .8)
ax.scatter(start_data, i, c = "blue", alpha = .2)
ax.scatter(finish_data, i, c
= "blue", alpha = .2)
ax.hlines(i, start_data, finish_data, color
# set x and y label
"Pct change")
ax.set_xlabel("Area")
ax.set_ylabel(# set the title
"Dumbell Chart: Pct Change - 2013 vs 2014")
ax.set_title(= "x")
ax.grid(axis = ax.get_xlim()
x_lim 0]*.5, x_lim[1]*1.1)
ax.set_xlim(x_lim[= ax.get_xticks()
x_ticks
# Add this line to set ticks before setting labels
ax.set_xticks(x_ticks)
"{:.0f}%".format(round(tick*100, 0)) for tick in x_ticks])
ax.set_xticklabels([
ax.set_yticks(df.index)
plt.grid()
# More info:
# https://www.amcharts.com/demos/dumbbell-plot/
The histogram is one of the most useful graphical tools for understanding the distribution of a continuous variable. A stacked histogram is two or more histograms displayed on the same scale and used to compare variables.
Reusing the mpg_ggplot2.csv
data.
= pd.read_csv('data\mpg_ggplot2.csv')
df "mpg_ggplot2.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'mpg_ggplot2.csv' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Manufacturer | Model | Displ | Year | Cyl | Trans | Drv | Cty | Hwy | Fl |
dodge | ram 1500 pickup 4wd | 4.70 | 2,008.00 | 8.00 | manual(m6) | 4 | 9.00 | 12.00 | e |
toyota | toyota tacoma 4wd | 4.00 | 2,008.00 | 6.00 | auto(l5) | 4 | 16.00 | 20.00 | r |
toyota | camry | 2.20 | 1,999.00 | 4.00 | auto(l4) | f | 21.00 | 27.00 | r |
audi | a4 quattro | 2.00 | 2,008.00 | 4.00 | manual(m6) | 4 | 20.00 | 28.00 | p |
jeep | grand cherokee 4wd | 4.70 | 2,008.00 | 8.00 | auto(l5) | 4 | 14.00 | 19.00 | r |
hyundai | sonata | 2.40 | 1,999.00 | 4.00 | manual(m5) | f | 18.00 | 27.00 | r |
toyota | corolla | 1.80 | 2,008.00 | 4.00 | manual(m5) | f | 28.00 | 37.00 | r |
ford | mustang | 4.00 | 2,008.00 | 6.00 | auto(l5) | r | 16.00 | 24.00 | r |
volkswagen | jetta | 2.00 | 1,999.00 | 4.00 | manual(m5) | f | 21.00 | 29.00 | r |
audi | a6 quattro | 2.80 | 1,999.00 | 6.00 | auto(l5) | 4 | 15.00 | 24.00 | p |
💡 Additional columns not displayed: class | |||||||||
🔍 Data Exploration: mpg_ggplot2.csv | Sample Size: 10 Records |
= df[["class", "displ"]].groupby("class")
gb_df = []
lx = []
ln
= ["#543005", "#8c510a", "#bf812d", "#80cdc1", "#35978f", "#01665e", "#003c30"]
colors
for _, df_ in gb_df:
"displ"].values.tolist())
lx.append(df_[list(set(df_["class"].values.tolist()))[0])
ln.append(
= plt.figure(figsize = (8, 8))
fig = fig.add_subplot()
ax
= ax.hist(lx, bins = 30, stacked = True, density = False, color = colors)
n, bins, patches
# change x lim
0, 25)
ax.set_ylim(# set the xticks to reflect every third value
3])
ax.set_xticks(bins[::
# set a title
"Stacked Histogram of displ colored by class")
ax.set_title(
for class_, color in zip(ln, colors)})
ax.legend({class_:color
# set the y label
"Frequency");
ax.set_ylabel( plt.grid()
The stacked histogram of categorical variables compares frequency distributions of these variables as a grouped and stacked bar plot.
Using the same data as the plot above.
= df[["class", "manufacturer"]].groupby("class")
gb_df = []
lx = []
ln
= ["#543005", "#8c510a", "#bf812d", "#80cdc1", "#35978f", "#01665e", "#003c30"]
colors
for _, df_ in gb_df:
"manufacturer"].values.tolist())
lx.append(df_[list(set(df_["class"].values.tolist()))[0])
ln.append(
= plt.figure(figsize = (8, 8))
fig = fig.add_subplot()
ax
= ax.hist(lx, bins = 30, stacked = True, density = False, color = colors)
n, bins, patches
= 'x', labelrotation = 90)
ax.tick_params(axis
for class_, color in zip(ln, colors)})
ax.legend({class_:color
# add a title
"Stacked histogram of manufacturer colored by class")
ax.set_title(
# set an y label
"Frequency");
ax.set_ylabel( plt.grid()
A density plot is a representation of the distribution of a numeric variable. It uses a kernel density estimate to show the probability density function of the variable.
Using the same data as the plot above.
= plt.figure(figsize = (10, 8))
fig
for cyl_ in df["cyl"].unique():
# extract the data
= df[df["cyl"] == cyl_]["cty"]
x # plot the data using seaborn
=True, label = "{} cyl".format(cyl_))
sns.kdeplot(x, fill
# set the title of the plot
"Density Plot of City Mileage by n_cilinders");
plt.title(
plt.grid()
# More info:
# https://www.data-to-viz.com/graph/density.html
Add a density curve to a histogram by creating the histogram with a density scale, creating the curve data in a separate data frame, and adding the curve as another layer.
Using the same data as above.
= plt.figure(figsize = (10, 8))
fig for class_ in ["compact", "suv", "minivan"]:
# extract the data
= df[df["class"] == class_]["cty"]
x # plot the data using seaborn
=True, label="{} class".format(class_))
sns.histplot(x, kde
# set the title of the plot
"Density Plot of City Mileage by vehicle type")
plt.title(
plt.legend()
plt.grid()
# More info:
# https://www.data-to-viz.com/graph/density.html
Joyplots are stacked, partially overlapping density plots. The code for JoyPy
borrows from the code for KDEs in pandas.plotting
, and uses a couple of utility functions.
Reusing the same data as the plot above.
= (14,10), dpi = 80)
plt.figure(figsize # plot the data using joypy
= joypy.joyplot(df,
fig, axes = ['hwy', 'cty'], # colums to be plotted.
column = "model", # separate the data by this value. Creates a separate distribution for each one.
by = 'own',
ylim = (14,10)
figsize
)# add a title
'Joy Plot of City and Highway Mileage by Model', fontsize = 18);
plt.title( plt.grid()
<Figure size 1120x800 with 0 Axes>
This is a type of flow diagram that visualizes the transfer of quantities between different stages or categories.
import urllib.request
import json
import plotly.graph_objects as go
# Load data
= 'https://raw.githubusercontent.com/plotly/plotly.js/master/test/image/mocks/sankey_energy.json'
url = urllib.request.urlopen(url)
response = json.loads(response.read())
data
# Create df from json data to present gt table
# Extract the sankey data
= data['data'][0]
sankey_data
# Separate node and link data
= sankey_data['node']
node_data = sankey_data['link']
link_data
# Create a DataFrame from link data
= pd.DataFrame({
df 'source_index': link_data['source'],
'target_index': link_data['target'],
'value': link_data['value'],
'link_label': link_data['label'],
'link_color': link_data['color']
})
# Add corresponding labels and colors for source and target from node_data
'source_label'] = [node_data['label'][idx] for idx in df['source_index']]
df['target_label'] = [node_data['label'][idx] for idx in df['target_index']]
df['source_color'] = [node_data['color'][idx] for idx in df['source_index']]
df['target_color'] = [node_data['color'][idx] for idx in df['target_index']]
df[
"Plotly Data", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Plotly Data' | ||||||||
---|---|---|---|---|---|---|---|---|
Exploring Data | ||||||||
Source Index | Target Index | Value | Link Label | Link Color | Source Label | Target Label | Source Color | Target Color |
15.00 | 21.00 | 90.01 | rgba(0,0,96,0.2) | Electricity grid | Lighting & appliances - commercial | rgba(140, 86, 75, 0.8) | rgba(255, 127, 14, 0.8) | |
0.00 | 1.00 | 124.73 | stream 1 | rgba(0,0,96,0.2) | Agricultural 'waste' | Bio-conversion | rgba(31, 119, 180, 0.8) | rgba(255, 127, 14, 0.8) |
38.00 | 37.00 | 107.70 | rgba(0,0,96,0.2) | Oil reserves | Oil | rgba(188, 189, 34, 0.8) | rgba(127, 127, 127, 0.8) | |
1.00 | 5.00 | 81.14 | stream 1 | rgba(0,0,96,0.2) | Bio-conversion | Gas | rgba(255, 127, 14, 0.8) | rgba(140, 86, 75, 0.8) |
41.00 | 15.00 | 59.90 | rgba(0,0,96,0.2) | Solar PV | Electricity grid | rgba(255, 127, 14, 0.8) | rgba(140, 86, 75, 0.8) | |
15.00 | 19.00 | 4.41 | rgba(0,0,96,0.2) | Electricity grid | Agriculture | rgba(140, 86, 75, 0.8) | rgba(23, 190, 207, 0.8) | |
11.00 | 12.00 | 10.64 | rgba(0,0,96,0.2) | District heating | Industry | rgba(255, 127, 14, 0.8) | rgba(44, 160, 44, 0.8) | |
17.00 | 3.00 | 6.24 | rgba(0,0,96,0.2) | H2 conversion | Losses | rgba(127, 127, 127, 0.8) | rgba(214, 39, 40, 0.8) | |
35.00 | 26.00 | 500.00 | Old generation plant (made-up) | rgba(33,102,172,0.35) | Nuclear | Thermal generation | magenta | rgba(227, 119, 194, 0.8) |
11.00 | 14.00 | 46.18 | rgba(0,0,96,0.2) | District heating | Heating and cooling - homes | rgba(255, 127, 14, 0.8) | rgba(148, 103, 189, 0.8) | |
🔍 Data Exploration: Plotly Data | Sample Size: 10 Records |
# Extract node/link data for convenience
= data['data'][0]
sankey_data = sankey_data['node']
node_data = sankey_data['link']
link_data
# Replace "magenta" with RGBA and apply opacity
'color'] = [
node_data['rgba(255,0,255,0.8)' if c == "magenta" else c
for c in node_data['color']
]
# Use node source colors for links with lower opacity
'color'] = [
link_data['color'][src].replace("0.8", "0.4")
node_data[for src in link_data['source']
]
= go.Figure(data=[go.Sankey(
fig =".0f",
valueformat="TWh",
valuesuffix=dict(
node=15,
pad=15,
thickness=dict(color="black", width=0.5),
line=node_data['label'],
label=node_data['color']
color
),=dict(
link=link_data['source'],
source=link_data['target'],
target=link_data['value'],
value=link_data['label'],
label=link_data['color']
color
)
)])
fig.update_layout(=(
title_text"Energy forecast for 2050<br>"
"Source: Department of Energy & Climate Change, Tom Counsell via "
"<a href='https://bost.ocks.org/mike/sankey/'>Mike Bostock</a>"
),=12,
font_size=False,
autosize=1000,
width=800
height
)
fig.show()
A Dot Distribution Plot visualizes the data distribution across multiple categories by plotting dots along an axis. Each dot can represent a single data point or a count.
Reusing data that has been used before.
= pd.read_csv('data\mpg_ggplot2.csv')
df "mpg_ggplot2.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'mpg_ggplot2.csv' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Manufacturer | Model | Displ | Year | Cyl | Trans | Drv | Cty | Hwy | Fl |
dodge | ram 1500 pickup 4wd | 4.70 | 2,008.00 | 8.00 | manual(m6) | 4 | 9.00 | 12.00 | e |
toyota | toyota tacoma 4wd | 4.00 | 2,008.00 | 6.00 | auto(l5) | 4 | 16.00 | 20.00 | r |
toyota | camry | 2.20 | 1,999.00 | 4.00 | auto(l4) | f | 21.00 | 27.00 | r |
audi | a4 quattro | 2.00 | 2,008.00 | 4.00 | manual(m6) | 4 | 20.00 | 28.00 | p |
jeep | grand cherokee 4wd | 4.70 | 2,008.00 | 8.00 | auto(l5) | 4 | 14.00 | 19.00 | r |
hyundai | sonata | 2.40 | 1,999.00 | 4.00 | manual(m5) | f | 18.00 | 27.00 | r |
toyota | corolla | 1.80 | 2,008.00 | 4.00 | manual(m5) | f | 28.00 | 37.00 | r |
ford | mustang | 4.00 | 2,008.00 | 6.00 | auto(l5) | r | 16.00 | 24.00 | r |
volkswagen | jetta | 2.00 | 1,999.00 | 4.00 | manual(m5) | f | 21.00 | 29.00 | r |
audi | a6 quattro | 2.80 | 1,999.00 | 6.00 | auto(l5) | 4 | 15.00 | 24.00 | p |
💡 Additional columns not displayed: class | |||||||||
🔍 Data Exploration: mpg_ggplot2.csv | Sample Size: 10 Records |
"model", "cty"], inplace = True)
df.sort_values([= []
lc
= plt.figure(figsize = (12, 12))
fig = fig.add_subplot()
ax
# iterate over each car manufacturer
for i, car in enumerate(df["model"].unique()):
# prepare the data for plotting
# get x and y
= df[df["model"] == car]["cty"]
x = [car for i_ in range(len(x))]
y
# calculate the median value
= np.median(x)
x_median
# plot the data
= "white", edgecolor = "black", s = 30)
ax.scatter(x, y, c = "red", edgecolor = "black", s = 80)
ax.scatter(x_median, i, c
0, 40, linewidth = .1)
ax.hlines(i,
lc.append(car)
5, 40)
ax.set_xlim(-2, 38)
ax.set_ylim(
= "y", labelsize = 12)
ax.tick_params(axis
# set a title
"Distribution of City Mileage by Model", fontsize = 16)
ax.set_title(
= plt.plot([],[], marker = "o", ms = 10, ls = "", mec = None, color = 'firebrick', label = "Median")
red_patch
= red_patch, loc = 7, fontsize = 12)
plt.legend(handles
"right"].set_color("None")
ax.spines["left"].set_color("None")
ax.spines["top"].set_color("None");
ax.spines[
# More info:
# https://www.statisticshowto.com/what-is-a-dot-plot/
A boxplot is a standardized way of displaying the Interquartile Range (IQR) of a data set based on its five-number summary of data points: the “minimum,” first quartile, median, third quartile, and maximum. Boxplots are used to show distributions of numeric data values, especially when you want to compare them between multiple groups.
Reusing the same data as the plot above.
= (12, 10), dpi = 80)
plt.figure(figsize
= sns.boxplot(x = "manufacturer", y = "cty", data = df)
ax
= 'x', labelrotation = 90, labelsize = 12)
ax.tick_params(axis = 'y', labelsize = 12)
ax.tick_params(axis
# set and x and y label
"Manufacturer", fontsize = 14)
ax.set_xlabel("CTY", fontsize = 14)
ax.set_ylabel(
# set a title
"Boxplot CTY vs Manufacturer", fontsize = 14);
ax.set_title(
plt.grid()
# More info:
# https://en.wikipedia.org/wiki/Box_plot
Dot & Box plot conveys similar information as a boxplot split in groups.
= pd.DataFrame({ "A":np.random.normal(0.8,0.2,20),
df "B":np.random.normal(0.8,0.1,20),
"C":np.random.normal(0.9,0.1,20)} )
"Made Up Data", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Made Up Data' | ||
---|---|---|
Exploring Data | ||
A | B | C |
0.95 | 0.94 | 1.00 |
0.95 | 0.80 | 1.05 |
0.99 | 0.95 | 1.09 |
0.85 | 0.93 | 0.98 |
0.81 | 0.85 | 0.84 |
0.84 | 0.76 | 0.94 |
0.40 | 0.65 | 0.85 |
0.88 | 0.81 | 1.00 |
0.83 | 0.77 | 0.76 |
0.62 | 0.72 | 0.92 |
🔍 Data Exploration: Made Up Data | Sample Size: 10 Records |
# Create a boxplot
df.boxplot()
# Overlay points on the boxplot
for i, d in enumerate(df):
= df[d]
y = np.random.normal(i + 1, 0.04, len(y))
x =["orange", "blue", "yellow"][i], mec='k', ms=7, marker="o", linestyle="None")
plt.plot(x, y, mfc
# Add a horizontal line at y=1
1, 0, 4, linestyle="--")
plt.hlines(
# Show the plot
plt.show()
Another example using data we have used before: mpg_ggplot2
= pd.read_csv('data\mpg_ggplot2.csv')
df "mpg_ggplot2.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'mpg_ggplot2.csv' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Manufacturer | Model | Displ | Year | Cyl | Trans | Drv | Cty | Hwy | Fl |
dodge | ram 1500 pickup 4wd | 4.70 | 2,008.00 | 8.00 | manual(m6) | 4 | 9.00 | 12.00 | e |
toyota | toyota tacoma 4wd | 4.00 | 2,008.00 | 6.00 | auto(l5) | 4 | 16.00 | 20.00 | r |
toyota | camry | 2.20 | 1,999.00 | 4.00 | auto(l4) | f | 21.00 | 27.00 | r |
audi | a4 quattro | 2.00 | 2,008.00 | 4.00 | manual(m6) | 4 | 20.00 | 28.00 | p |
jeep | grand cherokee 4wd | 4.70 | 2,008.00 | 8.00 | auto(l5) | 4 | 14.00 | 19.00 | r |
hyundai | sonata | 2.40 | 1,999.00 | 4.00 | manual(m5) | f | 18.00 | 27.00 | r |
toyota | corolla | 1.80 | 2,008.00 | 4.00 | manual(m5) | f | 28.00 | 37.00 | r |
ford | mustang | 4.00 | 2,008.00 | 6.00 | auto(l5) | r | 16.00 | 24.00 | r |
volkswagen | jetta | 2.00 | 1,999.00 | 4.00 | manual(m5) | f | 21.00 | 29.00 | r |
audi | a6 quattro | 2.80 | 1,999.00 | 6.00 | auto(l5) | 4 | 15.00 | 24.00 | p |
💡 Additional columns not displayed: class | |||||||||
🔍 Data Exploration: mpg_ggplot2.csv | Sample Size: 10 Records |
= (12, 8), dpi= 80)
plt.figure(figsize # plot the data using seaborn
# since we don't create a specific separete plot
# everything will be rendered on the same axes
= "class", y = "cty", data = df, hue = "cyl")
sns.boxplot(x = 'class', y = 'cty', data = df, color = 'black', size = 3, jitter = 1)
sns.stripplot(x
= plt.gca()
ax # get the xticks to iterate over
= ax.get_xticks()
xticks
for tick in xticks:
+ 0.5, 0, np.max(df["cty"]), color = "grey", alpha = .1)
ax.vlines(tick
# rotate the x and y ticks
= 'x', labelrotation = 45, labelsize = 12)
ax.tick_params(axis = 'y', labelsize = 12)
ax.tick_params(axis
# add x and y label
"Class", fontsize = 14)
ax.set_xlabel("CTY", fontsize = 14)
ax.set_ylabel(
# add a title and put the legend on a specific location
"Boxplot and stripplot on the same figure", fontsize = 14)
ax.set_title(= "lower left", fontsize = 14);
ax.legend(loc
plt.grid()
# More info:
# https://en.wikipedia.org/wiki/Box_plot
# https://en.wikipedia.org/wiki/Dot_plot_(statistics)
A violin plot depicts distributions of numeric data for one or more groups using density curves. The width of each curve corresponds with the approximate frequency of data points in each region.
Using the same data as the plot above.
= (12, 8), dpi= 80)
plt.figure(figsize = "manufacturer",
sns.violinplot(x = "hwy",
y = df,
data = 'width',
density_norm = 'quartile'
inner
)
= plt.gca()
ax # get the xticks to iterate over
= ax.get_xticks()
xticks
for tick in xticks:
+ 0.5, 0, np.max(df["hwy"]), color = "grey", alpha = .1)
ax.vlines(tick
# rotate the x and y ticks
= 'x', labelrotation = 45, labelsize = 14)
ax.tick_params(axis = 'y', labelsize = 14)
ax.tick_params(axis
# add x and y label
"manufacturer", fontsize = 14)
ax.set_xlabel("HWY", fontsize = 14)
ax.set_ylabel(
# set title
"Violin plot HWY vs manufacturer", fontsize = 18);
ax.set_title(
plt.grid()
# More info:
# https://en.wikipedia.org/wiki/Violin_plot
Population pyramids are graphical representation of the age-sex structure of a country or an area.
= pd.DataFrame({'Age': ['0-4','5-9','10-14','15-19','20-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59','60-64','65-69','70-74','75-79','80-84','85-89','90-94','95-99','100+'],
df 'Male': [-49228000, -61283000, -64391000, -52437000, -42955000, -44667000, -31570000, -23887000, -22390000, -20971000, -17685000, -15450000, -13932000, -11020000, -7611000, -4653000, -1952000, -625000, -116000, -14000, -1000],
'Female': [52367000, 64959000, 67161000, 55388000, 45448000, 47129000, 33436000, 26710000, 25627000, 23612000, 20075000, 16368000, 14220000, 10125000, 5984000, 3131000, 1151000, 312000, 49000, 4000, 0]})
"Made Up Data", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Made Up Data' | ||
---|---|---|
Exploring Data | ||
Age | Male | Female |
0-4 | -49,228,000.00 | 52,367,000.00 |
85-89 | -625,000.00 | 312,000.00 |
75-79 | -4,653,000.00 | 3,131,000.00 |
5-9 | -61,283,000.00 | 64,959,000.00 |
40-44 | -22,390,000.00 | 25,627,000.00 |
25-29 | -44,667,000.00 | 47,129,000.00 |
55-59 | -15,450,000.00 | 16,368,000.00 |
15-19 | -52,437,000.00 | 55,388,000.00 |
90-94 | -116,000.00 | 49,000.00 |
80-84 | -1,952,000.00 | 1,151,000.00 |
🔍 Data Exploration: Made Up Data | Sample Size: 10 Records |
= ['100+','95-99','90-94','85-89','80-84','75-79','70-74','65-69','60-64','55-59','50-54','45-49','40-44','35-39','30-34','25-29','20-24','15-19','10-14','5-9','0-4']
AgeClass
= sns.barplot(x='Male', y='Age', data=df, order=AgeClass)
bar_plot
= sns.barplot(x='Female', y='Age', data=df, order=AgeClass)
bar_plot
set(xlabel="Population (hundreds of millions)", ylabel="Age-Group", title = "Population Pyramid")
bar_plot. plt.grid()
If one of the main variables is categorical (divided into discrete groups) it may be helpful to use a more specialized approach to visualization. In seaborn
, catplot()
gives unified higher-level access to a number of axes-level functions for plotting categorical data in different ways.
= pd.read_csv('data/train.csv')
df "train.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'train.csv' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Passengerid | Survived | Pclass | Name | Sex | Age | Sibsp | Parch | Ticket | Fare |
710.00 | 1.00 | 3.00 | Moubarek, Master. Halim Gonios ("William George") | male | nan | 1.00 | 1.00 | 2661 | 15.25 |
440.00 | 0.00 | 2.00 | Kvillner, Mr. Johan Henrik Johannesson | male | 31.00 | 0.00 | 0.00 | C.A. 18723 | 10.50 |
841.00 | 0.00 | 3.00 | Alhomaki, Mr. Ilmari Rudolf | male | 20.00 | 0.00 | 0.00 | SOTON/O2 3101287 | 7.92 |
721.00 | 1.00 | 2.00 | Harper, Miss. Annie Jessie "Nina" | female | 6.00 | 0.00 | 1.00 | 248727 | 33.00 |
40.00 | 1.00 | 3.00 | Nicola-Yarred, Miss. Jamila | female | 14.00 | 1.00 | 0.00 | 2651 | 11.24 |
291.00 | 1.00 | 1.00 | Barber, Miss. Ellen "Nellie" | female | 26.00 | 0.00 | 0.00 | 19877 | 78.85 |
301.00 | 1.00 | 3.00 | Kelly, Miss. Anna Katherine "Annie Kate" | female | nan | 0.00 | 0.00 | 9234 | 7.75 |
334.00 | 0.00 | 3.00 | Vander Planke, Mr. Leo Edmondus | male | 16.00 | 2.00 | 0.00 | 345764 | 18.00 |
209.00 | 1.00 | 3.00 | Carr, Miss. Helen "Ellen" | female | 16.00 | 0.00 | 0.00 | 367231 | 7.75 |
137.00 | 1.00 | 1.00 | Newsom, Miss. Helen Monypeny | female | 19.00 | 0.00 | 2.00 | 11752 | 26.28 |
💡 Additional columns not displayed: Cabin, Embarked | |||||||||
🔍 Data Exploration: train.csv | Sample Size: 10 Records |
#https://www.geeksforgeeks.org/python-seaborn-catplot/
= plt.figure(figsize = (12, 6))
fig
= sns.catplot(x="Sex", y="Age",
ax =df)
data
plt.grid()
# More info:
# https://seaborn.pydata.org/tutorial/categorical.html
<Figure size 1152x576 with 0 Axes>
A Waffle Chart is a gripping visualization technique that is normally created to display progress towards goals.
Using data we have used before.
= pd.read_csv('data/mpg_ggplot2.csv')
df "mpg_ggplot2.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'mpg_ggplot2.csv' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Manufacturer | Model | Displ | Year | Cyl | Trans | Drv | Cty | Hwy | Fl |
dodge | ram 1500 pickup 4wd | 4.70 | 2,008.00 | 8.00 | manual(m6) | 4 | 9.00 | 12.00 | e |
toyota | toyota tacoma 4wd | 4.00 | 2,008.00 | 6.00 | auto(l5) | 4 | 16.00 | 20.00 | r |
toyota | camry | 2.20 | 1,999.00 | 4.00 | auto(l4) | f | 21.00 | 27.00 | r |
audi | a4 quattro | 2.00 | 2,008.00 | 4.00 | manual(m6) | 4 | 20.00 | 28.00 | p |
jeep | grand cherokee 4wd | 4.70 | 2,008.00 | 8.00 | auto(l5) | 4 | 14.00 | 19.00 | r |
hyundai | sonata | 2.40 | 1,999.00 | 4.00 | manual(m5) | f | 18.00 | 27.00 | r |
toyota | corolla | 1.80 | 2,008.00 | 4.00 | manual(m5) | f | 28.00 | 37.00 | r |
ford | mustang | 4.00 | 2,008.00 | 6.00 | auto(l5) | r | 16.00 | 24.00 | r |
volkswagen | jetta | 2.00 | 1,999.00 | 4.00 | manual(m5) | f | 21.00 | 29.00 | r |
audi | a6 quattro | 2.80 | 1,999.00 | 6.00 | auto(l5) | 4 | 15.00 | 24.00 | p |
💡 Additional columns not displayed: class | |||||||||
🔍 Data Exploration: mpg_ggplot2.csv | Sample Size: 10 Records |
A Pie Chart is a circular statistical plot that can display only one series of data. The area of the chart is the total percentage of the given data.
Pie charts are typically to be avoided.
Using dthe same data as the plot above.
= df["manufacturer"].value_counts().to_dict()
d
= plt.figure(figsize = (18, 6))
fig = fig.add_subplot()
ax
# pass the values from our dictionary
ax.pie(d.values(), = d.keys(), # pass the labels from our dictonary
labels = '%1.1f%%', # specify the format to be plotted
autopct = {'fontsize': 10, 'color' : "white"} # change the font size and the color of the numbers inside the pie
textprops
)
# set the title
"Pie chart")
ax.set_title(
# set the legend and add a title to the legend
= "upper left", bbox_to_anchor = (1, 0, 0.5, 1), fontsize = 10, title = "Manufacturer");
ax.legend(loc
# More info:
# https://en.wikipedia.org/wiki/Pie_chart
A Treemap diagram is an appropriate type of visualization when the data set is structured in hierarchical order with a tree layout with roots, branches, and nodes. It allows us to show information about an important amount of data in a very efficient way in a limited space.
Resuing the data as the plot above.
= df["manufacturer"].value_counts().to_dict()
label_value
= ["{} has {} obs".format(class_, obs) for class_, obs in label_value.items()]
labels
= [plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))]
colors
= (12, 10))
plt.figure(figsize
= label_value.values(), label = labels, color = colors, alpha = 0.8)
squarify.plot(sizes
# add a title to the plot
"Treemap using external libraries");
plt.title(
# More info:
# https://en.wikipedia.org/wiki/Treemapping
A bar plot or bar chart is a graph that represents the category of data with rectangular bars with lengths and heights that is proportional to the values which they represent. The bar plots can be plotted horizontally or vertically. A bar chart describes the comparisons between the discrete categories. One of the axis of the plot represents the specific categories being compared, while the other axis represents the measured values corresponding to those categories.
Reusing the same data as the plot above.
= df["class"].value_counts().to_dict()
d
# create n colors based on the number of labels we have
= [plt.cm.Spectral(i/float(len(d.keys()))) for i in range(len(d.keys()))]
colors
= plt.figure(figsize = (12, 8))
fig = fig.add_subplot()
ax
= colors)
ax.bar(d.keys(), d.values(), color
# iterate over every x and y
for i, (k, v) in enumerate(d.items()):
# where to put the text on the x coordinates
ax.text(k, + 1, # where to put the text on the y coordinates
v # value to text
v, = colors[i], # color corresponding to the bar
color = 10, # fontsize
fontsize = 'center', # center the text to be more pleasant
horizontalalignment = 'center'
verticalalignment
)
= 'x', labelrotation = 45, labelsize = 12)
ax.tick_params(axis = 'y', labelsize = 12)
ax.tick_params(axis
# set a title for the plot
"", fontsize = 14);
ax.set_title( plt.grid()
Time series data is the data marked by some time. Each point on the graph represents a measurement of both time and quantity. A time-series chart (aka a fever chart) when the data are connected in chronological order by a straight line that forms a succession of peaks and troughs. x-axis of the chart is used to represent time intervals. y-line locates values of the parameter getting monitored.
= pd.read_csv('data/AirPassengers.csv')
df "AirPassengers.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'AirPassengers.csv' | |
---|---|
Exploring Data | |
Date | Value |
1958-10-01 | 359.00 |
1950-08-01 | 170.00 |
1955-11-01 | 237.00 |
1957-02-01 | 301.00 |
1953-09-01 | 237.00 |
1950-01-01 | 115.00 |
1960-01-01 | 417.00 |
1954-06-01 | 264.00 |
1954-07-01 | 302.00 |
1950-07-01 | 170.00 |
🔍 Data Exploration: AirPassengers.csv | Sample Size: 10 Records |
def create_date_tick(df):
'''
Converts dates from this format: Timestamp('1949-01-01 00:00:00')
To this format: 'Jan-1949'
'''
"date"] = pd.to_datetime(df["date"]) # convert to datetime
df["month_name"] = df["date"].dt.month_name() # extracts month_name
df["month_name"] = df["month_name"].apply(lambda x: x[:3]) # passes from January to Jan
df["year"] = df["date"].dt.year # extracts year
df["new_date"] = df["month_name"].astype(str) + "-" + df["year"].astype(str) # Concatenaes Jan and year --> Jan-1949
df[
# create the time column and the xtickslabels column
create_date_tick(df)
# get the y values (the x is the index of the series)
= df["value"]
y
# find local maximum INDEX using scipy library
= find_peaks(y, height=0)
max_peaks_index, _
# find local minimum INDEX using numpy library
= np.diff(np.sign(np.diff(-1*y)))
doublediff2 = np.where(doublediff2 == -2)[0] + 1
min_peaks_index
= plt.figure(figsize = (12, 8))
fig = fig.add_subplot()
ax
# plot the data using matplotlib
= "blue", alpha = .5, label = "Air traffic")
ax.plot(y, color
# we have the index of max and min, so we must index the values in order to plot them
= y[max_peaks_index].index, y = y[max_peaks_index].values, marker = "^", s = 90, color = "green", alpha = .5, label = "Peaks")
ax.scatter(x = y[min_peaks_index].index, y = y[min_peaks_index].values, marker = "v", s = 90, color = "red", alpha = .5, label = "Troughs")
ax.scatter(x
# iterate over some max and min in order to annotate the values
for max_annot, min_annot in zip(max_peaks_index[::3], min_peaks_index[1::5]):
# extract the date to be plotted for max and min
= df.iloc[max_annot]["new_date"]
max_text = df.iloc[min_annot]["new_date"]
min_text
# add the text
+ 50, s = max_text, fontsize = 8, horizontalalignment = 'center', verticalalignment = 'center')
ax.text(df.index[max_annot], y[max_annot] - 50, s = min_text, fontsize = 8, horizontalalignment = 'center', verticalalignment = 'center')
ax.text(df.index[min_annot], y[min_annot]
# change the ylim
0, 700)
ax.set_ylim(
# get the xticks and the xticks labels
= df.index.tolist()[::6]
xtick_location = df["new_date"].tolist()[::6]
xtick_labels
# set the xticks to be every 6'th entry
# every 6 months
ax.set_xticks(xtick_location)
= .5)
ax.grid(alpha
# chage the label from '1949-01-01 00:00:00' to this 'Jan-1949'
=45, fontdict={'horizontalalignment': 'center', 'verticalalignment': 'center_baseline'})
ax.set_xticklabels(xtick_labels, rotation
# change the size of the font of the x and y axis
= 'x', labelrotation = 45, labelsize = 12)
ax.tick_params(axis = 'y', labelsize = 12)
ax.tick_params(axis
# set the title and the legend of the plot
"Air Passsengers Traffic (1949 - 1969)", fontsize = 16)
ax.set_title(= "upper left", fontsize = 10); ax.legend(loc
Time series decomposition can be thought of as a statistical technique used to break down a time series dataset into its individual components such as trend, seasonality, cyclic, and residuals.
Using the same data as the plot above.
def create_date_tick(df):
'''
Converts dates from this format: Timestamp('1949-01-01 00:00:00')
To this format: 'Jan-1949'
'''
"date"] = pd.to_datetime(df["date"]) # convert to datetime
df["date", inplace = True)
df.set_index("date"] = df.index
df["month_name"] = df["date"].dt.month_name() # extracts month_name
df["month_name"] = df["month_name"].apply(lambda x: x[:3]) # passes from January to Jan
df["year"] = df["date"].dt.year # extracts year
df["new_date"] = df["month_name"].astype(str) + "-" +df["year"].astype(str) # Concatenaes Jan and year --> Jan-1949
df[
create_date_tick(df)
= seasonal_decompose(df["value"])
result
= plt.subplots(ncols = 1, nrows = 4, sharex = True, figsize = (12,10))
fig, axes
# https://stackoverflow.com/questions/45184055/how-to-plot-multiple-seasonal-decompose-plots-in-one-figure
# plot the original data
= axes[0], legend = False)
result.observed.plot(ax 0].set_ylabel('Observed')
axes[0].set_title("Decomposition of a series")
axes[
# plot the trend
= axes[1], legend = False)
result.trend.plot(ax 1].set_ylabel('Trend')
axes[
# plot the seasonal part
= axes[2], legend = False)
result.seasonal.plot(ax 2].set_ylabel('Seasonal')
axes[
# plot the residual
= axes[3], legend = False)
result.resid.plot(ax 3].set_ylabel('Residual')
axes[
= df.index.tolist()[::6]
xtick_location = df["new_date"].tolist()[::6]
xtick_labels
# set the xticks to be every 6'th entry
# every 6 months
ax.set_xticks(xtick_location)
# chage the label from '1949-01-01 00:00:00' to this 'Jan-1949'
=90, fontdict={'horizontalalignment': 'center', 'verticalalignment': 'center_baseline'}); ax.set_xticklabels(xtick_labels, rotation
Can use the plot()
method to plot a line chart of multiple time series, provided that indexes of all the DataFrames are aligned.
= pd.read_csv('data/mortality.csv')
df "mortality.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'mortality.csv' | ||
---|---|---|
Exploring Data | ||
Date | Mdeaths | Fdeaths |
May 1974 | 1,492.00 | 522.00 |
Mar 1979 | 1,846.00 | 727.00 |
Jul 1975 | 1,186.00 | 421.00 |
Jan 1974 | 2,134.00 | 901.00 |
May 1976 | 1,189.00 | 447.00 |
Mar 1978 | 1,942.00 | 737.00 |
Nov 1974 | 1,621.00 | 578.00 |
Nov 1976 | 1,467.00 | 546.00 |
Jan 1975 | 2,103.00 | 830.00 |
Jul 1978 | 1,098.00 | 431.00 |
🔍 Data Exploration: mortality.csv | Sample Size: 10 Records |
# set the date column to be the index
"date", inplace = True)
df.set_index(
= plt.figure(figsize = (10, 5))
fig = fig.add_subplot()
ax
"mdeaths"], color = "red", alpha = .5, label = "mdeaths")
ax.plot(df["fdeaths"], color = "blue", alpha = .5, label = "fdeaths")
ax.plot(df[
# get the xticks and the xticks labels
= df.index.tolist()[::6]
xtick_location = df.index.tolist()[::6]
xtick_labels
# set the xticks to be every 6'th entry
# every 6 months
ax.set_xticks(xtick_location)=45, fontdict={'horizontalalignment': 'center', 'verticalalignment': 'center_baseline'});
ax.set_xticklabels(xtick_labels, rotation
# change the x and y ticks to be smaller
= 'x', labelrotation = 45, labelsize = 12)
ax.tick_params(axis = 'y', labelsize = 12)
ax.tick_params(axis
# add legend, a title and grid to make it look nicer
= "upper left", fontsize = 10)
ax.legend(loc "Mdeaths and fdeaths over time", fontsize = 14)
ax.set_title(= "y", alpha = .3)
ax.grid(axis
# More info:
# https://study.com/academy/lesson/time-series-plots-definition-features.html
Dual axis time series charts makes it possible to choose two vertical scales so the drawing on the page is equivalent to drawing two indexed series, but retaining the meaningful mapping to the scale of the original variables.
Using data that we have seen before.
= pd.read_csv('data/economics.csv')
df "economics.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'economics.csv' | |||||
---|---|---|---|---|---|
Exploring Data | |||||
Date | Pce | Pop | Psavert | Uempmed | Unemploy |
2010-05-01 | 10,140.20 | 309,376.00 | 6.00 | 22.30 | 14,849.00 |
1973-05-01 | 843.10 | 211,577.00 | 12.80 | 4.90 | 4,329.00 |
1978-06-01 | 1,429.80 | 222,379.00 | 9.50 | 6.00 | 6,028.00 |
2002-09-01 | 7,426.10 | 288,618.00 | 4.90 | 9.50 | 8,251.00 |
2012-12-01 | 11,245.20 | 315,532.00 | 10.50 | 17.60 | 12,272.00 |
1994-04-01 | 4,690.70 | 262,631.00 | 5.80 | 9.10 | 8,331.00 |
1983-03-01 | 2,208.60 | 233,613.00 | 10.00 | 10.40 | 11,408.00 |
1969-12-01 | 623.70 | 203,675.00 | 11.70 | 4.60 | 2,884.00 |
1974-04-01 | 912.70 | 213,361.00 | 12.70 | 5.00 | 4,618.00 |
1993-05-01 | 4,441.30 | 259,680.00 | 7.70 | 8.10 | 9,149.00 |
🔍 Data Exploration: economics.csv | Sample Size: 10 Records |
# set the date column to be the index
"date", inplace = True)
df.set_index(
= df["psavert"]
x_1 = df["unemploy"]
x_2
= plt.figure(figsize = (14, 8))
fig = fig.add_subplot()
ax
= "red", alpha = .3, label = "Personal savings rate")
ax.plot(x_1, color
="lower right")
plt.legend(loc
= ax.twinx()
ax2
= "blue", alpha = .3, label = "Unemployment rate")
ax2.plot(x_2, color
="lower left")
plt.legend(loc
= df.index.tolist()[::12]
xtick_location = df.index.tolist()[::12]
xtick_labels
ax.set_xticks(xtick_location)= 90, fontdict={'horizontalalignment': 'center', 'verticalalignment': 'center_baseline'});
ax.set_xticklabels(xtick_labels, rotation
# change the x and y ticks to be smaller for the main axis and for the secondary axis
= 'x', labelrotation = 90, labelsize = 10)
ax.tick_params(axis = 'y', labelsize = 12,colors='r')
ax.tick_params(axis = 'y', labelsize = 12,colors='b')
ax2.tick_params(axis
# set a title and a grid
"Personal savings rate vs Unemployed rate: 2 axis", fontsize = 16)
ax.set_title(= "y", alpha = .3)
ax.grid(axis
plt.grid()
# More info:
# https://study.com/academy/lesson/time-series-plots-definition-features.html
Continuous error bands are a graphical representation of error or uncertainty as a shaded region around a main trace, rather than as discrete whisker-like error bars.
= pd.read_csv('data/user_orders_hourofday.csv')
df "user_orders_hourofday.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'user_orders_hourofday.csv' | ||
---|---|---|
Exploring Data | ||
User Id | Order Hour Of Day | Quantity |
110,283.00 | 17.00 | 4.00 |
186,203.00 | 23.00 | 1.00 |
176,972.00 | 18.00 | 6.00 |
189,725.00 | 16.00 | 12.00 |
27,310.00 | 6.00 | 4.00 |
186,783.00 | 16.00 | 3.00 |
79,861.00 | 14.00 | 24.00 |
180,139.00 | 15.00 | 19.00 |
8,815.00 | 14.00 | 11.00 |
34,197.00 | 20.00 | 12.00 |
🔍 Data Exploration: user_orders_hourofday.csv | Sample Size: 10 Records |
= df.groupby(["order_hour_of_day"])["quantity"].mean().to_frame()
gb_df
= gb_df["quantity"]
x = x*0.95
x_lower = x*1.05
x_upper
= plt.figure(figsize = (12, 8))
fig = fig.add_subplot()
ax
= "white", lw = 3)
ax.plot(x, color = "#bcbddc")
ax.plot(x_lower, color = "#bcbddc")
ax.plot(x_upper, color
= x > x_lower, facecolor='#bcbddc', interpolate = True)
ax.fill_between(x.index, x, x_lower, where = x_upper > x, facecolor='#bcbddc', interpolate = True)
ax.fill_between(x.index, x, x_upper, where
0, 25)
ax.set_ylim(
# set the x and y labels
"Hour of day")
ax.set_xlabel("# Orders")
ax.set_ylabel(
# get the xticks and the xticks labels
= gb_df.index.tolist()[::2]
xtick_location = gb_df.index.tolist()[::2]
xtick_labels
# set the xticks to be every 2'th entry
# every 2 months
ax.set_xticks(xtick_location)={'horizontalalignment': 'center', 'verticalalignment': 'center_baseline', "fontsize":"12"})
ax.set_xticklabels(xtick_labels, fontdict
# change the x and y tick size
= 'x', labelsize = 12)
ax.tick_params(axis = 'y', labelsize = 12)
ax.tick_params(axis
# add a title and a gridline
"Mean orders +- 5% interval ", fontsize = 16)
ax.set_title(= "y", alpha = .5)
ax.grid(axis = "x", alpha = .5) ax.grid(axis
Here is another example that looks nicer brought to you directly by seaborn
.
="darkgrid")
sns.set_theme(style
# Load an example dataset with long-form data
= sns.load_dataset("fmri")
fmri
# Plot the responses for different events and regions
="timepoint", y="signal",
sns.lineplot(x="region", style="event",
hue=fmri) data
A stacked area chart displays the evolution of a numeric variable for several groups of a dataset.
= pd.read_csv('data/nightvisitors.csv')
df # set the data as index of the df
"yearmon", inplace = True)
df.set_index("nightvisitors.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'nightvisitors.csv' | |||||||
---|---|---|---|---|---|---|---|
Exploring Data | |||||||
Sydney | Nsw | Melbourne | Vic | Brisbanegc | Qld | Capitals | Other |
7,320.00 | 21,782.00 | 4,865.00 | 14,054.00 | 9,055.00 | 8,016.00 | 9,178.00 | 10,232.00 |
5,651.00 | 14,775.00 | 3,902.00 | 7,883.00 | 7,351.00 | 9,672.00 | 7,690.00 | 9,948.00 |
5,663.00 | 14,433.00 | 5,285.00 | 7,600.00 | 7,077.00 | 9,417.00 | 8,276.00 | 9,769.00 |
6,333.00 | 15,152.00 | 4,585.00 | 7,478.00 | 7,017.00 | 9,804.00 | 7,192.00 | 10,412.00 |
5,977.00 | 16,748.00 | 5,289.00 | 8,521.00 | 8,964.00 | 9,950.00 | 7,310.00 | 12,892.00 |
5,253.00 | 14,023.00 | 4,821.00 | 5,990.00 | 7,717.00 | 13,311.00 | 6,252.00 | 10,167.00 |
6,521.00 | 19,774.00 | 4,703.00 | 14,071.00 | 8,705.00 | 11,103.00 | 9,992.00 | 11,372.00 |
5,021.00 | 14,590.00 | 4,177.00 | 6,807.00 | 8,756.00 | 15,078.00 | 7,391.00 | 12,017.00 |
5,356.00 | 19,148.00 | 4,688.00 | 11,017.00 | 7,830.00 | 8,918.00 | 8,004.00 | 9,417.00 |
6,981.00 | 19,960.00 | 5,675.00 | 13,000.00 | 7,706.00 | 9,460.00 | 7,765.00 | 9,842.00 |
🔍 Data Exploration: nightvisitors.csv | Sample Size: 10 Records |
= df.index
x = [df[col].values for col in df.columns]
y
= df.columns
labels
# prepare some colors for each group to be ploted
= [plt.cm.Spectral(i/float(len(labels))) for i in range(len(labels))]
colors
= plt.figure(figsize = (12, 10))
fig = fig.add_subplot()
ax
= labels, colors = colors)
ax.stackplot(x,y, labels
= df.index.tolist()[::3]
xtick_location = df.index.tolist()[::3]
xtick_labels
ax.set_xticks(xtick_location)={'horizontalalignment': 'center', 'verticalalignment': 'center_baseline', "fontsize":"12"})
ax.set_xticklabels(xtick_labels, fontdict
= 'x', labelsize = 10, rotation = 45)
ax.tick_params(axis = 'y', labelsize = 10)
ax.tick_params(axis
"Date", fontsize = 12)
ax.set_xlabel("Visitors", fontsize = 12)
ax.set_ylabel(
# change the ylim
0, 90000)
ax.set_ylim(
# set a title and a legend
"Night visitors in Australian Regions", fontsize = 16)
ax.set_title(=12);
ax.legend(fontsize = "y", alpha = .5)
ax.grid(axis = "x", alpha = .5) ax.grid(axis
An area plot displays quantitative data visually. Area plots are stacked by default. To produce an unstacked plot, pass stacked=False
.
Returning the the econimics
data.
= pd.read_csv('data/economics.csv')
df # set the data as index of the df
"date", inplace = True)
df.set_index("economics.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'economics.csv' | ||||
---|---|---|---|---|
Exploring Data | ||||
Pce | Pop | Psavert | Uempmed | Unemploy |
10,140.20 | 309,376.00 | 6.00 | 22.30 | 14,849.00 |
843.10 | 211,577.00 | 12.80 | 4.90 | 4,329.00 |
1,429.80 | 222,379.00 | 9.50 | 6.00 | 6,028.00 |
7,426.10 | 288,618.00 | 4.90 | 9.50 | 8,251.00 |
11,245.20 | 315,532.00 | 10.50 | 17.60 | 12,272.00 |
4,690.70 | 262,631.00 | 5.80 | 9.10 | 8,331.00 |
2,208.60 | 233,613.00 | 10.00 | 10.40 | 11,408.00 |
623.70 | 203,675.00 | 11.70 | 4.60 | 2,884.00 |
912.70 | 213,361.00 | 12.70 | 5.00 | 4,618.00 |
4,441.30 | 259,680.00 | 7.70 | 8.10 | 9,149.00 |
🔍 Data Exploration: economics.csv | Sample Size: 10 Records |
= df["psavert"]
x = df["uempmed"]
y
= plt.figure(figsize = (14, 8))
fig = fig.add_subplot()
ax
= "blue", alpha = .3, label = "Personal savings rate")
ax.plot(x, color = "red", alpha = .3, label = "Unemployment rate")
ax.plot(y, color
# fill the areas between the plots and the x axis
# this can create overlapping areas between lines
0, x, color = "blue", alpha = .2)
ax.fill_between(x.index, 0, y, color = "red", alpha = .2)
ax.fill_between(x.index,
# set the title
"Personal savings rate vs Unemployed rate", fontsize = 16)
ax.set_title(
= df.index.tolist()[::12]
xtick_location = df.index.tolist()[::12]
xtick_labels
ax.set_xticks(xtick_location)= 90, fontdict = {'horizontalalignment': 'center', 'verticalalignment': 'center_baseline'})
ax.set_xticklabels(xtick_labels, rotation
= 'x', labelrotation = 90, labelsize = 12)
ax.tick_params(axis = 'y', labelsize = 12)
ax.tick_params(axis
"right"].set_color("None")
ax.spines["top"].set_color("None")
ax.spines[
= 10)
ax.legend(fontsize = "y", alpha = .3);
ax.grid(axis = "x", alpha = .3); ax.grid(axis
Import calmap
to create calendar heatmaps from Pandas time series data. For illustration purposes, create 500 events as random float values assigned to random days over a 700-day period.
= pd.date_range('1/15/2022', periods=700, freq='D')
all_days = np.random.choice(all_days, 500)
days = pd.Series(np.random.randn(len(days)), index=days)
events print(events)
2022-06-06 0.601210
2023-08-25 0.284106
2022-10-27 0.713959
2022-07-04 0.471265
2023-08-02 1.000837
...
2023-11-07 0.854588
2022-05-09 -0.522025
2023-01-04 -0.059316
2023-06-30 1.464429
2022-04-28 1.543473
Length: 500, dtype: float64
=(10,8))
plt.figure(figsize=2023) calmap.yearplot(events, year
=3, daylabels='MTWTFSS',
calmap.calendarplot(events, monthticks=[0, 2, 4, 6], cmap='YlGn',
dayticks='grey', linewidth=0,
fillcolor=dict(figsize=(8, 4))) fig_kws
(<Figure size 768x384 with 2 Axes>,
array([<Axes: ylabel='2022'>, <Axes: ylabel='2023'>], dtype=object))
The seasonal plot can be used to compare how the time series performed at same day in the previous season (year / month / week etc).
= pd.read_csv('data/AirPassengers.csv')
df "AirPassengers.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'AirPassengers.csv' | |
---|---|
Exploring Data | |
Date | Value |
1958-10-01 | 359.00 |
1950-08-01 | 170.00 |
1955-11-01 | 237.00 |
1957-02-01 | 301.00 |
1953-09-01 | 237.00 |
1950-01-01 | 115.00 |
1960-01-01 | 417.00 |
1954-06-01 | 264.00 |
1954-07-01 | 302.00 |
1950-07-01 | 170.00 |
🔍 Data Exploration: AirPassengers.csv | Sample Size: 10 Records |
= [i for i in range(1, 13)]*12
index_
# set the index into the dataframe
"index_"] = index_
df[
# create a dictionary with the months name
= ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
months_ = {k:v for k,v in zip(index_[:12], months_)}
d
# convert to datetime the date column
"date"] = pd.to_datetime(df["date"])
df[
# extract the year using pandas datatime (dt)
"year"] = df["date"].dt.year
df[
# drop the date
"date", inplace = True, axis = 1)
df.drop(
# create a pivot table
# traspose the rows into columns, where the columns name are the year to plot
= df.pivot(values = "value", columns = "year", index = "index_")
df
# create n colors for each season
= [plt.cm.gist_earth(i/float(len(df.columns))) for i in range(len(df.columns))]
colors
= df.index
x
= plt.figure(figsize = (12, 6))
fig = fig.add_subplot()
ax
for col, color in zip(df.columns, colors):
# get the y to plot
= df[col]
y
# plot the data using seaborn
= col, c = color)
ax.plot(x, y, label
# get the x and y to annotate
= x[-1]
x_annotate = df.iloc[11][col]
y_annotate
+ 0.3, y_annotate, col, fontsize = 8, c = color)
ax.text(x_annotate
"Months", fontsize = 13)
ax.set_xlabel("Air traffic", fontsize = 13)
ax.set_ylabel(
# extract the x ticks location
= df.index.tolist()
xtick_location
= [d[tick] for tick in xtick_location]
months
ax.set_xticks(xtick_location)= 45, fontdict = {'horizontalalignment': 'center', 'verticalalignment': 'center_baseline', "fontsize":"12"})
ax.set_xticklabels(months, rotation
= 'y', labelsize = 12)
ax.tick_params(axis
0, 700)
ax.set_ylim(
"right"].set_color("None")
ax.spines["top"].set_color("None")
ax.spines[
= "y", alpha = .7)
ax.grid(axis = "x", alpha = .7)
ax.grid(axis # set the title for the plot
"Monthly seasonal plot of air traffic (1949 - 1969)", fontsize = 15); ax.set_title(
The dendrogram illustrates how each cluster is composed by drawing a U-shaped link between a non-singleton cluster and its children. The top of the U-link indicates a cluster merge. The two legs of the U-link indicate which clusters were merged. The length of the two legs of the U-link represents the distance between the child clusters.
= pd.read_csv('data/USArrests.csv')
df "USArrests.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'USArrests.csv' | ||||
---|---|---|---|---|
Exploring Data | ||||
Murder | Assault | Urbanpop | Rape | State |
7.20 | 113.00 | 65.00 | 21.00 | Indiana |
14.40 | 279.00 | 48.00 | 22.50 | South Carolina |
11.40 | 285.00 | 70.00 | 32.10 | New Mexico |
8.50 | 156.00 | 63.00 | 20.70 | Virginia |
15.40 | 249.00 | 66.00 | 22.20 | Louisiana |
2.60 | 53.00 | 66.00 | 10.80 | Wisconsin |
4.30 | 102.00 | 62.00 | 16.50 | Nebraska |
6.00 | 109.00 | 53.00 | 16.40 | Montana |
13.00 | 337.00 | 45.00 | 16.10 | North Carolina |
11.30 | 300.00 | 67.00 | 27.80 | Maryland |
🔍 Data Exploration: USArrests.csv | Sample Size: 10 Records |
= plt.figure(figsize = (14, 7))
fig
# plot the data using the scipy package
= shc.dendrogram(shc.linkage(df[['Murder', 'Assault', 'UrbanPop', 'Rape']], method = 'ward'),
dend = df["State"].values,
labels = 100)
color_threshold
= plt.gca()
ax
"County level")
ax.set_xlabel("# of incidents")
ax.set_ylabel(
"x", labelsize = 10)
ax.tick_params("y", labelsize = 10)
ax.tick_params(
= "y", alpha = .7)
ax.grid(axis = "x", alpha = .7)
ax.grid(axis
# set a title
"US Arrests dendograms"); ax.set_title(
There are different ways to label a scatter plot with different groups (or clusters) of data points using the Python packages matplotlib and seaborn. These labeling methods are useful to represent the results of clustering algorithms, such as K-means clustering.
from sklearn.datasets import load_iris
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import itertools
= load_iris()
iris = iris['data']
X print(X[:6])
[[5.1 3.5 1.4 0.2]
[4.9 3. 1.4 0.2]
[4.7 3.2 1.3 0.2]
[4.6 3.1 1.5 0.2]
[5. 3.6 1.4 0.2]
[5.4 3.9 1.7 0.4]]
# get flower families
= iris['target']
labels = np.unique(labels).size
nclusters
# scale flower data
= preprocessing.MinMaxScaler()
scaler
scaler.fit(X)= scaler.transform(X)
X_scaled
# instantiate k-means
= 0
seed = KMeans(n_clusters=nclusters, random_state=seed)
km
km.fit(X_scaled)
# predict the cluster for each data point
= km.predict(X_scaled)
y_cluster_kmeans
# Compute PCA of data set
= PCA(n_components=X.shape[1], random_state=seed)
pca
pca.fit(X_scaled)= pca.transform(X_scaled)
X_pca_array = pd.DataFrame(X_pca_array, columns=['PC%i' % (ii + 1) for ii in range(X_pca_array.shape[1])]) # PC=principal component
X_pca
# decide which prediction labels to associate with observed labels
# - search each possible way of transforming observed labels
# - identify approach with maximum agreement
= 0
MAX for ii in itertools.permutations([kk for kk in range(np.unique(y_cluster_kmeans).size)]):
= {jj: ii[jj] for jj in range(len(ii))}
change
= np.ones(y_cluster_kmeans.size) * -99
changedPredictions for jj in range(len(ii)):
== jj] = change[jj]
changedPredictions[y_cluster_kmeans
= np.sum(labels == changedPredictions)
successful if successful > MAX:
= successful
MAX = change
bestChange
# transform predictions to match observations
= np.ones(y_cluster_kmeans.size) * -99
changedPredictions for jj in range(len(ii)):
== jj] = bestChange[jj]
changedPredictions[y_cluster_kmeans
# plot clusters for observations and predictions
= plt.subplots(1, 2, figsize=(10, 6))
fig, ax 0].scatter(X_pca['PC1'], X_pca['PC2'], c=changedPredictions)
ax[1].scatter(X_pca['PC1'], X_pca['PC2'], c=labels)
ax[0].set_title('Prediction')
ax[1].set_title('Truth')
ax[0].set_facecolor("green")
ax[1].set_facecolor("blue") ax[
Another example given the complexity of this visualization using data we have seen before.
= pd.read_csv('data/USArrests.csv')
df "USArrests.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'USArrests.csv' | ||||
---|---|---|---|---|
Exploring Data | ||||
Murder | Assault | Urbanpop | Rape | State |
7.20 | 113.00 | 65.00 | 21.00 | Indiana |
14.40 | 279.00 | 48.00 | 22.50 | South Carolina |
11.40 | 285.00 | 70.00 | 32.10 | New Mexico |
8.50 | 156.00 | 63.00 | 20.70 | Virginia |
15.40 | 249.00 | 66.00 | 22.20 | Louisiana |
2.60 | 53.00 | 66.00 | 10.80 | Wisconsin |
4.30 | 102.00 | 62.00 | 16.50 | Nebraska |
6.00 | 109.00 | 53.00 | 16.40 | Montana |
13.00 | 337.00 | 45.00 | 16.10 | North Carolina |
11.30 | 300.00 | 67.00 | 27.80 | Maryland |
🔍 Data Exploration: USArrests.csv | Sample Size: 10 Records |
= df["Murder"]
x = df["Assault"]
y
# Create out cluster using the AgglomerativeClustering from sklearn
#https://scikit-learn.org/dev/modules/generated/sklearn.cluster.AgglomerativeClustering.html
= AgglomerativeClustering(n_clusters = 5, # notice that we specify the number of "optimal" clusters
cluster ='euclidean', # use the euclidean distance to compute similarity. The closer the better.
metric= 'ward'
linkage
)
# fit and predict the clusters based on this data
'Murder', 'Assault', 'UrbanPop', 'Rape']])
cluster.fit_predict(df[[
= plt.figure(figsize = (12, 10))
fig = fig.add_subplot()
ax
ax.scatter(x, y)
# Encircle
def encircle(x,y, ax = None, **kw):
'''
Takes an axes and the x and y and draws a polygon on the axes.
This code separates the differents clusters
'''
# get the axis if not passed
if not ax: ax=plt.gca()
# concatenate the x and y arrays
= np.c_[x,y]
p
# to calculate the limits of the polygon
= ConvexHull(p)
hull
# create a polygon from the hull vertices
= plt.Polygon(p[hull.vertices,:], **kw)
poly
# add the patch to the axes
ax.add_patch(poly)
# use our cluster fitted before to draw the clusters borders like we did at the beginning of the kernel
# basically go over each cluster and add a patch to the axes
== 0, 'Murder'], df.loc[cluster.labels_ == 0, 'Assault'], ec = "k", fc = "gold", alpha = 0.2, linewidth = 0)
encircle(df.loc[cluster.labels_ == 1, 'Murder'], df.loc[cluster.labels_ == 1, 'Assault'], ec = "k", fc = "tab:blue", alpha = 0.2, linewidth = 0)
encircle(df.loc[cluster.labels_ == 2, 'Murder'], df.loc[cluster.labels_ == 2, 'Assault'], ec = "k", fc = "tab:red", alpha = 0.2, linewidth = 0)
encircle(df.loc[cluster.labels_ == 3, 'Murder'], df.loc[cluster.labels_ == 3, 'Assault'], ec = "k", fc = "tab:green", alpha = 0.2, linewidth = 0)
encircle(df.loc[cluster.labels_ == 4, 'Murder'], df.loc[cluster.labels_ == 4, 'Assault'], ec = "k", fc = "tab:orange", alpha = 0.2, linewidth = 0)
encircle(df.loc[cluster.labels_
"x", labelsize = 10)
ax.tick_params("y", labelsize = 10)
ax.tick_params(
"Murder", fontsize = 12)
ax.set_xlabel("Assault", fontsize = 12)
ax.set_ylabel(
= "y", alpha = .7)
ax.grid(axis = "x", alpha = .7)
ax.grid(axis
# set a title for the plot
"Agglomerative clustering of US arrests (5 Groups)", fontsize = 14);
ax.set_title(
# More info:
# https://en.wikipedia.org/wiki/Cluster_analysis
Andrews curves are used for visualizing high-dimensional data by mapping each observation onto a function. It preserves means, distance, and variances. Plotting Andrews curves on a graph can be done using the andrews_curves()
method of the plotting module.
= pd.read_csv('data/iris2.csv')
df "Iris.csv", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Iris.csv' | ||||
---|---|---|---|---|
Exploring Data | ||||
Sepallength | Sepalwidth | Petallength | Petalwidth | Name |
6.10 | 2.80 | 4.70 | 1.20 | Iris-versicolor |
5.70 | 3.80 | 1.70 | 0.30 | Iris-setosa |
7.70 | 2.60 | 6.90 | 2.30 | Iris-virginica |
6.00 | 2.90 | 4.50 | 1.50 | Iris-versicolor |
6.80 | 2.80 | 4.80 | 1.40 | Iris-versicolor |
5.40 | 3.40 | 1.50 | 0.40 | Iris-setosa |
5.60 | 2.90 | 3.60 | 1.30 | Iris-versicolor |
6.90 | 3.10 | 5.10 | 2.30 | Iris-virginica |
6.20 | 2.20 | 4.50 | 1.50 | Iris-versicolor |
5.80 | 2.70 | 3.90 | 1.20 | Iris-versicolor |
🔍 Data Exploration: Iris.csv | Sample Size: 10 Records |
# Creating Andrews curves
= pd.plotting.andrews_curves(df, 'Name')
x
# plotting the Curve
x.plot()
# Display
plt.grid() plt.show()
1–3D data can be viewed relatively straight-forwardly using traditional plot types. Dimensions above 4, though, become increasingly difficult to display. Fortunately, parallel coordinates plots provide a mechanism for viewing results with higher dimensions.
Reusing the same data as the previous plot.
'Name', color=('#556270', '#4ECDC4', '#C7F464')) pd.plotting.parallel_coordinates(df,
#import all necessary modules
from wordcloud import WordCloud, STOPWORDS
= pd.read_csv(r'data\youtube.csv', encoding ='latin-1')
df "Data from Kaggle", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Data from Kaggle' | |||
---|---|---|---|
Exploring Data | |||
Link | Title | Description | Category |
174YLL | Amazing Indian food at Namaste in Miami!! Indian food reaction!! | Follow Me, I'm a Foodie - India 13K subscribers SUBSCRIBE In this episode, we vi... | food |
KJQkleg0QTQ | Greatest Indian Food Videos Compilation | Indian Food Preparations | Cooking Vid... | Crazy For Indian Food 500K subscribers SUBSCRIBE Indian food is colorful, intere... | food |
7iY3I6e5zNI | Pop Music 2021(2021 New Song) - Pop Hits 2021 New Popular Songs - Best English S... | Top Hits Music 408K subscribers SUBSCRIBE Pop Music 2021(2021 New Song) - Pop Hi... | art_music |
NGq3jj_bZy4 | New Hindi Song 2021 April 💖 Top Bollywood Romantic Love Songs 2021 💖 Best ... | Bollywood Hits Songs SUBSCRIBE New Hindi Song 2021 April 💖 Top Bollywood Roma... | art_music |
Md8e0VvU | Travel Blog से $400 महीना कमाने का Plan | Micro Nich... | Become Blogger 15.3K subscribers SUBSCRIBE Hello Bloggers, In this super energet... | travel |
xFH8DLqTQEA | AP European History Unit 4: Scientific, Philosophical, and Political Development... | Marco Learning 20.9K subscribers SUBSCRIBE Download our free AP European History... | history |
gdZLi9oWNZg | BTS (방탄소년단) 'Dynamite' Official MV | HYBE LABELS 57.1M subscribers SUBSCRIBE BTS (방탄소년단) 'Dynamite' Officia... | art_music |
S3Fz6bPu11 | Top things to do in Kerala! India Travel Vlog | Alex Outhwaite 152K subscribers SUBSCRIBE Top things to do in Kerala! If you w... | travel |
JFcgOboQZ08 | DILBAR Lyrical | Satyameva Jayate |John Abraham, Nora Fatehi,Tanishk B, Neha Kak... | T-Series 184M subscribers SUBSCRIBE Gulshan Kumar and T-Series in association wi... | art_music |
zFHPC4x8wk0 | Maroon 5, Ed Sheeran, Adele, Taylor Swift, Lady Gaga - english songs | Best Pop ... | Top Hits Music 1.56K subscribers SUBSCRIBE Maroon 5, Ed Sheeran, Adele, Taylor S... | art_music |
🔍 Data Exploration: Data from Kaggle | Sample Size: 10 Records |
#set STOPWORDS
= ""
comment_words = set(STOPWORDS)
stopwords
# Let’s iterate through the csv file
for val in df.description:
= str(val)
val # split the value
= val.split()
tokens
# Converts each token into lowercase
for i in range(len(tokens)):
= tokens[i].lower()
tokens[i]
+= " ".join(tokens)+" "
comment_words
= WordCloud(width = 800, height = 800, background_color ='white', stopwords = stopwords,
wordcloud = 10).generate(comment_words)
min_font_size
#and plot the WordCloud image
= (8, 8), facecolor = None)
plt.figure(figsize
plt.imshow(wordcloud)'off')
plt.axis(= 0)
plt.tight_layout(pad
#plt.savefig('youtubewordcloudsongs.png')
# Create a grid of x and y values
= np.linspace(-5, 5, 200)
x = np.linspace(-5, 5, 200)
y = np.meshgrid(x, y)
X, Y
# Define a custom function that combines several Gaussian peaks
def multi_gaussian(X, Y):
# Centers and heights of Gaussian peaks
= [
peaks 'x0': -2, 'y0': 1, 'amp': 2, 'sigma': 1.0},
{'x0': 1, 'y0': -1, 'amp': 1.5, 'sigma': 1.2},
{'x0': 0, 'y0': 0, 'amp': 2.5, 'sigma': 0.5},
{'x0': 3, 'y0': 2, 'amp': 1, 'sigma': 1.5}
{
]= np.zeros_like(X)
Z for peak in peaks:
+= peak['amp'] * np.exp(-((X - peak['x0'])**2 + (Y - peak['y0'])**2) / (2 * peak['sigma']**2))
Z return Z
= multi_gaussian(X, Y)
Z
# Create the contour plot
= plt.subplots(figsize=(8, 6))
fig, ax
# Filled contour for a smooth color gradient
= ax.contourf(X, Y, Z, levels=20, cmap='viridis')
contourf # Line contours for more definition
= ax.contour(X, Y, Z, levels=10, colors='black', linewidths=0.5)
contour
# Add a colorbar to show the "height" values
= plt.colorbar(contourf, ax=ax)
cbar 'Function Value')
cbar.set_label(
# Add labels and title
'2D Contour Map of Multiple Gaussian Peaks')
ax.set_title('X-axis')
ax.set_xlabel('Y-axis')
ax.set_ylabel(
plt.show()
Sunburst charts can be used to display any kind of hierarchical or multi-level data.
import plotly.express as px
= px.data.tips()
df
"Tips Data from Plotly Express", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Tips Data from Plotly Express' | ||||||
---|---|---|---|---|---|---|
Exploring Data | ||||||
Total Bill | Tip | Sex | Smoker | Day | Time | Size |
19.82 | 3.18 | Male | No | Sat | Dinner | 2.00 |
8.77 | 2.00 | Male | No | Sun | Dinner | 2.00 |
24.55 | 2.00 | Male | No | Sun | Dinner | 4.00 |
25.89 | 5.16 | Male | Yes | Sat | Dinner | 4.00 |
13.00 | 2.00 | Female | Yes | Thur | Lunch | 2.00 |
17.89 | 2.00 | Male | Yes | Sun | Dinner | 2.00 |
28.44 | 2.56 | Male | Yes | Thur | Lunch | 2.00 |
12.48 | 2.52 | Female | No | Thur | Lunch | 2.00 |
14.78 | 3.23 | Male | No | Sun | Dinner | 2.00 |
15.38 | 3.00 | Female | Yes | Fri | Dinner | 2.00 |
🔍 Data Exploration: Tips Data from Plotly Express | Sample Size: 10 Records |
= px.data.tips()
df = px.sunburst(df, path=['sex', 'day', 'time'], values='total_bill', color='day')
fig fig.show()
= px.data.gapminder().query("year == 2007")
df
"Gapminder Data from Plotly Express", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Gapminder Data from Plotly Express' | |||||||
---|---|---|---|---|---|---|---|
Exploring Data | |||||||
Country | Continent | Year | Lifeexp | Pop | Gdppercap | Iso Alpha | Iso Num |
Turkey | Europe | 2,007.00 | 71.78 | 71,158,647.00 | 8,458.28 | TUR | 792.00 |
Cameroon | Africa | 2,007.00 | 50.43 | 17,696,293.00 | 2,042.10 | CMR | 120.00 |
Mauritius | Africa | 2,007.00 | 72.80 | 1,250,882.00 | 10,956.99 | MUS | 480.00 |
Oman | Asia | 2,007.00 | 75.64 | 3,204,897.00 | 22,316.19 | OMN | 512.00 |
Hungary | Europe | 2,007.00 | 73.34 | 9,956,108.00 | 18,008.94 | HUN | 348.00 |
Bosnia and Herzegovina | Europe | 2,007.00 | 74.85 | 4,552,198.00 | 7,446.30 | BIH | 70.00 |
Panama | Americas | 2,007.00 | 75.54 | 3,242,173.00 | 9,809.19 | PAN | 591.00 |
Jamaica | Americas | 2,007.00 | 72.57 | 2,780,132.00 | 7,320.88 | JAM | 388.00 |
Japan | Asia | 2,007.00 | 82.60 | 127,467,972.00 | 31,656.07 | JPN | 392.00 |
Cambodia | Asia | 2,007.00 | 59.72 | 14,131,858.00 | 1,713.78 | KHM | 116.00 |
🔍 Data Exploration: Gapminder Data from Plotly Express | Sample Size: 10 Records |
= px.sunburst(df, path=['continent', 'country'], values='pop',
fig ='lifeExp', hover_data=['iso_alpha'],
color='RdBu',
color_continuous_scale=np.average(df['lifeExp'], weights=df['pop']))
color_continuous_midpoint fig.show()
Rose diagrams, also known as rose charts or wind roses, are circular statistical representations used primarily to display directional data. They consist of radiating spokes that represent different directions, typically based on compass points (North, East, South, West, and their intermediates).
= pd.read_csv('data/vgsales.csv')
df "Video Games Sakes Data from Kaggle", dataframe=df) process_csv_from_data_folder(
Random Sample of 10 Records from 'Video Games Sakes Data from Kaggle' | |||||||||
---|---|---|---|---|---|---|---|---|---|
Exploring Data | |||||||||
Rank | Name | Platform | Year | Genre | Publisher | Na Sales | Eu Sales | Jp Sales | Other Sales |
8,930.00 | F1 2012 | PC | 2,012.00 | Racing | Codemasters | 0.01 | 0.11 | 0.00 | 0.03 |
4,791.00 | Transformers: The Game (XBox 360, PS2, PS3, Wii & PC Versions) | PS3 | 2,007.00 | Action | Activision | 0.32 | 0.04 | 0.01 | 0.04 |
15,495.00 | Commandos 3: Destination Berlin | PC | 2,003.00 | Strategy | Eidos Interactive | 0.00 | 0.02 | 0.00 | 0.00 |
14,770.00 | The Sims 2: Bon Voyage | PC | 2,007.00 | Simulation | Electronic Arts | 0.01 | 0.01 | 0.00 | 0.00 |
5,213.00 | Guitar Hero: Smash Hits | PS3 | 2,009.00 | Misc | Activision | 0.20 | 0.11 | 0.00 | 0.05 |
722.00 | Sonic Advance | GBA | 2,001.00 | Platform | Sega | 1.19 | 0.71 | 0.22 | 0.13 |
4,920.00 | Red Faction: Armageddon | X360 | 2,011.00 | Shooter | THQ | 0.18 | 0.17 | 0.01 | 0.04 |
3,109.00 | Real Heroes: Firefighter | Wii | 2,009.00 | Action | Rondomedia | 0.56 | 0.04 | 0.00 | 0.05 |
7,418.00 | WinBack: Covert Operations | N64 | 1,999.00 | Shooter | Virgin Interactive | 0.17 | 0.04 | 0.00 | 0.00 |
4,449.00 | Gundam SEED: Federation vs. Z.A.F.T. | PS2 | 2,005.00 | Shooter | Namco Bandai Games | 0.00 | 0.00 | 0.44 | 0.00 |
💡 Additional columns not displayed: Global_Sales | |||||||||
🔍 Data Exploration: Video Games Sakes Data from Kaggle | Sample Size: 10 Records |
= ['#91DCEA', '#64CDCC', '#5FBB68',
colors '#F9D23C', '#F9A729', '#FD6F30','grey','red','blue','cyan']
= df['Platform'].value_counts()
platform_freq #print('5 most frequent platforms:\n', platform_freq.iloc[0:5])
=(7, 7))
plt.figure(figsize= platform_freq.iloc[0:5]
values = values.index
indexes = [plt.Rectangle((0,0),1,1, color=color, alpha=0.5) for color in colors]
handles
= plt.subplot(111, polar=True)
ax = values
height = 1.5*np.pi/(len(values))
width = [(2/1.5)*element*width for element in range(5)]
angles
= ax.bar(x=angles, height=height, width=width, bottom=0, linewidth=2,\
bars ='black', color=colors, alpha=0.5)
edgecolor0], padding=5)
ax.bar_label(ax.containers[
ax.set_xticklabels([])
ax.set_yticklabels([])'N')
ax.set_theta_zero_location('5 most frequent platforms in the dataset')
ax.set_title(='best')
ax.legend(handles, indexes, loc#plt.savefig('rosetop5platforms.png')
Radar charts, also known as spider charts or web charts, are two-dimensional graphical tools used to display multivariate data across three or more quantitative variables14. They consist of a series of radial axes extending from a central point, with each axis representing a different variable or dimension.
Using fake data.
# Define categories and values
='Business Development & Support'
myc1='Customer Service'
myc2='Data Science & Analytics'
myc3='Design & User Experience'
myc4='Engineering'
myc5='Finance'
myc6='Finance & Legal'
myc7='IT Services'
myc8='Leadership'
myc9='Security & Infrastructure'
myc10
# Create data dictionary with example values
= {
mydata 'Category': [myc1, myc2, myc3, myc4, myc5, myc6, myc7, myc8, myc9, myc10],
'Value': [4, 3, 5, 2, 4, 3, 2, 3, 4, 5] # Example values
}
# Create a DataFrame
= pd.DataFrame(mydata)
mydf
# Plot radar graph
= px.line_polar(mydf, r='Value', theta='Category', line_close=True, title="Junior Endpoints Engineer")
fig
# Show the plot
fig.show()
great_tables
ExamplesRecall this module was built by Posit and is relatively new. It is based on the great success of the gt
package from the R ecosystem. The tables generated fo far have all been developed using great_tables
.
Below are just a few examples of the possibilities.
The one below is the same that has been used for all the data presented so far. Nothing new or exceptional.
The ones that follow were copied from https://posit-dev.github.io/great-tables/examples/
#from great_tables import GT, html
from great_tables.data import sza
"Solar Zenith Angles great_tables", dataframe=sza) process_csv_from_data_folder(
Random Sample of 10 Records from 'Solar Zenith Angles great_tables' | |||
---|---|---|---|
Exploring Data | |||
Latitude | Month | Tst | Sza |
20 | dec | 0830 | 66.10 |
30 | jun | 0900 | 41.00 |
50 | aug | 1030 | 36.60 |
30 | feb | 0700 | 86.20 |
40 | aug | 1100 | 25.40 |
50 | dec | 0530 | nan |
20 | dec | 1000 | 50.90 |
30 | aug | 1030 | 23.60 |
30 | aug | 1130 | 13.70 |
40 | aug | 0500 | 89.30 |
🔍 Data Exploration: Solar Zenith Angles great_tables | Sample Size: 10 Records |
The plot below was copied from https://posit-dev.github.io/great-tables/examples/
from great_tables import html
from great_tables.data import sza
# Convert 'latitude' and 'tst' columns to a consistent type if necessary
# This step may be needed depending on your data.
# For example, if 'latitude' and 'tst' should be strings:
# sza['latitude'] = sza['latitude'].astype(str)
# sza['tst'] = sza['tst'].astype(str)
= (
sza_pivot
sza"latitude == '20' and tst <= '1200'")
.query(=["latitude"])
.drop(columns
.dropna()="month", columns="tst", values="sza")
.pivot(index=1) # Sort columns to mimic sort_columns=True in polars
.sort_index(axis
)
# Reset index so 'month' is a column again
= sza_pivot.reset_index()
sza_pivot
(="month")
GT(sza_pivot, rowname_col
.data_color(=[90, 0],
domain=["rebeccapurple", "white", "orange"],
palette="white",
na_color
)
.tab_header(="Solar Zenith Angles from 05:30 to 12:00",
title=html("Average monthly values at latitude of 20°N.")
subtitle
)="")
.sub_missing(missing_text )
Solar Zenith Angles from 05:30 to 12:00 | ||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Average monthly values at latitude of 20°N. | ||||||||||||||
0530 | 0600 | 0630 | 0700 | 0730 | 0800 | 0830 | 0900 | 0930 | 1000 | 1030 | 1100 | 1130 | 1200 | |
apr | 88.5 | 81.5 | 74.4 | 67.4 | 60.3 | 53.4 | 46.5 | 39.7 | 33.2 | 26.9 | 21.3 | 17.2 | 15.5 | |
aug | 83.8 | 77.1 | 70.2 | 63.3 | 56.4 | 49.4 | 42.4 | 35.4 | 28.3 | 21.3 | 14.3 | 7.3 | 1.9 | |
dec | 84.3 | 78.0 | 71.8 | 66.1 | 60.5 | 55.6 | 50.9 | 47.2 | 44.2 | 42.4 | 41.8 | |||
feb | 88.9 | 82.5 | 75.8 | 69.6 | 63.3 | 57.7 | 52.2 | 47.4 | 43.1 | 40.0 | 37.8 | 37.2 | ||
jan | 84.9 | 78.7 | 72.7 | 66.1 | 61.5 | 56.5 | 52.1 | 48.3 | 45.5 | 43.6 | 43.0 | |||
jul | 88.8 | 82.3 | 75.7 | 69.1 | 62.3 | 55.5 | 48.7 | 41.8 | 35.0 | 28.1 | 21.2 | 14.3 | 7.7 | 3.1 |
jun | 89.2 | 82.7 | 76.0 | 69.3 | 62.5 | 55.7 | 48.8 | 41.9 | 35.0 | 28.1 | 21.1 | 14.2 | 7.3 | 2.0 |
mar | 85.7 | 78.8 | 72.0 | 65.2 | 58.6 | 52.3 | 46.2 | 40.5 | 35.5 | 31.4 | 28.6 | 27.7 | ||
may | 85.0 | 78.2 | 71.2 | 64.3 | 57.2 | 50.2 | 43.2 | 36.1 | 29.1 | 26.1 | 15.2 | 8.8 | 5.0 | |
nov | 87.8 | 81.3 | 74.5 | 68.3 | 61.8 | 56.0 | 50.2 | 45.3 | 40.7 | 37.4 | 35.1 | 34.4 | ||
oct | 84.1 | 77.1 | 70.2 | 63.3 | 56.5 | 49.9 | 43.5 | 37.5 | 32.0 | 27.4 | 24.3 | 23.1 | ||
sep | 87.2 | 80.2 | 73.2 | 66.1 | 59.1 | 52.1 | 45.1 | 38.1 | 31.3 | 24.7 | 18.6 | 13.7 | 11.6 |
I decided the best way to sow the power of gt
tables is to share visualizations I have alreaded created in R. The same capabilites are availbe in great_tables
.
If you want to see a variety of interesting visualizations, see Cliff’s Blog.
The Great American Beer Festival Document is full of creative visualizations.