Module pandas_profiling.visualisation.plot

Plot functions for the profiling report.

Expand source code
"""Plot functions for the profiling report."""

from typing import Union

import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap

from pandas.plotting import register_matplotlib_converters
from pandas_profiling.visualisation.utils import plot_360_n0sc0pe
from pkg_resources import resource_filename

from pandas_profiling.config import config
from pandas_profiling.model.base import Variable

register_matplotlib_converters()
matplotlib.style.use(resource_filename(__name__, "pandas_profiling.mplstyle"))
sns.set_style(style="white")


def _plot_histogram(
    series: pd.Series,
    series_description: dict,
    bins: Union[int, np.ndarray],
    figsize: tuple = (6, 4),
):
    """Plot an histogram from the data and return the AxesSubplot object.

    Args:
        series: The data to plot
        figsize: The size of the figure (width, height) in inches, default (6,4)
        bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
        The histogram plot.


    """
    if series_description["type"] == Variable.TYPE_DATE:
        # Workaround for https://github.com/pandas-dev/pandas/issues/17372
        fig = plt.figure(figsize=figsize)
        plot = fig.add_subplot(111)
        plot.set_ylabel("Frequency")
        plot.hist(
            series.dropna().values,
            facecolor=config["html"]["style"]["primary_color"].get(str),
            bins=bins,
        )

    else:
        plot = series.plot(
            kind="hist",
            figsize=figsize,
            facecolor=config["html"]["style"]["primary_color"].get(str),
            bins=bins,
        )
    return plot


def histogram(
    series: pd.Series, series_description: dict, bins: Union[int, np.ndarray]
) -> str:
    """Plot an histogram of the data.

    Args:
      series_description:
      series: The data to plot.
      bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
      The resulting histogram encoded as a string.

    """
    plot = _plot_histogram(series, series_description, bins)
    plot.xaxis.set_tick_params(rotation=45)
    plot.figure.tight_layout()

    return plot_360_n0sc0pe(plt)


def mini_histogram(
    series: pd.Series, series_description: dict, bins: Union[int, np.ndarray]
) -> str:
    """Plot a small (mini) histogram of the data.

    Args:
      series_description:
      series: The data to plot.
      bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
      The resulting mini histogram encoded as a string.
    """
    plot = _plot_histogram(series, series_description, bins, figsize=(2, 1.5))
    plot.axes.get_yaxis().set_visible(False)
    plot.set_facecolor("w")

    xticks = plot.xaxis.get_major_ticks()
    for tick in xticks:
        tick.label1.set_fontsize(8)
    plot.xaxis.set_tick_params(rotation=45)
    plot.figure.tight_layout()

    return plot_360_n0sc0pe(plt)


def get_cmap_half(cmap_name):
    """Get the upper half of the color map

    Args:
        cmap_name: the name of the color map

    Returns:
        A new color map based on the upper half of another color map

    References:
        https://stackoverflow.com/a/24746399/470433
    """
    # Evaluate an existing colormap from 0.5 (midpoint) to 1 (upper end)
    cmap = plt.get_cmap(cmap_name)
    colors = cmap(np.linspace(0.5, 1, cmap.N // 2))

    # Create a new colormap from those colors
    return LinearSegmentedColormap.from_list("cmap_half", colors)


def correlation_matrix(data: pd.DataFrame, vmin: int = -1) -> str:
    """Plot image of a matrix correlation.

    Args:
      data: The matrix correlation to plot.
      vmin: Minimum value of value range.

    Returns:
      The resulting correlation matrix encoded as a string.
    """
    fig_cor, axes_cor = plt.subplots()
    cmap = config["plot"]["correlation"]["cmap"].get(str)
    if vmin == 0:
        cmap = get_cmap_half(cmap)

    labels = data.columns
    matrix_image = axes_cor.imshow(
        data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap
    )
    plt.colorbar(matrix_image)
    axes_cor.set_xticks(np.arange(0, data.shape[0], float(data.shape[0]) / len(labels)))
    axes_cor.set_yticks(np.arange(0, data.shape[1], float(data.shape[1]) / len(labels)))
    axes_cor.set_xticklabels(labels, rotation=90)
    axes_cor.set_yticklabels(labels)
    plt.subplots_adjust(bottom=0.2)

    return plot_360_n0sc0pe(plt)


def scatter_complex(series) -> str:
    plt.ylabel("Imaginary")
    plt.xlabel("Real")

    color = config["html"]["style"]["primary_color"].get(str)

    if len(series) > 1000:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(series.real, series.imag, cmap=cmap)
    else:
        plt.scatter(series.real, series.imag, color=color)

    return plot_360_n0sc0pe(plt)


def scatter_series(series, x_label="Width", y_label="Height") -> str:
    """

    Examples:
        >>> scatter_series(file_sizes, "Width", "Height")

    Args:
        series:
        x_label:
        y_label:

    Returns:

    """
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    color = config["html"]["style"]["primary_color"].get(str)

    if len(series) > 1000:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(*zip(*series.tolist()), cmap=cmap)
    else:
        plt.scatter(*zip(*series.tolist()), color=color)
    return plot_360_n0sc0pe(plt)


def scatter_pairwise(series1, series2, x_label, y_label) -> str:
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    color = config["html"]["style"]["primary_color"].get(str)

    if len(series1) > 1000:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(series1.tolist(), series2.tolist(), gridsize=15, cmap=cmap)
    else:
        plt.scatter(series1.tolist(), series2.tolist(), color=color)
    return plot_360_n0sc0pe(plt)

Functions

def correlation_matrix(data, vmin=-1)

Plot image of a matrix correlation.

Args

data
The matrix correlation to plot.
vmin
Minimum value of value range.

Returns

The resulting correlation matrix encoded as a string.

Expand source code
def correlation_matrix(data: pd.DataFrame, vmin: int = -1) -> str:
    """Plot image of a matrix correlation.

    Args:
      data: The matrix correlation to plot.
      vmin: Minimum value of value range.

    Returns:
      The resulting correlation matrix encoded as a string.
    """
    fig_cor, axes_cor = plt.subplots()
    cmap = config["plot"]["correlation"]["cmap"].get(str)
    if vmin == 0:
        cmap = get_cmap_half(cmap)

    labels = data.columns
    matrix_image = axes_cor.imshow(
        data, vmin=vmin, vmax=1, interpolation="nearest", cmap=cmap
    )
    plt.colorbar(matrix_image)
    axes_cor.set_xticks(np.arange(0, data.shape[0], float(data.shape[0]) / len(labels)))
    axes_cor.set_yticks(np.arange(0, data.shape[1], float(data.shape[1]) / len(labels)))
    axes_cor.set_xticklabels(labels, rotation=90)
    axes_cor.set_yticklabels(labels)
    plt.subplots_adjust(bottom=0.2)

    return plot_360_n0sc0pe(plt)
def get_cmap_half(cmap_name)

Get the upper half of the color map

Args

cmap_name
the name of the color map

Returns

A new color map based on the upper half of another color map
 

References

https://stackoverflow.com/a/24746399/470433

Expand source code
def get_cmap_half(cmap_name):
    """Get the upper half of the color map

    Args:
        cmap_name: the name of the color map

    Returns:
        A new color map based on the upper half of another color map

    References:
        https://stackoverflow.com/a/24746399/470433
    """
    # Evaluate an existing colormap from 0.5 (midpoint) to 1 (upper end)
    cmap = plt.get_cmap(cmap_name)
    colors = cmap(np.linspace(0.5, 1, cmap.N // 2))

    # Create a new colormap from those colors
    return LinearSegmentedColormap.from_list("cmap_half", colors)
def histogram(series, series_description, bins)

Plot an histogram of the data.

Args

series_description:
series
The data to plot.
bins
number of bins (int for equal size, ndarray for variable size)

Returns

The resulting histogram encoded as a string.

Expand source code
def histogram(
    series: pd.Series, series_description: dict, bins: Union[int, np.ndarray]
) -> str:
    """Plot an histogram of the data.

    Args:
      series_description:
      series: The data to plot.
      bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
      The resulting histogram encoded as a string.

    """
    plot = _plot_histogram(series, series_description, bins)
    plot.xaxis.set_tick_params(rotation=45)
    plot.figure.tight_layout()

    return plot_360_n0sc0pe(plt)
def mini_histogram(series, series_description, bins)

Plot a small (mini) histogram of the data.

Args

series_description:
series
The data to plot.
bins
number of bins (int for equal size, ndarray for variable size)

Returns

The resulting mini histogram encoded as a string.

Expand source code
def mini_histogram(
    series: pd.Series, series_description: dict, bins: Union[int, np.ndarray]
) -> str:
    """Plot a small (mini) histogram of the data.

    Args:
      series_description:
      series: The data to plot.
      bins: number of bins (int for equal size, ndarray for variable size)

    Returns:
      The resulting mini histogram encoded as a string.
    """
    plot = _plot_histogram(series, series_description, bins, figsize=(2, 1.5))
    plot.axes.get_yaxis().set_visible(False)
    plot.set_facecolor("w")

    xticks = plot.xaxis.get_major_ticks()
    for tick in xticks:
        tick.label1.set_fontsize(8)
    plot.xaxis.set_tick_params(rotation=45)
    plot.figure.tight_layout()

    return plot_360_n0sc0pe(plt)
def scatter_complex(series)
Expand source code
def scatter_complex(series) -> str:
    plt.ylabel("Imaginary")
    plt.xlabel("Real")

    color = config["html"]["style"]["primary_color"].get(str)

    if len(series) > 1000:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(series.real, series.imag, cmap=cmap)
    else:
        plt.scatter(series.real, series.imag, color=color)

    return plot_360_n0sc0pe(plt)
def scatter_pairwise(series1, series2, x_label, y_label)
Expand source code
def scatter_pairwise(series1, series2, x_label, y_label) -> str:
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    color = config["html"]["style"]["primary_color"].get(str)

    if len(series1) > 1000:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(series1.tolist(), series2.tolist(), gridsize=15, cmap=cmap)
    else:
        plt.scatter(series1.tolist(), series2.tolist(), color=color)
    return plot_360_n0sc0pe(plt)
def scatter_series(series, x_label='Width', y_label='Height')

Examples

>>> scatter_series(file_sizes, "Width", "Height")

Args

series: x_label: y_label: Returns:

Expand source code
def scatter_series(series, x_label="Width", y_label="Height") -> str:
    """

    Examples:
        >>> scatter_series(file_sizes, "Width", "Height")

    Args:
        series:
        x_label:
        y_label:

    Returns:

    """
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    color = config["html"]["style"]["primary_color"].get(str)

    if len(series) > 1000:
        cmap = sns.light_palette(color, as_cmap=True)
        plt.hexbin(*zip(*series.tolist()), cmap=cmap)
    else:
        plt.scatter(*zip(*series.tolist()), color=color)
    return plot_360_n0sc0pe(plt)