Module pandas_profiling.model.messages

Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant values, high correlations).

Source code
"""Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant
values, high correlations)."""
from enum import Enum, unique
from typing import List

import numpy as np

from pandas_profiling.config import config
from pandas_profiling.model.base import Variable


@unique
class MessageType(Enum):
    """Message Types"""

    CONST = 1
    """This variable has a constant value."""

    ZEROS = 2
    """This variable contains zeros."""

    CORR = 3
    """This variable is highly correlated."""

    RECODED = 4
    """This variable is correlated (categorical)."""

    HIGH_CARDINALITY = 5
    """This variable has a high cardinality."""

    UNSUPPORTED = 6
    """This variable is unsupported."""

    DUPLICATES = 7
    """This variable contains duplicates."""

    SKEWED = 8
    """This variable is highly skewed."""

    MISSING = 9
    """THis variable contains missing values."""

    INFINITE = 10
    """This variable contains infinite values."""


class Message(object):
    """A message object (type, values, column)."""

    def __init__(
        self, message_type: MessageType, values: dict, column_name: str or None = None
    ):
        self.message_type = message_type
        self.values = values
        self.column_name = column_name


def check_table_messages(table: dict) -> List[Message]:
    """Checks the overall dataset for warnings.

    Args:
        table: Overall dataset statistics.

    Returns:
        A list of messages.
    """
    messages = []
    if warning_value(table["n_duplicates"]):
        messages.append(Message(message_type=MessageType.DUPLICATES, values=table))
    return messages


def check_variable_messages(col: str, description: dict) -> List[Message]:
    """Checks individual variables for warnings.

    Args:
        col: The column name that is checked.
        description: The series description.

    Returns:
        A list of messages.
    """
    messages = []
    # Special types
    if description["type"] in {
        Variable.S_TYPE_UNSUPPORTED,
        Variable.S_TYPE_CORR,
        Variable.S_TYPE_CONST,
        Variable.S_TYPE_RECODED,
    }:
        messages.append(
            Message(
                column_name=col,
                message_type=MessageType[description["type"].value],
                values=description,
            )
        )

    if description["type"] in {Variable.TYPE_CAT, Variable.TYPE_BOOL}:
        # High cardinality
        if description["distinct_count"] > config["cardinality_threshold"].get(int):
            messages.append(
                Message(
                    column_name=col,
                    message_type=MessageType.HIGH_CARDINALITY,
                    values=description,
                )
            )

    if description["type"] in {Variable.TYPE_NUM}:
        # Skewness
        if warning_skewness(description["skewness"]):
            messages.append(
                Message(
                    column_name=col, message_type=MessageType.SKEWED, values=description
                )
            )
        # Zeros
        if warning_value(description["p_zeros"]):
            messages.append(
                Message(
                    column_name=col, message_type=MessageType.ZEROS, values=description
                )
            )

    if description["type"] not in {
        Variable.S_TYPE_UNSUPPORTED,
        Variable.S_TYPE_CORR,
        Variable.S_TYPE_CONST,
        Variable.S_TYPE_RECODED,
    }:
        # Missing
        if warning_value(description["p_missing"]):
            messages.append(
                Message(
                    column_name=col,
                    message_type=MessageType.MISSING,
                    values=description,
                )
            )
        # Infinite values
        if warning_value(description["p_infinite"]):
            messages.append(
                Message(
                    column_name=col,
                    message_type=MessageType.INFINITE,
                    values=description,
                )
            )

    return messages


def warning_value(value: np.nan or float) -> bool:
    return not np.isnan(value) and value > 0.01


def warning_skewness(v: np.nan or float) -> bool:
    return not np.isnan(v) and (
        v < -config["vars"]["num"]["skewness_threshold"].get(int)
        or v > config["vars"]["num"]["skewness_threshold"].get(int)
    )

Functions

def check_table_messages(table)

Checks the overall dataset for warnings.

Args

table
Overall dataset statistics.

Returns

A list of messages.

Source code
def check_table_messages(table: dict) -> List[Message]:
    """Checks the overall dataset for warnings.

    Args:
        table: Overall dataset statistics.

    Returns:
        A list of messages.
    """
    messages = []
    if warning_value(table["n_duplicates"]):
        messages.append(Message(message_type=MessageType.DUPLICATES, values=table))
    return messages
def check_variable_messages(col, description)

Checks individual variables for warnings.

Args

col
The column name that is checked.
description
The series description.

Returns

A list of messages.

Source code
def check_variable_messages(col: str, description: dict) -> List[Message]:
    """Checks individual variables for warnings.

    Args:
        col: The column name that is checked.
        description: The series description.

    Returns:
        A list of messages.
    """
    messages = []
    # Special types
    if description["type"] in {
        Variable.S_TYPE_UNSUPPORTED,
        Variable.S_TYPE_CORR,
        Variable.S_TYPE_CONST,
        Variable.S_TYPE_RECODED,
    }:
        messages.append(
            Message(
                column_name=col,
                message_type=MessageType[description["type"].value],
                values=description,
            )
        )

    if description["type"] in {Variable.TYPE_CAT, Variable.TYPE_BOOL}:
        # High cardinality
        if description["distinct_count"] > config["cardinality_threshold"].get(int):
            messages.append(
                Message(
                    column_name=col,
                    message_type=MessageType.HIGH_CARDINALITY,
                    values=description,
                )
            )

    if description["type"] in {Variable.TYPE_NUM}:
        # Skewness
        if warning_skewness(description["skewness"]):
            messages.append(
                Message(
                    column_name=col, message_type=MessageType.SKEWED, values=description
                )
            )
        # Zeros
        if warning_value(description["p_zeros"]):
            messages.append(
                Message(
                    column_name=col, message_type=MessageType.ZEROS, values=description
                )
            )

    if description["type"] not in {
        Variable.S_TYPE_UNSUPPORTED,
        Variable.S_TYPE_CORR,
        Variable.S_TYPE_CONST,
        Variable.S_TYPE_RECODED,
    }:
        # Missing
        if warning_value(description["p_missing"]):
            messages.append(
                Message(
                    column_name=col,
                    message_type=MessageType.MISSING,
                    values=description,
                )
            )
        # Infinite values
        if warning_value(description["p_infinite"]):
            messages.append(
                Message(
                    column_name=col,
                    message_type=MessageType.INFINITE,
                    values=description,
                )
            )

    return messages
def warning_skewness(v)
Source code
def warning_skewness(v: np.nan or float) -> bool:
    return not np.isnan(v) and (
        v < -config["vars"]["num"]["skewness_threshold"].get(int)
        or v > config["vars"]["num"]["skewness_threshold"].get(int)
    )
def warning_value(value)
Source code
def warning_value(value: np.nan or float) -> bool:
    return not np.isnan(value) and value > 0.01

Classes

class Message (message_type, values, column_name=None)

A message object (type, values, column).

Source code
class Message(object):
    """A message object (type, values, column)."""

    def __init__(
        self, message_type: MessageType, values: dict, column_name: str or None = None
    ):
        self.message_type = message_type
        self.values = values
        self.column_name = column_name
class MessageType (*args, **kwargs)

Message Types

Source code
class MessageType(Enum):
    """Message Types"""

    CONST = 1
    """This variable has a constant value."""

    ZEROS = 2
    """This variable contains zeros."""

    CORR = 3
    """This variable is highly correlated."""

    RECODED = 4
    """This variable is correlated (categorical)."""

    HIGH_CARDINALITY = 5
    """This variable has a high cardinality."""

    UNSUPPORTED = 6
    """This variable is unsupported."""

    DUPLICATES = 7
    """This variable contains duplicates."""

    SKEWED = 8
    """This variable is highly skewed."""

    MISSING = 9
    """THis variable contains missing values."""

    INFINITE = 10
    """This variable contains infinite values."""

Ancestors

  • enum.Enum

Class variables

var CONST

This variable has a constant value.

var CORR

This variable is highly correlated.

var DUPLICATES

This variable contains duplicates.

var HIGH_CARDINALITY

This variable has a high cardinality.

var INFINITE

This variable contains infinite values.

var MISSING

THis variable contains missing values.

var RECODED

This variable is correlated (categorical).

var SKEWED

This variable is highly skewed.

var UNSUPPORTED

This variable is unsupported.

var ZEROS

This variable contains zeros.