Module pandas_profiling.model.messages
Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant values, high correlations).
Source code
"""Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant
values, high correlations)."""
from enum import Enum, unique
from typing import List
import numpy as np
from pandas_profiling.config import config
from pandas_profiling.model.base import Variable
@unique
class MessageType(Enum):
"""Message Types"""
CONST = 1
"""This variable has a constant value."""
ZEROS = 2
"""This variable contains zeros."""
CORR = 3
"""This variable is highly correlated."""
RECODED = 4
"""This variable is correlated (categorical)."""
HIGH_CARDINALITY = 5
"""This variable has a high cardinality."""
UNSUPPORTED = 6
"""This variable is unsupported."""
DUPLICATES = 7
"""This variable contains duplicates."""
SKEWED = 8
"""This variable is highly skewed."""
MISSING = 9
"""THis variable contains missing values."""
INFINITE = 10
"""This variable contains infinite values."""
class Message(object):
"""A message object (type, values, column)."""
def __init__(
self, message_type: MessageType, values: dict, column_name: str or None = None
):
self.message_type = message_type
self.values = values
self.column_name = column_name
def check_table_messages(table: dict) -> List[Message]:
"""Checks the overall dataset for warnings.
Args:
table: Overall dataset statistics.
Returns:
A list of messages.
"""
messages = []
if warning_value(table["n_duplicates"]):
messages.append(Message(message_type=MessageType.DUPLICATES, values=table))
return messages
def check_variable_messages(col: str, description: dict) -> List[Message]:
"""Checks individual variables for warnings.
Args:
col: The column name that is checked.
description: The series description.
Returns:
A list of messages.
"""
messages = []
# Special types
if description["type"] in {
Variable.S_TYPE_UNSUPPORTED,
Variable.S_TYPE_CORR,
Variable.S_TYPE_CONST,
Variable.S_TYPE_RECODED,
}:
messages.append(
Message(
column_name=col,
message_type=MessageType[description["type"].value],
values=description,
)
)
if description["type"] in {Variable.TYPE_CAT, Variable.TYPE_BOOL}:
# High cardinality
if description["distinct_count"] > config["cardinality_threshold"].get(int):
messages.append(
Message(
column_name=col,
message_type=MessageType.HIGH_CARDINALITY,
values=description,
)
)
if description["type"] in {Variable.TYPE_NUM}:
# Skewness
if warning_skewness(description["skewness"]):
messages.append(
Message(
column_name=col, message_type=MessageType.SKEWED, values=description
)
)
# Zeros
if warning_value(description["p_zeros"]):
messages.append(
Message(
column_name=col, message_type=MessageType.ZEROS, values=description
)
)
if description["type"] not in {
Variable.S_TYPE_UNSUPPORTED,
Variable.S_TYPE_CORR,
Variable.S_TYPE_CONST,
Variable.S_TYPE_RECODED,
}:
# Missing
if warning_value(description["p_missing"]):
messages.append(
Message(
column_name=col,
message_type=MessageType.MISSING,
values=description,
)
)
# Infinite values
if warning_value(description["p_infinite"]):
messages.append(
Message(
column_name=col,
message_type=MessageType.INFINITE,
values=description,
)
)
return messages
def warning_value(value: np.nan or float) -> bool:
return not np.isnan(value) and value > 0.01
def warning_skewness(v: np.nan or float) -> bool:
return not np.isnan(v) and (
v < -config["vars"]["num"]["skewness_threshold"].get(int)
or v > config["vars"]["num"]["skewness_threshold"].get(int)
)
Functions
def check_table_messages(table)
-
Checks the overall dataset for warnings.
Args
table
- Overall dataset statistics.
Returns
A list of messages.
Source code
def check_table_messages(table: dict) -> List[Message]: """Checks the overall dataset for warnings. Args: table: Overall dataset statistics. Returns: A list of messages. """ messages = [] if warning_value(table["n_duplicates"]): messages.append(Message(message_type=MessageType.DUPLICATES, values=table)) return messages
def check_variable_messages(col, description)
-
Checks individual variables for warnings.
Args
col
- The column name that is checked.
description
- The series description.
Returns
A list of messages.
Source code
def check_variable_messages(col: str, description: dict) -> List[Message]: """Checks individual variables for warnings. Args: col: The column name that is checked. description: The series description. Returns: A list of messages. """ messages = [] # Special types if description["type"] in { Variable.S_TYPE_UNSUPPORTED, Variable.S_TYPE_CORR, Variable.S_TYPE_CONST, Variable.S_TYPE_RECODED, }: messages.append( Message( column_name=col, message_type=MessageType[description["type"].value], values=description, ) ) if description["type"] in {Variable.TYPE_CAT, Variable.TYPE_BOOL}: # High cardinality if description["distinct_count"] > config["cardinality_threshold"].get(int): messages.append( Message( column_name=col, message_type=MessageType.HIGH_CARDINALITY, values=description, ) ) if description["type"] in {Variable.TYPE_NUM}: # Skewness if warning_skewness(description["skewness"]): messages.append( Message( column_name=col, message_type=MessageType.SKEWED, values=description ) ) # Zeros if warning_value(description["p_zeros"]): messages.append( Message( column_name=col, message_type=MessageType.ZEROS, values=description ) ) if description["type"] not in { Variable.S_TYPE_UNSUPPORTED, Variable.S_TYPE_CORR, Variable.S_TYPE_CONST, Variable.S_TYPE_RECODED, }: # Missing if warning_value(description["p_missing"]): messages.append( Message( column_name=col, message_type=MessageType.MISSING, values=description, ) ) # Infinite values if warning_value(description["p_infinite"]): messages.append( Message( column_name=col, message_type=MessageType.INFINITE, values=description, ) ) return messages
def warning_skewness(v)
-
Source code
def warning_skewness(v: np.nan or float) -> bool: return not np.isnan(v) and ( v < -config["vars"]["num"]["skewness_threshold"].get(int) or v > config["vars"]["num"]["skewness_threshold"].get(int) )
def warning_value(value)
-
Source code
def warning_value(value: np.nan or float) -> bool: return not np.isnan(value) and value > 0.01
Classes
class Message (message_type, values, column_name=None)
-
A message object (type, values, column).
Source code
class Message(object): """A message object (type, values, column).""" def __init__( self, message_type: MessageType, values: dict, column_name: str or None = None ): self.message_type = message_type self.values = values self.column_name = column_name
class MessageType (*args, **kwargs)
-
Message Types
Source code
class MessageType(Enum): """Message Types""" CONST = 1 """This variable has a constant value.""" ZEROS = 2 """This variable contains zeros.""" CORR = 3 """This variable is highly correlated.""" RECODED = 4 """This variable is correlated (categorical).""" HIGH_CARDINALITY = 5 """This variable has a high cardinality.""" UNSUPPORTED = 6 """This variable is unsupported.""" DUPLICATES = 7 """This variable contains duplicates.""" SKEWED = 8 """This variable is highly skewed.""" MISSING = 9 """THis variable contains missing values.""" INFINITE = 10 """This variable contains infinite values."""
Ancestors
- enum.Enum
Class variables
var CONST
-
This variable has a constant value.
var CORR
-
This variable is highly correlated.
var DUPLICATES
-
This variable contains duplicates.
var HIGH_CARDINALITY
-
This variable has a high cardinality.
var INFINITE
-
This variable contains infinite values.
var MISSING
-
THis variable contains missing values.
var RECODED
-
This variable is correlated (categorical).
var SKEWED
-
This variable is highly skewed.
var UNSUPPORTED
-
This variable is unsupported.
var ZEROS
-
This variable contains zeros.