Module pandas_profiling.model.messages
Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant values, high correlations).
Expand source code
"""Logic for alerting the user on possibly problematic patterns in the data (e.g. high number of zeros , constant
values, high correlations)."""
from enum import Enum, unique
from typing import List, Union
import warnings
from contextlib import suppress
import re
from dateutil.parser import parse
import numpy as np
from pandas_profiling.model.correlations import perform_check_correlation
from pandas_profiling.config import config
from pandas_profiling.model.base import Variable
@unique
class MessageType(Enum):
"""Message Types"""
CONSTANT = 1
"""This variable has a constant value."""
ZEROS = 2
"""This variable contains zeros."""
HIGH_CORRELATION = 3
"""This variable is highly correlated."""
RECODED = 4
"""This variable is correlated (categorical)."""
HIGH_CARDINALITY = 5
"""This variable has a high cardinality."""
UNSUPPORTED = 6
"""This variable is unsupported."""
DUPLICATES = 7
"""This variable contains duplicates."""
SKEWED = 8
"""This variable is highly skewed."""
MISSING = 9
"""This variable contains missing values."""
INFINITE = 10
"""This variable contains infinite values."""
TYPE_DATE = 11
"""This variable is likely a datetime, but treated as categorical."""
UNIQUE = 12
"""This variable has unique values."""
CONSTANT_LENGTH = 13
"""This variable has a constant length"""
REJECTED = 15
"""Variables are rejected if we do not want to consider them for further analysis."""
UNIFORM = 14
"""The variable is uniformly distributed"""
class Message(object):
"""A message object (type, values, column)."""
def __init__(
self,
message_type: MessageType,
values: dict,
column_name: Union[str, None] = None,
fields=None,
):
if fields is None:
fields = set()
self.fields = fields
self.message_type = message_type
self.values = values
self.column_name = column_name
self.anchor_id = hash(column_name)
def fmt(self):
# TODO: render in template
name = self.message_type.name.replace("_", " ")
if name == "HIGH CORRELATION":
name = '<abbr title="This variable has a high correlation with {num} fields: {title}">HIGH CORRELATION</abbr>'.format(
num=len(self.values["fields"]), title=", ".join(self.values["fields"])
)
return name
def __repr__(self):
return "[{message_type}] warning on column {column}".format(
message_type=self.message_type.name, column=self.column_name
)
def check_table_messages(table: dict) -> List[Message]:
"""Checks the overall dataset for warnings.
Args:
table: Overall dataset statistics.
Returns:
A list of messages.
"""
messages = []
if warning_value(table["n_duplicates"]):
messages.append(
Message(
message_type=MessageType.DUPLICATES,
values=table,
fields={"n_duplicates"},
)
)
return messages
def check_variable_messages(col: str, description: dict) -> List[Message]:
"""Checks individual variables for warnings.
Args:
col: The column name that is checked.
description: The series description.
Returns:
A list of messages.
"""
messages = []
# Missing
if warning_value(description["p_missing"]):
messages.append(
Message(
column_name=col,
message_type=MessageType.MISSING,
values=description,
fields={"p_missing", "n_missing"},
)
)
if description["type"] == Variable.S_TYPE_UNSUPPORTED:
messages.append(
Message(
column_name=col,
message_type=MessageType.UNSUPPORTED,
values=description,
fields={},
)
)
if description["distinct_count_with_nan"] <= 1:
messages.append(
Message(
column_name=col,
message_type=MessageType.CONSTANT,
values=description,
fields={"n_unique"},
)
)
if (
description["type"] == Variable.S_TYPE_UNSUPPORTED
or description["distinct_count_with_nan"] <= 1
):
messages.append(
Message(
column_name=col,
message_type=MessageType.REJECTED,
values=description,
fields={},
)
)
if description["distinct_count_without_nan"] == description["n"]:
messages.append(
Message(
column_name=col,
message_type=MessageType.UNIQUE,
values=description,
fields={"n_unique", "p_unique"},
)
)
# Infinite values
if warning_value(description["p_infinite"]):
messages.append(
Message(
column_name=col,
message_type=MessageType.INFINITE,
values=description,
fields={"p_infinite", "n_infinite"},
)
)
# Date
if description["type"] == Variable.TYPE_DATE:
# Uniformity
chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
float
)
# chi_squared_threshold = 0.5
if 0.0 < chi_squared_threshold < description["chi_squared"][1]:
messages.append(
Message(column_name=col, message_type=MessageType.UNIFORM, values={})
)
# Categorical
if description["type"] == Variable.TYPE_CAT:
if description["date_warning"]:
messages.append(
Message(column_name=col, message_type=MessageType.TYPE_DATE, values={})
)
# Uniformity
chi_squared_threshold = config["vars"]["cat"]["chi_squared_threshold"].get(
float
)
if 0.0 < chi_squared_threshold < description["chi_squared"][1]:
messages.append(
Message(column_name=col, message_type=MessageType.UNIFORM, values={})
)
# High cardinality
if description["distinct_count"] > config["vars"]["cat"][
"cardinality_threshold"
].get(int):
messages.append(
Message(
column_name=col,
message_type=MessageType.HIGH_CARDINALITY,
values=description,
fields={"n_unique"},
)
)
# Constant length
if (
"composition" in description
and description["min_length"] == description["max_length"]
):
messages.append(
Message(
column_name=col,
message_type=MessageType.CONSTANT_LENGTH,
values=description,
fields={"composition_min_length", "composition_max_length"},
)
)
# Numerical
if description["type"] == Variable.TYPE_NUM:
# Skewness
if warning_skewness(description["skewness"]):
messages.append(
Message(
column_name=col,
message_type=MessageType.SKEWED,
values=description,
fields={"skewness"},
)
)
# Uniformity
chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get(
float
)
if 0.0 < chi_squared_threshold < description["chi_squared"][1]:
messages.append(
Message(column_name=col, message_type=MessageType.UNIFORM, values={})
)
# Zeros
if warning_value(description["p_zeros"]):
messages.append(
Message(
column_name=col,
message_type=MessageType.ZEROS,
values=description,
fields={"n_zeros", "p_zeros"},
)
)
return messages
def check_correlation_messages(correlations):
messages = []
for corr, matrix in correlations.items():
if config["correlations"][corr]["warn_high_correlations"].get(bool):
threshold = config["correlations"][corr]["threshold"].get(float)
correlated_mapping = perform_check_correlation(matrix, threshold)
if len(correlated_mapping) > 0:
for k, v in correlated_mapping.items():
messages.append(
Message(
column_name=k,
message_type=MessageType.HIGH_CORRELATION,
values={"corr": corr, "fields": v},
)
)
return messages
def warning_value(value: float) -> bool:
return not np.isnan(value) and value > 0.01
def warning_skewness(v: float) -> bool:
return not np.isnan(v) and (
v < -config["vars"]["num"]["skewness_threshold"].get(int)
or v > config["vars"]["num"]["skewness_threshold"].get(int)
)
def _date_parser(date_string):
pattern = re.compile(r"[.\-:]")
pieces = re.split(pattern, date_string)
if len(pieces) < 3:
raise ValueError("Must have at least year, month and date passed")
return parse(date_string)
def warning_type_date(series):
with suppress(ValueError, TypeError):
series.apply(_date_parser)
return True
return False
Functions
def check_correlation_messages(correlations)
-
Expand source code
def check_correlation_messages(correlations): messages = [] for corr, matrix in correlations.items(): if config["correlations"][corr]["warn_high_correlations"].get(bool): threshold = config["correlations"][corr]["threshold"].get(float) correlated_mapping = perform_check_correlation(matrix, threshold) if len(correlated_mapping) > 0: for k, v in correlated_mapping.items(): messages.append( Message( column_name=k, message_type=MessageType.HIGH_CORRELATION, values={"corr": corr, "fields": v}, ) ) return messages
def check_table_messages(table)
-
Checks the overall dataset for warnings.
Args
table
- Overall dataset statistics.
Returns
A list of messages.
Expand source code
def check_table_messages(table: dict) -> List[Message]: """Checks the overall dataset for warnings. Args: table: Overall dataset statistics. Returns: A list of messages. """ messages = [] if warning_value(table["n_duplicates"]): messages.append( Message( message_type=MessageType.DUPLICATES, values=table, fields={"n_duplicates"}, ) ) return messages
def check_variable_messages(col, description)
-
Checks individual variables for warnings.
Args
col
- The column name that is checked.
description
- The series description.
Returns
A list of messages.
Expand source code
def check_variable_messages(col: str, description: dict) -> List[Message]: """Checks individual variables for warnings. Args: col: The column name that is checked. description: The series description. Returns: A list of messages. """ messages = [] # Missing if warning_value(description["p_missing"]): messages.append( Message( column_name=col, message_type=MessageType.MISSING, values=description, fields={"p_missing", "n_missing"}, ) ) if description["type"] == Variable.S_TYPE_UNSUPPORTED: messages.append( Message( column_name=col, message_type=MessageType.UNSUPPORTED, values=description, fields={}, ) ) if description["distinct_count_with_nan"] <= 1: messages.append( Message( column_name=col, message_type=MessageType.CONSTANT, values=description, fields={"n_unique"}, ) ) if ( description["type"] == Variable.S_TYPE_UNSUPPORTED or description["distinct_count_with_nan"] <= 1 ): messages.append( Message( column_name=col, message_type=MessageType.REJECTED, values=description, fields={}, ) ) if description["distinct_count_without_nan"] == description["n"]: messages.append( Message( column_name=col, message_type=MessageType.UNIQUE, values=description, fields={"n_unique", "p_unique"}, ) ) # Infinite values if warning_value(description["p_infinite"]): messages.append( Message( column_name=col, message_type=MessageType.INFINITE, values=description, fields={"p_infinite", "n_infinite"}, ) ) # Date if description["type"] == Variable.TYPE_DATE: # Uniformity chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get( float ) # chi_squared_threshold = 0.5 if 0.0 < chi_squared_threshold < description["chi_squared"][1]: messages.append( Message(column_name=col, message_type=MessageType.UNIFORM, values={}) ) # Categorical if description["type"] == Variable.TYPE_CAT: if description["date_warning"]: messages.append( Message(column_name=col, message_type=MessageType.TYPE_DATE, values={}) ) # Uniformity chi_squared_threshold = config["vars"]["cat"]["chi_squared_threshold"].get( float ) if 0.0 < chi_squared_threshold < description["chi_squared"][1]: messages.append( Message(column_name=col, message_type=MessageType.UNIFORM, values={}) ) # High cardinality if description["distinct_count"] > config["vars"]["cat"][ "cardinality_threshold" ].get(int): messages.append( Message( column_name=col, message_type=MessageType.HIGH_CARDINALITY, values=description, fields={"n_unique"}, ) ) # Constant length if ( "composition" in description and description["min_length"] == description["max_length"] ): messages.append( Message( column_name=col, message_type=MessageType.CONSTANT_LENGTH, values=description, fields={"composition_min_length", "composition_max_length"}, ) ) # Numerical if description["type"] == Variable.TYPE_NUM: # Skewness if warning_skewness(description["skewness"]): messages.append( Message( column_name=col, message_type=MessageType.SKEWED, values=description, fields={"skewness"}, ) ) # Uniformity chi_squared_threshold = config["vars"]["num"]["chi_squared_threshold"].get( float ) if 0.0 < chi_squared_threshold < description["chi_squared"][1]: messages.append( Message(column_name=col, message_type=MessageType.UNIFORM, values={}) ) # Zeros if warning_value(description["p_zeros"]): messages.append( Message( column_name=col, message_type=MessageType.ZEROS, values=description, fields={"n_zeros", "p_zeros"}, ) ) return messages
def warning_skewness(v)
-
Expand source code
def warning_skewness(v: float) -> bool: return not np.isnan(v) and ( v < -config["vars"]["num"]["skewness_threshold"].get(int) or v > config["vars"]["num"]["skewness_threshold"].get(int) )
def warning_type_date(series)
-
Expand source code
def warning_type_date(series): with suppress(ValueError, TypeError): series.apply(_date_parser) return True return False
def warning_value(value)
-
Expand source code
def warning_value(value: float) -> bool: return not np.isnan(value) and value > 0.01
Classes
class Message (message_type, values, column_name=None, fields=None)
-
A message object (type, values, column).
Expand source code
class Message(object): """A message object (type, values, column).""" def __init__( self, message_type: MessageType, values: dict, column_name: Union[str, None] = None, fields=None, ): if fields is None: fields = set() self.fields = fields self.message_type = message_type self.values = values self.column_name = column_name self.anchor_id = hash(column_name) def fmt(self): # TODO: render in template name = self.message_type.name.replace("_", " ") if name == "HIGH CORRELATION": name = '<abbr title="This variable has a high correlation with {num} fields: {title}">HIGH CORRELATION</abbr>'.format( num=len(self.values["fields"]), title=", ".join(self.values["fields"]) ) return name def __repr__(self): return "[{message_type}] warning on column {column}".format( message_type=self.message_type.name, column=self.column_name )
Methods
def fmt(self)
-
Expand source code
def fmt(self): # TODO: render in template name = self.message_type.name.replace("_", " ") if name == "HIGH CORRELATION": name = '<abbr title="This variable has a high correlation with {num} fields: {title}">HIGH CORRELATION</abbr>'.format( num=len(self.values["fields"]), title=", ".join(self.values["fields"]) ) return name
class MessageType (*args, **kwargs)
-
Message Types
Expand source code
class MessageType(Enum): """Message Types""" CONSTANT = 1 """This variable has a constant value.""" ZEROS = 2 """This variable contains zeros.""" HIGH_CORRELATION = 3 """This variable is highly correlated.""" RECODED = 4 """This variable is correlated (categorical).""" HIGH_CARDINALITY = 5 """This variable has a high cardinality.""" UNSUPPORTED = 6 """This variable is unsupported.""" DUPLICATES = 7 """This variable contains duplicates.""" SKEWED = 8 """This variable is highly skewed.""" MISSING = 9 """This variable contains missing values.""" INFINITE = 10 """This variable contains infinite values.""" TYPE_DATE = 11 """This variable is likely a datetime, but treated as categorical.""" UNIQUE = 12 """This variable has unique values.""" CONSTANT_LENGTH = 13 """This variable has a constant length""" REJECTED = 15 """Variables are rejected if we do not want to consider them for further analysis.""" UNIFORM = 14 """The variable is uniformly distributed"""
Ancestors
- enum.Enum
Class variables
var CONSTANT
-
This variable has a constant value.
var CONSTANT_LENGTH
-
This variable has a constant length
var DUPLICATES
-
This variable contains duplicates.
var HIGH_CARDINALITY
-
This variable has a high cardinality.
var HIGH_CORRELATION
-
This variable is highly correlated.
var INFINITE
-
This variable contains infinite values.
var MISSING
-
This variable contains missing values.
var RECODED
-
This variable is correlated (categorical).
var REJECTED
-
Variables are rejected if we do not want to consider them for further analysis.
var SKEWED
-
This variable is highly skewed.
var TYPE_DATE
-
This variable is likely a datetime, but treated as categorical.
var UNIFORM
-
The variable is uniformly distributed
var UNIQUE
-
This variable has unique values.
var UNSUPPORTED
-
This variable is unsupported.
var ZEROS
-
This variable contains zeros.