Module pandas_profiling.report.structure.variables.render_categorical
Expand source code
import pandas as pd
from pandas_profiling.config import config
from pandas_profiling.report.presentation.frequency_table_utils import freq_table
from pandas_profiling.visualisation.plot import histogram
from pandas_profiling.report.presentation.core import (
Image,
FrequencyTable,
FrequencyTableSmall,
Sequence,
Table,
VariableInfo,
)
from pandas_profiling.report.structure.variables.render_common import render_common
def render_categorical(summary):
n_obs_cat = config["vars"]["cat"]["n_obs"].get(int)
image_format = config["plot"]["image_format"].get(str)
template_variables = render_common(summary)
# TODO: merge with boolean
mini_freq_table_rows = freq_table(
freqtable=summary["value_counts"],
n=summary["count"],
max_number_to_print=n_obs_cat,
)
# Top
# Element composition
info = VariableInfo(
summary["varid"], summary["varname"], "Categorical", summary["warnings"]
)
table = Table(
[
{
"name": "Distinct count",
"value": summary["n_unique"],
"fmt": "fmt",
"alert": "n_unique" in summary["warn_fields"],
},
{
"name": "Unique (%)",
"value": summary["p_unique"],
"fmt": "fmt_percent",
"alert": "p_unique" in summary["warn_fields"],
},
{
"name": "Missing",
"value": summary["n_missing"],
"fmt": "fmt",
"alert": "n_missing" in summary["warn_fields"],
},
{
"name": "Missing (%)",
"value": summary["p_missing"],
"fmt": "fmt_percent",
"alert": "p_missing" in summary["warn_fields"],
},
{
"name": "Memory size",
"value": summary["memory_size"],
"fmt": "fmt_bytesize",
"alert": False,
},
]
)
fqm = FrequencyTableSmall(mini_freq_table_rows)
# TODO: settings 3,3,6
template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid")
# Bottom
items = []
frequency_table = FrequencyTable(
template_variables["freq_table_rows"],
name="Common Values",
anchor_id="{varid}common_values".format(varid=summary["varid"]),
)
items.append(frequency_table)
check_compositions = config["vars"]["cat"]["check_composition"].get(bool)
if check_compositions:
length_table = Table(
[
{
"name": "Max length",
"value": summary["max_length"],
"fmt": "fmt_numeric",
"alert": False,
},
{
"name": "Mean length",
"value": summary["mean_length"],
"fmt": "fmt_numeric",
"alert": False,
},
{
"name": "Min length",
"value": summary["min_length"],
"fmt": "fmt_numeric",
"alert": False,
},
],
name="Length",
anchor_id="{varid}lengthstats".format(varid=summary["varid"]),
)
histogram_bins = 10
length = Image(
histogram(summary["length"], summary, histogram_bins),
image_format=image_format,
alt="Scatter",
name="Length",
anchor_id="{varid}length".format(varid=summary["varid"]),
)
tbl = Sequence(
[length, length_table],
anchor_id="{varid}tbl".format(varid=summary["varid"]),
name="Length",
sequence_type="grid",
)
items.append(tbl)
n_freq_table_max = config["n_freq_table_max"].get(int)
citems = []
vc = pd.Series(summary["category_alias_values"]).value_counts()
citems.append(
FrequencyTable(
freq_table(
freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max
),
name="Categories",
anchor_id="{varid}category_long_values".format(varid=summary["varid"]),
)
)
vc = pd.Series(summary["script_values"]).value_counts()
citems.append(
FrequencyTable(
freq_table(
freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max
),
name="Scripts",
anchor_id="{varid}script_values".format(varid=summary["varid"]),
)
)
vc = pd.Series(summary["block_alias_values"]).value_counts()
citems.append(
FrequencyTable(
freq_table(
freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max
),
name="Blocks",
anchor_id="{varid}block_alias_values".format(varid=summary["varid"]),
)
)
characters = Sequence(
citems,
name="Characters",
sequence_type="tabs",
anchor_id="{varid}characters".format(varid=summary["varid"]),
)
items.append(characters)
template_variables["bottom"] = Sequence(
items,
sequence_type="tabs",
anchor_id="{varid}bottom".format(varid=summary["varid"]),
)
return template_variables
Functions
def render_categorical(summary)
-
Expand source code
def render_categorical(summary): n_obs_cat = config["vars"]["cat"]["n_obs"].get(int) image_format = config["plot"]["image_format"].get(str) template_variables = render_common(summary) # TODO: merge with boolean mini_freq_table_rows = freq_table( freqtable=summary["value_counts"], n=summary["count"], max_number_to_print=n_obs_cat, ) # Top # Element composition info = VariableInfo( summary["varid"], summary["varname"], "Categorical", summary["warnings"] ) table = Table( [ { "name": "Distinct count", "value": summary["n_unique"], "fmt": "fmt", "alert": "n_unique" in summary["warn_fields"], }, { "name": "Unique (%)", "value": summary["p_unique"], "fmt": "fmt_percent", "alert": "p_unique" in summary["warn_fields"], }, { "name": "Missing", "value": summary["n_missing"], "fmt": "fmt", "alert": "n_missing" in summary["warn_fields"], }, { "name": "Missing (%)", "value": summary["p_missing"], "fmt": "fmt_percent", "alert": "p_missing" in summary["warn_fields"], }, { "name": "Memory size", "value": summary["memory_size"], "fmt": "fmt_bytesize", "alert": False, }, ] ) fqm = FrequencyTableSmall(mini_freq_table_rows) # TODO: settings 3,3,6 template_variables["top"] = Sequence([info, table, fqm], sequence_type="grid") # Bottom items = [] frequency_table = FrequencyTable( template_variables["freq_table_rows"], name="Common Values", anchor_id="{varid}common_values".format(varid=summary["varid"]), ) items.append(frequency_table) check_compositions = config["vars"]["cat"]["check_composition"].get(bool) if check_compositions: length_table = Table( [ { "name": "Max length", "value": summary["max_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Mean length", "value": summary["mean_length"], "fmt": "fmt_numeric", "alert": False, }, { "name": "Min length", "value": summary["min_length"], "fmt": "fmt_numeric", "alert": False, }, ], name="Length", anchor_id="{varid}lengthstats".format(varid=summary["varid"]), ) histogram_bins = 10 length = Image( histogram(summary["length"], summary, histogram_bins), image_format=image_format, alt="Scatter", name="Length", anchor_id="{varid}length".format(varid=summary["varid"]), ) tbl = Sequence( [length, length_table], anchor_id="{varid}tbl".format(varid=summary["varid"]), name="Length", sequence_type="grid", ) items.append(tbl) n_freq_table_max = config["n_freq_table_max"].get(int) citems = [] vc = pd.Series(summary["category_alias_values"]).value_counts() citems.append( FrequencyTable( freq_table( freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max ), name="Categories", anchor_id="{varid}category_long_values".format(varid=summary["varid"]), ) ) vc = pd.Series(summary["script_values"]).value_counts() citems.append( FrequencyTable( freq_table( freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max ), name="Scripts", anchor_id="{varid}script_values".format(varid=summary["varid"]), ) ) vc = pd.Series(summary["block_alias_values"]).value_counts() citems.append( FrequencyTable( freq_table( freqtable=vc, n=vc.sum(), max_number_to_print=n_freq_table_max ), name="Blocks", anchor_id="{varid}block_alias_values".format(varid=summary["varid"]), ) ) characters = Sequence( citems, name="Characters", sequence_type="tabs", anchor_id="{varid}characters".format(varid=summary["varid"]), ) items.append(characters) template_variables["bottom"] = Sequence( items, sequence_type="tabs", anchor_id="{varid}bottom".format(varid=summary["varid"]), ) return template_variables