Module pandas_profiling.report.structure.overview

Expand source code
from urllib.parse import quote

from pandas_profiling.report.presentation.core import HTML, Table, Sequence, Warnings


def get_dataset_overview(summary):
    dataset_info = Table(
        [
            {
                "name": "Number of variables",
                "value": summary["table"]["n_var"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Number of observations",
                "value": summary["table"]["n"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing cells",
                "value": summary["table"]["n_cells_missing"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing cells (%)",
                "value": summary["table"]["p_cells_missing"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Duplicate rows",
                "value": summary["table"]["n_duplicates"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Duplicate rows (%)",
                "value": summary["table"]["p_duplicates"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Total size in memory",
                "value": summary["table"]["memory_size"],
                "fmt": "fmt_bytesize",
            },
            {
                "name": "Average record size in memory",
                "value": summary["table"]["record_size"],
                "fmt": "fmt_bytesize",
            },
        ],
        name="Dataset statistics",
    )

    dataset_types = Table(
        [
            {"name": type_name, "value": count, "fmt": "fmt_numeric"}
            for type_name, count in summary["table"]["types"].items()
        ],
        name="Variable types",
    )

    return Sequence(
        [dataset_info, dataset_types],
        anchor_id="dataset_overview",
        name="Overview",
        sequence_type="grid",
    )


def get_dataset_reproduction(summary, date_start, date_end):
    return Table(
        [
            {"name": "Analysis started", "value": date_start, "fmt": "fmt"},
            {"name": "Analysis finished", "value": date_end, "fmt": "fmt"},
            {
                "name": "Version",
                "value": '<a href="https://github.com/pandas-profiling/pandas-profiling">pandas-profiling v{version}</a>'.format(
                    version=summary["package"]["pandas_profiling_version"]
                ),
                "fmt": "raw",
            },
            {
                "name": "Command line",
                "value": "<code>pandas_profiling --config_file config.yaml [YOUR_FILE.csv]</code>",
                "fmt": "raw",
            },
            {
                "name": "Download configuration",
                "value": '<a download="config.yaml" href="data:text/plain;charset=utf-8,{config}">config.yaml</a>'.format(
                    config=quote(summary["package"]["pandas_profiling_config"])
                ),
                "fmt": "raw",
            },
        ],
        name="Reproduction",
        anchor_id="reproduction",
    )


def get_dataset_warnings(warnings, count):
    return Warnings(
        warnings=warnings,
        name="Warnings ({count})".format(count=count),
        anchor_id="warnings",
    )

Functions

def get_dataset_overview(summary)
Expand source code
def get_dataset_overview(summary):
    dataset_info = Table(
        [
            {
                "name": "Number of variables",
                "value": summary["table"]["n_var"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Number of observations",
                "value": summary["table"]["n"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing cells",
                "value": summary["table"]["n_cells_missing"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Missing cells (%)",
                "value": summary["table"]["p_cells_missing"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Duplicate rows",
                "value": summary["table"]["n_duplicates"],
                "fmt": "fmt_numeric",
            },
            {
                "name": "Duplicate rows (%)",
                "value": summary["table"]["p_duplicates"],
                "fmt": "fmt_percent",
            },
            {
                "name": "Total size in memory",
                "value": summary["table"]["memory_size"],
                "fmt": "fmt_bytesize",
            },
            {
                "name": "Average record size in memory",
                "value": summary["table"]["record_size"],
                "fmt": "fmt_bytesize",
            },
        ],
        name="Dataset statistics",
    )

    dataset_types = Table(
        [
            {"name": type_name, "value": count, "fmt": "fmt_numeric"}
            for type_name, count in summary["table"]["types"].items()
        ],
        name="Variable types",
    )

    return Sequence(
        [dataset_info, dataset_types],
        anchor_id="dataset_overview",
        name="Overview",
        sequence_type="grid",
    )
def get_dataset_reproduction(summary, date_start, date_end)
Expand source code
def get_dataset_reproduction(summary, date_start, date_end):
    return Table(
        [
            {"name": "Analysis started", "value": date_start, "fmt": "fmt"},
            {"name": "Analysis finished", "value": date_end, "fmt": "fmt"},
            {
                "name": "Version",
                "value": '<a href="https://github.com/pandas-profiling/pandas-profiling">pandas-profiling v{version}</a>'.format(
                    version=summary["package"]["pandas_profiling_version"]
                ),
                "fmt": "raw",
            },
            {
                "name": "Command line",
                "value": "<code>pandas_profiling --config_file config.yaml [YOUR_FILE.csv]</code>",
                "fmt": "raw",
            },
            {
                "name": "Download configuration",
                "value": '<a download="config.yaml" href="data:text/plain;charset=utf-8,{config}">config.yaml</a>'.format(
                    config=quote(summary["package"]["pandas_profiling_config"])
                ),
                "fmt": "raw",
            },
        ],
        name="Reproduction",
        anchor_id="reproduction",
    )
def get_dataset_warnings(warnings, count)
Expand source code
def get_dataset_warnings(warnings, count):
    return Warnings(
        warnings=warnings,
        name="Warnings ({count})".format(count=count),
        anchor_id="warnings",
    )