Module pandas_profiling.utils.dataframe

Utils for pandas DataFrames.

Source code
"""Utils for pandas DataFrames."""
import warnings
from pathlib import Path

import pandas as pd


def warn_read(extension):
    """Warn the user when an extension is not supported.

    Args:
        extension: The extension that is not supported.
    """
    warnings.warn(
        "There was an attempt to read a file with extension {extension}, we assume it to be in CSV format.\n"
        "To prevent this warning from showing up, please rename the file to any of the extensions supported by pandas\n"
        "(docs: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html)\n"
        "If you think this extension should be supported, please report this as an issue:\n"
        "https://github.com/pandas-profiling/pandas-profiling/issues".format(
            extension=extension
        )
    )


def read_pandas(file_name: Path) -> pd.DataFrame:
    """Read DataFrame based on the file extension. This function is used when the file is in a standard format.
    Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet)

    Args:
        file_name: the file to read

    Returns:
        DataFrame

    Notes:
        This function is based on pandas IO tools:
        https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
        https://pandas.pydata.org/pandas-docs/stable/reference/io.html

        This function is not intended to be flexible or complete. The main use case is to be able to read files without
        user input, which is currently used in the editor integration. For more advanced use cases, the user should load
        the DataFrame in code.
    """
    extension = file_name.suffix.lower()
    if extension == ".json":
        df = pd.read_json(str(file_name))
    elif extension == ".jsonl":
        df = pd.read_json(str(file_name), lines=True)
    elif extension == ".dta":
        df = pd.read_stata(str(file_name))
    elif extension == ".tsv":
        df = pd.read_csv(str(file_name), sep="\t")
    elif extension in [".xls", ".xlsx"]:
        df = pd.read_excel(str(file_name))
    elif extension in [".hdf", ".h5"]:
        df = pd.read_hdf(str(file_name))
    elif extension in [".sas7bdat", ".xpt"]:
        df = pd.read_sas(str(file_name))
    elif extension == ".parquet":
        df = pd.read_parquet(str(file_name))
    elif extension in [".pkl", ".pickle"]:
        df = pd.read_pickle(str(file_name))
    else:
        if extension != ".csv":
            warn_read(extension)

        df = pd.read_csv(str(file_name))
    return df


def clean_column_names(df):
    """Removes spaces and colons from pandas DataFrame column names

    Args:
        df: DataFrame

    Returns:
        DataFrame with spaces in column names replaced by underscores, colons removed.
    """
    df.columns = df.columns.str.replace(" ", "_")
    df.columns = df.columns.str.replace(":", "")
    return df


def rename_index(df):
    """If the DataFrame contains a column or index named `index`, this will produce errors. We rename the {index,column}
    to be `df_index`.

    Args:
        df: DataFrame to process.

    Returns:
        The DataFrame with {index,column} `index` replaced by `df_index`, unchanged if the DataFrame does not contain such a string.
    """
    df.rename(columns={"index": "df_index"}, inplace=True)

    if "index" in df.index.names:
        df.index.names = [x if x != "index" else "df_index" for x in df.index.names]
    return df


def expand_mixed(df: pd.DataFrame, types=None) -> pd.DataFrame:
    """Expand non-nested lists, dicts and tuples in a DataFrame into columns with a prefix.

    Args:
        types: list of types to expand (Default: list, dict, tuple)
        df: DataFrame

    Returns:
        DataFrame with the dict values as series, identifier by the combination of the series and dict keys.

    Notes:
        TODO: allow for list expansion by duplicating rows.
        TODO: allow for expansion of nested data structures.
    """
    if types is None:
        types = [list, dict, tuple]

    for column_name in df.columns:
        # All
        non_nested_enumeration = (
            df[column_name]
            .dropna()
            .map(lambda x: type(x) in types and not any(type(y) in types for y in x))
        )

        if non_nested_enumeration.all():
            # Expand and prefix
            expanded = pd.DataFrame(df[column_name].dropna().tolist())
            expanded = expanded.add_prefix(column_name + "_")

            # Add recursion
            expanded = expand_mixed(expanded)

            # Drop te expanded
            df.drop(columns=[column_name], inplace=True)

            df = pd.concat([df, expanded], axis=1)
    return df

Functions

def clean_column_names(df)

Removes spaces and colons from pandas DataFrame column names

Args

df
DataFrame

Returns

DataFrame with spaces in column names replaced by underscores, colons removed.

Source code
def clean_column_names(df):
    """Removes spaces and colons from pandas DataFrame column names

    Args:
        df: DataFrame

    Returns:
        DataFrame with spaces in column names replaced by underscores, colons removed.
    """
    df.columns = df.columns.str.replace(" ", "_")
    df.columns = df.columns.str.replace(":", "")
    return df
def expand_mixed(df, types=None)

Expand non-nested lists, dicts and tuples in a DataFrame into columns with a prefix.

Args

types
list of types to expand (Default: list, dict, tuple)
df
DataFrame

Returns

DataFrame with the dict values as series, identifier by the combination of the series and dict keys.

Notes

TODO
allow for list expansion by duplicating rows.
TODO
allow for expansion of nested data structures.
Source code
def expand_mixed(df: pd.DataFrame, types=None) -> pd.DataFrame:
    """Expand non-nested lists, dicts and tuples in a DataFrame into columns with a prefix.

    Args:
        types: list of types to expand (Default: list, dict, tuple)
        df: DataFrame

    Returns:
        DataFrame with the dict values as series, identifier by the combination of the series and dict keys.

    Notes:
        TODO: allow for list expansion by duplicating rows.
        TODO: allow for expansion of nested data structures.
    """
    if types is None:
        types = [list, dict, tuple]

    for column_name in df.columns:
        # All
        non_nested_enumeration = (
            df[column_name]
            .dropna()
            .map(lambda x: type(x) in types and not any(type(y) in types for y in x))
        )

        if non_nested_enumeration.all():
            # Expand and prefix
            expanded = pd.DataFrame(df[column_name].dropna().tolist())
            expanded = expanded.add_prefix(column_name + "_")

            # Add recursion
            expanded = expand_mixed(expanded)

            # Drop te expanded
            df.drop(columns=[column_name], inplace=True)

            df = pd.concat([df, expanded], axis=1)
    return df
def read_pandas(file_name)

Read DataFrame based on the file extension. This function is used when the file is in a standard format. Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet)

Args

file_name
the file to read

Returns

DataFrame
 

Notes

This function is based on pandas IO tools: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html https://pandas.pydata.org/pandas-docs/stable/reference/io.html

This function is not intended to be flexible or complete. The main use case is to be able to read files without user input, which is currently used in the editor integration. For more advanced use cases, the user should load the DataFrame in code.

Source code
def read_pandas(file_name: Path) -> pd.DataFrame:
    """Read DataFrame based on the file extension. This function is used when the file is in a standard format.
    Various file types are supported (.csv, .json, .jsonl, .data, .tsv, .xls, .xlsx, .xpt, .sas7bdat, .parquet)

    Args:
        file_name: the file to read

    Returns:
        DataFrame

    Notes:
        This function is based on pandas IO tools:
        https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
        https://pandas.pydata.org/pandas-docs/stable/reference/io.html

        This function is not intended to be flexible or complete. The main use case is to be able to read files without
        user input, which is currently used in the editor integration. For more advanced use cases, the user should load
        the DataFrame in code.
    """
    extension = file_name.suffix.lower()
    if extension == ".json":
        df = pd.read_json(str(file_name))
    elif extension == ".jsonl":
        df = pd.read_json(str(file_name), lines=True)
    elif extension == ".dta":
        df = pd.read_stata(str(file_name))
    elif extension == ".tsv":
        df = pd.read_csv(str(file_name), sep="\t")
    elif extension in [".xls", ".xlsx"]:
        df = pd.read_excel(str(file_name))
    elif extension in [".hdf", ".h5"]:
        df = pd.read_hdf(str(file_name))
    elif extension in [".sas7bdat", ".xpt"]:
        df = pd.read_sas(str(file_name))
    elif extension == ".parquet":
        df = pd.read_parquet(str(file_name))
    elif extension in [".pkl", ".pickle"]:
        df = pd.read_pickle(str(file_name))
    else:
        if extension != ".csv":
            warn_read(extension)

        df = pd.read_csv(str(file_name))
    return df
def rename_index(df)

If the DataFrame contains a column or index named index, this will produce errors. We rename the {index,column} to be df_index.

Args

df
DataFrame to process.

Returns

The DataFrame with {index,column} index replaced by df_index, unchanged if the DataFrame does not contain such a string.

Source code
def rename_index(df):
    """If the DataFrame contains a column or index named `index`, this will produce errors. We rename the {index,column}
    to be `df_index`.

    Args:
        df: DataFrame to process.

    Returns:
        The DataFrame with {index,column} `index` replaced by `df_index`, unchanged if the DataFrame does not contain such a string.
    """
    df.rename(columns={"index": "df_index"}, inplace=True)

    if "index" in df.index.names:
        df.index.names = [x if x != "index" else "df_index" for x in df.index.names]
    return df
def warn_read(extension)

Warn the user when an extension is not supported.

Args

extension
The extension that is not supported.
Source code
def warn_read(extension):
    """Warn the user when an extension is not supported.

    Args:
        extension: The extension that is not supported.
    """
    warnings.warn(
        "There was an attempt to read a file with extension {extension}, we assume it to be in CSV format.\n"
        "To prevent this warning from showing up, please rename the file to any of the extensions supported by pandas\n"
        "(docs: https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html)\n"
        "If you think this extension should be supported, please report this as an issue:\n"
        "https://github.com/pandas-profiling/pandas-profiling/issues".format(
            extension=extension
        )
    )