Module imodels.tree.c45_tree.c45_tree

Modified from https://github.com/RaczeQ/scikit-learn-C4.5-tree-classifier References


.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning .. [2] https://en.wikipedia.org/wiki/C4.5_algorithm

Expand source code
"""Modified from https://github.com/RaczeQ/scikit-learn-C4.5-tree-classifier
References
----------
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
.. [2] https://en.wikipedia.org/wiki/C4.5_algorithm
"""
from copy import deepcopy
from typing import List
from xml.dom import minidom
from xml.etree import ElementTree as ET

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y

from imodels.tree.c45_tree.c45_utils import decision, is_numeric_feature, gain, gain_ratio, get_best_split, \
    set_as_leaf_node


class C45TreeClassifier(BaseEstimator, ClassifierMixin):
    """A C4.5 tree classifier.

    Parameters
    ----------
    max_rules : int, optional (default=None)
        Maximum number of split nodes allowed in the tree
    """

    def __init__(self, max_rules: int = None):
        super().__init__()
        self.max_rules = max_rules

    def fit(self, X, y, feature_names: str = None):
        self.complexity_ = 0
        X, y = check_X_y(X, y)
        self.resultType = type(y[0])
        if feature_names is None:
            self.feature_names = [f'X_{x}' for x in range(X.shape[1])]
        else:
            # only include alphanumeric chars / replace spaces with underscores
            self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_')
                                  for x in feature_names]
            self.feature_names = ['X_' + x if x[0].isdigit()
                                  else x
                                  for x in self.feature_names]

        assert len(self.feature_names) == X.shape[1]

        data = [[] for i in range(len(self.feature_names))]
        categories = []

        for i in range(len(X)):
            categories.append(str(y[i]))
            for j in range(len(self.feature_names)):
                data[j].append(X[i][j])
        root = ET.Element('GreedyTree')
        self.grow_tree(data, categories, root, self.feature_names)  # adds to root
        self.tree_ = ET.tostring(root, encoding="unicode")
        # print('self.tree_', self.tree_)
        self.dom_ = minidom.parseString(self.tree_)
        return self

    def raw_preds(self, X):
        check_is_fitted(self, ['tree_', 'resultType', 'feature_names'])
        X = check_array(X)
        if isinstance(X, pd.DataFrame):
            X = deepcopy(X)
            X.columns = self.feature_names
        root = self.dom_.childNodes[0]
        prediction = []
        for i in range(X.shape[0]):
            answerlist = decision(root, X[i], self.feature_names, 1)
            answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True)
            answer = answerlist[0][0]
            prediction.append(self.resultType(answer))
        return np.array(prediction)

    def predict(self, X):
        return (self.raw_preds(X) > 0.5).astype(int)

    def predict_proba(self, X):
        raw_preds = self.raw_preds(X)
        return np.vstack((1 - raw_preds, raw_preds)).transpose()

    def __str__(self):
        check_is_fitted(self, ['tree_'])
        return self.dom_.toprettyxml(newl="\r\n")

    def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names):
        """
        Parameters
        ----------
        X_t: List[list]
            input data transposed (num_features x num_observations)
        y_str: List[str]
            outcome represented as strings

        parent
        attrs_names

        """
        # check that y contains more than 1 distinct value
        if len(set(y_str)) > 1:
            split = []

            # loop over features and build up potential splits
            for i in range(len(X_t)):
                if set(X_t[i]) == set("?"):
                    split.append(0)
                else:
                    if is_numeric_feature(X_t[i]):
                        split.append(gain(y_str, X_t[i]))
                    else:
                        split.append(gain_ratio(y_str, X_t[i]))

            # no good split, return child node
            if max(split) == 0:
                set_as_leaf_node(parent, y_str)

            # there is a good split
            else:
                index_selected = split.index(max(split))
                name_selected = str(attrs_names[index_selected])
                self.complexity_ += 1
                if is_numeric_feature(X_t[index_selected]):
                    # split on this point
                    split_point = get_best_split(y_str, X_t[index_selected])

                    # build up children nodes
                    r_child_X = [[] for i in range(len(X_t))]
                    r_child_y = []
                    l_child_X = [[] for i in range(len(X_t))]
                    l_child_y = []
                    for i in range(len(y_str)):
                        if not X_t[index_selected][i] == "?":
                            if float(X_t[index_selected][i]) < float(split_point):
                                l_child_y.append(y_str[i])
                                for j in range(len(X_t)):
                                    l_child_X[j].append(X_t[j][i])
                            else:
                                r_child_y.append(y_str[i])
                                for j in range(len(X_t)):
                                    r_child_X[j].append(X_t[j][i])

                    # grow child nodes as well
                    if len(l_child_y) > 0 and len(r_child_y) > 0 and (
                            self.max_rules is None or
                            self.complexity_ <= self.max_rules
                    ):
                        p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?"))
                        son = ET.SubElement(parent, name_selected,
                                            {'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))})
                        self.grow_tree(l_child_X, l_child_y, son, attrs_names)
                        son = ET.SubElement(parent, name_selected,
                                            {'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))})
                        self.grow_tree(r_child_X, r_child_y, son, attrs_names)
                    else:
                        num_max = 0
                        for cat in set(y_str):
                            num_cat = y_str.count(cat)
                            if num_cat > num_max:
                                num_max = num_cat
                                most_cat = cat
                        parent.text = most_cat
                else:
                    # split on non-numeric variable (e.g. categorical)
                    # create a leaf for each unique value
                    for k in set(X_t[index_selected]):
                        if not k == "?" and (
                                self.max_rules is None or
                                self.complexity_ <= self.max_rules
                        ):
                            child_X = [[] for i in range(len(X_t))]
                            child_y = []
                            for i in range(len(y_str)):
                                if X_t[index_selected][i] == k:
                                    child_y.append(y_str[i])
                                    for j in range(len(X_t)):
                                        child_X[j].append(X_t[j][i])
                            son = ET.SubElement(parent, name_selected, {
                                'feature': k, "flag": "m",
                                'p': str(round(
                                    float(len(child_y)) / (
                                            len(X_t[index_selected]) - X_t[index_selected].count("?")),
                                    3))})
                            self.grow_tree(child_X, child_y, son, attrs_names)
        else:
            parent.text = y_str[0]


if __name__ == '__main__':
    from imodels.util.data_util import get_clean_dataset

    X, y, feature_names = get_clean_dataset('ionosphere', data_source='pmlb')
    m = C45TreeClassifier(max_rules=3)
    m.fit(X, y)
    print('mse', np.mean(np.square(m.predict(X) - y)))
    print(m)
    m.predict(X)

Classes

class C45TreeClassifier (max_rules: int = None)

A C4.5 tree classifier.

Parameters

max_rules : int, optional (default=None)
Maximum number of split nodes allowed in the tree
Expand source code
class C45TreeClassifier(BaseEstimator, ClassifierMixin):
    """A C4.5 tree classifier.

    Parameters
    ----------
    max_rules : int, optional (default=None)
        Maximum number of split nodes allowed in the tree
    """

    def __init__(self, max_rules: int = None):
        super().__init__()
        self.max_rules = max_rules

    def fit(self, X, y, feature_names: str = None):
        self.complexity_ = 0
        X, y = check_X_y(X, y)
        self.resultType = type(y[0])
        if feature_names is None:
            self.feature_names = [f'X_{x}' for x in range(X.shape[1])]
        else:
            # only include alphanumeric chars / replace spaces with underscores
            self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_')
                                  for x in feature_names]
            self.feature_names = ['X_' + x if x[0].isdigit()
                                  else x
                                  for x in self.feature_names]

        assert len(self.feature_names) == X.shape[1]

        data = [[] for i in range(len(self.feature_names))]
        categories = []

        for i in range(len(X)):
            categories.append(str(y[i]))
            for j in range(len(self.feature_names)):
                data[j].append(X[i][j])
        root = ET.Element('GreedyTree')
        self.grow_tree(data, categories, root, self.feature_names)  # adds to root
        self.tree_ = ET.tostring(root, encoding="unicode")
        # print('self.tree_', self.tree_)
        self.dom_ = minidom.parseString(self.tree_)
        return self

    def raw_preds(self, X):
        check_is_fitted(self, ['tree_', 'resultType', 'feature_names'])
        X = check_array(X)
        if isinstance(X, pd.DataFrame):
            X = deepcopy(X)
            X.columns = self.feature_names
        root = self.dom_.childNodes[0]
        prediction = []
        for i in range(X.shape[0]):
            answerlist = decision(root, X[i], self.feature_names, 1)
            answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True)
            answer = answerlist[0][0]
            prediction.append(self.resultType(answer))
        return np.array(prediction)

    def predict(self, X):
        return (self.raw_preds(X) > 0.5).astype(int)

    def predict_proba(self, X):
        raw_preds = self.raw_preds(X)
        return np.vstack((1 - raw_preds, raw_preds)).transpose()

    def __str__(self):
        check_is_fitted(self, ['tree_'])
        return self.dom_.toprettyxml(newl="\r\n")

    def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names):
        """
        Parameters
        ----------
        X_t: List[list]
            input data transposed (num_features x num_observations)
        y_str: List[str]
            outcome represented as strings

        parent
        attrs_names

        """
        # check that y contains more than 1 distinct value
        if len(set(y_str)) > 1:
            split = []

            # loop over features and build up potential splits
            for i in range(len(X_t)):
                if set(X_t[i]) == set("?"):
                    split.append(0)
                else:
                    if is_numeric_feature(X_t[i]):
                        split.append(gain(y_str, X_t[i]))
                    else:
                        split.append(gain_ratio(y_str, X_t[i]))

            # no good split, return child node
            if max(split) == 0:
                set_as_leaf_node(parent, y_str)

            # there is a good split
            else:
                index_selected = split.index(max(split))
                name_selected = str(attrs_names[index_selected])
                self.complexity_ += 1
                if is_numeric_feature(X_t[index_selected]):
                    # split on this point
                    split_point = get_best_split(y_str, X_t[index_selected])

                    # build up children nodes
                    r_child_X = [[] for i in range(len(X_t))]
                    r_child_y = []
                    l_child_X = [[] for i in range(len(X_t))]
                    l_child_y = []
                    for i in range(len(y_str)):
                        if not X_t[index_selected][i] == "?":
                            if float(X_t[index_selected][i]) < float(split_point):
                                l_child_y.append(y_str[i])
                                for j in range(len(X_t)):
                                    l_child_X[j].append(X_t[j][i])
                            else:
                                r_child_y.append(y_str[i])
                                for j in range(len(X_t)):
                                    r_child_X[j].append(X_t[j][i])

                    # grow child nodes as well
                    if len(l_child_y) > 0 and len(r_child_y) > 0 and (
                            self.max_rules is None or
                            self.complexity_ <= self.max_rules
                    ):
                        p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?"))
                        son = ET.SubElement(parent, name_selected,
                                            {'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))})
                        self.grow_tree(l_child_X, l_child_y, son, attrs_names)
                        son = ET.SubElement(parent, name_selected,
                                            {'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))})
                        self.grow_tree(r_child_X, r_child_y, son, attrs_names)
                    else:
                        num_max = 0
                        for cat in set(y_str):
                            num_cat = y_str.count(cat)
                            if num_cat > num_max:
                                num_max = num_cat
                                most_cat = cat
                        parent.text = most_cat
                else:
                    # split on non-numeric variable (e.g. categorical)
                    # create a leaf for each unique value
                    for k in set(X_t[index_selected]):
                        if not k == "?" and (
                                self.max_rules is None or
                                self.complexity_ <= self.max_rules
                        ):
                            child_X = [[] for i in range(len(X_t))]
                            child_y = []
                            for i in range(len(y_str)):
                                if X_t[index_selected][i] == k:
                                    child_y.append(y_str[i])
                                    for j in range(len(X_t)):
                                        child_X[j].append(X_t[j][i])
                            son = ET.SubElement(parent, name_selected, {
                                'feature': k, "flag": "m",
                                'p': str(round(
                                    float(len(child_y)) / (
                                            len(X_t[index_selected]) - X_t[index_selected].count("?")),
                                    3))})
                            self.grow_tree(child_X, child_y, son, attrs_names)
        else:
            parent.text = y_str[0]

Ancestors

  • sklearn.base.BaseEstimator
  • sklearn.base.ClassifierMixin

Methods

def fit(self, X, y, feature_names: str = None)
Expand source code
def fit(self, X, y, feature_names: str = None):
    self.complexity_ = 0
    X, y = check_X_y(X, y)
    self.resultType = type(y[0])
    if feature_names is None:
        self.feature_names = [f'X_{x}' for x in range(X.shape[1])]
    else:
        # only include alphanumeric chars / replace spaces with underscores
        self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_')
                              for x in feature_names]
        self.feature_names = ['X_' + x if x[0].isdigit()
                              else x
                              for x in self.feature_names]

    assert len(self.feature_names) == X.shape[1]

    data = [[] for i in range(len(self.feature_names))]
    categories = []

    for i in range(len(X)):
        categories.append(str(y[i]))
        for j in range(len(self.feature_names)):
            data[j].append(X[i][j])
    root = ET.Element('GreedyTree')
    self.grow_tree(data, categories, root, self.feature_names)  # adds to root
    self.tree_ = ET.tostring(root, encoding="unicode")
    # print('self.tree_', self.tree_)
    self.dom_ = minidom.parseString(self.tree_)
    return self
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names)

Parameters

X_t : List[list]
input data transposed (num_features x num_observations)
y_str : List[str]
outcome represented as strings
parent
 
attrs_names
 
Expand source code
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names):
    """
    Parameters
    ----------
    X_t: List[list]
        input data transposed (num_features x num_observations)
    y_str: List[str]
        outcome represented as strings

    parent
    attrs_names

    """
    # check that y contains more than 1 distinct value
    if len(set(y_str)) > 1:
        split = []

        # loop over features and build up potential splits
        for i in range(len(X_t)):
            if set(X_t[i]) == set("?"):
                split.append(0)
            else:
                if is_numeric_feature(X_t[i]):
                    split.append(gain(y_str, X_t[i]))
                else:
                    split.append(gain_ratio(y_str, X_t[i]))

        # no good split, return child node
        if max(split) == 0:
            set_as_leaf_node(parent, y_str)

        # there is a good split
        else:
            index_selected = split.index(max(split))
            name_selected = str(attrs_names[index_selected])
            self.complexity_ += 1
            if is_numeric_feature(X_t[index_selected]):
                # split on this point
                split_point = get_best_split(y_str, X_t[index_selected])

                # build up children nodes
                r_child_X = [[] for i in range(len(X_t))]
                r_child_y = []
                l_child_X = [[] for i in range(len(X_t))]
                l_child_y = []
                for i in range(len(y_str)):
                    if not X_t[index_selected][i] == "?":
                        if float(X_t[index_selected][i]) < float(split_point):
                            l_child_y.append(y_str[i])
                            for j in range(len(X_t)):
                                l_child_X[j].append(X_t[j][i])
                        else:
                            r_child_y.append(y_str[i])
                            for j in range(len(X_t)):
                                r_child_X[j].append(X_t[j][i])

                # grow child nodes as well
                if len(l_child_y) > 0 and len(r_child_y) > 0 and (
                        self.max_rules is None or
                        self.complexity_ <= self.max_rules
                ):
                    p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?"))
                    son = ET.SubElement(parent, name_selected,
                                        {'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))})
                    self.grow_tree(l_child_X, l_child_y, son, attrs_names)
                    son = ET.SubElement(parent, name_selected,
                                        {'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))})
                    self.grow_tree(r_child_X, r_child_y, son, attrs_names)
                else:
                    num_max = 0
                    for cat in set(y_str):
                        num_cat = y_str.count(cat)
                        if num_cat > num_max:
                            num_max = num_cat
                            most_cat = cat
                    parent.text = most_cat
            else:
                # split on non-numeric variable (e.g. categorical)
                # create a leaf for each unique value
                for k in set(X_t[index_selected]):
                    if not k == "?" and (
                            self.max_rules is None or
                            self.complexity_ <= self.max_rules
                    ):
                        child_X = [[] for i in range(len(X_t))]
                        child_y = []
                        for i in range(len(y_str)):
                            if X_t[index_selected][i] == k:
                                child_y.append(y_str[i])
                                for j in range(len(X_t)):
                                    child_X[j].append(X_t[j][i])
                        son = ET.SubElement(parent, name_selected, {
                            'feature': k, "flag": "m",
                            'p': str(round(
                                float(len(child_y)) / (
                                        len(X_t[index_selected]) - X_t[index_selected].count("?")),
                                3))})
                        self.grow_tree(child_X, child_y, son, attrs_names)
    else:
        parent.text = y_str[0]
def predict(self, X)
Expand source code
def predict(self, X):
    return (self.raw_preds(X) > 0.5).astype(int)
def predict_proba(self, X)
Expand source code
def predict_proba(self, X):
    raw_preds = self.raw_preds(X)
    return np.vstack((1 - raw_preds, raw_preds)).transpose()
def raw_preds(self, X)
Expand source code
def raw_preds(self, X):
    check_is_fitted(self, ['tree_', 'resultType', 'feature_names'])
    X = check_array(X)
    if isinstance(X, pd.DataFrame):
        X = deepcopy(X)
        X.columns = self.feature_names
    root = self.dom_.childNodes[0]
    prediction = []
    for i in range(X.shape[0]):
        answerlist = decision(root, X[i], self.feature_names, 1)
        answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True)
        answer = answerlist[0][0]
        prediction.append(self.resultType(answer))
    return np.array(prediction)