Module imodels.tree.c45_tree.c45_tree
Modified from https://github.com/RaczeQ/scikit-learn-C4.5-tree-classifier References
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning .. [2] https://en.wikipedia.org/wiki/C4.5_algorithm
Expand source code
"""Modified from https://github.com/RaczeQ/scikit-learn-C4.5-tree-classifier
References
----------
.. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
.. [2] https://en.wikipedia.org/wiki/C4.5_algorithm
"""
from copy import deepcopy
from typing import List
from xml.dom import minidom
from xml.etree import ElementTree as ET
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y
from imodels.tree.c45_tree.c45_utils import decision, is_numeric_feature, gain, gain_ratio, get_best_split, \
set_as_leaf_node
class C45TreeClassifier(BaseEstimator, ClassifierMixin):
"""A C4.5 tree classifier.
Parameters
----------
max_rules : int, optional (default=None)
Maximum number of split nodes allowed in the tree
"""
def __init__(self, max_rules: int = None):
super().__init__()
self.max_rules = max_rules
def fit(self, X, y, feature_names: str = None):
self.complexity_ = 0
X, y = check_X_y(X, y)
self.resultType = type(y[0])
if feature_names is None:
self.feature_names = [f'X_{x}' for x in range(X.shape[1])]
else:
# only include alphanumeric chars / replace spaces with underscores
self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_')
for x in feature_names]
self.feature_names = ['X_' + x if x[0].isdigit()
else x
for x in self.feature_names]
assert len(self.feature_names) == X.shape[1]
data = [[] for i in range(len(self.feature_names))]
categories = []
for i in range(len(X)):
categories.append(str(y[i]))
for j in range(len(self.feature_names)):
data[j].append(X[i][j])
root = ET.Element('GreedyTree')
self.grow_tree(data, categories, root, self.feature_names) # adds to root
self.tree_ = ET.tostring(root, encoding="unicode")
# print('self.tree_', self.tree_)
self.dom_ = minidom.parseString(self.tree_)
return self
def raw_preds(self, X):
check_is_fitted(self, ['tree_', 'resultType', 'feature_names'])
X = check_array(X)
if isinstance(X, pd.DataFrame):
X = deepcopy(X)
X.columns = self.feature_names
root = self.dom_.childNodes[0]
prediction = []
for i in range(X.shape[0]):
answerlist = decision(root, X[i], self.feature_names, 1)
answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True)
answer = answerlist[0][0]
prediction.append(self.resultType(answer))
return np.array(prediction)
def predict(self, X):
return (self.raw_preds(X) > 0.5).astype(int)
def predict_proba(self, X):
raw_preds = self.raw_preds(X)
return np.vstack((1 - raw_preds, raw_preds)).transpose()
def __str__(self):
check_is_fitted(self, ['tree_'])
return self.dom_.toprettyxml(newl="\r\n")
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names):
"""
Parameters
----------
X_t: List[list]
input data transposed (num_features x num_observations)
y_str: List[str]
outcome represented as strings
parent
attrs_names
"""
# check that y contains more than 1 distinct value
if len(set(y_str)) > 1:
split = []
# loop over features and build up potential splits
for i in range(len(X_t)):
if set(X_t[i]) == set("?"):
split.append(0)
else:
if is_numeric_feature(X_t[i]):
split.append(gain(y_str, X_t[i]))
else:
split.append(gain_ratio(y_str, X_t[i]))
# no good split, return child node
if max(split) == 0:
set_as_leaf_node(parent, y_str)
# there is a good split
else:
index_selected = split.index(max(split))
name_selected = str(attrs_names[index_selected])
self.complexity_ += 1
if is_numeric_feature(X_t[index_selected]):
# split on this point
split_point = get_best_split(y_str, X_t[index_selected])
# build up children nodes
r_child_X = [[] for i in range(len(X_t))]
r_child_y = []
l_child_X = [[] for i in range(len(X_t))]
l_child_y = []
for i in range(len(y_str)):
if not X_t[index_selected][i] == "?":
if float(X_t[index_selected][i]) < float(split_point):
l_child_y.append(y_str[i])
for j in range(len(X_t)):
l_child_X[j].append(X_t[j][i])
else:
r_child_y.append(y_str[i])
for j in range(len(X_t)):
r_child_X[j].append(X_t[j][i])
# grow child nodes as well
if len(l_child_y) > 0 and len(r_child_y) > 0 and (
self.max_rules is None or
self.complexity_ <= self.max_rules
):
p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?"))
son = ET.SubElement(parent, name_selected,
{'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))})
self.grow_tree(l_child_X, l_child_y, son, attrs_names)
son = ET.SubElement(parent, name_selected,
{'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))})
self.grow_tree(r_child_X, r_child_y, son, attrs_names)
else:
num_max = 0
for cat in set(y_str):
num_cat = y_str.count(cat)
if num_cat > num_max:
num_max = num_cat
most_cat = cat
parent.text = most_cat
else:
# split on non-numeric variable (e.g. categorical)
# create a leaf for each unique value
for k in set(X_t[index_selected]):
if not k == "?" and (
self.max_rules is None or
self.complexity_ <= self.max_rules
):
child_X = [[] for i in range(len(X_t))]
child_y = []
for i in range(len(y_str)):
if X_t[index_selected][i] == k:
child_y.append(y_str[i])
for j in range(len(X_t)):
child_X[j].append(X_t[j][i])
son = ET.SubElement(parent, name_selected, {
'feature': k, "flag": "m",
'p': str(round(
float(len(child_y)) / (
len(X_t[index_selected]) - X_t[index_selected].count("?")),
3))})
self.grow_tree(child_X, child_y, son, attrs_names)
else:
parent.text = y_str[0]
if __name__ == '__main__':
from imodels.util.data_util import get_clean_dataset
X, y, feature_names = get_clean_dataset('ionosphere', data_source='pmlb')
m = C45TreeClassifier(max_rules=3)
m.fit(X, y)
print('mse', np.mean(np.square(m.predict(X) - y)))
print(m)
m.predict(X)
Classes
class C45TreeClassifier (max_rules: int = None)
-
A C4.5 tree classifier.
Parameters
max_rules
:int
, optional(default=None)
- Maximum number of split nodes allowed in the tree
Expand source code
class C45TreeClassifier(BaseEstimator, ClassifierMixin): """A C4.5 tree classifier. Parameters ---------- max_rules : int, optional (default=None) Maximum number of split nodes allowed in the tree """ def __init__(self, max_rules: int = None): super().__init__() self.max_rules = max_rules def fit(self, X, y, feature_names: str = None): self.complexity_ = 0 X, y = check_X_y(X, y) self.resultType = type(y[0]) if feature_names is None: self.feature_names = [f'X_{x}' for x in range(X.shape[1])] else: # only include alphanumeric chars / replace spaces with underscores self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_') for x in feature_names] self.feature_names = ['X_' + x if x[0].isdigit() else x for x in self.feature_names] assert len(self.feature_names) == X.shape[1] data = [[] for i in range(len(self.feature_names))] categories = [] for i in range(len(X)): categories.append(str(y[i])) for j in range(len(self.feature_names)): data[j].append(X[i][j]) root = ET.Element('GreedyTree') self.grow_tree(data, categories, root, self.feature_names) # adds to root self.tree_ = ET.tostring(root, encoding="unicode") # print('self.tree_', self.tree_) self.dom_ = minidom.parseString(self.tree_) return self def raw_preds(self, X): check_is_fitted(self, ['tree_', 'resultType', 'feature_names']) X = check_array(X) if isinstance(X, pd.DataFrame): X = deepcopy(X) X.columns = self.feature_names root = self.dom_.childNodes[0] prediction = [] for i in range(X.shape[0]): answerlist = decision(root, X[i], self.feature_names, 1) answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True) answer = answerlist[0][0] prediction.append(self.resultType(answer)) return np.array(prediction) def predict(self, X): return (self.raw_preds(X) > 0.5).astype(int) def predict_proba(self, X): raw_preds = self.raw_preds(X) return np.vstack((1 - raw_preds, raw_preds)).transpose() def __str__(self): check_is_fitted(self, ['tree_']) return self.dom_.toprettyxml(newl="\r\n") def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names): """ Parameters ---------- X_t: List[list] input data transposed (num_features x num_observations) y_str: List[str] outcome represented as strings parent attrs_names """ # check that y contains more than 1 distinct value if len(set(y_str)) > 1: split = [] # loop over features and build up potential splits for i in range(len(X_t)): if set(X_t[i]) == set("?"): split.append(0) else: if is_numeric_feature(X_t[i]): split.append(gain(y_str, X_t[i])) else: split.append(gain_ratio(y_str, X_t[i])) # no good split, return child node if max(split) == 0: set_as_leaf_node(parent, y_str) # there is a good split else: index_selected = split.index(max(split)) name_selected = str(attrs_names[index_selected]) self.complexity_ += 1 if is_numeric_feature(X_t[index_selected]): # split on this point split_point = get_best_split(y_str, X_t[index_selected]) # build up children nodes r_child_X = [[] for i in range(len(X_t))] r_child_y = [] l_child_X = [[] for i in range(len(X_t))] l_child_y = [] for i in range(len(y_str)): if not X_t[index_selected][i] == "?": if float(X_t[index_selected][i]) < float(split_point): l_child_y.append(y_str[i]) for j in range(len(X_t)): l_child_X[j].append(X_t[j][i]) else: r_child_y.append(y_str[i]) for j in range(len(X_t)): r_child_X[j].append(X_t[j][i]) # grow child nodes as well if len(l_child_y) > 0 and len(r_child_y) > 0 and ( self.max_rules is None or self.complexity_ <= self.max_rules ): p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?")) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))}) self.grow_tree(l_child_X, l_child_y, son, attrs_names) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))}) self.grow_tree(r_child_X, r_child_y, son, attrs_names) else: num_max = 0 for cat in set(y_str): num_cat = y_str.count(cat) if num_cat > num_max: num_max = num_cat most_cat = cat parent.text = most_cat else: # split on non-numeric variable (e.g. categorical) # create a leaf for each unique value for k in set(X_t[index_selected]): if not k == "?" and ( self.max_rules is None or self.complexity_ <= self.max_rules ): child_X = [[] for i in range(len(X_t))] child_y = [] for i in range(len(y_str)): if X_t[index_selected][i] == k: child_y.append(y_str[i]) for j in range(len(X_t)): child_X[j].append(X_t[j][i]) son = ET.SubElement(parent, name_selected, { 'feature': k, "flag": "m", 'p': str(round( float(len(child_y)) / ( len(X_t[index_selected]) - X_t[index_selected].count("?")), 3))}) self.grow_tree(child_X, child_y, son, attrs_names) else: parent.text = y_str[0]
Ancestors
- sklearn.base.BaseEstimator
- sklearn.base.ClassifierMixin
Methods
def fit(self, X, y, feature_names: str = None)
-
Expand source code
def fit(self, X, y, feature_names: str = None): self.complexity_ = 0 X, y = check_X_y(X, y) self.resultType = type(y[0]) if feature_names is None: self.feature_names = [f'X_{x}' for x in range(X.shape[1])] else: # only include alphanumeric chars / replace spaces with underscores self.feature_names = [''.join([i for i in x if i.isalnum()]).replace(' ', '_') for x in feature_names] self.feature_names = ['X_' + x if x[0].isdigit() else x for x in self.feature_names] assert len(self.feature_names) == X.shape[1] data = [[] for i in range(len(self.feature_names))] categories = [] for i in range(len(X)): categories.append(str(y[i])) for j in range(len(self.feature_names)): data[j].append(X[i][j]) root = ET.Element('GreedyTree') self.grow_tree(data, categories, root, self.feature_names) # adds to root self.tree_ = ET.tostring(root, encoding="unicode") # print('self.tree_', self.tree_) self.dom_ = minidom.parseString(self.tree_) return self
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names)
-
Parameters
X_t
:List[list]
- input data transposed (num_features x num_observations)
y_str
:List[str]
- outcome represented as strings
parent
attrs_names
Expand source code
def grow_tree(self, X_t: List[list], y_str: List[str], parent, attrs_names): """ Parameters ---------- X_t: List[list] input data transposed (num_features x num_observations) y_str: List[str] outcome represented as strings parent attrs_names """ # check that y contains more than 1 distinct value if len(set(y_str)) > 1: split = [] # loop over features and build up potential splits for i in range(len(X_t)): if set(X_t[i]) == set("?"): split.append(0) else: if is_numeric_feature(X_t[i]): split.append(gain(y_str, X_t[i])) else: split.append(gain_ratio(y_str, X_t[i])) # no good split, return child node if max(split) == 0: set_as_leaf_node(parent, y_str) # there is a good split else: index_selected = split.index(max(split)) name_selected = str(attrs_names[index_selected]) self.complexity_ += 1 if is_numeric_feature(X_t[index_selected]): # split on this point split_point = get_best_split(y_str, X_t[index_selected]) # build up children nodes r_child_X = [[] for i in range(len(X_t))] r_child_y = [] l_child_X = [[] for i in range(len(X_t))] l_child_y = [] for i in range(len(y_str)): if not X_t[index_selected][i] == "?": if float(X_t[index_selected][i]) < float(split_point): l_child_y.append(y_str[i]) for j in range(len(X_t)): l_child_X[j].append(X_t[j][i]) else: r_child_y.append(y_str[i]) for j in range(len(X_t)): r_child_X[j].append(X_t[j][i]) # grow child nodes as well if len(l_child_y) > 0 and len(r_child_y) > 0 and ( self.max_rules is None or self.complexity_ <= self.max_rules ): p_l = float(len(l_child_y)) / (len(X_t[index_selected]) - X_t[index_selected].count("?")) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "l", "p": str(round(p_l, 3))}) self.grow_tree(l_child_X, l_child_y, son, attrs_names) son = ET.SubElement(parent, name_selected, {'feature': str(split_point), "flag": "r", "p": str(round(1 - p_l, 3))}) self.grow_tree(r_child_X, r_child_y, son, attrs_names) else: num_max = 0 for cat in set(y_str): num_cat = y_str.count(cat) if num_cat > num_max: num_max = num_cat most_cat = cat parent.text = most_cat else: # split on non-numeric variable (e.g. categorical) # create a leaf for each unique value for k in set(X_t[index_selected]): if not k == "?" and ( self.max_rules is None or self.complexity_ <= self.max_rules ): child_X = [[] for i in range(len(X_t))] child_y = [] for i in range(len(y_str)): if X_t[index_selected][i] == k: child_y.append(y_str[i]) for j in range(len(X_t)): child_X[j].append(X_t[j][i]) son = ET.SubElement(parent, name_selected, { 'feature': k, "flag": "m", 'p': str(round( float(len(child_y)) / ( len(X_t[index_selected]) - X_t[index_selected].count("?")), 3))}) self.grow_tree(child_X, child_y, son, attrs_names) else: parent.text = y_str[0]
def predict(self, X)
-
Expand source code
def predict(self, X): return (self.raw_preds(X) > 0.5).astype(int)
def predict_proba(self, X)
-
Expand source code
def predict_proba(self, X): raw_preds = self.raw_preds(X) return np.vstack((1 - raw_preds, raw_preds)).transpose()
def raw_preds(self, X)
-
Expand source code
def raw_preds(self, X): check_is_fitted(self, ['tree_', 'resultType', 'feature_names']) X = check_array(X) if isinstance(X, pd.DataFrame): X = deepcopy(X) X.columns = self.feature_names root = self.dom_.childNodes[0] prediction = [] for i in range(X.shape[0]): answerlist = decision(root, X[i], self.feature_names, 1) answerlist = sorted(answerlist.items(), key=lambda x: x[1], reverse=True) answer = answerlist[0][0] prediction.append(self.resultType(answer)) return np.array(prediction)