Tutorial: Classification

Class probability estimation with Partition Trees

Partition Trees model the conditional distribution \(p(y \mid x)\) as a piecewise-constant density over the outcome space. Depending on the dataset and hyperparameters, this can yield not only a better probabilitistic classification but also improved accuracy.

Setup

We use the UCI letter recognition dataset — a well-known classification benchmark with 26 classes. We initially use a simple train-test split for clarity, but the same code works with cross-validation and pipelines.

from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score, log_loss
import pandas as pd

# fetch dataset
letter_recognition = fetch_ucirepo(id=59)

# data (as pandas dataframes)
X = letter_recognition.data.features
y = letter_recognition.data.targets

X, y = load_digits(return_X_y=True)  # --- IGNORE ---

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_train.shape, X_test.shape
((1257, 64), (540, 64))

Partition Tree Classifier

from partition_tree.sklearn import PartitionTreeClassifier


pt = PartitionTreeClassifier(
    random_state=42,
    min_samples_split=2,
    min_samples_xy=1,
    min_samples_x=1,
    min_samples_y=1,
    min_volume_fraction=0,
)

pt.fit(X_train, y_train)
/home/runner/work/partition_tree/partition_tree/partition_tree/src/partition_tree/sklearn/partition_tree.py:27: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  y_dtype = y_df.dtypes[0]
PartitionTreeClassifier(min_samples_split=2, min_samples_x=1, min_samples_xy=1,
                        min_samples_y=1, min_volume_fraction=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred_pt = pt.predict(X_test)
y_proba_pt = pt.predict_proba(X_test)
y_pred_train_pt = pt.predict(X_train)
y_proba_train_pt = pt.predict_proba(X_train)

logloss_pt = log_loss(y_test, y_proba_pt)
acc_pt = accuracy_score(y_test, y_pred_pt)

logloss_pt, acc_pt, log_loss(y_train, y_proba_train_pt), accuracy_score(y_train, y_pred_train_pt)
(0.9631333727990169,
 0.7703703703703704,
 0.33763977806183815,
 0.8695306284805091)
from sklearn.tree import DecisionTreeClassifier

cart = DecisionTreeClassifier(random_state=42, criterion="log_loss")
cart.fit(X_train, y_train)
DecisionTreeClassifier(criterion='log_loss', random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred_cart = cart.predict(X_test)
y_proba_cart = cart.predict_proba(X_test)
y_pred_train_cart = cart.predict(X_train)
y_proba_train_cart = cart.predict_proba(X_train)
logloss_cart = log_loss(y_test, y_proba_cart)
acc_cart = accuracy_score(y_test, y_pred_cart)

logloss_cart, acc_cart, log_loss(y_train, y_proba_train_cart), accuracy_score(y_train, y_pred_train_cart)
(4.2050928953970015, 0.8833333333333333, 2.2204460492503136e-16, 1.0)
from partition_tree.sklearn import PartitionForestClassifier

pf = PartitionForestClassifier(
    n_estimators=50,
    random_state=42,
    max_samples=1,
    max_features=1,
    min_samples_xy=0,
    max_leaves=10000,
    max_depth=10000,
)

pf.fit(X_train, y_train)
/home/runner/work/partition_tree/partition_tree/partition_tree/src/partition_tree/sklearn/partition_tree.py:27: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  y_dtype = y_df.dtypes[0]
PartitionForestClassifier(max_depth=10000, max_features=1, max_leaves=10000,
                          max_samples=1, min_samples_xy=0, n_estimators=50,
                          random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred_pf = pf.predict(X_test)
y_proba_pf = pf.predict_proba(X_test)
y_pred_train_pf = pf.predict(X_train)
y_proba_train_pf = pf.predict_proba(X_train)

logloss_pf = log_loss(y_test, y_proba_pf)
acc_pf = accuracy_score(y_test, y_pred_pf)
logloss_pf, acc_pf, log_loss(y_train, y_proba_train_pf), accuracy_score(y_train, y_pred_train_pf)
(0.2304735176510748, 0.9740740740740741, 0.08552331466917275, 1.0)
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(
    n_estimators=50,
    random_state=42,
    max_depth=50,
    min_samples_leaf=1,
    bootstrap=False,
    criterion="log_loss",
)
random_forest.fit(X_train, y_train)
RandomForestClassifier(bootstrap=False, criterion='log_loss', max_depth=50,
                       n_estimators=50, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_pred_rf = random_forest.predict(X_test)
y_proba_rf = random_forest.predict_proba(X_test)
y_pred_train_rf = random_forest.predict(X_train)
y_proba_train_rf = random_forest.predict_proba(X_train)

logloss_rf = log_loss(y_test, y_proba_rf)
acc_rf = accuracy_score(y_test, y_pred_rf)
logloss_rf, acc_rf, log_loss(y_train, y_proba_train_rf), accuracy_score(y_train, y_pred_train_rf)
(0.24943876980216306, 0.9722222222222222, 2.2204460492503136e-16, 1.0)
comparison_df = pd.DataFrame({
    "Model": ["PartitionTree", "PartitionForest", "DecisionTree (CART)", "RandomForest"],
    "Log Loss": [logloss_pt, logloss_pf, logloss_cart, logloss_rf],
    "Accuracy": [acc_pt, acc_pf, acc_cart, acc_rf],
})

comparison_df.sort_values("Log Loss")
Model Log Loss Accuracy
1 PartitionForest 0.230474 0.974074
3 RandomForest 0.249439 0.972222
0 PartitionTree 0.963133 0.770370
2 DecisionTree (CART) 4.205093 0.883333