Examples#

  1. Henrik Boström, 2024

[1]:
import xrf
print(f"xrf v. {xrf.__version__}")
xrf v. 0.1.1

Classification forests#

Tic-tac-toe#

Let us start by importing the tic-tac-toe dataset from openml.org.

[2]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder

dataset = fetch_openml(name="tic-tac-toe", parser="auto")

y = dataset.target.values

X_orig = dataset.data.values
X = OneHotEncoder().fit_transform(X_orig).toarray()

Let us split the dataset into a training and a test set.

[3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, X_orig_train, X_orig_test = train_test_split(
    X, y, X_orig, test_size=0.75)

Let us first generate a standard random forest classifier and apply it to the test set.

[4]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

np.random.seed(42)

rf = RandomForestClassifier(n_jobs=-1, n_estimators=500)
rf.fit(X_train, y_train)
rf_predictions = rf.predict_proba(X_test)

Similarly, we may generate and apply an explainable random forest classifier, here without constraining the number of training examples involved in the predictions.

[5]:
from xrf import XRandomForestClassifier

np.random.seed(42)

rfx = XRandomForestClassifier(n_jobs=-1, n_estimators=500)
rfx.fit(X_train, y_train)
rfx_predictions = rfx.predict_proba(X_test)

Let us check that we indeed get the same predictions.

[6]:
assert np.allclose(rf_predictions, rfx_predictions)

We may now limit the number of examples involved in a prediction, e.g., to at most 5 (k=5).

[7]:
k=5

rfx_predictions_k = rfx.predict_proba(X_test, k=k)

Let us compare the predictive performance of the original and the constrained predictions.

[8]:
from sklearn.metrics import roc_auc_score, accuracy_score
import pandas as pd

accuracy_orig = accuracy_score(y_test==rf.classes_[1],
                               np.round(rf_predictions[:,1]))
auc_orig = roc_auc_score(y_test == rf.classes_[1],
                         rf_predictions[:,1])

accuracy_k = accuracy_score(y_test==rfx.classes_[1],
                              np.round(rfx_predictions_k[:,1]))
auc_k = roc_auc_score(y_test == rfx.classes_[1],
                        rfx_predictions_k[:,1])

df_result = pd.DataFrame([[accuracy_orig, auc_orig],[accuracy_k, auc_k]],
                  index=["Original", f"k={k}"],
                  columns=["Accuracy", "AUC"])

display(df_result.round(3))
Accuracy AUC
Original 0.876 0.974
k=5 0.933 0.982

Let us take a look at the example attribution for some prediction.

[9]:
rfx_predictions_k, examples, weights = rfx.predict_proba(
    X_test, k=k, return_examples=True, return_weights=True)
[10]:
rf_predicted_labels = np.array([rf.classes_[p.argmax()] for p in rf_predictions])
rfx_predicted_labels = np.array([rfx.classes_[p.argmax()] for p in rfx_predictions_k])
[11]:
def display_board(squares, Caption, Styles):
    df = pd.DataFrame(squares.reshape(3,3),
                      columns=["","",""],
                      index=["","",""])
    display(df.style.set_caption(Caption).set_table_styles(Styles))
[12]:
test_index = 0 # Select some test example

props = [("color", "grey"), ("font-weight", "bold")]
Styles = [dict(selector = "caption", props = props)]
props_b = [("caption-side", "bottom")]
Styles_b = [dict(selector = "caption", props = props_b)]

df = pd.DataFrame([rf_predictions[test_index]], index = [""],
                  columns=rf.classes_).round(2)
display(df.style.format(precision=2).set_caption("Original prediction").set_table_styles(Styles))

df = pd.DataFrame([rfx_predictions_k[test_index]], index = [""],
                  columns=rfx.classes_).round(2)
display(df.style.format(precision=2).set_caption("Constrained prediction").set_table_styles(Styles))

display_board(X_orig_test[test_index],
             f"Test example [{y_test[test_index]}]",
             Styles_b)

for i, e in enumerate(examples[test_index]):
    caption = f"Example #{i+1} [{y_train[e]}, {weights[test_index][i]:.2f}]"
    display_board(X_orig_train[examples[test_index][i]],
                 caption, Styles_b)
Original prediction
  negative positive
0.37 0.63
Constrained prediction
  negative positive
0.67 0.33
Test example [negative]
 
x o x
b x x
o o o
Example #1 [negative, 0.32]
 
b b x
b x x
o o o
Example #2 [positive, 0.21]
 
b o x
b x x
o o x
Example #3 [negative, 0.19]
 
x b x
b b x
o o o
Example #4 [negative, 0.16]
 
x x b
b x b
o o o
Example #5 [positive, 0.12]
 
x o o
b x x
o b x

Let us check how many training examples contribute to the predictions in the original forest.

[13]:
import matplotlib.pyplot as plt

xrfc_predictions, all_non_zero_weights = rfx.predict_proba(X_test, c=1.0, return_weights=True)

lengths = [len(w) for w in all_non_zero_weights]

plt.hist(lengths, 10)
plt.xlabel("# training examples")
plt.ylabel("# test examples")
plt.show()

assert np.allclose(rf_predictions, xrfc_predictions)
_images/Examples_24_0.png

Rather than constraining the predictions to a fixed number by setting a value for k, we could set a limit on the cumulative sum of the highest weights, e.g., to 30% (c=0.3).

[14]:
xrfc_predictions_c, all_non_zero_weights = rfx.predict_proba(
    X_test, c=0.3, return_weights=True)

lengths = [len(w) for w in all_non_zero_weights]

plt.hist(lengths, 10)
plt.xlabel("# training examples")
plt.ylabel("# test examples")
plt.show()

accuracy_c = accuracy_score(y_test==rfx.classes_[1],
                            np.round(xrfc_predictions_c[:,1]))
auc_c = roc_auc_score(y_test == rfx.classes_[1], xrfc_predictions_c[:,1])

df_result.loc["c=0.3"] = [accuracy_c, auc_c]
display(df_result.round(3))
_images/Examples_26_0.png
Accuracy AUC
Original 0.876 0.974
k=5 0.933 0.982
c=0.3 0.932 0.985

MNIST#

Let us also consider the MNIST dataset at openml.org.

[15]:
dataset = fetch_openml(name="mnist_784", parser="auto")

X = dataset.data.values.astype(float)
y = dataset.target.values.astype(int)

print(f"Original dataset size: {X.shape}")
Original dataset size: (70000, 784)

Let us split the dataset into a training and a test set.

[16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, = train_test_split(
    X, y, test_size=0.1)

Let us first generate a standard random forest classifier and apply it to the test set.

[17]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

np.random.seed(42)

rf = RandomForestClassifier(n_jobs=-1, n_estimators=200)
rf.fit(X_train, y_train)
rf_predictions = rf.predict_proba(X_test)

Similarly, we may generate and apply an explainable random forest classifier, here without constraining the number of training examples involved in the predictions.

[18]:
from xrf import XRandomForestClassifier

np.random.seed(42)

rfx = XRandomForestClassifier(n_jobs=-1, n_estimators=200)
rfx.fit(X_train, y_train)
[18]:
XRandomForestClassifier(model=RandomForestClassifier(n_estimators=200, n_jobs=-1, oob_score=True)
[19]:
rfx_predictions, all_non_zero_weights = rfx.predict_proba(X_test, c=1.0, return_weights=True)

Let us check that we indeed get the same predictions.

[20]:
assert np.allclose(rf_predictions, rfx_predictions)

Let us check how many training examples contribute to the predictions.

[21]:
import matplotlib.pyplot as plt

lengths = [len(w) for w in all_non_zero_weights]

plt.hist(lengths, 10)
plt.xlabel("# training examples")
plt.ylabel("# test examples")
plt.show()
_images/Examples_40_0.png

We may now limit the number of examples involved in a prediction, e.g., to at most 10 (k=10) and request that the top-weighted examples and the corresponding weights are returned.

[22]:
k=10

rfx_predictions_k, examples, weights  = rfx.predict_proba(X_test,
                                                          k=k,
                                                          return_examples=True,
                                                          return_weights=True)
[23]:
lengths = [len(w) for w in weights]

plt.hist(lengths, 10)
plt.xlabel("# training examples")
plt.ylabel("# test examples")
plt.show()
_images/Examples_43_0.png

Let us compare the predictive performance of the original and the constrained predictions.

[24]:
rf_predicted_labels = np.array([rf.classes_[p.argmax()] for p in rf_predictions])
rfx_predicted_labels = np.array([rfx.classes_[p.argmax()] for p in rfx_predictions_k])
[25]:
from sklearn.metrics import roc_auc_score, accuracy_score

accuracy_orig = accuracy_score(y_test, rf_predicted_labels)
auc_orig = roc_auc_score(y_test, rf_predictions, average='weighted',
                         multi_class='ovr', labels=rf.classes_)

roc_auc_score(y_test == rf.classes_[1],
                         rf_predictions[:,1])

accuracy_k = accuracy_score(y_test, rfx_predicted_labels)
auc_k = roc_auc_score(y_test, rfx_predictions, average='weighted',
                      multi_class='ovr', labels=rfx.classes_)

df_result = pd.DataFrame([[accuracy_orig, auc_orig],[accuracy_k, auc_k]],
                  index=["Original", f"k={k}"],
                  columns=["Accuracy", "AUC"])

display(df_result.round(3))
Accuracy AUC
Original 0.965 0.999
k=10 0.932 0.999

Let us take a look at the training examples that are used for some prediction.

[26]:
test_index = 5 # Select any test example

print("Test example:")
test_pixels = X_test[test_index].reshape((28, 28))

plt.figure(figsize=(2,2))
plt.imshow(test_pixels, cmap='Blues')
plt.axis('off')
plt.text(0.1, 0.1, r"$\hat{y}$" +f" = {rfx_predicted_labels[test_index]} y = {y_test[test_index]}",
         fontsize = 12)
plt.show()

df = pd.DataFrame([rf_predictions[test_index]], index = [""], columns=rf.classes_)
display(df.style.format(precision=2).set_caption("Original prediction").set_table_styles(Styles))

df = pd.DataFrame([rfx_predictions_k[test_index]], index = [""], columns=rfx.classes_)
display(df.style.format(precision=2).set_caption("Constrained prediction").set_table_styles(Styles))

print("Training examples:")

no_rows = 2
no_cols = 5

fig, axs = plt.subplots(no_rows, no_cols, figsize=(10, 4))

for i, train_ind in enumerate(examples[test_index]):
    pixels = X_train[train_ind].reshape((28, 28))
    row = i // no_cols
    col = i % no_cols
    axs[row, col].imshow(pixels, cmap='Blues')
    axs[row, col].axis('off')
    axs[row, col].text(0.1, 0.5, f"y = {y_train[train_ind]} w = {weights[test_index][i]:.2f}", fontsize = 12)
plt.show()
Test example:
_images/Examples_48_1.png
Original prediction
  0 1 2 3 4 5 6 7 8 9
0.03 0.01 0.58 0.03 0.08 0.03 0.17 0.01 0.06 0.01
Constrained prediction
  0 1 2 3 4 5 6 7 8 9
0.00 0.00 0.47 0.10 0.09 0.09 0.16 0.00 0.08 0.00
Training examples:
_images/Examples_48_5.png

Regression forests#

Miami housing#

Let us import the Miami housing dataset from openml.org.

[27]:
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder

dataset = fetch_openml(name="miami_housing", parser="auto")

y = dataset.target.values
X = dataset.data.values

Let us split the dataset into a training and a test set.

[28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75)

Let us first generate a standard random forest regressor and apply it to the test set.

[29]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

np.random.seed(42)

rf = RandomForestRegressor(n_jobs=-1, n_estimators=500)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)

Similarly, we may generate and apply an explainable random forest regressor, here without constraining the number of training examples involved in the predictions.

[30]:
from xrf import XRandomForestRegressor

np.random.seed(42)

rfx = XRandomForestRegressor(n_jobs=-1, n_estimators=500)
rfx.fit(X_train, y_train)
rfx_predictions = rfx.predict(X_test)

Let us check that we indeed get the same predictions.

[31]:
assert np.allclose(rf_predictions, rfx_predictions)

We may now limit the number of examples involved in a prediction, e.g., to at most 5 (k=5).

[32]:
k=5

rfx_predictions_k, examples, weights  = rfx.predict(X_test,
                                                    k=k,
                                                    return_examples=True,
                                                    return_weights=True)

Let us compare the correctness of the constrained and original predictions.

[33]:
from sklearn.metrics import mean_squared_error

rmse_orig = np.sqrt(mean_squared_error(y_test, rf_predictions))
corr_orig = np.corrcoef(y_test, rf_predictions)[0,1]

rmse_k = np.sqrt(mean_squared_error(y_test, rfx_predictions_k))
corr_k = np.corrcoef(y_test, rfx_predictions_k)[0,1]

df_result = pd.DataFrame([[rmse_orig, corr_orig],[rmse_k, corr_k]],
                  index=["Original", f"k={k}"],
                  columns=["RMSE", "CORR"])

display(df_result.round(3))
RMSE CORR
Original 111896.571 0.941
k=5 113617.611 0.936

Let us take a look at the training examples that are used for some prediction.

[34]:
test_index = 0 # Select any test example
#test_index = np.argwhere(rfx_predicted_labels != y_test)[1,0] # Select a misclassified test example
#test_index = np.argwhere(rfx_predicted_labels != rf_predicted_labels)[2,0] # Select a test example on which there is disagreement

import pandas as pd

print(f"Original prediction: {rf_predictions[test_index]:.1f}")
print(f"Constrained prediction: {rfx_predictions_k[test_index]:.1f}")

df = pd.DataFrame(np.hstack((np.vstack((y_test[test_index], np.nan, X_test[test_index].reshape(-1,1))),
                             np.hstack((y_train[examples[test_index]].reshape(k,1),
                                        weights[test_index].reshape(k,1),
                                        X_train[examples[test_index]])).T)),
                            index = ["Target", "Weight"]+dataset.feature_names,
                  columns=["Test"]+["#"+str(i+1) for i in range(k)])

df.apply(pd.to_numeric).style.format(precision=2, thousands=" ", na_rep=" ").\
   background_gradient(cmap="Reds", axis=1)
Original prediction: 137202.8
Constrained prediction: 128643.4
[34]:
  Test #1 #2 #3 #4 #5
Target 133 000.00 168 000.00 115 000.00 115 000.00 120 000.00 81 500.00
Weight 0.33 0.20 0.19 0.15 0.14
LATITUDE 25.89 25.90 25.89 25.89 25.88 25.86
LONGITUDE -80.23 -80.23 -80.24 -80.23 -80.23 -80.23
LND_SQFOOT 7 575.00 8 925.00 7 200.00 8 100.00 5 300.00 8 025.00
TOT_LVG_AREA 1 176.00 1 251.00 1 170.00 884.00 1 264.00 1 178.00
SPEC_FEAT_VAL 1 512.00 1 176.00 3 088.00 0.00 0.00 2 197.00
RAIL_DIST 4 979.20 5 150.20 2 972.90 5 511.10 7 500.90 5 401.80
OCEAN_DIST 35 765.40 35 205.00 37 758.60 35 228.30 35 436.70 35 229.30
WATER_DIST 2 692.60 2 457.80 497.70 3 187.40 2 060.70 3 323.60
CNTR_DIST 43 454.60 44 715.80 43 128.10 43 460.00 37 986.10 30 764.50
SUBCNTR_DI 43 454.60 44 715.80 43 128.10 43 460.00 37 986.10 30 764.50
HWY_DIST 6 964.90 6 394.70 8 940.20 6 429.30 6 332.90 5 949.80
age 63.00 63.00 55.00 63.00 76.00 59.00
avno60plus 0.00 0.00 0.00 0.00 0.00 0.00
month_sold 3.00 2.00 1.00 10.00 3.00 3.00
structure_quality 4.00 4.00 4.00 4.00 4.00 4.00

Lipophilicty#

[35]:
import numpy as np
import pandas as pd
import rdkit
from rdkit.Chem import AllChem

url = ("https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/"
       "Lipophilicity.csv")

df = pd.read_csv(url)

y = df["exp"].values

molecules = [rdkit.Chem.MolFromSmiles(s) for s in df["smiles"]]

fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=1024)

X = np.array([fpgen.GetFingerprint(m) for m in molecules])

print(X.shape)
(4200, 1024)

Let us split the dataset into a training and a test set.

[36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, molecules_train, molecules_test = \
    train_test_split(X, y, molecules, test_size=0.75)

Let us first generate a standard random forest regressor and apply it to the test set.

[37]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

np.random.seed(42)

rf = RandomForestRegressor(n_estimators=500, n_jobs=-1)
rf.fit(X_train, y_train)
rf_predictions = rf.predict(X_test)

Similarly, we may generate and apply an explainable random forest regressor, here without constraining the number of training examples involved in the predictions.

[38]:
from xrf import XRandomForestRegressor

np.random.seed(42)

rfx = XRandomForestRegressor(n_estimators=500, n_jobs=-1)
rfx.fit(X_train, y_train)
rfx_predictions = rfx.predict(X_test)

Let us check that we indeed get the same predictions.

[39]:
assert np.allclose(rf_predictions, rfx_predictions)

We may now limit the number of examples involved in a prediction, e.g., to at most 10 (k=10).

[40]:
k=10

rfx_predictions_k, examples, weights  = rfx.predict(X_test,
                                                    k=k,
                                                    return_examples=True,
                                                    return_weights=True)

Let us compare the correctness of the constrained and original predictions.

[41]:
from sklearn.metrics import mean_squared_error

rmse_orig = np.sqrt(mean_squared_error(y_test, rf_predictions))
corr_orig = np.corrcoef(y_test, rf_predictions)[0,1]

rmse_k = np.sqrt(mean_squared_error(y_test, rfx_predictions_k))
corr_k = np.corrcoef(y_test, rfx_predictions_k)[0,1]

df_result = pd.DataFrame([[rmse_orig, corr_orig],[rmse_k, corr_k]],
                  index=["Original", f"k={k}"],
                  columns=["RMSE", "CORR"])

display(df_result.round(3))
RMSE CORR
Original 0.969 0.602
k=10 1.013 0.567

Let us take a look at some test example and the training examples that are used for the prediction.

[42]:
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import rdFMCS
from rdkit.Chem.Draw import rdDepictor

IPythonConsole.ipython_useSVG=False
IPythonConsole.drawOptions.minFontSize=20

rdDepictor.SetPreferCoordGen(True)

def show_difference(mol1, w, y, mol2):
    mcs = rdFMCS.FindMCS([mol1,mol2])
    mcs_mol = Chem.MolFromSmarts(mcs.smartsString)
    match1 = mol1.GetSubstructMatch(mcs_mol)
    target_atm1 = []
    for atom in mol1.GetAtoms():
        if atom.GetIdx() not in match1:
            target_atm1.append(atom.GetIdx())
    return Draw.MolsToGridImage([mol1],
                                highlightAtomLists=[target_atm1],
                                useSVG=True)
[43]:
test_index = 5 # Select any test example

print("Test molecule:")
test_mol = molecules_test[test_index]
test_mol_fig = Draw.MolsToGridImage([test_mol], useSVG=True)
display(test_mol_fig)
print(f"True target: {y_test[test_index]:.2f}")
print(f"Original prediction: {rf_predictions[test_index]:.2f}")
print(f"Constrained prediction: {rfx_predictions_k[test_index]:.2f}\n")

for i, training_index in enumerate(examples[test_index]):
    print(f"Training example {i+1} "
          f"[{y_train[training_index]}, "
          f"weight: {weights[test_index][i]:.2f}]")
    training_mol = molecules_train[training_index]
    train_mol_fig = show_difference(training_mol, weights[test_index][i],
                                    y_train[training_index], test_mol)
    display(train_mol_fig)
Test molecule:
_images/Examples_83_1.svg
True target: 1.42
Original prediction: 2.28
Constrained prediction: 2.58

Training example 1 [2.5, weight: 0.22]
_images/Examples_83_3.svg
Training example 2 [3.77, weight: 0.19]
_images/Examples_83_5.svg
Training example 3 [2.7, weight: 0.11]
_images/Examples_83_7.svg
Training example 4 [1.85, weight: 0.08]
_images/Examples_83_9.svg
Training example 5 [2.8, weight: 0.08]
_images/Examples_83_11.svg
Training example 6 [2.39, weight: 0.07]
_images/Examples_83_13.svg
Training example 7 [3.4, weight: 0.07]
_images/Examples_83_15.svg
Training example 8 [1.4, weight: 0.06]
_images/Examples_83_17.svg
Training example 9 [1.09, weight: 0.05]
_images/Examples_83_19.svg
Training example 10 [1.3, weight: 0.05]
_images/Examples_83_21.svg