In [1]:
from sklearn.datasets import make_circles, load_iris
from sklearn.model_selection import train_test_split

import torch

import numpy as np

import yellowbrick as yb
import matplotlib
import matplotlib.pylab as plt

# dtype = torch.long
# device = torch.device("cpu")

Load data & prepare

In [2]:
X, y = make_circles(n_samples=1000, noise=0.1)

# 75/25 train/test split
orig_X_train, orig_X_test, orig_y_train, orig_y_test = train_test_split(X, y, test_size=0.25)

# Transform data into tensors.
X = torch.tensor(orig_X_train, dtype=torch.float)
y = torch.tensor(orig_y_train, dtype=torch.long)

Visualize data

In [3]:
import yellowbrick.contrib.scatter
visualizer = yellowbrick.contrib.scatter.ScatterVisualizer()

visualizer.fit(orig_X_train, orig_y_train)
visualizer.poof()

Basic Neural Net

3 things are needed for an optimization problem:

  1. model
  2. Loss function
  3. Optimizer
In [4]:
from torch import nn

# Sequential model allows easy model experimentation
model = nn.Sequential(
          nn.Linear(2, 16),    # input dim 2. 16 neurons in first layer.
          nn.ReLU(),           # ReLU activation
          #nn.Dropout(p=0.2),  # Optional dropout
          nn.Linear(16, 4),     # Linear from 16 neurons down to 2
          nn.ReLU(),
          nn.Linear(4,2),
          nn.Softmax(dim=1)    # Softmax activation to normalize output weights
        )


# Loss function. CrossEntropy is valid for classification problems.
loss_fn = nn.CrossEntropyLoss()

# Optimizer. Many to choose from. 
optimizer = torch.optim.Adam(params=model.parameters())

# Optimizer iterations
for i in range(1000):
    # Clear the gradient at the start of each step.
    optimizer.zero_grad()
    
    # Compute the forward pass
    output = model(X)
    
    # Compute the loss
    loss = loss_fn(output, y)
    
    # Backprop to compute the gradients
    loss.backward()
    
    # Update the model parameters
    optimizer.step()

print(loss.item())
0.46360188722610474

What do the activation regions look like?

(an exercise in Tensor math)

In [5]:
%matplotlib inline

# Make a grid 
ns = 25
xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 2*ns), np.linspace(-1.5, 1.5, 2*ns))
# Shape of each is [ns, ns]

# Combine into a single tensor
G = torch.tensor(np.array([xx, yy]), dtype=torch.float)
# Shape is [2, ns, ns]

# reshape to be convenient to work with
G = G.reshape((2, G.shape[1]*G.shape[2])).transpose(0,1)
# Now a tensor of shape [ns*ns, 2]. Sequence of x,y coordinate pairs

result = model(G).detach()
# For each row (sample) in G, get the prediction under the model
# The variables inside the model are tracked for gradients. 
# Call "detach()" to stop tracking gradient for further computations.
# Result is shape [ns*ns, 2] since model takes 2-dim vectors and generates a 2-dim prediction

c0 = result[:,0]
# weights assigned to class 0

c1 = result[:,1]
# weights assigned to class 1

plt.hexbin(G[:,0].detach().numpy(), G[:,1].detach().numpy(), c0.numpy(), gridsize=ns, cmap='viridis')
# Gridsize is half that of the meshgrid for clean rendering.

plt.title("Class 0 Activation")
plt.axis('equal')
plt.show()
plt.hexbin(G[:,0].detach().numpy(), G[:,1].detach().numpy(), c1.numpy(), gridsize=ns, cmap='viridis')
plt.title("Class 1 Activation")
plt.axis('equal')
plt.show()

What is the classification performance?

Case study in working with Yellowbrick

In [6]:
from sklearn.base import BaseEstimator

class NetWrapper(BaseEstimator):
    """
    Wrap our model as a BaseEstimator
    """
    _estimator_type = "classifier"
    # Tell yellowbrick this is a classifier
    
    def __init__(self, model):
        # save a reference to the model
        self.model = model
        self.classes_ = None
    
    def fit(self, X, y):
        # save the list of classes
        self.classes_ = list(set(i for i in y))
    
    def predict_proba(self, X):
        """
        Define predict_proba or decision_function
        
        Compute predictions from model. 
        Transform input into a Tensor, compute the prediction, 
        transform the prediction back into a numpy array
        """
        v = model(torch.tensor(X, dtype=torch.float)).detach().numpy()
        print("v:", v.shape)
        return v
        

wrapped_net = NetWrapper(model)
# Wrap the model

# Use ROCAUC as per usual
ROCAUC = yb.classifier.ROCAUC(wrapped_net)

ROCAUC.fit(orig_X_train, orig_y_train)
print(orig_X_test.shape, orig_y_test.shape)
print(orig_X_train.shape, orig_y_train.shape)
ROCAUC.score(orig_X_test, orig_y_test)
ROCAUC.poof()
(250, 2) (250,)
(750, 2) (750,)
v: (250, 2)

Custom Modules

Implementing new functionality, e.g. radial activation regions for "circular" neurons

In [7]:
# weight: a * (x-c)^T(x-c), a is a real number

class Circle(torch.nn.Module):
    """
    Extend torch.nn.Module for a new "layer" in a neural network
    """
    def __init__(self, k, data):
        """
        k is the number of neurons to use
        data is passed in to use as samples to initialize centers
        """
        super().__init__()
        
        # k is not a Parameter, so there is no gradient and this is not updated in optimization
        self.k = int(k)
        
        # Parameters always have gradients computed
        self.alpha = torch.nn.Parameter(torch.normal(mean=torch.zeros(k), std=torch.ones(k)*0.5).unsqueeze(1))
        self.C = torch.nn.Parameter(data[np.random.choice(data.shape[0], k, replace=False), :].unsqueeze(1))
        
        
    def forward(self, x): 
        diff = (x - self.C)        
        # compact way of writing inner products, outer products, etc.
        tmp = torch.einsum('kij,kij->ki', [diff, diff])

        return (self.alpha * torch.einsum('kij,kij->ki', [diff, diff])).transpose(0,1)
In [8]:
from tqdm import tqdm
loss_fn = torch.nn.CrossEntropyLoss()
model = nn.Sequential(
          Circle(16, X),
          nn.ReLU(),
          nn.Linear(16,4),
          nn.ReLU(),
          nn.Linear(4,2),
          nn.Softmax(dim=1)
        )
optimizer = torch.optim.Adam(params=model.parameters())
for i in tqdm(range(1000)):
    optimizer.zero_grad()
    output = model(X)
    loss = loss_fn(output, y)
    loss.backward()
    optimizer.step()
100%|██████████| 1000/1000 [00:26<00:00, 37.85it/s]
In [9]:
%matplotlib inline

ns = 25
xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 2*ns), np.linspace(-1.5, 1.5, 2*ns))
G = torch.tensor(np.array([xx, yy]), dtype=torch.float)


# reshape...
G = G.reshape((2, G.shape[1]*G.shape[2])).transpose(0,1)
result = model(G).detach()

c0 = result[:,0]
c1 = result[:,1]

plt.hexbin(G[:,0].detach().numpy(), G[:,1].detach().numpy(), c0.numpy(), gridsize=ns, cmap='viridis')
plt.title("Class 0 Activation")
plt.axis('equal')
plt.show()
plt.hexbin(G[:,0].detach().numpy(), G[:,1].detach().numpy(), c1.numpy(), gridsize=ns, cmap='viridis')
plt.title("Class 1 Activation")
plt.axis('equal')
plt.show()
In [10]:
wrapped_net = NetWrapper(model)
ROCAUC = yb.classifier.ROCAUC(wrapped_net)

ROCAUC.fit(orig_X_train, orig_y_train)
wrapped_net.predict_proba(orig_X_test)
ROCAUC.score(orig_X_test, orig_y_test)
ROCAUC.poof()
v: (250, 2)
v: (250, 2)
In [11]:
%matplotlib inline

# Show the centers of each "kernel" 

centers = model[0].C.squeeze().detach().numpy()
scales = model[0].alpha.squeeze().detach().numpy()

plt.scatter(centers[:,0], centers[:,1])
plt.scatter(X[:,0], X[:,1], alpha=0.1)
plt.axis('equal')

print(centers.shape)
(16, 2)
In [12]:
%matplotlib inline
from matplotlib import cm

# Show the contours of the activation regions of each kernel

ns = 25
xx, yy = np.meshgrid(np.linspace(-2, 2, ns), np.linspace(-2, 2, ns))
G = torch.tensor(np.array([xx, yy]), dtype=torch.float)
G = G.reshape((2, G.shape[1]*G.shape[2])).transpose(0,1)
G = G.expand(centers.shape[0], ns*ns, 2)
Z = torch.tensor(scales).unsqueeze(1) * torch.einsum('kij,kij->ki', [G-torch.tensor(centers).unsqueeze(1), G-torch.tensor(centers).unsqueeze(1)])

plt.scatter(centers[:,0], centers[:,1])
plt.scatter(X[:,0], X[:,1], alpha=0.1)
cmap = cm.get_cmap('tab20')
for i in range(Z.shape[0]):
    if scales[i] > 0:   
        plt.contour(np.linspace(-2, 2, ns), np.linspace(-2, 2, ns), Z[i].reshape(ns, ns), [-0.5,0.5], antialiased=True, colors=[cmap(i)], alpha=0.8, linestyles='dotted')
    else:
        plt.contour(np.linspace(-2, 2, ns), np.linspace(-2, 2, ns), Z[i].reshape(ns, ns), [-0.5,0.5], antialiased=True, colors=[cmap(i)], alpha=0.3, linestyles='solid')

plt.axis('equal')
/Users/andrewgodbehere/environments/torch/lib/python3.7/site-packages/matplotlib/contour.py:1230: UserWarning: No contour levels were found within the data range.
  warnings.warn("No contour levels were found"
Out[12]:
(-2.0, 2.0, -2.0, 2.0)