跳转至

Hardware & Software

Deep Learning Hardware

  • Refer to Slides

Deep Learning Software

Pytorch

Basic Concepts

  • Tensor

  • Create

x = torch.empty(3, 4)
print(type(x))
print(x)
'''Out:
<class 'torch.Tensor'>
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])
'''
zeros = torch.zeros(2, 3)
ones = torch.ones(2, 3)
torch.manual_seed(1729)
random = torch.rand(2, 3)
  • _like methods
x = torch.empty(2, 2, 3)
print(x.shape)
print(x)

empty_like_x = torch.empty_like(x)
print(empty_like_x.shape)
print(empty_like_x)

zeros_like_x = torch.zeros_like(x)
print(zeros_like_x.shape)
print(zeros_like_x)

ones_like_x = torch.ones_like(x)
print(ones_like_x.shape)
print(ones_like_x)

rand_like_x = torch.rand_like(x)
print(rand_like_x.shape)
print(rand_like_x)

Fundamental Concepts

  • Tensor: Like a numpy array, but can run on GPU
import torch
device = torch.device('cpu')
#device = torch.device('cuda:0')
#device = torch.device('mps')
N,D_in,H,D_out = 64,1000,100,10
x = torch.randn(N,D_in,device=device)
y = torch.randn(N,D_out,device=device)
w1 = torch.randn(D_in,H,device=device)
w2 = torch.randn(H,D_out,device=device)
learning_rate=1e-6
for t in range(500):
  h = x.mm(w1)
  h_relu = h.clamp(min=0)
  y_pred = h_relu.mm(w2)
  loss = (y_pred-y).pow(2).sum

  grad_y_pred = 2.0*(y_pred-y)
  grad_w2 = h_relu.t().mm(grad_y_pred)
  grad_h_relu = grad_y_pred.mm(w2.t())
  grad_h = grad_h_relu.clone()
  grad_h[h<0]=0
  grad_21 = x.t()mm(grad_h)

  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2
  • Data types : a = torch.ones((2, 3), dtype=torch.int16)
  • Tensor Broadcasting
    • Broadcasting is a way to perform an operation between tensors that have similarities in their shapes. In the example below, the one-row, four-column tensor is multiplied by both rows of the two-row, four-column tensor.
rand = torch.rand(2, 4)
doubled = rand * (torch.ones(1, 4) * 2)
print(rand)
print(doubled)
  • Moving to GPU

    device = torch.device("mps")
    model = ModelName(xxx).to(device)
    data = torch.Tensor(dataset.x).to(device)
    
  • Changing dimensions

    a = torch.rand(3, 226, 226)
    b = a.unsqueeze(0)
    print(a.shape)
    print(b.shape)
    
  • numpy

    import numpy as np
    
    numpy_array = np.ones((2, 3))
    print(numpy_array)
    
    pytorch_tensor = torch.from_numpy(numpy_array)
    print(pytorch_tensor)
    
  • Autograd: Package for building computational graphs out of Tensors, and automatically computing gradients

import torch
N,D_in,H,D_out = 64,1000,100,10
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
w1 = torch.randn(D_in, H ,requires_grad=True)
w2 = torch.randn(H,D_out,requires_grad=True)
learning_rate=1e-6
for t in range(500):
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  loss = (y_pred-y).pow(2).sum()
  loss.backward()
  with torch.no_grad():
  # Don't do computational graph in this stage [donnot do grad computation in this stage]
    w1 -= learning_rate * w1.grad
      w2 -= learning_rate * w2.grad
    w1.grad.zero_()
    w2.grad.zero_()

1

  • After backward finishes, gradients are accumulated into \(w1.grad\) and \(w2.grad\) and the graph is destroyed -- FORGET this is a common bug!

  • Can define new operations using Python functions

def sigmoid(x):
  return 1.0/(1.0+(-x).exp())
# y_pred = sigmoid(x.mm(w1)).mm(w2)
  • new functions
def sigmoid(x):
  return 1.0/(1.0+(-x).exp())
import torch
N,D_in,H,D_out = 64,1000,100,10
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
w1 = torch.randn(D_in, H ,requires_grad=True)
w2 = torch.randn(H,D_out,requires_grad=True)
learning_rate=1e-6
for t in range(500):
  y_pred = sigmoid(x.mm(w1)).mm(w2)
  loss = (y_pred-y).pow(2).sum()
  loss.backward()
  with torch.no_grad():
  # Don't do computational graph in this stage [donnot do grad computation in this stage]
    w1 -= learning_rate * w1.grad
      w2 -= learning_rate * w2.grad
    w1.grad.zero_()
    w2.grad.zero_()
  • Improvement

    class Sigmoid(torch.autograd.Function):
      @staticmethod
      def forward(ctx,x):
        y = 1.0/(1.0+(-x).exp())
        ctx.save_for_backward(y)
        return y
        def backward(ctx,grad_y):
        y,=ctx.saved_tensors
        grad_x = grad_y*y*(1.0-y)
        return grad_x
    def sigmoid(x):
      return SIgmoid.apply(x)
    

    2

    • In practice this is pretty rare – in most cases Python functions are good enough
  • Module: A neural network layer ; may store state or learnable weights

  • nn : Higher-level wrapper for working with neural nets

import torch
N,D_in,H,D_out = 64,1000,100,10

x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
'''
Object-oriented API: Define model object as sequence of layers objects, each of which holds weight tensors
'''
model = torch.nn.Sequential(
  torch.nn.Linear(D_in,H),
  torch.nn,ReLU(),
  torch.nn.Linear(H,D_out)
)
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
for t in range(500):
  y_pred = model(x)
  loss = torch.nn.functional.mse_loss(y_pred,y)#torch.nn.functional has useful helpers like loss functions
  loss.backward()
  with torch.no_grad():
    for param in model.parameters():
      param -= learning_rate * param.grad
   model.zero_grad( )
  • Use an optimizer for different update rules
import torch
N,D_in,H,D_out = 64,1000,100,10

x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
'''
Object-oriented API: Define model object as sequence of layers objects, each of which holds weight tensors
'''
model = torch.nn.Sequential(
  torch.nn.Linear(D_in,H),
  torch.nn,ReLU(),
  torch.nn.Linear(H,D_out)
)
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
for t in range(500):
  y_pred = model(x)
  loss = torch.nn.functional.mse_loss(y_pred,y)#torch.nn.functional has useful helpers like loss functions
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()
  • nn Defining Modules
import torch 
class TwoLayerNet(torch.nn.Module):
  def __init__(self,D_in,H,D_out):
    super(TwoLayerNet,self).__init__()
    self.linear1 = torch.nn.Linear(D_in,H)
    self.linear2 = torch.nn.Linear(H,D_out)
  def forward(self,x):
    h_relu = self.linear1(x).clamp(min=0)
    y_ored = self.linear2(h_relu)
    return y_pred
N,D_in,H,D_out = 64,1000,100,10
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
model = TwoLayerNet(D_in,H,D_out)
optimizer = torch.optim.SGD(model.parameters(),lr=1e-4)
for t in range(500):
  y_pred = model(x)
  loss = torch.nn.functional.mse_loss(y_pred,y)
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()
  • Very common to mix and match custom Module subclasses and Sequential containers
  • Very easy to quickly build complex network architectures!
import torch
class ParallelBlock(torch.nn.Module):
  def __init__(self,D_in,D_out):
    super(ParallelBlock,self)._init__()
    self.linear1 = torch.nn.Linear(D_in,D_out)
    self.linear2 = torch.nn.Linear(D_in,D_out)
  def forward(self,x):
    h1 = self.linear1(x)
    h2 = self.linear2(x)
    return (h1*h2).clamp(min = 0)
N,D_in,H,D_out = 64,1000,100,10
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
model = torch.nn.Sequential(
  ParallelBlock(D_in,H),
  ParallelBlock(H,H),
  torch.nn.Linear(H,D_out)
)
optimizer = torch.optim.Adam(model.parameters(),lr = 1e-4)
for t in range(500):
  y_pred = model(x)
  loss = torch.nn.functional.mse_loss(y_pred,y)
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()
  • DataLoaders
import torch
from torch.utils.data import TensorDataset,DataLoader
N,D_in,H,D_out = 64,1000,100,10
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
loader = DataLoader(TensorDataset(x,y),batch_size = 8)
model = TwoLayerNet(D_in,H,D_out)
optimizer = torch.optim.SGD(model.parameters(),lr= = 1e-2)
for epoch in range(20):
  # Iterate over loader to form minibatches
  for x_batch,y_batch in loader:
    y_pred = model(x_batch)
    loss = loss.nn.functional.mse_loss(y_pred,y_batch)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
  • Pretrained Models
import torch
import torchvision
alexnet = torchvision.models.alexnet(pretrained = True)
vgg16 = torchvision.models.vgg16(pretrained = True)
resnet101 = torchvision.models.resnet101(pretrained = True)
  • Dynamic Computation Graphs

Note : this model doesn’t makes sense! Just a simple dynamic example

import torch
N,D_in,H,D_out = 64,1000,100,10
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
w1 = torch.randn(D_in,H,requires_grad = True)
w2a = torch.randn(H,D_out,requires_grad = True)
w2b = torch.randn(H,D_out,requires_grad = True)
learning_rate=1e-6
for t in range(500):
  # Decide which one to use at each layer based on loss at previous iteration
  w2 = w2a if prev_loss <5.0 else w2b
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  loss = (y_pred - y).pow(2).sum
  loss.backward()
  prev_loss = loss.item()
  • Static Computation Graphs

  • Step 1: Build computational graph describing our computation (including finding paths for backprop)

  • Step 2: Reuse the same graph on every iteration
import torch 
def model(x,y,w1,w2a,w2b,prev_loss):
  w2 = w2a if prev_loss <5.0 else w2b
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  loss = (y_pred - y).pow(2).sum
  return loss
N,D_in,H,D_out = 64,1000,100,10
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
w1 = torch.randn(D_in,H,requires_grad = True)
w2a = torch.randn(H,D_out,requires_grad = True)
w2b = torch.randn(H,D_out,requires_grad = True)
#Just-In-Time compilation: Introspect the source code of the function, compile it into a graph object.
graph = torch.jit.script(model)
prev_loss = 5.0
learning_rate = 1e-6
for t in range(500):
  loss = graph(x,y,w1,w2a,w2b,prev_loss)
  loss.backward()
  prev_loss = loss.item()
  • Even easier: add annotation to function, Python function compiled to a graph when it is defined
import torch
@torch.jit.script
def model(x,y,w1,w2a,w2b,prev_loss):
  w2 = w2a if prev_loss <5.0 else w2b
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  loss = (y_pred - y).pow(2).sum
  return loss
N,D_in,H,D_out = 64,1000,100,10
x = torch.randn(N,D_in)
y = torch.randn(N,D_out)
w1 = torch.randn(D_in,H,requires_grad = True)
w2a = torch.randn(H,D_out,requires_grad = True)
w2b = torch.randn(H,D_out,requires_grad = True)
prev_loss = 5.0
learning_rate = 1e-6
for t in range(500):
  loss = model(x,y,w1,e2a,w2b,prev_loss)
  loss.backward()
  prev_loss = loss.item()
  • Static vs Dynamic Graphs: Debugging

Static

  • With static graphs, framework can optimize the graph for you before it runs!

  • Once graph is built, can serialize it and run it without the code that built the graph!

    e.g. train model in Python, deploy in C++

  • Lots of indirection between the code you write and the code that runs – can be hard to debug, benchmark, etc

Dynamic

  • Graph building and execution are intertwined, so always need to keep code around

  • The code you write is the code that runs! Easy to reason about, debug, profile, etc

  • Dynamic Graph Applications

Model structure depends on the input:

  • Recurrent Networks
  • Recursive Networks
  • Modular Networks

TensorFLow

  • TensorFlow 1.0 : Static Graphs

  • First define computational graph

  • Then run the graph many times

import numpy as np
import tensorflow as tf
N,D,H = 64,1000,100
x = tf.placeholder(tf.float32,shape = (N,D))
y = tf.placeholder(tf.float32,shape = (N,D))
w1 = tf.placeholder(tf.float32,shape = (D,H))
w2 = tf.placeholder(tf,float32,shape = (H,D))

h = tf.maximum(tf.matmul(x,w1),0)
y_pred = tf.matmul(h,w2)
diff = y_pred - y
loss = tf.reduce_mean(tf.refuce_sum(diff ** 2,axis = 1))
grad_w1,grad_e2 = tf.grafients(loss,[w1,w2])

with tf.Session() as sess :
  values = {
    x:np.random.randn(N,D),
    w1:np.random.randn(D,H),
    w2:np.random.randn(H,D),
    y:np.random.randn(N,D),
  }
  out = sess.run(
    [loss,grad_w1,grad_w2],
    feed_dict = values
  )
  loss_val,grad_w1_val,grad_w2_val = out
  • TensorFlow 2.0: Dynamic Graphs

  • Create TensorFlow Tensors for data and weights

  • Weights need to be wrapped in tf.Variable so we can mutate them

import tensorflow as tf
N,Din,H,Dout = 16,1000,100,10
x = tf.random.noraml((N,Din))
y = tf.random,normal((N,Dout))
w1 = tf.Variable(tf.random.normal(Din,H))
w2 = tf.Variable(tf.random.normal(H,Dout))
for t in range(1000):
  #Scope forward pass under a GradientTape to tell TensorFlow to start building a graph
  with tf,GradientTape() as tape :
    h = tf.maximum(tf.matmul(x,w1),0)
    y_pred = tf.matmul(h,w2)
    diff = y_pred - y
    loss = tf.reduce_mean(tf.reduce_sum(diff **2 , axis = 1))
    #Ask the tape to compute gradients
      grad_w1,grad_e2 = tape.gradient(loss,[w1,w2])
    # Gradient descent step, update weights
      w1.assign(w1-learning_rate*grad_w1)
    w2.assign(w2-learning-learning_rate*grad_w2)
learning_rate
  • TensorFlow 2.0: Static Graphs
@tf.function
def step(x,y,w1,w2):
 with tf,GradientTape() as tape :
    h = tf.maximum(tf.matmul(x,w1),0)
    y_pred = tf.matmul(h,w2)
    diff = y_pred - y
    loss = tf.reduce_mean(tf.reduce_sum(diff **2 , axis = 1))
    #Ask the tape to compute gradients
  grad_w1,grad_e2 = tape.gradient(loss,[w1,w2])
  w1.assign(w1-learning_rate*grad_w1)
  w2.assign(w2-learning-learning_rate*grad_w2)
  return loss
N,Din,H,Dout = 16,1000,100,10
x = tf.random.noraml((N,Din))
y = tf.random,normal((N,Dout))
w1 = tf.Variable(tf.random.normal(Din,H))
w2 = tf.Variable(tf.random.normal(H,Dout))
learning_rate = 1e-6
for t in range(1000):
  loss = step(x,y,w1,w2)
  • Keras: High-level API
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer,Dense
N,Din,H,Dout = 16,1000,100,10

model = Sequential()
model.add(InputLayer(input_shape=(Din,)))
model.add(Dense(units = H,activation = 'relu'))
model.add(Dense(units = Dout))

params = model.trainable_variables
loss_fn = tf.keras.losses.MeanSquaredError()
opt = tf.kears.optimizers.SGD(learning_rate = 1e-6)
x = tf.random.normal((N,Din))
y = tf.random.noraml((N,Dout))
def step():
  y_pred = model(x)
  loss = loss_fn(y_pred,y)
  return loss
for t in range(1000):
  opt.minimize(step,params)

3


最后更新: 2024年3月25日 12:53:47
创建日期: 2023年12月27日 18:58:21