I have recently started working through Neural Networks: Zero to Hero, Andrej Karpathy’s excellent AI course. In the first video, he demonstrates how to build micrograd, a library that can be used to implement backpropagation, the algorithm used to train neural networks.

As an exercise, I tried to reimplement it in JavaScript to check my understanding. I was able to do it mostly from memory which is a testament to Andrej’s qualities as a teacher. It has proven incredibly useful, and I recommend it to everyone who has a couple of hours of free time.

You can try my implementation in this Observable notebook.

Here is how I implemented the Value class:

class Value {
  constructor(value, label = "", operator = "", children = [], exponent = 1) {
    this.value = value;
    this.label = label;
    this.operator = operator;
    this.gradient = 0;
    this.exponent = exponent;
    this.children = children; 
  }

  add(other) {
    if (typeof other === "number") other = new Value(other);
    const newValue = this.value + other.value;
    return new Value(newValue, "", "+", [this, other]);
  }

  sub(other) {
    if (typeof other === "number") other = new Value(other);
    return this.add(other.mul(-1));
  }

  mul(other) {
    if (typeof other === "number") other = new Value(other);
    const newValue = this.value * other.value;
    return new Value(newValue, "", "*", [this, other]);
  }

  div(other) {
    if (typeof other === "number") other = new Value(other);
    return this.mul(other.pow(-1));
  }

  neg() {
    return this.mul(-1);
  }

  pow(x) {
    const newValue = Math.pow(this.value, x);
    this.exponent = x;
    return new Value(newValue, "", "^", [this], x);
  }

  exp() {
    const newValue = Math.exp(this.value);
    return new Value(newValue, "", "exp", [this]);
  }

  tanh() {
    const newValue = Math.tanh(this.value);
    return new Value(newValue, "", "tanh", [this]);
  }

  backward() {
    this.gradient = 1;

    let topo = [];
    let visited = new Set();

    const buildTopo = (v) => {
      if (!visited.has(v)) {
        visited.add(v);
        for (let child of v.children) {
          buildTopo(child);
        }
        topo.push(v);
      }
    };

    buildTopo(this);

    for (let node of topo.reverse()) {
      node._setChildGradients();
    }
  }

  _setChildGradients() {
    const saved = this.children.map(c => ({ v: c.value, g: c.gradient, op: c.operator }));
    switch (this.operator) {
      case "+": {
        const [left, right] = this.children;
        left.gradient += this.gradient;
        right.gradient += this.gradient;
      	break;
      }
      case "*": {
        const [left, right] = this.children;
        left.gradient += this.gradient * right.value;
        right.gradient += this.gradient * left.value;
      	break;
      }
      case "^": {
        const [c] = this.children;
        c.gradient += this.exponent * (Math.pow(c.value, this.exponent - 1)) * this.gradient;
      	break;
      }
      case "tanh": {
        const [c] = this.children;
        c.gradient += this.gradient * (1 - Math.pow(this.value, 2));
      	break;
      }
      case "exp": {
        const [c] = this.children;
        c.gradient += this.gradient * this.value;
      	break;
      }
      case "":
        break;
      default:
        throw new Error(`Operator '${this.operator}' not implemented!`);
        break;
    }
  }
}

I’m not sure if my solution with the giant switch is better than Andrej’s but I wanted to try something different. At least it has proven to be a real test for my understanding to reimplement it this way as the code for updating the gradient is in the parent class. So, I had to think through each of the derivatives thoroughly.

The rest of the code is a straightforward translation to JavaScript:

class Neuron {
  constructor(nin) {
    this.w = [];
    for (let i = 0; i < 5; i++)
      this.w.push(new Value((Math.random() * 2) - 1))
    this.b = new Value((Math.random() * 2) - 1);
  }

  call(x) {
    const z = x.map((x, i) => this.w[i].mul(x)).reduce((sum, p) => sum.add(p), new Value(0)).add(this.b);
    return z.tanh();
  }

  parameters() {
    return [...this.w, this.b];
  }
}

class Layer {
  constructor(nin, nout) {
    this.neurons = [];
    for (let i = 0; i < nout; i++) {
      this.neurons.push(new Neuron(nin));
    }
  }

  call(x) {
    const outs = this.neurons.map(n => n.call(x));
    return outs.length === 1 ? outs[0] : outs;
  }

  parameters() {
    return this.neurons.reduce((p, n) => p.concat(n.parameters()), []);
  }
}

class MLP {
  constructor(nin, nouts) {
    const sizes = [nin].concat(nouts);
    this.layers = [];
    for (let i = 0; i < nouts.length; i++) {
      this.layers.push(new Layer(sizes[i], sizes[i + 1]));
    }
  }

  call(x) {
    for (const layer of this.layers) {
      x = layer.call(x);
    }
    return x;
  }

  parameters() {
    return this.layers.reduce((p, n) => p.concat(n.parameters()), []);
  }
}

And finally, the training loop:

{
  const n = new MLP(3, [4, 4, 1]);
  const xs = [
    [2.0, 3.0, -1.0],
    [3.0, -1.0, 0.5],
    [0.5, 1.0, 1.0],
    [1.0, 1.0, -1.0],
  ];
  const ys = [1.0, -1.0, -1.0, 1.0];
  let ypred;
  let loss;
  const iterations = 1000;
  const learningRate = 0.1;
  
  for (let k = 0; k < iterations; k++) {
    // forward pass
    ypred = xs.map(x => n.call(x));
    loss = ys.reduce((acc, ygt, index) => {
      const yout = ypred[index];
      return yout.sub(ygt).pow(2).add(acc);
    }, 0);

    // backward pass
    for (const p of n.parameters()) {
      p.gradient = 0;
    }
    loss.backward();

    // learning
    for (const p of n.parameters()) {
      p.value -= p.gradient * learningRate;
    }
  }
  return { loss, ypred };
}