I have recently started working through Neural Networks: Zero to Hero, Andrej Karpathy’s excellent AI course. In the first video, he demonstrates how to build micrograd, a library that can be used to implement backpropagation, the algorithm used to train neural networks.
As an exercise, I tried to reimplement it in JavaScript to check my understanding. I was able to do it mostly from memory which is a testament to Andrej’s qualities as a teacher. It has proven incredibly useful, and I recommend it to everyone who has a couple of hours of free time.
You can try my implementation in this Observable notebook.
Here is how I implemented the Value
class:
class Value {
constructor(value, label = "", operator = "", children = [], exponent = 1) {
this.value = value;
this.label = label;
this.operator = operator;
this.gradient = 0;
this.exponent = exponent;
this.children = children;
}
add(other) {
if (typeof other === "number") other = new Value(other);
const newValue = this.value + other.value;
return new Value(newValue, "", "+", [this, other]);
}
sub(other) {
if (typeof other === "number") other = new Value(other);
return this.add(other.mul(-1));
}
mul(other) {
if (typeof other === "number") other = new Value(other);
const newValue = this.value * other.value;
return new Value(newValue, "", "*", [this, other]);
}
div(other) {
if (typeof other === "number") other = new Value(other);
return this.mul(other.pow(-1));
}
neg() {
return this.mul(-1);
}
pow(x) {
const newValue = Math.pow(this.value, x);
this.exponent = x;
return new Value(newValue, "", "^", [this], x);
}
exp() {
const newValue = Math.exp(this.value);
return new Value(newValue, "", "exp", [this]);
}
tanh() {
const newValue = Math.tanh(this.value);
return new Value(newValue, "", "tanh", [this]);
}
backward() {
this.gradient = 1;
let topo = [];
let visited = new Set();
const buildTopo = (v) => {
if (!visited.has(v)) {
visited.add(v);
for (let child of v.children) {
buildTopo(child);
}
topo.push(v);
}
};
buildTopo(this);
for (let node of topo.reverse()) {
node._setChildGradients();
}
}
_setChildGradients() {
const saved = this.children.map(c => ({ v: c.value, g: c.gradient, op: c.operator }));
switch (this.operator) {
case "+": {
const [left, right] = this.children;
left.gradient += this.gradient;
right.gradient += this.gradient;
break;
}
case "*": {
const [left, right] = this.children;
left.gradient += this.gradient * right.value;
right.gradient += this.gradient * left.value;
break;
}
case "^": {
const [c] = this.children;
c.gradient += this.exponent * (Math.pow(c.value, this.exponent - 1)) * this.gradient;
break;
}
case "tanh": {
const [c] = this.children;
c.gradient += this.gradient * (1 - Math.pow(this.value, 2));
break;
}
case "exp": {
const [c] = this.children;
c.gradient += this.gradient * this.value;
break;
}
case "":
break;
default:
throw new Error(`Operator '${this.operator}' not implemented!`);
break;
}
}
}
I’m not sure if my solution with the giant switch is better than Andrej’s but I wanted to try something different. At least it has proven to be a real test for my understanding to reimplement it this way as the code for updating the gradient is in the parent class. So, I had to think through each of the derivatives thoroughly.
The rest of the code is a straightforward translation to JavaScript:
class Neuron {
constructor(nin) {
this.w = [];
for (let i = 0; i < 5; i++)
this.w.push(new Value((Math.random() * 2) - 1))
this.b = new Value((Math.random() * 2) - 1);
}
call(x) {
const z = x.map((x, i) => this.w[i].mul(x)).reduce((sum, p) => sum.add(p), new Value(0)).add(this.b);
return z.tanh();
}
parameters() {
return [...this.w, this.b];
}
}
class Layer {
constructor(nin, nout) {
this.neurons = [];
for (let i = 0; i < nout; i++) {
this.neurons.push(new Neuron(nin));
}
}
call(x) {
const outs = this.neurons.map(n => n.call(x));
return outs.length === 1 ? outs[0] : outs;
}
parameters() {
return this.neurons.reduce((p, n) => p.concat(n.parameters()), []);
}
}
class MLP {
constructor(nin, nouts) {
const sizes = [nin].concat(nouts);
this.layers = [];
for (let i = 0; i < nouts.length; i++) {
this.layers.push(new Layer(sizes[i], sizes[i + 1]));
}
}
call(x) {
for (const layer of this.layers) {
x = layer.call(x);
}
return x;
}
parameters() {
return this.layers.reduce((p, n) => p.concat(n.parameters()), []);
}
}
And finally, the training loop:
{
const n = new MLP(3, [4, 4, 1]);
const xs = [
[2.0, 3.0, -1.0],
[3.0, -1.0, 0.5],
[0.5, 1.0, 1.0],
[1.0, 1.0, -1.0],
];
const ys = [1.0, -1.0, -1.0, 1.0];
let ypred;
let loss;
const iterations = 1000;
const learningRate = 0.1;
for (let k = 0; k < iterations; k++) {
// forward pass
ypred = xs.map(x => n.call(x));
loss = ys.reduce((acc, ygt, index) => {
const yout = ypred[index];
return yout.sub(ygt).pow(2).add(acc);
}, 0);
// backward pass
for (const p of n.parameters()) {
p.gradient = 0;
}
loss.backward();
// learning
for (const p of n.parameters()) {
p.value -= p.gradient * learningRate;
}
}
return { loss, ypred };
}