Pico GPT2 in Aesara
import aesara.tensor as at
import numpy as np
def gelu(x):
return 0.5 * x * (1 + at.tanh(at.sqrt(2 / np.pi)) * (x + 0.044715 * x ** 3))
x = at.scalar('x')
y = gelu(x)
import aesara
aesara.dprint(y)
x = at.vector('x')
def softmax(x):
return at.exp(x) / at.sum(at.exp(x))
y = softmax(x)
fn = aesara.function([x], y)
aesara.dprint(y)
aesara.dprint(fn)
def layer_norm(x, g, b, eps = 1e-5):
mean = at.mean(x)
var = at.var(x)
return g * (x - mean) / at.sqrt(var + eps) + b
x = at.vector('x')
y = layer_norm(x, 1., 1.)
aesara.dprint(y)
def linear(x, w, b):
return x @ w + b
def ffn(x, c_fc, c_proj):
return linear(gelu(linear(x, *c_fc)), *c_proj)
def attention(q, k, v, mask):
return softmax(q @ k.T / at.sqrt(q.shape[-1]) + mask) @ v
def mha(x, c_attn, c_proj, n_head):
x = linear(x, *c_attn)
qkv = at.split(x, 3, axis=-1)
qkv_heads =
causal_mask =
out_heads =
x =
x = linear(x, *c_proj)