I find it quite interesting for educational purposes, getting an idea about how transformers (chatGPT) work:
attention.prg
- Code: Select all Expand view RUN
- #define MIN_DOUBLE Val( "-1.7976931348623158e+308" )
PROCEDURE Main()
LOCAL aTokenIds, aEmbeddings, aInputEmbeddings, aPositionalEncoding, aEncodedEmbeddings, aWQ, aWK, aWV, aSelfAttentionOutput, nHeads, nDim, i
nDim := 4 // Set the number of dimensions for the embeddings, weight matrices, etc.
aTokenIds := {1, 5, 8, 2} // Example token IDs
aEmbeddings := InitializeEmbeddings(10, nDim) // Initialize embeddings for 10 tokens, each with nDim dimensions
aInputEmbeddings := GetInputEmbeddings(aTokenIds, aEmbeddings)
aPositionalEncoding := GeneratePositionalEncoding(LEN(aTokenIds), nDim)
aEncodedEmbeddings := AddPositionalEncoding(aInputEmbeddings, aPositionalEncoding)
// Initialize the weight matrices for Query (aWQ), Key (aWK), and Value (aWV) for each head
nHeads := 4
aWQ := Array(nHeads)
aWK := Array(nHeads)
aWV := Array(nHeads)
FOR i := 1 TO nHeads
aWQ[i] := InitializeEmbeddings(nDim, nDim)
aWK[i] := InitializeEmbeddings(nDim, nDim)
aWV[i] := InitializeEmbeddings(nDim, nDim)
NEXT
aSelfAttentionOutput := MultiHeadSelfAttention(aEncodedEmbeddings, aWQ, aWK, aWV, nHeads, nDim)
? "Input Token IDs:", aTokenIds
? "Input Embeddings:", aInputEmbeddings
? "Positional Encoding:", aPositionalEncoding
? "Encoded Embeddings:", aEncodedEmbeddings
? "Multi-Head Self Attention Output:", aSelfAttentionOutput
RETURN
FUNCTION InitializeEmbeddings(nTokens, nDimensions)
LOCAL aEmbeddings, nIndex, nDim
aEmbeddings := Array( nTokens, nDimensions )
FOR nIndex := 1 TO nTokens
FOR nDim := 1 TO nDimensions
aEmbeddings[nIndex][nDim] := (HB_Random() - 0.5) * 2 // Random number between -1 and 1
NEXT
NEXT
RETURN aEmbeddings
FUNCTION GetInputEmbeddings(aTokenIds, aEmbeddings)
LOCAL aInputEmbeddings, nIndex
aInputEmbeddings := Array( LEN(aTokenIds) )
FOR nIndex := 1 TO LEN(aTokenIds)
aInputEmbeddings[nIndex] := aEmbeddings[aTokenIds[nIndex]]
NEXT
RETURN aInputEmbeddings
FUNCTION GeneratePositionalEncoding(nSequenceLength, nDimensions)
LOCAL aPositionalEncoding, nIndex, nDim, nPos, nDivTerm
aPositionalEncoding := Array( nSequenceLength, nDimensions )
FOR nIndex := 1 TO nSequenceLength
nPos := nIndex - 1
FOR nDim := 1 TO nDimensions
nDivTerm := 10000 ^ ((nDim - 1) / nDimensions)
IF Mod(nDim, 2) == 1
aPositionalEncoding[nIndex][nDim] := Sin(nPos / nDivTerm)
ELSE
aPositionalEncoding[nIndex][nDim] := Cos(nPos / nDivTerm)
ENDIF
NEXT
NEXT
RETURN aPositionalEncoding
FUNCTION AddPositionalEncoding(aInputEmbeddings, aPositionalEncoding)
LOCAL nIndex, nDim, aEncodedEmbeddings
aEncodedEmbeddings := Array(LEN(aInputEmbeddings))
FOR nIndex := 1 TO LEN(aInputEmbeddings)
aEncodedEmbeddings[nIndex] := Array(LEN(aInputEmbeddings[nIndex]))
FOR nDim := 1 TO LEN(aInputEmbeddings[nIndex])
aEncodedEmbeddings[nIndex][nDim] := aInputEmbeddings[nIndex][nDim] + aPositionalEncoding[nIndex][nDim]
NEXT
NEXT
RETURN aEncodedEmbeddings
FUNCTION MultiHeadSelfAttention(aEncodedEmbeddings, aWQ, aWK, aWV, nHeads, nDim )
LOCAL aHeadOutputs, aConcatenatedHeads, nIndex
aHeadOutputs := Array(nHeads) // Create an array to store the output of each attention head
FOR i := 1 TO nHeads
aHeadOutputs[i] := SelfAttention(aEncodedEmbeddings, aWQ[i], aWK[i], aWV[i], nDim, nHeads)
NEXT
// Concatenate the outputs of all heads
aConcatenatedHeads := ConcatenateHeads(aHeadOutputs)
RETURN aConcatenatedHeads
FUNCTION SelfAttention(aEncodedEmbeddings, aWQ, aWK, aWV, nDim, nHeads)
LOCAL aQ, aK, aV, aScores, aSoftmaxScores, aAttentionOutput
// Reshape the weight matrices aWQ, aWK, and aWV
aWQ := Reshape(aWQ, nDim, nHeads)
aWK := Reshape(aWK, nDim, nHeads)
aWV := Reshape(aWV, nDim, nHeads)
// Compute the Query (Q), Key (K), and Value (V) matrices
aQ := MultiplyMatrix(aEncodedEmbeddings, aWQ)
aK := MultiplyMatrix(aEncodedEmbeddings, aWK)
aV := MultiplyMatrix(aEncodedEmbeddings, aWV)
// Transpose the Key (K) matrix
aK := TransposeMatrix(aK)
// Compute the attention scores
aScores := MultiplyMatrix(aQ, aK)
// Apply the softmax function to the scores
aSoftmaxScores := Softmax(aScores)
// Compute the attention output
aAttentionOutput := MultiplyMatrix(aSoftmaxScores, aV)
RETURN aAttentionOutput
FUNCTION SplitMatrix(aMatrix, nSplitSize)
LOCAL nRows, nCols, nSplits, aSplitMatrices, i, j, k, aSplitMatrix
nRows := LEN(aMatrix)
nCols := LEN(aMatrix[1])
nSplits := INT(nCols / nSplitSize) // Calculate the number of splits
// Initialize the array of split matrices
aSplitMatrices := Array(nSplits)
// Split the matrix into multiple matrices of size nSplitSize
FOR i := 1 TO nSplits
aSplitMatrix := Array(nRows)
FOR j := 1 TO nRows
aSplitMatrix[j] := Array(nSplitSize)
FOR k := 1 TO nSplitSize
aSplitMatrix[j][k] := aMatrix[j][(i - 1) * nSplitSize + k]
NEXT
NEXT
aSplitMatrices[i] := aSplitMatrix
NEXT
RETURN aSplitMatrices
FUNCTION ConcatenateHeads(aMultiHeadOutput)
LOCAL nIndex, nHead, aConcatenated
aConcatenated := aMultiHeadOutput[1]
FOR nHead := 2 TO LEN(aMultiHeadOutput)
aConcatenated := ConcatenateArrays(aConcatenated, aMultiHeadOutput[nHead])
NEXT
RETURN aConcatenated
FUNCTION ConcatenateArrays(aArray1, aArray2)
LOCAL aResult, nIndex
aResult := Array(LEN(aArray1) + LEN(aArray2))
FOR nIndex := 1 TO LEN(aArray1)
aResult[nIndex] := aArray1[nIndex]
NEXT
FOR nIndex := 1 TO LEN(aArray2)
aResult[LEN(aArray1) + nIndex] := aArray2[nIndex]
NEXT
RETURN aResult
function MultiplyMatrix(m1, m2)
local nRows1, nCols1, nRows2, nCols2, i, j, k
local result := {}
nRows1 := Len(m1)
nCols1 := Len(m1[1])
nRows2 := Len(m2)
nCols2 := Len(m2[1])
// Initialize result matrix with correct dimensions
result := Array(nRows1, nCols2 )
// Calculate result matrix
for i := 1 to nRows1
for j := 1 to nCols2
for k := 1 to nCols1
result[i][j] = m1[i][k] * m2[k][j]
next
next
next
return result
FUNCTION TransposeMatrix(aMatrix)
LOCAL nRows, nCols, aResult, i, j
nRows := LEN(aMatrix)
nCols := LEN(aMatrix[1])
aResult := {}
FOR i := 1 TO nCols
AAdd(aResult, {})
FOR j := 1 TO nRows
AAdd(aResult[i], aMatrix[j][i])
NEXT
NEXT
RETURN aResult
FUNCTION NormalizeAndSoftmax(aMatrix)
LOCAL nRows, nCols, aResult, i, j, nMax, nSum
nRows := LEN(aMatrix)
nCols := LEN(aMatrix[1])
aResult := {}
FOR i := 1 TO nRows
AAdd(aResult, {})
nMax := MaxElem(aMatrix[i])
nSum := 0
FOR j := 1 TO nCols
aMatrix[i][j] := Exp(aMatrix[i][j] - nMax)
nSum += aMatrix[i][j]
NEXT
FOR j := 1 TO nCols
AAdd(aResult[i], aMatrix[i][j] / nSum)
NEXT
NEXT
RETURN aResult
FUNCTION MaxElem(aArray)
LOCAL nMax, nElem
nMax := aArray[1]
FOR EACH nElem IN aArray
IF nElem > nMax
nMax := nElem
ENDIF
NEXT
RETURN nMax
FUNCTION ScaledDotProductAttention(aQuery, aKey, aValue)
LOCAL nHeads, nBatchSize, nSeqLength, nDimPerHead, aQueryTranspose, aDotProduct, aAttentionScores, aAttentionScoresTranspose, aSoftmaxWeights, aMultiplied, aMultipliedTranspose
nHeads := LEN(aQuery)
nBatchSize := LEN(aQuery[1])
nSeqLength := LEN(aQuery[1][1])
nDimPerHead := LEN(aQuery[1][1])
// Compute the dot product of the Query and Key matrices
aQueryTranspose := TransposeMatrix(aQuery)
aDotProduct := MultiplyMatrix(aQueryTranspose, aKey)
// Scale the dot product by the square root of the number of dimensions per head
aScaledDotProduct := aDotProduct / SQRT(nDimPerHead)
// Compute the softmax weights for the attention scores
aAttentionScores := TransposeMatrix(aScaledDotProduct)
aAttentionScoresTranspose := NormalizeAndSoftmax(aAttentionScores)
// Compute the matrix multiplication of the softmax weights and the Value matrix
aSoftmaxWeights := TransposeMatrix(aAttentionScoresTranspose)
aMultiplied := MultiplyMatrix(aSoftmaxWeights, aValue)
aMultipliedTranspose := TransposeMatrix(aMultiplied)
// Reshape the output to match the input shape
RETURN aMultipliedTranspose
FUNCTION ConcatenateMatrices(aMatrices)
LOCAL nSplits, nBatchSize, nSeqLength, nDim, aResultMatrix, i, j, k, nOffset, aSubMatrix
nSplits := LEN(aMatrices)
nBatchSize := LEN(aMatrices[1])
nSeqLength := LEN(aMatrices[1][1])
nDim := 0
// Calculate the total number of dimensions across all splits
FOR i := 1 TO nSplits
nDim += LEN(aMatrices[i][1][1])
NEXT
// Initialize the result matrix with the correct shape
aResultMatrix := Array(nBatchSize)
FOR i := 1 TO nBatchSize
aResultMatrix[i] := Array(nSeqLength)
FOR j := 1 TO nSeqLength
aResultMatrix[i][j] := Array(nDim)
NEXT
NEXT
// Concatenate the splits along the dimension axis
nOffset := 0
FOR i := 1 TO nSplits
FOR j := 1 TO nBatchSize
FOR k := 1 TO nSeqLength
aSubMatrix := aMatrices[i][j][k]
FOR n := 1 TO LEN(aSubMatrix)
aResultMatrix[j][k][nOffset+n] := aSubMatrix[n]
NEXT
NEXT
NEXT
nOffset += LEN(aMatrices[i][1][1])
NEXT
RETURN aResultMatrix
FUNCTION Reshape(aMatrix, nRows, nCols)
LOCAL aResult, i, j, k, l
aResult := Array(nRows)
k := 1
l := 1
FOR i := 1 TO nRows
aResult[i] := Array(nCols)
FOR j := 1 TO nCols
aResult[i][j] := aMatrix[k][l]
l += 1
IF l > LEN(aMatrix[k])
l := 1
k += 1
ENDIF
NEXT
NEXT
RETURN aResult
FUNCTION Softmax(aMatrix)
LOCAL aResult, nRows, nCols, i, j, nMax, nSum, nExp
nRows := LEN(aMatrix)
nCols := LEN(aMatrix[1])
aResult := Array(nRows) // Create an array to store the result
FOR i := 1 TO nRows
aResult[i] := Array(nCols) // Create an array for each row of the result
nMax := MIN_DOUBLE // Initialize the maximum value for numerical stability
FOR j := 1 TO nCols
IF aMatrix[i][j] > nMax
nMax := aMatrix[i][j]
ENDIF
NEXT
nSum := 0 // Initialize the sum for the softmax function
FOR j := 1 TO nCols
nExp := EXP(aMatrix[i][j] - nMax) // Calculate the exponent for each entry
aResult[i][j] := nExp
nSum += nExp
NEXT
FOR j := 1 TO nCols
aResult[i][j] /= nSum // Divide each entry by the sum
NEXT
NEXT
RETURN aResult