Harbour's multi-head self-attention transformer

Post by **Antonio Linares** » Mon Apr 17, 2023 1:38 pm

This code was mostly written by GPT4, I just supervised errors and tried to make it work.

I find it quite interesting for educational purposes, getting an idea about how transformers (chatGPT) work:

attention.prg

Code: Select all | Expand

#define MIN_DOUBLE Val( "-1.7976931348623158e+308" )

PROCEDURE Main()
    LOCAL aTokenIds, aEmbeddings, aInputEmbeddings, aPositionalEncoding, aEncodedEmbeddings, aWQ, aWK, aWV, aSelfAttentionOutput, nHeads, nDim, i

    nDim := 4 // Set the number of dimensions for the embeddings, weight matrices, etc.
    aTokenIds := {1, 5, 8, 2} // Example token IDs
    aEmbeddings := InitializeEmbeddings(10, nDim) // Initialize embeddings for 10 tokens, each with nDim dimensions
    aInputEmbeddings := GetInputEmbeddings(aTokenIds, aEmbeddings)
    aPositionalEncoding := GeneratePositionalEncoding(LEN(aTokenIds), nDim)
    aEncodedEmbeddings := AddPositionalEncoding(aInputEmbeddings, aPositionalEncoding)

    // Initialize the weight matrices for Query (aWQ), Key (aWK), and Value (aWV) for each head
    nHeads := 4
    aWQ := Array(nHeads)
    aWK := Array(nHeads)
    aWV := Array(nHeads)
    FOR i := 1 TO nHeads
       aWQ[i] := InitializeEmbeddings(nDim, nDim)
       aWK[i] := InitializeEmbeddings(nDim, nDim)
       aWV[i] := InitializeEmbeddings(nDim, nDim)
    NEXT

    aSelfAttentionOutput := MultiHeadSelfAttention(aEncodedEmbeddings, aWQ, aWK, aWV, nHeads, nDim)

    ? "Input Token IDs:", aTokenIds
    ? "Input Embeddings:", aInputEmbeddings
    ? "Positional Encoding:", aPositionalEncoding
    ? "Encoded Embeddings:", aEncodedEmbeddings
    ? "Multi-Head Self Attention Output:", aSelfAttentionOutput

RETURN

 
 
 FUNCTION InitializeEmbeddings(nTokens, nDimensions)
    LOCAL aEmbeddings, nIndex, nDim
 
    aEmbeddings := Array( nTokens, nDimensions )
    FOR nIndex := 1 TO nTokens
       FOR nDim := 1 TO nDimensions
          aEmbeddings[nIndex][nDim] := (HB_Random() - 0.5) * 2 // Random number between -1 and 1
       NEXT
    NEXT
 
 RETURN aEmbeddings
 
 FUNCTION GetInputEmbeddings(aTokenIds, aEmbeddings)
    LOCAL aInputEmbeddings, nIndex
 
    aInputEmbeddings := Array( LEN(aTokenIds) )
    FOR nIndex := 1 TO LEN(aTokenIds)
       aInputEmbeddings[nIndex] := aEmbeddings[aTokenIds[nIndex]]
    NEXT
 
 RETURN aInputEmbeddings
 
 FUNCTION GeneratePositionalEncoding(nSequenceLength, nDimensions)
    LOCAL aPositionalEncoding, nIndex, nDim, nPos, nDivTerm
 
    aPositionalEncoding := Array( nSequenceLength, nDimensions )
    FOR nIndex := 1 TO nSequenceLength
       nPos := nIndex - 1
       FOR nDim := 1 TO nDimensions
          nDivTerm := 10000 ^ ((nDim - 1) / nDimensions)
          IF Mod(nDim, 2) == 1
             aPositionalEncoding[nIndex][nDim] := Sin(nPos / nDivTerm)
          ELSE
             aPositionalEncoding[nIndex][nDim] := Cos(nPos / nDivTerm)
          ENDIF
       NEXT
    NEXT
 
 RETURN aPositionalEncoding
 
 FUNCTION AddPositionalEncoding(aInputEmbeddings, aPositionalEncoding)
    LOCAL nIndex, nDim, aEncodedEmbeddings
 
    aEncodedEmbeddings := Array(LEN(aInputEmbeddings))
    FOR nIndex := 1 TO LEN(aInputEmbeddings)
       aEncodedEmbeddings[nIndex] := Array(LEN(aInputEmbeddings[nIndex]))
       FOR nDim := 1 TO LEN(aInputEmbeddings[nIndex])
          aEncodedEmbeddings[nIndex][nDim] := aInputEmbeddings[nIndex][nDim] + aPositionalEncoding[nIndex][nDim]
       NEXT
    NEXT
 
 RETURN aEncodedEmbeddings
 
 FUNCTION MultiHeadSelfAttention(aEncodedEmbeddings, aWQ, aWK, aWV, nHeads, nDim )
    LOCAL aHeadOutputs, aConcatenatedHeads, nIndex

    aHeadOutputs := Array(nHeads) // Create an array to store the output of each attention head

    FOR i := 1 TO nHeads
        aHeadOutputs[i] := SelfAttention(aEncodedEmbeddings, aWQ[i], aWK[i], aWV[i], nDim, nHeads)
    NEXT

    // Concatenate the outputs of all heads
    aConcatenatedHeads := ConcatenateHeads(aHeadOutputs)

    RETURN aConcatenatedHeads
    
FUNCTION SelfAttention(aEncodedEmbeddings, aWQ, aWK, aWV, nDim, nHeads)
    LOCAL aQ, aK, aV, aScores, aSoftmaxScores, aAttentionOutput

    // Reshape the weight matrices aWQ, aWK, and aWV
    aWQ := Reshape(aWQ, nDim, nHeads)
    aWK := Reshape(aWK, nDim, nHeads)
    aWV := Reshape(aWV, nDim, nHeads)

    // Compute the Query (Q), Key (K), and Value (V) matrices
    aQ := MultiplyMatrix(aEncodedEmbeddings, aWQ)
    aK := MultiplyMatrix(aEncodedEmbeddings, aWK)
    aV := MultiplyMatrix(aEncodedEmbeddings, aWV)

    // Transpose the Key (K) matrix
    aK := TransposeMatrix(aK)

    // Compute the attention scores
    aScores := MultiplyMatrix(aQ, aK)

    // Apply the softmax function to the scores
    aSoftmaxScores := Softmax(aScores)

    // Compute the attention output
    aAttentionOutput := MultiplyMatrix(aSoftmaxScores, aV)

    RETURN aAttentionOutput    
 
 FUNCTION SplitMatrix(aMatrix, nSplitSize)
    LOCAL nRows, nCols, nSplits, aSplitMatrices, i, j, k, aSplitMatrix
 
    nRows := LEN(aMatrix)
    nCols := LEN(aMatrix[1])
    nSplits := INT(nCols / nSplitSize) // Calculate the number of splits
 
    // Initialize the array of split matrices
    aSplitMatrices := Array(nSplits)
 
    // Split the matrix into multiple matrices of size nSplitSize
    FOR i := 1 TO nSplits
       aSplitMatrix := Array(nRows)
       FOR j := 1 TO nRows
          aSplitMatrix[j] := Array(nSplitSize)
          FOR k := 1 TO nSplitSize
             aSplitMatrix[j][k] := aMatrix[j][(i - 1) * nSplitSize + k]
          NEXT
       NEXT
       aSplitMatrices[i] := aSplitMatrix
    NEXT
 
 RETURN aSplitMatrices
 
 FUNCTION ConcatenateHeads(aMultiHeadOutput)
    LOCAL nIndex, nHead, aConcatenated
 
    aConcatenated := aMultiHeadOutput[1]
 
    FOR nHead := 2 TO LEN(aMultiHeadOutput)
       aConcatenated := ConcatenateArrays(aConcatenated, aMultiHeadOutput[nHead])
    NEXT
 
 RETURN aConcatenated
 
 FUNCTION ConcatenateArrays(aArray1, aArray2)
    LOCAL aResult, nIndex
 
    aResult := Array(LEN(aArray1) + LEN(aArray2))
 
    FOR nIndex := 1 TO LEN(aArray1)
       aResult[nIndex] := aArray1[nIndex]
    NEXT
 
    FOR nIndex := 1 TO LEN(aArray2)
       aResult[LEN(aArray1) + nIndex] := aArray2[nIndex]
    NEXT
 
 RETURN aResult
 
 function MultiplyMatrix(m1, m2)
    local nRows1, nCols1, nRows2, nCols2, i, j, k
    local result := {}

    nRows1 := Len(m1)
    nCols1 := Len(m1[1])
    nRows2 := Len(m2)
    nCols2 := Len(m2[1])

    // Initialize result matrix with correct dimensions
    result := Array(nRows1, nCols2 )

    // Calculate result matrix
    for i := 1 to nRows1
        for j := 1 to nCols2
            for k := 1 to nCols1
                result[i][j] = m1[i][k] * m2[k][j]
            next
        next
    next

return result

 FUNCTION TransposeMatrix(aMatrix)
    LOCAL nRows, nCols, aResult, i, j
 
    nRows := LEN(aMatrix)
    nCols := LEN(aMatrix[1])
 
    aResult := {}
    FOR i := 1 TO nCols
       AAdd(aResult, {})
       FOR j := 1 TO nRows
          AAdd(aResult[i], aMatrix[j][i])
       NEXT
    NEXT
 
 RETURN aResult
 
 FUNCTION NormalizeAndSoftmax(aMatrix)
    LOCAL nRows, nCols, aResult, i, j, nMax, nSum
 
    nRows := LEN(aMatrix)
    nCols := LEN(aMatrix[1])
 
    aResult := {}
    FOR i := 1 TO nRows
       AAdd(aResult, {})
       nMax := MaxElem(aMatrix[i])
       nSum := 0
       FOR j := 1 TO nCols
          aMatrix[i][j] := Exp(aMatrix[i][j] - nMax)
          nSum += aMatrix[i][j]
       NEXT
 
       FOR j := 1 TO nCols
          AAdd(aResult[i], aMatrix[i][j] / nSum)
       NEXT
    NEXT
 
 RETURN aResult
 
 FUNCTION MaxElem(aArray)
    LOCAL nMax, nElem
 
    nMax := aArray[1]
    FOR EACH nElem IN aArray
       IF nElem > nMax
          nMax := nElem
       ENDIF
    NEXT
 
 RETURN nMax
 
 FUNCTION ScaledDotProductAttention(aQuery, aKey, aValue)
    LOCAL nHeads, nBatchSize, nSeqLength, nDimPerHead, aQueryTranspose, aDotProduct, aAttentionScores, aAttentionScoresTranspose, aSoftmaxWeights, aMultiplied, aMultipliedTranspose
 
    nHeads := LEN(aQuery)
    nBatchSize := LEN(aQuery[1])
    nSeqLength := LEN(aQuery[1][1])
    nDimPerHead := LEN(aQuery[1][1])
 
    // Compute the dot product of the Query and Key matrices
    aQueryTranspose := TransposeMatrix(aQuery)
    aDotProduct := MultiplyMatrix(aQueryTranspose, aKey)
 
    // Scale the dot product by the square root of the number of dimensions per head
    aScaledDotProduct := aDotProduct / SQRT(nDimPerHead)
 
    // Compute the softmax weights for the attention scores
    aAttentionScores := TransposeMatrix(aScaledDotProduct)
    aAttentionScoresTranspose := NormalizeAndSoftmax(aAttentionScores)
 
    // Compute the matrix multiplication of the softmax weights and the Value matrix
    aSoftmaxWeights := TransposeMatrix(aAttentionScoresTranspose)
    aMultiplied := MultiplyMatrix(aSoftmaxWeights, aValue)
    aMultipliedTranspose := TransposeMatrix(aMultiplied)
 
    // Reshape the output to match the input shape
RETURN aMultipliedTranspose

FUNCTION ConcatenateMatrices(aMatrices)
   LOCAL nSplits, nBatchSize, nSeqLength, nDim, aResultMatrix, i, j, k, nOffset, aSubMatrix

   nSplits := LEN(aMatrices)
   nBatchSize := LEN(aMatrices[1])
   nSeqLength := LEN(aMatrices[1][1])
   nDim := 0

   // Calculate the total number of dimensions across all splits
   FOR i := 1 TO nSplits
      nDim += LEN(aMatrices[i][1][1])
   NEXT

   // Initialize the result matrix with the correct shape
   aResultMatrix := Array(nBatchSize)
   FOR i := 1 TO nBatchSize
      aResultMatrix[i] := Array(nSeqLength)
      FOR j := 1 TO nSeqLength
         aResultMatrix[i][j] := Array(nDim)
      NEXT
   NEXT

   // Concatenate the splits along the dimension axis
   nOffset := 0
   FOR i := 1 TO nSplits
      FOR j := 1 TO nBatchSize
         FOR k := 1 TO nSeqLength
            aSubMatrix := aMatrices[i][j][k]
            FOR n := 1 TO LEN(aSubMatrix)
               aResultMatrix[j][k][nOffset+n] := aSubMatrix[n]
            NEXT
         NEXT
      NEXT
      nOffset += LEN(aMatrices[i][1][1])
   NEXT

RETURN aResultMatrix

FUNCTION Reshape(aMatrix, nRows, nCols)
    LOCAL aResult, i, j, k, l

    aResult := Array(nRows)

    k := 1
    l := 1

    FOR i := 1 TO nRows
        aResult[i] := Array(nCols)

        FOR j := 1 TO nCols
            aResult[i][j] := aMatrix[k][l]
            l += 1
            IF l > LEN(aMatrix[k])
                l := 1
                k += 1
            ENDIF
        NEXT
    NEXT

    RETURN aResult
    
FUNCTION Softmax(aMatrix)
    LOCAL aResult, nRows, nCols, i, j, nMax, nSum, nExp

    nRows := LEN(aMatrix)
    nCols := LEN(aMatrix[1])

    aResult := Array(nRows) // Create an array to store the result

    FOR i := 1 TO nRows
        aResult[i] := Array(nCols) // Create an array for each row of the result

        nMax := MIN_DOUBLE // Initialize the maximum value for numerical stability
        FOR j := 1 TO nCols
            IF aMatrix[i][j] > nMax
                nMax := aMatrix[i][j]
            ENDIF
        NEXT

        nSum := 0 // Initialize the sum for the softmax function
        FOR j := 1 TO nCols
            nExp := EXP(aMatrix[i][j] - nMax) // Calculate the exponent for each entry
            aResult[i][j] := nExp
            nSum += nExp
        NEXT

        FOR j := 1 TO nCols
            aResult[i][j] /= nSum // Divide each entry by the sum
        NEXT
    NEXT

    RETURN aResult

Post by **Jimmy** » Mon Apr 17, 2023 2:06 pm

hi Antonio,

does the CODE really work

i can compile it (with some Warning) but i got "no Display" (and no Error LOG)

Post by **Antonio Linares** » Mon Apr 17, 2023 3:57 pm

Providing it some sentences. Still needs training...

attention.prg

Code: Select all | Expand

#define MIN_DOUBLE Val( "-1.7976931348623158e+308" )

PROCEDURE Main()
    LOCAL aSentences, aVocabulary := {}, aTokenIds, aEmbeddings, aInputEmbeddings, aPositionalEncoding, aEncodedEmbeddings, aWQ, aWK, aWV, aSelfAttentionOutput, nHeads, nDim, i, aNextWordEmbedding, aOutput, aNextWordTokens, aNextWords

    aSentences := {;
        "The quick brown fox jumps over the lazy dog.",;
        "I have a pen. I have an apple. Apple pen.",;
        "A journey of a thousand miles begins with a single step."}

    aTokenIds := SentencesToTokenIds( aSentences, @aVocabulary )

    nDim := 4 // Set the number of dimensions for the embeddings, weight matrices, etc.
    aEmbeddings := InitializeEmbeddings( Len( aTokenIds ), nDim) // Initialize embeddings for 10 tokens, each with nDim dimensions
    aInputEmbeddings := GetInputEmbeddings(aTokenIds, aEmbeddings)
    aPositionalEncoding := GeneratePositionalEncoding(LEN(aTokenIds), nDim)
    aEncodedEmbeddings := AddPositionalEncoding(aInputEmbeddings, aPositionalEncoding)

    // Initialize the weight matrices for Query (aWQ), Key (aWK), and Value (aWV) for each head
    nHeads := 4
    aWQ := Array(nHeads)
    aWK := Array(nHeads)
    aWV := Array(nHeads)
    FOR i := 1 TO nHeads
       aWQ[i] := InitializeEmbeddings(nDim, nDim)
       aWK[i] := InitializeEmbeddings(nDim, nDim)
       aWV[i] := InitializeEmbeddings(nDim, nDim)
    NEXT

    aSelfAttentionOutput := MultiHeadSelfAttention(aEncodedEmbeddings, aWQ, aWK, aWV, nHeads, nDim)

    aNextWordEmbedding := InitializeEmbeddings(nDim, nDim) // Initialize output weight matrix for the next word prediction
    aOutput := GetNextWordPrediction(aSelfAttentionOutput, aNextWordEmbedding)
    aNextWordTokens := GetTokensFromEmbeddings(aOutput, aEmbeddings)
    aNextWords := TokensToWords( aNextWordTokens, aVocabulary )

    // ? "Input Token IDs:", aTokenIds
    // ? "Input Embeddings:", aInputEmbeddings
    // ? "Positional Encoding:", aPositionalEncoding
    // ? "Encoded Embeddings:", aEncodedEmbeddings
    // ? "Multi-Head Self Attention Output:", aSelfAttentionOutput
    ? "Next Word Prediction:", aNextWords

RETURN

FUNCTION TokensToWords(aTokenIds, aVocabulary)
    LOCAL aReversedVocabulary, aWords, nTokenId, cWord

    aWords := {}

    FOR EACH nTokenId IN aTokenIds
        if nTokenId <= Len( aVocabulary ) 
           cWord := aVocabulary[ nTokenId ]
           AAdd(aWords, cWord)
        endif   
    NEXT

RETURN aWords

FUNCTION ReverseVocabulary(aVocabulary)
    LOCAL aReversedVocabulary, nTokenId, cWord

    aReversedVocabulary := Array( Len( aVocabulary ) )
    
    FOR EACH cWord IN aVocabulary
        nTokenId := AScan( aVocabulary, cWord )
        aReversedVocabulary[nTokenId] := cWord
    NEXT

RETURN aReversedVocabulary

FUNCTION GetNextWordPrediction(aSelfAttentionOutput, aNextWordEmbedding)
    LOCAL aOutput, nRow, nCol, nSum

    aOutput := Array(LEN(aSelfAttentionOutput))
    FOR nRow := 1 TO LEN(aSelfAttentionOutput)
        aOutput[nRow] := Array(LEN(aNextWordEmbedding))
        FOR nCol := 1 TO LEN(aNextWordEmbedding)
            nSum := 0
            FOR i := 1 TO LEN(aSelfAttentionOutput[nRow])
                nSum := nSum + aSelfAttentionOutput[nRow][i] * aNextWordEmbedding[i][nCol]
            NEXT
            aOutput[nRow][nCol] := nSum
        NEXT
    NEXT

RETURN aOutput

FUNCTION SentencesToTokenIds(aSentences, aVocabulary)
    LOCAL aTokenIds, aSentence, aTokenId, nToken, cWord, aWords

    aTokenIds := {}

    FOR EACH aSentence IN aSentences
       aWords = hb_ATokens( aSentence )
       for each cWord in aWords
          if AScan( aVocabulary, cWord ) == 0
             AAdd( aVocabulary, cWord )
             nToken = Len( aVocabulary )
          else
             nToken = AScan( aVocabulary, cWord )
          endif     
          AAdd( aTokenIds, nToken )
        NEXT
    NEXT

RETURN aTokenIds

FUNCTION GetTokensFromEmbeddings(aOutputEmbeddings, aEmbeddings)
    LOCAL aTokenIds, nIndex, nRow

    aTokenIds := Array(LEN(aOutputEmbeddings))

    FOR nRow := 1 TO LEN(aOutputEmbeddings)
        nIndex := FindClosestEmbedding(aOutputEmbeddings[nRow], aEmbeddings)
        aTokenIds[nRow] := nIndex
    NEXT

RETURN aTokenIds

FUNCTION FindClosestEmbedding(aTargetEmbedding, aEmbeddings)
    LOCAL nMinDistance, nDistance, nClosestIndex, nRow

    nMinDistance := Val( "1e99" )
    nClosestIndex := -1

    FOR nRow := 1 TO LEN(aEmbeddings)
        nDistance := CalculateDistance(aTargetEmbedding, aEmbeddings[nRow])
        IF nDistance < nMinDistance
            nMinDistance := nDistance
            nClosestIndex := nRow
        ENDIF
    NEXT

RETURN nClosestIndex

FUNCTION CalculateDistance(aVector1, aVector2)
    LOCAL nSum, i

    nSum := 0

    FOR i := 1 TO LEN(aVector1)
        nSum := nSum + (aVector1[i] - aVector2[i])^2
    NEXT

RETURN SQRT(nSum)

FUNCTION InitializeEmbeddings(nTokens, nDimensions)
    LOCAL aEmbeddings, nIndex, nDim
 
    aEmbeddings := Array( nTokens, nDimensions )
    FOR nIndex := 1 TO nTokens
       FOR nDim := 1 TO nDimensions
          aEmbeddings[nIndex][nDim] := (HB_Random() - 0.5) * 2 // Random number between -1 and 1
       NEXT
    NEXT
 
 RETURN aEmbeddings
 
 FUNCTION GetInputEmbeddings(aTokenIds, aEmbeddings)
    LOCAL aInputEmbeddings, nIndex
 
    aInputEmbeddings := Array( LEN(aTokenIds) )
    FOR nIndex := 1 TO LEN(aTokenIds)
       aInputEmbeddings[nIndex] := aEmbeddings[aTokenIds[nIndex]]
    NEXT
 
 RETURN aInputEmbeddings
 
 FUNCTION GeneratePositionalEncoding(nSequenceLength, nDimensions)
    LOCAL aPositionalEncoding, nIndex, nDim, nPos, nDivTerm
 
    aPositionalEncoding := Array( nSequenceLength, nDimensions )
    FOR nIndex := 1 TO nSequenceLength
       nPos := nIndex - 1
       FOR nDim := 1 TO nDimensions
          nDivTerm := 10000 ^ ((nDim - 1) / nDimensions)
          IF Mod(nDim, 2) == 1
             aPositionalEncoding[nIndex][nDim] := Sin(nPos / nDivTerm)
          ELSE
             aPositionalEncoding[nIndex][nDim] := Cos(nPos / nDivTerm)
          ENDIF
       NEXT
    NEXT
 
 RETURN aPositionalEncoding
 
 FUNCTION AddPositionalEncoding(aInputEmbeddings, aPositionalEncoding)
    LOCAL nIndex, nDim, aEncodedEmbeddings
 
    aEncodedEmbeddings := Array(LEN(aInputEmbeddings))
    FOR nIndex := 1 TO LEN(aInputEmbeddings)
       aEncodedEmbeddings[nIndex] := Array(LEN(aInputEmbeddings[nIndex]))
       FOR nDim := 1 TO LEN(aInputEmbeddings[nIndex])
          aEncodedEmbeddings[nIndex][nDim] := aInputEmbeddings[nIndex][nDim] + aPositionalEncoding[nIndex][nDim]
       NEXT
    NEXT
 
 RETURN aEncodedEmbeddings
 
 FUNCTION MultiHeadSelfAttention(aEncodedEmbeddings, aWQ, aWK, aWV, nHeads, nDim )
    LOCAL aHeadOutputs, aConcatenatedHeads, nIndex

    aHeadOutputs := Array(nHeads) // Create an array to store the output of each attention head

    FOR i := 1 TO nHeads
        aHeadOutputs[i] := SelfAttention(aEncodedEmbeddings, aWQ[i], aWK[i], aWV[i], nDim, nHeads)
    NEXT

    // Concatenate the outputs of all heads
    aConcatenatedHeads := ConcatenateHeads(aHeadOutputs)

    RETURN aConcatenatedHeads
    
FUNCTION SelfAttention(aEncodedEmbeddings, aWQ, aWK, aWV, nDim, nHeads)
    LOCAL aQ, aK, aV, aScores, aSoftmaxScores, aAttentionOutput

    // Reshape the weight matrices aWQ, aWK, and aWV
    aWQ := Reshape(aWQ, nDim, nHeads)
    aWK := Reshape(aWK, nDim, nHeads)
    aWV := Reshape(aWV, nDim, nHeads)

    // Compute the Query (Q), Key (K), and Value (V) matrices
    aQ := MultiplyMatrix(aEncodedEmbeddings, aWQ)
    aK := MultiplyMatrix(aEncodedEmbeddings, aWK)
    aV := MultiplyMatrix(aEncodedEmbeddings, aWV)

    // Transpose the Key (K) matrix
    aK := TransposeMatrix(aK)

    // Compute the attention scores
    aScores := MultiplyMatrix(aQ, aK)

    // Apply the softmax function to the scores
    aSoftmaxScores := Softmax(aScores)

    // Compute the attention output
    aAttentionOutput := MultiplyMatrix(aSoftmaxScores, aV)

    RETURN aAttentionOutput    
 
 FUNCTION SplitMatrix(aMatrix, nSplitSize)
    LOCAL nRows, nCols, nSplits, aSplitMatrices, i, j, k, aSplitMatrix
 
    nRows := LEN(aMatrix)
    nCols := LEN(aMatrix[1])
    nSplits := INT(nCols / nSplitSize) // Calculate the number of splits
 
    // Initialize the array of split matrices
    aSplitMatrices := Array(nSplits)
 
    // Split the matrix into multiple matrices of size nSplitSize
    FOR i := 1 TO nSplits
       aSplitMatrix := Array(nRows)
       FOR j := 1 TO nRows
          aSplitMatrix[j] := Array(nSplitSize)
          FOR k := 1 TO nSplitSize
             aSplitMatrix[j][k] := aMatrix[j][(i - 1) * nSplitSize + k]
          NEXT
       NEXT
       aSplitMatrices[i] := aSplitMatrix
    NEXT
 
 RETURN aSplitMatrices
 
 FUNCTION ConcatenateHeads(aMultiHeadOutput)
    LOCAL nIndex, nHead, aConcatenated
 
    aConcatenated := aMultiHeadOutput[1]
 
    FOR nHead := 2 TO LEN(aMultiHeadOutput)
       aConcatenated := ConcatenateArrays(aConcatenated, aMultiHeadOutput[nHead])
    NEXT
 
 RETURN aConcatenated
 
 FUNCTION ConcatenateArrays(aArray1, aArray2)
    LOCAL aResult, nIndex
 
    aResult := Array(LEN(aArray1) + LEN(aArray2))
 
    FOR nIndex := 1 TO LEN(aArray1)
       aResult[nIndex] := aArray1[nIndex]
    NEXT
 
    FOR nIndex := 1 TO LEN(aArray2)
       aResult[LEN(aArray1) + nIndex] := aArray2[nIndex]
    NEXT
 
 RETURN aResult
 
 function MultiplyMatrix(m1, m2)
    local nRows1, nCols1, nRows2, nCols2, i, j, k
    local result := {}

    nRows1 := Len(m1)
    nCols1 := Len(m1[1])
    nRows2 := Len(m2)
    nCols2 := Len(m2[1])

    // Initialize result matrix with correct dimensions
    result := Array(nRows1, nCols2 )

    // Calculate result matrix
    for i := 1 to nRows1
        for j := 1 to nCols2
            for k := 1 to nCols1
                result[i][j] = m1[i][k] * m2[k][j]
            next
        next
    next

return result

 FUNCTION TransposeMatrix(aMatrix)
    LOCAL nRows, nCols, aResult, i, j
 
    nRows := LEN(aMatrix)
    nCols := LEN(aMatrix[1])
 
    aResult := {}
    FOR i := 1 TO nCols
       AAdd(aResult, {})
       FOR j := 1 TO nRows
          AAdd(aResult[i], aMatrix[j][i])
       NEXT
    NEXT
 
 RETURN aResult
 
 FUNCTION NormalizeAndSoftmax(aMatrix)
    LOCAL nRows, nCols, aResult, i, j, nMax, nSum
 
    nRows := LEN(aMatrix)
    nCols := LEN(aMatrix[1])
 
    aResult := {}
    FOR i := 1 TO nRows
       AAdd(aResult, {})
       nMax := MaxElem(aMatrix[i])
       nSum := 0
       FOR j := 1 TO nCols
          aMatrix[i][j] := Exp(aMatrix[i][j] - nMax)
          nSum += aMatrix[i][j]
       NEXT
 
       FOR j := 1 TO nCols
          AAdd(aResult[i], aMatrix[i][j] / nSum)
       NEXT
    NEXT
 
 RETURN aResult
 
 FUNCTION MaxElem(aArray)
    LOCAL nMax, nElem
 
    nMax := aArray[1]
    FOR EACH nElem IN aArray
       IF nElem > nMax
          nMax := nElem
       ENDIF
    NEXT
 
 RETURN nMax
 
 FUNCTION ScaledDotProductAttention(aQuery, aKey, aValue)
    LOCAL nHeads, nBatchSize, nSeqLength, nDimPerHead, aQueryTranspose, aDotProduct, aAttentionScores, aAttentionScoresTranspose, aSoftmaxWeights, aMultiplied, aMultipliedTranspose
 
    nHeads := LEN(aQuery)
    nBatchSize := LEN(aQuery[1])
    nSeqLength := LEN(aQuery[1][1])
    nDimPerHead := LEN(aQuery[1][1])
 
    // Compute the dot product of the Query and Key matrices
    aQueryTranspose := TransposeMatrix(aQuery)
    aDotProduct := MultiplyMatrix(aQueryTranspose, aKey)
 
    // Scale the dot product by the square root of the number of dimensions per head
    aScaledDotProduct := aDotProduct / SQRT(nDimPerHead)
 
    // Compute the softmax weights for the attention scores
    aAttentionScores := TransposeMatrix(aScaledDotProduct)
    aAttentionScoresTranspose := NormalizeAndSoftmax(aAttentionScores)
 
    // Compute the matrix multiplication of the softmax weights and the Value matrix
    aSoftmaxWeights := TransposeMatrix(aAttentionScoresTranspose)
    aMultiplied := MultiplyMatrix(aSoftmaxWeights, aValue)
    aMultipliedTranspose := TransposeMatrix(aMultiplied)
 
    // Reshape the output to match the input shape
RETURN aMultipliedTranspose

FUNCTION ConcatenateMatrices(aMatrices)
   LOCAL nSplits, nBatchSize, nSeqLength, nDim, aResultMatrix, i, j, k, nOffset, aSubMatrix

   nSplits := LEN(aMatrices)
   nBatchSize := LEN(aMatrices[1])
   nSeqLength := LEN(aMatrices[1][1])
   nDim := 0

   // Calculate the total number of dimensions across all splits
   FOR i := 1 TO nSplits
      nDim += LEN(aMatrices[i][1][1])
   NEXT

   // Initialize the result matrix with the correct shape
   aResultMatrix := Array(nBatchSize)
   FOR i := 1 TO nBatchSize
      aResultMatrix[i] := Array(nSeqLength)
      FOR j := 1 TO nSeqLength
         aResultMatrix[i][j] := Array(nDim)
      NEXT
   NEXT

   // Concatenate the splits along the dimension axis
   nOffset := 0
   FOR i := 1 TO nSplits
      FOR j := 1 TO nBatchSize
         FOR k := 1 TO nSeqLength
            aSubMatrix := aMatrices[i][j][k]
            FOR n := 1 TO LEN(aSubMatrix)
               aResultMatrix[j][k][nOffset+n] := aSubMatrix[n]
            NEXT
         NEXT
      NEXT
      nOffset += LEN(aMatrices[i][1][1])
   NEXT

RETURN aResultMatrix

FUNCTION Reshape(aMatrix, nRows, nCols)
    LOCAL aResult, i, j, k, l

    aResult := Array(nRows)

    k := 1
    l := 1

    FOR i := 1 TO nRows
        aResult[i] := Array(nCols)

        FOR j := 1 TO nCols
            aResult[i][j] := aMatrix[k][l]
            l += 1
            IF l > LEN(aMatrix[k])
                l := 1
                k += 1
            ENDIF
        NEXT
    NEXT

    RETURN aResult
    
FUNCTION Softmax(aMatrix)
    LOCAL aResult, nRows, nCols, i, j, nMax, nSum, nExp

    nRows := LEN(aMatrix)
    nCols := LEN(aMatrix[1])

    aResult := Array(nRows) // Create an array to store the result

    FOR i := 1 TO nRows
        aResult[i] := Array(nCols) // Create an array for each row of the result

        nMax := MIN_DOUBLE // Initialize the maximum value for numerical stability
        FOR j := 1 TO nCols
            IF aMatrix[i][j] > nMax
                nMax := aMatrix[i][j]
            ENDIF
        NEXT

        nSum := 0 // Initialize the sum for the softmax function
        FOR j := 1 TO nCols
            nExp := EXP(aMatrix[i][j] - nMax) // Calculate the exponent for each entry
            aResult[i][j] := nExp
            nSum += nExp
        NEXT

        FOR j := 1 TO nCols
            aResult[i][j] /= nSum // Divide each entry by the sum
        NEXT
    NEXT

    RETURN aResult

Post by **Antonio Linares** » Mon Apr 17, 2023 3:58 pm

Dear Jimmy,

Please run it from here:
https://www.fivetechsoft.com/counter/modpro.php

I love this (mod_harbour) webapp for testing Harbour code and ideas quickly

Post by **Jimmy** » Tue Apr 18, 2023 12:18 am

hi Antonio,

Antonio Linares wrote: Please run it from here:
https://www.fivetechsoft.com/counter/modpro.php
I love this (mod_harbour) webapp for testing Harbour code and ideas quickly

WOW
i never have see something like this

FiveTech Software tech support forums

Harbour's multi-head self-attention transformer

Harbour's multi-head self-attention transformer

Re: Harbour's multi-head self-attention transformer

Re: Harbour's multi-head self-attention transformer

Re: Harbour's multi-head self-attention transformer

Re: Harbour's multi-head self-attention transformer