Embeddings for Harbour/xHarbour !!!

Post Reply
User avatar
Antonio Linares
Site Admin
Posts: 42716
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain
Has thanked: 93 times
Been thanked: 103 times
Contact:

Embeddings for Harbour/xHarbour !!!

Post by Antonio Linares »

This new Class TEmbeddings provides embeddings support for Harbour/xHarbour and also can be used from DBFs or other Database engines :) :!:

Included in next FWH build! Thanks to my son Lolo for his inspiration and for explaining it to me! <3

This class uses HuggingFace free embeddings API (378 values for each embedding) or OpenAI commercial embeddings API (3.072 values for each embedding). We convert those values to a json string that can be saved into a DBF, TXT, INI, etc.

tembeddings.prg

Code: Select all | Expand

#include "FiveWin.ch"
#include "hbcurl.ch"

//----------------------------------------------------------------------------//

CLASS TEmbeddings
    
   DATA   cModel
   DATA   cPrompt
   DATA   cResponse
   DATA   cUrl
   DATA   cKey
   DATA   hCurl
   DATA   nError     INIT 0
   DATA   nHttpCode  INIT 0
   DATA   aEmbeddings

   METHOD New( cModel, cKey )
   METHOD Distance( aVector2, lCosine )
   METHOD DistanceFrom( oEmbeddings ) INLINE ::Distance( oEmbeddings:aEmbeddings ) 
   METHOD End() 
   METHOD GetEmbeddings( cPrompt )    
   METHOD IsOpenAI() INLINE ( "openai" $ Lower( ::cModel ) .or. "text-embedding-" $ ::cModel )
   OPERATOR "-" ARG oEmbeddings INLINE ::DistanceFrom( oEmbeddings )  
   DESTRUCTOR Destroy()    

ENDCLASS        

//----------------------------------------------------------------------------//

METHOD New( cModel, cKey ) CLASS TEmbeddings

   DEFAULT cModel := "sentence-transformers/all-MiniLM-L6-v2"  // Default to Hugging Face
   DEFAULT cKey   := hb_GetEnv( "HF_API_KEY" )               // Try Hugging Face first

   if Empty( cKey )
      cKey := hb_GetEnv( "OPENAI_API_KEY" )  // If no HF_API_KEY, try OPENAI_API_KEY
   endif

   ::cModel = cModel
   ::cKey   = cKey
   ::hCurl  = curl_easy_init()

   // Set the endpoint based on the model
   if ::IsOpenAI()
      ::cUrl = "https://api.openai.com/v1/embeddings"
   else
      ::cUrl = "https://api-inference.huggingface.co/pipeline/feature-extraction/" + ::cModel  // Feature extraction for embeddings
   endif
    
   if Empty( ::cKey )
      MsgAlert( "API key is required! Set it via parameter or HF_API_KEY/OPENAI_API_KEY environment variables." )
   endif

return Self    

//----------------------------------------------------------------------------//

METHOD End() CLASS TEmbeddings

    curl_easy_cleanup( ::hCurl )
    ::hCurl = nil

return nil    

//----------------------------------------------------------------------------//

METHOD GetEmbeddings( cPrompt ) CLASS TEmbeddings 

   local aHeaders, cJson, hRequest := { => }, hResponse, uValue

   if ! Empty( cPrompt )
      ::cPrompt = cPrompt
   endif   

   curl_easy_setopt( ::hCurl, HB_CURLOPT_POST, .T. )
   curl_easy_setopt( ::hCurl, HB_CURLOPT_URL, ::cUrl )

   aHeaders := { ;
      "Content-Type: application/json", ;
      "Authorization: Bearer " + ::cKey ;
   }

   curl_easy_setopt( ::hCurl, HB_CURLOPT_HTTPHEADER, aHeaders )
   curl_easy_setopt( ::hCurl, HB_CURLOPT_USERNAME, '' )
   curl_easy_setopt( ::hCurl, HB_CURLOPT_DL_BUFF_SETUP )
   curl_easy_setopt( ::hCurl, HB_CURLOPT_SSL_VERIFYPEER, .F. )

   if ::IsOpenAI()
      hRequest[ "model" ] = ::cModel
      hRequest[ "input" ] = ::cPrompt  // OpenAI uses "input" instead of "inputs"
   else
      // Hugging Face: Correct structure for feature-extraction pipeline
      hRequest[ "inputs" ] = ::cPrompt
   endif

   cJson = hb_jsonEncode( hRequest )
   curl_easy_setopt( ::hCurl, HB_CURLOPT_POSTFIELDS, cJson )

   ::nError = curl_easy_perform( ::hCurl )
   curl_easy_getinfo( ::hCurl, HB_CURLINFO_RESPONSE_CODE, @::nHttpCode )

   if ::nError == HB_CURLE_OK
      ::cResponse = curl_easy_dl_buff_get( ::hCurl )
   else
      ::cResponse := "Error code " + Str( ::nError )
   endif

   hb_jsonDecode( ::cResponse, @hResponse )

   if ::IsOpenAI()
      TRY 
         uValue = hResponse[ "data" ][ 1 ][ "embedding" ]  // OpenAI returns embeddings in "data[0].embedding"
         ::aEmbeddings = uValue
      CATCH
         uValue = hResponse[ "error" ][ "message" ]
         ::aEmbeddings = nil
      END   
   else  // Hugging Face
      TRY 
         uValue = hResponse  // HF feature-extraction returns the vector directly as an array
         ::aEmbeddings = uValue
      CATCH
         uValue = hResponse[ "error" ]
         ::aEmbeddings = nil
      END   
   endif   
    
return uValue

//----------------------------------------------------------------------------//

PROCEDURE Destroy() CLASS TEmbeddings

   if ::hCurl != nil
      ::End()
   endif

return    

//----------------------------------------------------------------------------//

METHOD Distance( aVector2, lCosine ) CLASS TEmbeddings

    local nDistance, nSum, nNorm1, nNorm2, nDotProduct, nI

    DEFAULT lCosine := .T.  // Default to cosine distance
 
    if Empty( ::aEmbeddings )
       return -1  // Error: No embeddings stored in ::aEmbeddings
    endif
 
    if Len( ::aEmbeddings ) != Len( aVector2 )
       return -1  // Error: vectors must have the same length
    endif
 
    if lCosine
       // Calculate dot product and norms for cosine similarity
       nDotProduct := 0
       nNorm1 := 0
       nNorm2 := 0
       for nI := 1 to Len( ::aEmbeddings )
          nDotProduct += ::aEmbeddings[nI] * aVector2[nI]
          nNorm1 += ::aEmbeddings[nI] ^ 2
          nNorm2 += aVector2[nI] ^ 2
       next
       nNorm1 := Sqrt( nNorm1 )
       nNorm2 := Sqrt( nNorm2 )
       
       // Avoid division by zero
       if nNorm1 == 0 .OR. nNorm2 == 0
          return 1  // Maximum distance if one vector is zero
       endif
       
       // Cosine similarity and distance
       nDistance := 1 - ( nDotProduct / ( nNorm1 * nNorm2 ) )
    else
       // Euclidean distance
       nSum := 0
       for nI := 1 to Len( ::aEmbeddings )
          nSum += ( ::aEmbeddings[nI] - aVector2[nI] ) ^ 2
       next
       nDistance := Sqrt( nSum )
    endif
 
 return nDistance
 
//----------------------------------------------------------------------------//
Samples:

embeddings1.prg

Code: Select all | Expand

// Please remember to set HF_API_KEY=hf_... or OPENAI_API_KEY=... in your environment

#include "FiveWin.ch"

function Main()

    local oEmbeddings1 := TEmbeddings():New()
    local oEmbeddings2 := TEmbeddings():New()
    local oEmbeddings3 := TEmbeddings():New()
    
    oEmbeddings1:GetEmbeddings( "Me siento muy bien" )
    oEmbeddings2:GetEmbeddings( "¿ Como estás ?" )
    oEmbeddings3:GetEmbeddings( "he visto a Pepe" )

    ? oEmbeddings1 - oEmbeddings2
    ? oEmbeddings1 - oEmbeddings3
    
return nil    
embeddings2.prg

Code: Select all | Expand

// Please remember to set HF_API_KEY=hf_... or OPENAI_API_KEY=... in your environment

#include "FiveWin.ch"

function Main()

    local oEmbeddings := TEmbeddings():New()
    local cJson, cSentence := "Hello how are you ?" 
    
    oEmbeddings:GetEmbeddings( cSentence )
    
    cJson = hb_jsonEncode( oEmbeddings:aEmbeddings )

    if ! File( "embeddings.dbf" )
       DbCreate( "embeddings", { { "id", "+", 4, 0 },;
                                 { "sentence", "M", 10, 0 },;
                                 { "vectors", "M", 10 , 0 } } ) 
    endif
    
    USE embeddings 
    APPEND BLANK
    REPLACE sentence WITH cSentence
    REPLACE vectors WITH cJson
    XBrowse()
    USE
    
return nil    
regards, saludos

Antonio Linares
www.fivetechsoft.com
Post Reply