Words Embeddings

Words Embeddings

Postby Antonio Linares » Sat May 25, 2024 9:12 am

GoogleNews-vectors-negative300.bin
https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing

https://www.h-schmidt.net/FloatConverter/IEEE754.html

embeddings.prg
Code: Select all  Expand view  RUN
function Main()

    local hFile := FOpen( "GoogleNews-vectors-negative300.bin", "r" )
    local cBuffer := Chr( 0 ), cWord := "", n := 0
    local cEmbeddings := Space( 1200 ), nBytesRead

    DbCreate( "embeddings.dbf", { { "WORD", "C", 10, 0 }, { "EMBEDDINGS", "M", 10, 0 } } )
    USE embeddings

    while cBuffer != Chr( 10 )
       FRead( hFile, @cBuffer, 1 )
       ?? cBuffer
    end  

    APPEND BLANK
    while cBuffer != Chr( 32 )
       FRead( hFile, @cBuffer, 1 )
       ?? cBuffer
       cWord += cBuffer
    end  
    field->Word := cWord
 
    while .T.
        nBytesRead := FRead( hFile, @cEmbeddings, 1200 )
        if nBytesRead < 1200
            ? "End of file"
            exit
        else    
            field->embeddings := cEmbeddings
        endif
        APPEND BLANK
        cBuffer = Chr( 0 )
        cWord = ""
        ? ""
        while cBuffer != Chr( 32 )
            nBytesRead := FRead( hFile, @cBuffer, 1 )
            if nBytesRead == 0
                ? "End of file"
                exit
            endif
            ?? cBuffer
            cWord += cBuffer
        end  
        field->Word := cWord
    end  

    FClose( hFile )
    USE
   
return nil
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42107
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: Words Embeddings

Postby Antonio Linares » Mon May 27, 2024 8:51 am

Code: Select all  Expand view  RUN
function Main()

   SET DECIMALS TO 10

   ? Bin2D( 0x3A940000 )
   
return nil

#pragma BEGINDUMP

#include <hbapi.h>

struct st_ieee
{
   unsigned int uiMantissa:23;
   unsigned int uiExponent:8;
   unsigned int uiSign:1;
};

float ULToFloat( unsigned long ulValue )
{
   float fValue;
   struct st_ieee stValue;

   * ( unsigned long * ) &stValue = ulValue;
   * ( unsigned long * ) &fValue = * ( unsigned long * ) &stValue;

   return fValue;
}

HB_FUNC( BIN2D )
{
   hb_retnd( ULToFloat( hb_parnl( 1 ) ) );
}  

#pragma ENDDUMP
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42107
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: Words Embeddings

Postby Antonio Linares » Tue May 28, 2024 5:48 am

Code: Select all  Expand view  RUN
function Main()

    local hFile := FOpen( "GoogleNews-vectors-negative300.bin", "r" )
    local cBuffer := Chr( 0 ), cWord := "", n := 0
    local cEmbeddings := Space( 1200 ), nBytesRead
    local cVectors1, cVectors2, cVectors3

    SET DECIMALS TO 10

    DbCreate( "embeddings.dbf", { { "WORD", "C", 50, 0 },;
                                  { "VECTORS", "C", 1200, 0 } } )
    USE embeddings
    INDEX ON field->Word TO "words"
    SET INDEX TO words

    while cBuffer != Chr( 10 )
       FRead( hFile, @cBuffer, 1 )
       ?? cBuffer
    end  

    APPEND BLANK
    while cBuffer != Chr( 32 )
       FRead( hFile, @cBuffer, 1 )
       ?? cBuffer
       cWord += cBuffer
    end  
    field->Word := cWord
 
    while n++ < 10000
        nBytesRead := FRead( hFile, @cEmbeddings, 1200 )
        if nBytesRead < 1200
            ? "End of file"
            exit
        else    
            field->Vectors := cEmbeddings
        endif
        APPEND BLANK
        cBuffer = Chr( 0 )
        cWord = ""
        ? ""
        while cBuffer != Chr( 32 )
            nBytesRead := FRead( hFile, @cBuffer, 1 )
            if nBytesRead == 0
                ? "End of file"
                exit
            endif
            cWord += cBuffer
        end  
        field->Word := cWord
    end  

    GO TOP
    SEEK "man"
    ? field->word
    cVectors1 = field->vectors
    SEEK "woman"
    ? field->word
    cVectors2 = field->vectors
    ? CosineSim( cVectors1, cVectors2 )
    SEEK "child"
    ? field->word
    cVectors3 = field->vectors
    ? CosineSim( cVectors1, cVectors3 )

    // DbEval( { || If( Empty( field->Vectors ), Alert( "empty" ),) } )
    FClose( hFile )
    USE
   
return nil

#pragma BEGINDUMP

#include <math.h>
#include <hbapi.h>

struct st_ieee
{
   unsigned int uiMantissa:23;
   unsigned int uiExponent:8;
   unsigned int uiSign:1;
};

float ULToFloat( unsigned long ulValue )
{
   float fValue;
   struct st_ieee stValue;

   * ( unsigned long * ) &stValue = ulValue;
   * ( unsigned long * ) &fValue = * ( unsigned long * ) &stValue;

   return fValue;
}

HB_FUNC( BIN2D )
{
   unsigned long vector;

   memcpy( &vector, hb_parc( 1 ), 4 );

   hb_retnd( ULToFloat( vector ) );
}  

float cosine_similarity( unsigned long * vec1, unsigned long * vec2, int size )
{
   float dot_product = 0.0, norm_a = 0.0, norm_b = 0.0;

   for( int i = 0; i < size; i++ )
   {
      float fl1, fl2;

      dot_product += ( fl1 = ULToFloat( vec1[ i ] ) ) * ( fl2 = ULToFloat( vec2[ i ] ) );
      norm_a += fl1 * fl1;
      norm_b += fl2 * fl2;
   }
   
   return ( float ) ( dot_product / ( sqrt( norm_a ) * sqrt( norm_b ) ) );
}

HB_FUNC( COSINESIM )
{
   hb_retnd( cosine_similarity( ( unsigned long * ) hb_parc( 1 ), ( unsigned long * ) hb_parc( 2 ), 300 ) );    
}

#pragma ENDDUMP
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42107
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain


Return to Utilities / Utilidades

Who is online

Users browsing this forum: No registered users and 6 guests