Page 1 of 3

phpBB to LLM

Posted: Thu Dec 21, 2023 9:57 am
by Antonio Linares
1. Create a SQL dump file from your phpBB forums:

mysqldump --user=username --password=YourPassword database_phpbb > dumpfile.sql

2. We "clean" the dump file using this code:

Code: Select all | Expand

#include "FiveWin.ch"

function Main()

    local cSQL := hb_memoRead( "dumpfile.sql" )
    local hHTMLCodes := { ;
        """ => '"',;
        "#" => '#',;
        "$" => '$',;
        "%" => '%',;
        "&" => '&',;
        "'" => "'",;
        "(" => '(',;
        ")" => ')',;
        "*" => '*',;
        "+" => '+',;
        "," => ',',;
        "&hyphen-minus;" => '-',;
        "." => '.',;
        "/" => '/',;
        ":" => ':',;
        ";" => ';',;
        "<" => '<',;
        "&equals;" => '=',;
        ">" => '>',;
        "&quest;" => '?',;
        "&commat;" => '@',;
        "&lsqb;" => '[',;
        "&bsol;" => '\\',;
        "&rsqb;" => ']',;
        "&Hat;" => '^',;
        "&lowbar;" => '_',;
        "&grave;" => '`',;
        "&lcub;" => '{',;
        "&verbar;" => '|',;
        "&rcub;" => '}',;
        "~" => '~' }

    hb_memoWrit( "forums.sql", hb_strReplace( cSQL, hHTMLCodes ) )

return nil    
3. Create a local mysql "forums" database and using Heidi restore the dumpfile.sql into it

4. Create a DBF from it using this code:

Code: Select all | Expand

#include "FiveWin.ch"

request dbfcdx

function Main()

    local oCn := Maria_Connect( { "localhost", "forums", "username", "YourPassword" } )
    local cSQL 

    TEXT INTO cSQL 
    SELECT
        DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%Y-%m-%d') AS date,
        DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%H:%i') AS time,
        phpbb_forums.forum_name AS forum,
        phpbb_topics.topic_title AS topic,
        phpbb_users.username AS username,
        phpbb_posts.post_text AS text
    FROM
        phpbb_posts
    JOIN
        phpbb_users ON phpbb_posts.poster_id = phpbb_users.user_id
    JOIN
        phpbb_topics ON phpbb_posts.topic_id = phpbb_topics.topic_id
    JOIN
        phpbb_forums ON phpbb_posts.forum_id = phpbb_forums.forum_id;
    ENDTEXT

    oCn:SaveToDbf( cSQL, "posts.dbf" )

return nil    
5. Now we create a small dataset in json to make tests:

Code: Select all | Expand

#include "FiveWin.ch"

request dbfcdx

function Main()

    local aPosts := {}, n

    USE posts VIA "dbfcdx"

    INDEX ON posts->topic + posts->date + posts->time + posts->forum TO subject
    GO TOP

    for n = 1 to 20
       AAdd( aPosts, GetTopic() )
    next
    hb_memoWrit( "forums.json", hb_jsonEncode( aPosts ) )
    XBrowser( aPosts )

return nil

function GetTopic()

    local hTopic := {=>}, cTopic := StrToUtf8( RTrim( posts->topic ) )

    hTopic[ "topic" ]    = StrToUtf8( RTrim( posts->topic )  )
    hTopic[ "messages" ] = {}

    AAdd( hTopic[ "messages" ], GetPost() )
    SKIP 
    while posts->topic == cTopic
       AAdd( hTopic[ "messages" ], GetPost() ) 
       SKIP 
    end

return hTopic    

function GetPost() 

    local hPost := {=>}

    hPost[ "topic" ]    = StrToUtf8( RTrim( posts->topic ) )
    hPost[ "forum" ]    = StrToUtf8( RTrim( posts->forum ) )
    hPost[ "username" ] = StrToUtf8( RTrim( posts->username ) )
    hPost[ "date" ]     = posts->date 
    hPost[ "time" ]     = RTrim( posts->time )
    hPost[ "text" ]     = StrToUtf8( posts->text )

return hPost    

#pragma BEGINDUMP

#include <windows.h>
#include <hbapi.h>

HB_FUNC( STRTOUTF8 )
{
   int iLength1;
   int iLength2;
   LPWSTR szWideText;
   char * szDest;

   iLength1 = MultiByteToWideChar( CP_ACP, 0, hb_parc( 1 ), hb_parclen( 1 ), NULL, 0 );
   szWideText = ( LPWSTR ) hb_xgrab( ( iLength1 + 1 ) * 2 );
   MultiByteToWideChar( CP_ACP, 0, hb_parc( 1 ), hb_parclen( 1 ), szWideText, iLength1 );
   szWideText[ iLength1 ] = NULL;
   iLength2 = WideCharToMultiByte( CP_UTF8, 0, szWideText, iLength1, NULL, 0, NULL, NULL );
   szDest = ( char * ) hb_xgrab( iLength2 + 1 );
   WideCharToMultiByte( CP_UTF8, 0, szWideText, iLength1, szDest, iLength2, NULL, NULL );
   hb_xfree( ( void * ) szWideText );
   szDest[ iLength2 ] = NULL;
   hb_retc( szDest );
   hb_xfree( ( void * ) szDest );
}    

#pragma ENDDUMP
 
6. next we load this forums.json as a dataset to HuggingFace to verify that it is correct. Open a free account at HuggingFace, create a dataset and upload forums.json. If you can properly inspect the forums.json from HuggingFace then it means that forums.json is ok.

The structure of the generated json file is as follows:

Code: Select all | Expand

[
   {  "topic": the title of the topic,
      "messages":
      [ 
         {
            "topic": the title of the topic,
            "forum": the forum name,
            "username": name of the author,
            "date": date of the post,
            "time": time of the post,
            "text": text of the post
         },
        next posts for the same topic
      ]
   },
   next topic,
   ...
]
so basically it is a list of the topics, with the name of the topic and the list of messages for such topic.

Here you have the Google Colab file to run the fine tunning training from Microsoft Phi2:
https://github.com/FiveTechSoft/FWH_too ... rums.ipynb

======== original post

Some days ago our friend Bruno Cantero suggested me a great idea:

to generate an AI LLM from these forums, as these forums have been running for 18 years, so it may be a great dataset to train an AI LLM :-)

So first thing we need is to build a dataset from it. Here I am posting some initial tests that I expect that we may be able to complete with the help of Uwe and Rao:

topics.prg

Code: Select all | Expand

#include "FiveWin.ch"

function Main()

    local cURL  := "http://forums.fivetechsupport.com/viewtopic.php?f3&t=8"
    local cHTML := WebPageContents( cUrl )
 
    MsgInfo( Len( GetTopics( cHtml ) ) )
 
return nil

function GetTopics( cHtml )

   local nAt, cTopic
   local aTopics := {}

   while ( nAt := At( 'class="post bg', cHtml ) ) != 0
      cTopic = SubStr( cHtml, nAt + Len( 'class="post bg' ) )
      cTopic = SubStr( cTopic, 1, At( '<hr class="divider"', cTopic ) + Len( '<hr class="divider"' ) )
      AAdd( aTopics, GetTopic( cTopic ) )
      cHtml = SubStr( cHtml, At( '<hr class="divider"', cHtml ) +  + Len( '<hr class="divider"' ) )
   end 
   
return aTopics   

function GetTopic( cTopic )

   local hTopic := {=>}
   local cContent := SubStr( cTopic, At( '<div class="content">', cTopic ) + Len( '<div class="content">' ) )
   local cAuthor := SubStr( cTopic, At( '<p class="author">', cTopic ) + Len( '<p class="author">' ) )

   cContent = SubStr( cContent, 1, At( "</div>", cContent ) - 1 )

   cAuthor  = SubStr( cAuthor, 1, At( "</strong>", cAuthor ) - 5 )
   cAuthor  = SubStr( cAuthor, RAt( ">", cAuthor ) + 1 )

   MsgInfo( cContent, cAuthor )

   hTopic[ "contents" ] = cContent
   hTopic[ "author" ] = cAuthor

return hTopic   
 

Re: phpBB to LLM

Posted: Thu Dec 21, 2023 11:49 am
by Antonio Linares
Another ideal use would be a DBF to LLM :-)

We already have llama64.dll to run it !

Re: phpBB to LLM

Posted: Thu Dec 21, 2023 12:22 pm
by paquitohm
Antonio,

Pues me parece una buenisima idea.

Se podria interrogar a la IA sobre cual es el mejor vendedor, cuando vendio mas, que vendedor ha decrecido, etc

slds

Re: phpBB to LLM

Posted: Thu Dec 21, 2023 12:27 pm
by Jimmy
hi Antonio,
Antonio Linares wrote:Another ideal use would be a DBF to LLM :-)

We already have llama64.dll to run it !
i do have a DBF of Fivewin Forum

i use my phpBB "Codebox" Reader based on Idea of Uwe
https://www.hmgforum.com/viewtopic.php?t=7281

extract "Codebox" Tag is not my Problem, it are HTML "Sign" which i try to STRTRAN()
it work so far with CODE but in "Body" i still have a lot HTML "Sign"

Question : is there a Function HTML2TEXT() to get plain TEXT from HTML Message :?:

Re: phpBB to LLM

Posted: Thu Dec 21, 2023 4:31 pm
by Antonio Linares
paquitohm wrote:Antonio,

Pues me parece una buenisima idea.

Se podria interrogar a la IA sobre cual es el mejor vendedor, cuando vendio mas, que vendedor ha decrecido, etc

slds
totalmente de acuerdo. Tenemos que conseguirlo :-)

Re: phpBB to LLM

Posted: Thu Dec 21, 2023 8:47 pm
by Marc Venken
I also generate a DBF based on Rao and Uwe code.

I only extract the source samples from all the posts. Offline I then search the dbf for keywords of any kind. It helped me many many times in finding solutions because It is showing sample code.

Re: phpBB to LLM

Posted: Fri Dec 22, 2023 6:43 am
by Antonio Linares
This is the type of csv file that we have to generate from a DBF:

customer.csv
ID,FIRST,LAST,STREET,CITY,STATE,ZIP,HIREDATE,MARRIED,AGE,SALARY,NOTES
1,Homer,Simpson,32179 Maiden Lane,Springfield,IL,20503-8202,1992-09-18,True,50,5900.0,This is a test for record 1
2,Ceci,Gibbard,9540 Raynes Park Road,Miami,MA,55774-2304,1984-10-17,False,28,123700.0,This is a test for record 2
3,Reg,Kaczocha,30522 Park Ten Place,Scottsdale,WY,09226-1483,1989-05-23,True,43,82900.0,This is a test for record 3
4,David,Jochum,8211 Carnegie Center,Hingham,IL,71947-5114,1900-10-10,True,34,120000.0,This is a test for record 4
5,Simpson,Cafee,32736 Meadowbrook Drive,Nedlands,ID,38179-3789,1990-12-11,True,88,51800.0,This is a test for record 5
6,Tom,Logan,6180 Roselle Street,West Covina,CT,82378-0904,1992-02-24,True,90,20400.0,This is a test for record 6
7,Gary,Brock,3893 Canandaigua Road,Senford,WV,94177-5329,1987-09-12,True,58,145300.0,This is a test for record 7
8,Frank,Fonseca,18712 Sherman Way,Ashby,RI,08218-8409,1988-02-16,False,46,118900.0,This is a test for record 8
9,Rick,Sencovici,13802 South University,Arcadia,HI,82063-8091,1987-01-17,True,55,23700.0,This is a test for record 9
10,Hugh,Lupton,16472 S. LaSalle Street,Tarzana,AK,79021-0643,1989-08-28,False,89,96700.0,This is a test for record 10
11,_,Farley,19123 Washington Street,Boston,IN,25885-0851,1985-08-31,True,46,77300.0,This is a test for record 11
12,Johnny,Fischer,30621 Inridge Drive,McLean,WA,86275-8035,1988-11-12,False,37,2300.0,This is a test for record 12
13,Corkey,Young,9069 Avon Place,Lund,NC,36199-1793,1988-12-24,True,54,30000.0,This is a test for record 13
14,Phyllis,Lechuga,1457 Indianapolis Ave,Council Bluffs,AR,73036-5749,1987-01-29,False,94,84600.0,This is a test for record 14
15,Chester,Padilla,32385 Federal Street,Ashby,MS,82882-2447,1985-12-22,True,90,144000.0,This is a test for record 15

Re: phpBB to LLM

Posted: Fri Dec 22, 2023 9:31 am
by Otto
Dear Antonio,

Can we download the DBF (phpBB) file from somewhere?

Best regards,

Otto

Re: phpBB to LLM

Posted: Fri Dec 22, 2023 9:47 am
by VictorCasajuana
Hola Antonio, tienes acceso a la base de datos del foro? en este caso sería más sencillo extraer la información.

Re: phpBB to LLM

Posted: Fri Dec 22, 2023 10:22 am
by Antonio Linares
Estimado Victor,

Si, claro, tenemos acceso a la base de datos.

De todas formas la cuestión ahora es como organizar esos datos para que puedan ser entrenados en un modelo de IA pre entrenado.

Por el momento hemos encontrado este modelo que nos da una primera idea:
data = {
'text': ['Initial post for programming question.', 'Reply 1: I think the issue is with your code.',
'Reply 2: Can you provide more details?', 'Initial post for hardware issue.',
'Reply 1: Have you checked the connections?', 'Reply 2: Try updating your drivers.'],
'label': ['programming', 'programming', 'programming', 'hardware', 'hardware', 'hardware']
}
poco a poco :-)

Re: phpBB to LLM

Posted: Fri Dec 22, 2023 10:24 am
by Antonio Linares
Otto wrote:Dear Antonio,

Can we download the DBF (phpBB) file from somewhere?

Best regards,

Otto
Dear Otto,

We have not generated a DBF from these forums yet, but surely we will do it :-)

Re: phpBB to LLM

Posted: Fri Dec 22, 2023 1:32 pm
by Antonio Linares
Dear Otto, Victor,

We already have a posts.dbf with all the forums posts :-)

We are reviewing if it has some private info that must not be shared...

Re: phpBB to LLM

Posted: Sat Dec 23, 2023 4:51 pm
by Antonio Linares
This seems to be fine:

Code: Select all | Expand

#include "FiveWin.ch"

request dbfcdx

function Main()

    local oCn := Maria_Connect( { "localhost", "forums", "root", "password" } )
    local cSQL 

    TEXT INTO cSQL 
    SELECT
        DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%Y-%m-%d') AS date,
        DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%H:%i') AS time,
        phpbb_forums.forum_name AS forum,
        phpbb_topics.topic_title AS topic,
        phpbb_users.username AS username,
        phpbb_posts.post_text AS text
    FROM
        phpbb_posts
    JOIN
        phpbb_users ON phpbb_posts.poster_id = phpbb_users.user_id
    JOIN
        phpbb_topics ON phpbb_posts.topic_id = phpbb_topics.topic_id
    JOIN
        phpbb_forums ON phpbb_posts.forum_id = phpbb_forums.forum_id;
    ENDTEXT

    oCn:SaveToDbf( cSQL, "c:\temp\posts.dbf" )

return nil    

Re: phpBB to LLM

Posted: Sat Dec 23, 2023 5:22 pm
by Antonio Linares
Here you have a posts.dbf and posts.fpt with all the contents of these forums :-)

We appreciate if you can review it and check if they are fine for you:
https://github.com/FiveTechSoft/forums/ ... ts_dbf.zip

join these parts using Total Commander so you get a posts_fpt.zip
https://github.com/FiveTechSoft/forums/ ... ts_fpt.001
https://github.com/FiveTechSoft/forums/ ... ts_fpt.002
https://github.com/FiveTechSoft/forums/ ... ts_fpt.003

Re: phpBB to LLM

Posted: Sat Dec 23, 2023 9:19 pm
by Antonio Linares
First try building the dataset. hb_jsonEncode() GPFs...

Code: Select all | Expand

#include "FiveWin.ch"

request dbfcdx

function Main()

    local aPosts := {}

    USE posts VIA "dbfcdx"

    INDEX ON posts->topic + posts->date + posts->time + posts->forum TO subject
    GO TOP

    while ! EoF()
       AAdd( aPosts, GetTopic() )
    end       

    // hb_jsonEncode( aPosts )
    MsgInfo( Len( aPosts ) )

return nil

function GetTopic()

    local hTopic := {=>}, cTopic := posts->topic

    hTopic[ "topic" ]    = posts->topic 
    hTopic[ "messages" ] = {}

    AAdd( hTopic[ "messages" ], GetPost() )
    SKIP 
    while posts->topic == cTopic
       AAdd( hTopic[ "messages" ], GetPost() ) 
       SKIP 
    end

return hTopic    

function GetPost() 

    local hPost := {=>}

    hPost[ "topic" ]    = posts->topic 
    hPost[ "username" ] = posts->username 
    hPost[ "text" ]     = posts->text

return hPost