phpBB to LLM
Posted: Thu Dec 21, 2023 9:57 am
1. Create a SQL dump file from your phpBB forums:
mysqldump --user=username --password=YourPassword database_phpbb > dumpfile.sql
2. We "clean" the dump file using this code:
3. Create a local mysql "forums" database and using Heidi restore the dumpfile.sql into it
4. Create a DBF from it using this code:
5. Now we create a small dataset in json to make tests:
6. next we load this forums.json as a dataset to HuggingFace to verify that it is correct. Open a free account at HuggingFace, create a dataset and upload forums.json. If you can properly inspect the forums.json from HuggingFace then it means that forums.json is ok.
The structure of the generated json file is as follows:
so basically it is a list of the topics, with the name of the topic and the list of messages for such topic.
Here you have the Google Colab file to run the fine tunning training from Microsoft Phi2:
https://github.com/FiveTechSoft/FWH_too ... rums.ipynb
======== original post
Some days ago our friend Bruno Cantero suggested me a great idea:
to generate an AI LLM from these forums, as these forums have been running for 18 years, so it may be a great dataset to train an AI LLM
So first thing we need is to build a dataset from it. Here I am posting some initial tests that I expect that we may be able to complete with the help of Uwe and Rao:
topics.prg
mysqldump --user=username --password=YourPassword database_phpbb > dumpfile.sql
2. We "clean" the dump file using this code:
Code: Select all | Expand
#include "FiveWin.ch"
function Main()
local cSQL := hb_memoRead( "dumpfile.sql" )
local hHTMLCodes := { ;
""" => '"',;
"#" => '#',;
"$" => '$',;
"%" => '%',;
"&" => '&',;
"'" => "'",;
"(" => '(',;
")" => ')',;
"*" => '*',;
"+" => '+',;
"," => ',',;
"&hyphen-minus;" => '-',;
"." => '.',;
"/" => '/',;
":" => ':',;
";" => ';',;
"<" => '<',;
"=" => '=',;
">" => '>',;
"?" => '?',;
"@" => '@',;
"[" => '[',;
"\" => '\\',;
"]" => ']',;
"^" => '^',;
"_" => '_',;
"`" => '`',;
"{" => '{',;
"|" => '|',;
"}" => '}',;
"~" => '~' }
hb_memoWrit( "forums.sql", hb_strReplace( cSQL, hHTMLCodes ) )
return nil
4. Create a DBF from it using this code:
Code: Select all | Expand
#include "FiveWin.ch"
request dbfcdx
function Main()
local oCn := Maria_Connect( { "localhost", "forums", "username", "YourPassword" } )
local cSQL
TEXT INTO cSQL
SELECT
DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%Y-%m-%d') AS date,
DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%H:%i') AS time,
phpbb_forums.forum_name AS forum,
phpbb_topics.topic_title AS topic,
phpbb_users.username AS username,
phpbb_posts.post_text AS text
FROM
phpbb_posts
JOIN
phpbb_users ON phpbb_posts.poster_id = phpbb_users.user_id
JOIN
phpbb_topics ON phpbb_posts.topic_id = phpbb_topics.topic_id
JOIN
phpbb_forums ON phpbb_posts.forum_id = phpbb_forums.forum_id;
ENDTEXT
oCn:SaveToDbf( cSQL, "posts.dbf" )
return nil
Code: Select all | Expand
#include "FiveWin.ch"
request dbfcdx
function Main()
local aPosts := {}, n
USE posts VIA "dbfcdx"
INDEX ON posts->topic + posts->date + posts->time + posts->forum TO subject
GO TOP
for n = 1 to 20
AAdd( aPosts, GetTopic() )
next
hb_memoWrit( "forums.json", hb_jsonEncode( aPosts ) )
XBrowser( aPosts )
return nil
function GetTopic()
local hTopic := {=>}, cTopic := StrToUtf8( RTrim( posts->topic ) )
hTopic[ "topic" ] = StrToUtf8( RTrim( posts->topic ) )
hTopic[ "messages" ] = {}
AAdd( hTopic[ "messages" ], GetPost() )
SKIP
while posts->topic == cTopic
AAdd( hTopic[ "messages" ], GetPost() )
SKIP
end
return hTopic
function GetPost()
local hPost := {=>}
hPost[ "topic" ] = StrToUtf8( RTrim( posts->topic ) )
hPost[ "forum" ] = StrToUtf8( RTrim( posts->forum ) )
hPost[ "username" ] = StrToUtf8( RTrim( posts->username ) )
hPost[ "date" ] = posts->date
hPost[ "time" ] = RTrim( posts->time )
hPost[ "text" ] = StrToUtf8( posts->text )
return hPost
#pragma BEGINDUMP
#include <windows.h>
#include <hbapi.h>
HB_FUNC( STRTOUTF8 )
{
int iLength1;
int iLength2;
LPWSTR szWideText;
char * szDest;
iLength1 = MultiByteToWideChar( CP_ACP, 0, hb_parc( 1 ), hb_parclen( 1 ), NULL, 0 );
szWideText = ( LPWSTR ) hb_xgrab( ( iLength1 + 1 ) * 2 );
MultiByteToWideChar( CP_ACP, 0, hb_parc( 1 ), hb_parclen( 1 ), szWideText, iLength1 );
szWideText[ iLength1 ] = NULL;
iLength2 = WideCharToMultiByte( CP_UTF8, 0, szWideText, iLength1, NULL, 0, NULL, NULL );
szDest = ( char * ) hb_xgrab( iLength2 + 1 );
WideCharToMultiByte( CP_UTF8, 0, szWideText, iLength1, szDest, iLength2, NULL, NULL );
hb_xfree( ( void * ) szWideText );
szDest[ iLength2 ] = NULL;
hb_retc( szDest );
hb_xfree( ( void * ) szDest );
}
#pragma ENDDUMP
The structure of the generated json file is as follows:
Code: Select all | Expand
[
{ "topic": the title of the topic,
"messages":
[
{
"topic": the title of the topic,
"forum": the forum name,
"username": name of the author,
"date": date of the post,
"time": time of the post,
"text": text of the post
},
next posts for the same topic
]
},
next topic,
...
]
Here you have the Google Colab file to run the fine tunning training from Microsoft Phi2:
https://github.com/FiveTechSoft/FWH_too ... rums.ipynb
======== original post
Some days ago our friend Bruno Cantero suggested me a great idea:
to generate an AI LLM from these forums, as these forums have been running for 18 years, so it may be a great dataset to train an AI LLM
So first thing we need is to build a dataset from it. Here I am posting some initial tests that I expect that we may be able to complete with the help of Uwe and Rao:
topics.prg
Code: Select all | Expand
#include "FiveWin.ch"
function Main()
local cURL := "http://forums.fivetechsupport.com/viewtopic.php?f3&t=8"
local cHTML := WebPageContents( cUrl )
MsgInfo( Len( GetTopics( cHtml ) ) )
return nil
function GetTopics( cHtml )
local nAt, cTopic
local aTopics := {}
while ( nAt := At( 'class="post bg', cHtml ) ) != 0
cTopic = SubStr( cHtml, nAt + Len( 'class="post bg' ) )
cTopic = SubStr( cTopic, 1, At( '<hr class="divider"', cTopic ) + Len( '<hr class="divider"' ) )
AAdd( aTopics, GetTopic( cTopic ) )
cHtml = SubStr( cHtml, At( '<hr class="divider"', cHtml ) + + Len( '<hr class="divider"' ) )
end
return aTopics
function GetTopic( cTopic )
local hTopic := {=>}
local cContent := SubStr( cTopic, At( '<div class="content">', cTopic ) + Len( '<div class="content">' ) )
local cAuthor := SubStr( cTopic, At( '<p class="author">', cTopic ) + Len( '<p class="author">' ) )
cContent = SubStr( cContent, 1, At( "</div>", cContent ) - 1 )
cAuthor = SubStr( cAuthor, 1, At( "</strong>", cAuthor ) - 5 )
cAuthor = SubStr( cAuthor, RAt( ">", cAuthor ) + 1 )
MsgInfo( cContent, cAuthor )
hTopic[ "contents" ] = cContent
hTopic[ "author" ] = cAuthor
return hTopic