mysqldump --user=username --password=YourPassword database_phpbb > dumpfile.sql
2. We "clean" the dump file using this code:
Code: Select all | Expand
#include "FiveWin.ch"
function Main()
local cSQL := hb_memoRead( "dumpfile.sql" )
local hHTMLCodes := { ;
""" => '"',;
"#" => '#',;
"$" => '$',;
"%" => '%',;
"&" => '&',;
"'" => "'",;
"(" => '(',;
")" => ')',;
"*" => '*',;
"+" => '+',;
"," => ',',;
"&hyphen-minus;" => '-',;
"." => '.',;
"/" => '/',;
":" => ':',;
";" => ';',;
"<" => '<',;
"=" => '=',;
">" => '>',;
"?" => '?',;
"@" => '@',;
"[" => '[',;
"\" => '\\',;
"]" => ']',;
"^" => '^',;
"_" => '_',;
"`" => '`',;
"{" => '{',;
"|" => '|',;
"}" => '}',;
"~" => '~' }
hb_memoWrit( "forums.sql", hb_strReplace( cSQL, hHTMLCodes ) )
return nil
4. Create a DBF from it using this code:
Code: Select all | Expand
#include "FiveWin.ch"
request dbfcdx
function Main()
local oCn := Maria_Connect( { "localhost", "forums", "username", "YourPassword" } )
local cSQL
TEXT INTO cSQL
SELECT
DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%Y-%m-%d') AS date,
DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%H:%i') AS time,
phpbb_forums.forum_name AS forum,
phpbb_topics.topic_title AS topic,
phpbb_users.username AS username,
phpbb_posts.post_text AS text
FROM
phpbb_posts
JOIN
phpbb_users ON phpbb_posts.poster_id = phpbb_users.user_id
JOIN
phpbb_topics ON phpbb_posts.topic_id = phpbb_topics.topic_id
JOIN
phpbb_forums ON phpbb_posts.forum_id = phpbb_forums.forum_id;
ENDTEXT
oCn:SaveToDbf( cSQL, "posts.dbf" )
return nil
Code: Select all | Expand
#include "FiveWin.ch"
request dbfcdx
function Main()
local aPosts := {}, n
USE posts VIA "dbfcdx"
INDEX ON posts->topic + posts->date + posts->time + posts->forum TO subject
GO TOP
for n = 1 to 20
AAdd( aPosts, GetTopic() )
next
hb_memoWrit( "forums.json", hb_jsonEncode( aPosts ) )
XBrowser( aPosts )
return nil
function GetTopic()
local hTopic := {=>}, cTopic := StrToUtf8( RTrim( posts->topic ) )
hTopic[ "topic" ] = StrToUtf8( RTrim( posts->topic ) )
hTopic[ "messages" ] = {}
AAdd( hTopic[ "messages" ], GetPost() )
SKIP
while posts->topic == cTopic
AAdd( hTopic[ "messages" ], GetPost() )
SKIP
end
return hTopic
function GetPost()
local hPost := {=>}
hPost[ "topic" ] = StrToUtf8( RTrim( posts->topic ) )
hPost[ "forum" ] = StrToUtf8( RTrim( posts->forum ) )
hPost[ "username" ] = StrToUtf8( RTrim( posts->username ) )
hPost[ "date" ] = posts->date
hPost[ "time" ] = RTrim( posts->time )
hPost[ "text" ] = StrToUtf8( posts->text )
return hPost
#pragma BEGINDUMP
#include <windows.h>
#include <hbapi.h>
HB_FUNC( STRTOUTF8 )
{
int iLength1;
int iLength2;
LPWSTR szWideText;
char * szDest;
iLength1 = MultiByteToWideChar( CP_ACP, 0, hb_parc( 1 ), hb_parclen( 1 ), NULL, 0 );
szWideText = ( LPWSTR ) hb_xgrab( ( iLength1 + 1 ) * 2 );
MultiByteToWideChar( CP_ACP, 0, hb_parc( 1 ), hb_parclen( 1 ), szWideText, iLength1 );
szWideText[ iLength1 ] = NULL;
iLength2 = WideCharToMultiByte( CP_UTF8, 0, szWideText, iLength1, NULL, 0, NULL, NULL );
szDest = ( char * ) hb_xgrab( iLength2 + 1 );
WideCharToMultiByte( CP_UTF8, 0, szWideText, iLength1, szDest, iLength2, NULL, NULL );
hb_xfree( ( void * ) szWideText );
szDest[ iLength2 ] = NULL;
hb_retc( szDest );
hb_xfree( ( void * ) szDest );
}
#pragma ENDDUMP
The structure of the generated json file is as follows:
Code: Select all | Expand
[
{ "topic": the title of the topic,
"messages":
[
{
"topic": the title of the topic,
"forum": the forum name,
"username": name of the author,
"date": date of the post,
"time": time of the post,
"text": text of the post
},
next posts for the same topic
]
},
next topic,
...
]
Here you have the Google Colab file to run the fine tunning training from Microsoft Phi2:
https://github.com/FiveTechSoft/FWH_too ... rums.ipynb
======== original post
Some days ago our friend Bruno Cantero suggested me a great idea:
to generate an AI LLM from these forums, as these forums have been running for 18 years, so it may be a great dataset to train an AI LLM
So first thing we need is to build a dataset from it. Here I am posting some initial tests that I expect that we may be able to complete with the help of Uwe and Rao:
topics.prg
Code: Select all | Expand
#include "FiveWin.ch"
function Main()
local cURL := "http://forums.fivetechsupport.com/viewtopic.php?f3&t=8"
local cHTML := WebPageContents( cUrl )
MsgInfo( Len( GetTopics( cHtml ) ) )
return nil
function GetTopics( cHtml )
local nAt, cTopic
local aTopics := {}
while ( nAt := At( 'class="post bg', cHtml ) ) != 0
cTopic = SubStr( cHtml, nAt + Len( 'class="post bg' ) )
cTopic = SubStr( cTopic, 1, At( '<hr class="divider"', cTopic ) + Len( '<hr class="divider"' ) )
AAdd( aTopics, GetTopic( cTopic ) )
cHtml = SubStr( cHtml, At( '<hr class="divider"', cHtml ) + + Len( '<hr class="divider"' ) )
end
return aTopics
function GetTopic( cTopic )
local hTopic := {=>}
local cContent := SubStr( cTopic, At( '<div class="content">', cTopic ) + Len( '<div class="content">' ) )
local cAuthor := SubStr( cTopic, At( '<p class="author">', cTopic ) + Len( '<p class="author">' ) )
cContent = SubStr( cContent, 1, At( "</div>", cContent ) - 1 )
cAuthor = SubStr( cAuthor, 1, At( "</strong>", cAuthor ) - 5 )
cAuthor = SubStr( cAuthor, RAt( ">", cAuthor ) + 1 )
MsgInfo( cContent, cAuthor )
hTopic[ "contents" ] = cContent
hTopic[ "author" ] = cAuthor
return hTopic