http://www.codeproject.com/script/Content/ViewAssociatedFile.aspx?rzp=%2FKB%2Fcpp%2FExtractPDFText%2Fextractpdftext_src.zip&zep=cp%2Fpdf.cpp&obid=7056&obtid=2&ovid=1
Here is my code. I have not been able to make it work, not sure if I am testing wrong PDFs or if there is some hidden bug somewhere. In case some of you want to test it, and maybe we find whats missing:
- Code: Select all Expand view
- #include "FiveWin.ch"
function Main()
local cPDF := MemoRead( "c:\test.pdf" )
local nStart := At( "stream", cPDF )
local nEnd := At( "endstream", cPDF )
local cBuf := Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
local cText, nResult
local hFile
while nStart <= Len( cPDF )
nStart = At( "stream", cPDF )
nEnd = At( "endstream", cPDF )
cBuf = Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
cText = SubStr( cPDF, nStart + 6, nEnd - nStart )
if Left( cText, 1 ) == Chr( 0x0d ) .and. ;
SubStr( cText, 2, 1 ) == Chr( 0x0a )
nStart += 2
elseif Left( cText, 1 ) == Chr( 0x0a )
nStart++
endif
if SubStr( cText, nEnd - 2, 1 ) == Chr( 0x0d ) .and. ;
SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
nEnd -= 2
elseif SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
nEnd--
endif
HB_ZUNCOMPRESS( SubStr( cPDF, nStart + 6, nEnd - nStart ), @cBuf, @nResult )
cPDF = SubStr( cPDF, nEnd + Len( "endstream" ) + 1 )
ProcessOutput( hFile := fcreate( "c:\test.out", "wb" ), cBuf )
FClose( hFile )
if ! Empty( MemoRead( "c:\test.out" ) )
// MsgInfo( MemoRead( "c:\test.out" ), nResult )
endif
end
return nil
#pragma BEGINDUMP
#include <hbapi.h>
#include <wtypes.h>
#define oldchar 15
float ExtractNumber(const char* search, int lastcharoffset)
{
float flt=-1.0;
int i = lastcharoffset;
char buffer[oldchar+5];
while (i>0 && search[i]==' ') i--;
while (i>0 && (isdigit(search[i]) || search[i]=='.')) i--;
ZeroMemory(buffer,sizeof(buffer));
strncpy(buffer, search+i+1, lastcharoffset-i);
if (buffer[0] && sscanf(buffer, "%f", &flt))
{
return flt;
}
return -1.0;
}
BOOL seen2(const char* search, char* recent)
{
if ( recent[oldchar-3]==search[0]
&& recent[oldchar-2]==search[1]
&& (recent[oldchar-1]==' ' || recent[oldchar-1]==0x0d ||
recent[oldchar-1]==0x0a)
&& (recent[oldchar-4]==' ' || recent[oldchar-4]==0x0d ||
recent[oldchar-4]==0x0a)
)
{
return TRUE;
}
return FALSE;
}
#include <hbapifs.h>
static int xputc( unsigned char c, FILE * fo )
{
static int iPos = 0;
return hb_fsWriteAt( ( HB_FHANDLE ) fo, &c, 1, iPos++ );
}
void ProcessOutput(FILE* file, char* output, size_t len)
{
//Are we currently inside a text object?
BOOL intextobject = FALSE;
//Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
BOOL nextliteral = FALSE;
//() Bracket nesting level. Text appears inside ()
int rbdepth = 0;
//Keep previous chars to get extract numbers etc.:
char oc[oldchar];
int j=0;
size_t i;
for (j=0; j<oldchar; j++) oc[j]=' ';
for( i=0; i<len; i++)
{
unsigned char c = output[i];
xputc( c, file );
if (intextobject)
{
if (rbdepth==0 && seen2("TD", oc))
{
//Positioning.
//See if a new line has to start or just a tab:
float num = ExtractNumber(oc,oldchar-5);
if (num>1.0)
{
xputc(0x0d, file);
xputc(0x0a, file);
}
if (num<1.0)
{
xputc('\t', file);
}
}
if (rbdepth==0 && seen2("ET", oc))
{
//End of a text object, also go to a new line.
intextobject = FALSE;
xputc(0x0d, file);
xputc(0x0a, file);
}
else if (c=='(' && rbdepth==0 && !nextliteral)
{
int num;
//Start outputting text!
rbdepth=1;
//See if a space or tab (>1000) is called for by looking
//at the number in front of (
num = ExtractNumber(oc,oldchar-1);
if (num>0)
{
if (num>1000.0)
{
xputc('\t', file);
}
else if (num>100.0)
{
xputc(' ', file);
}
}
}
else if (c==')' && rbdepth==1 && !nextliteral)
{
//Stop outputting text
rbdepth=0;
}
else if (rbdepth==1)
{
//Just a normal text character:
if (c=='\\' && !nextliteral)
{
//Only print out next character no matter what. Do not interpret.
nextliteral = TRUE;
}
else
{
nextliteral = FALSE;
if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) )
{
xputc(c, file);
}
}
}
}
//Store the recent characters for when we have to go back for a number:
for (j=0; j<oldchar-1; j++) oc[j]=oc[j+1];
oc[oldchar-1]=c;
if (!intextobject)
{
if (seen2("BT", oc))
{
//Start of a text object:
intextobject = TRUE;
}
}
}
}
HB_FUNC( PROCESSOUTPUT )
{
int iLen = hb_parclen( 2 );
// char * buffer = hb_xgrab( iLen );
ProcessOutput( ( FILE * ) hb_parnl( 1 ), hb_parc( 2 ), iLen );
// hb_storclen( buffer, iLen, 2 );
// hb_xfree( buffer );
}
#pragma ENDDUMP