extractor de texto de un PDF

Post by **Antonio Linares** » Sat Jun 08, 2013 6:18 am

He adaptado este código en C para usarlo desde Harbour:

http://www.codeproject.com/script/Content/ViewAssociatedFile.aspx?rzp=%2FKB%2Fcpp%2FExtractPDFText%2Fextractpdftext_src.zip&zep=cp%2Fpdf.cpp&obid=7056&obtid=2&ovid=1

Aqui esta mi código. No funciona con los PDFs que he probado ó tal vez aún hay algún bug en el código. Aqui lo comparto por si a alguien le apetece probarlo:

Code: Select all | Expand

#include "FiveWin.ch"

function Main()

   local cPDF := MemoRead( "c:\test.pdf" )
   local nStart := At( "stream", cPDF )
   local nEnd := At( "endstream", cPDF )
   local cBuf := Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
   local cText, nResult
   local hFile

   while nStart <= Len( cPDF )
      nStart = At( "stream", cPDF )
      nEnd = At( "endstream", cPDF )
      cBuf = Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
      cText = SubStr( cPDF, nStart + 6, nEnd - nStart )

      if Left( cText, 1 ) == Chr( 0x0d ) .and. ;
         SubStr( cText, 2, 1 ) == Chr( 0x0a )
         nStart += 2
      elseif Left( cText, 1 ) == Chr( 0x0a )
         nStart++
      endif

      if SubStr( cText, nEnd - 2, 1 ) == Chr( 0x0d ) .and. ;
         SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
         nEnd -= 2
      elseif SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
         nEnd--
      endif

      HB_ZUNCOMPRESS( SubStr( cPDF, nStart + 6, nEnd - nStart ), @cBuf, @nResult )

      cPDF = SubStr( cPDF, nEnd + Len( "endstream" ) + 1 )

      ProcessOutput( hFile := fcreate( "c:\test.out", "wb" ), cBuf )
      FClose( hFile )
      
      if ! Empty( MemoRead( "c:\test.out" ) )
         // MsgInfo( MemoRead( "c:\test.out" ), nResult )
      endif
   end

return nil

#pragma BEGINDUMP

#include <hbapi.h>
#include <wtypes.h>

#define oldchar 15

float ExtractNumber(const char* search, int lastcharoffset)
{
        float flt=-1.0;
        int i = lastcharoffset;
        char buffer[oldchar+5];

        while (i>0 && search[i]==' ') i--;
        while (i>0 && (isdigit(search[i]) || search[i]=='.')) i--;
        ZeroMemory(buffer,sizeof(buffer));
        strncpy(buffer, search+i+1, lastcharoffset-i);
        if (buffer[0] && sscanf(buffer, "%f", &flt))
        {
                return flt;
        }
        return -1.0;
}
BOOL seen2(const char* search, char* recent)
{
if (    recent[oldchar-3]==search[0]
     && recent[oldchar-2]==search[1]
         && (recent[oldchar-1]==' ' || recent[oldchar-1]==0x0d ||
recent[oldchar-1]==0x0a)
         && (recent[oldchar-4]==' ' || recent[oldchar-4]==0x0d ||
recent[oldchar-4]==0x0a)
         )
        {
                return TRUE;
        }
        return FALSE;
}

#include <hbapifs.h>

static int xputc( unsigned char c, FILE * fo )
{
   static int iPos = 0;

   return hb_fsWriteAt( ( HB_FHANDLE ) fo, &c, 1, iPos++ );
}

void ProcessOutput(FILE* file, char* output, size_t len)
{
        //Are we currently inside a text object?
        BOOL intextobject = FALSE;

        //Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
        BOOL nextliteral = FALSE;

        //() Bracket nesting level. Text appears inside ()
        int rbdepth = 0;

        //Keep previous chars to get extract numbers etc.:
        char oc[oldchar];
        int j=0;
        size_t i;

        for (j=0; j<oldchar; j++) oc[j]=' ';

        for( i=0; i<len; i++)
        {
                unsigned char c = output[i];

                xputc( c, file );

                if (intextobject)
                {
                        if (rbdepth==0 && seen2("TD", oc))
                        {
                                //Positioning.
                                //See if a new line has to start or just a tab:
                                float num = ExtractNumber(oc,oldchar-5);
                                if (num>1.0)
                                {
                                        xputc(0x0d, file);
                                        xputc(0x0a, file);
                                }
                                if (num<1.0)
                                {
                                        xputc('\t', file);
                                }
                        }
                        if (rbdepth==0 && seen2("ET", oc))
                        {
                                //End of a text object, also go to a new line.
                                intextobject = FALSE;
                                xputc(0x0d, file);
                                xputc(0x0a, file);
                        }
                        else if (c=='(' && rbdepth==0 && !nextliteral)
                        {
        int num;
                                //Start outputting text!
                                rbdepth=1;
                                //See if a space or tab (>1000) is called for by looking
                                //at the number in front of (
                                num = ExtractNumber(oc,oldchar-1);
                                if (num>0)
                                {
                                        if (num>1000.0)
                                        {
                                                xputc('\t', file);
                                        }
                                        else if (num>100.0)
                                        {
                                                xputc(' ', file);
                                        }
                                }
                        }
                        else if (c==')' && rbdepth==1 && !nextliteral)
                        {
                                //Stop outputting text
                                rbdepth=0;
                        }
                        else if (rbdepth==1)
                        {
                                //Just a normal text character:
                                if (c=='\\' && !nextliteral)
                                {
                                        //Only print out next character no matter what. Do not interpret.
                                        nextliteral = TRUE;
                                }
                                else
                                {
                                        nextliteral = FALSE;
                                        if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) )
                                        {
                                                xputc(c, file);
                                        }
                                }
                        }
                }
                //Store the recent characters for when we have to go back for a number:
                for (j=0; j<oldchar-1; j++) oc[j]=oc[j+1];
                oc[oldchar-1]=c;
                if (!intextobject)
                {
                        if (seen2("BT", oc))
                        {
                                //Start of a text object:
                                intextobject = TRUE;
                        }
                }
        }
}

HB_FUNC( PROCESSOUTPUT )
{
   int iLen = hb_parclen( 2 );
   // char * buffer = hb_xgrab( iLen );

   ProcessOutput( ( FILE * ) hb_parnl( 1 ), hb_parc( 2 ), iLen );
   // hb_storclen( buffer, iLen, 2 );
   // hb_xfree( buffer );
}

#pragma ENDDUMP

AIDA · Post by **AIDA** » Sat Jun 08, 2013 5:07 pm

Hola

en el Xverce me sale lo siguiente

Code: Select all | Expand

Warning W8004 - test.prg 6055: 'j' is assigned a value that is never used in function ProcessOutput

es en este pedasito

Code: Select all | Expand

 //Keep previous chars to get extract numbers etc.:
        char oc[oldchar];
        int j=0;
        size_t i;

espero se pueda lograr seria para sacar los datos de mas de 15000 archivos PDF :shock:

Saluditos

Post by **Antonio Linares** » Sat Jun 08, 2013 6:06 pm

Aida,

j se usa aqui:

for (j=0; j<oldchar; j++) oc[j]=' ';

que compilador de C usas ?

xmanuel · Post by **xmanuel** » Sat Jun 08, 2013 6:16 pm

Aida, Antonio creo que el compilador tiene razón
o haces esto:

int j; // No se asigna el 0
...
for (j=0; j<oldchar; j++) oc[j]=' ';

o haces esto:

for ( int j=0; j<oldchar; j++) oc[j]=' ';

jaja pero las dos cosas juntas "nor" :mrgreen:

Aparezco poco por aquí, pero espero remediarlo pronto.

Saludos a los dos... bueno a "to er" mundo jejeje

Post by **Antonio Linares** » Sat Jun 08, 2013 6:43 pm

Tienes toda la razón, esa inicialización es la que sobraba :-)

Aun asi, seguimos sin saber si el código funciona o no. La funcion HB_ZUNCOMPRESS() devuelve un código de error, cero significa que los datos descomprimidos son correctos, -5 es que estan mal. En mis pruebas suele dar cero pero no aparece el texto.

A ver si entre todos lo hacemos funcionar :-)

AIDA · Post by **AIDA** » Sat Jun 08, 2013 7:19 pm

No soy buena en código importado pero le haré la luchita :mrgreen:

y como queda con la corrección el código ya me perdí :shock:

Saluditos

Post by **Daniel Garcia-Gil** » Sat Jun 08, 2013 9:57 pm

Hola

creo que el error es que se esta creando el archivo cada iteración del while, sacando la creacion/cierre del archivo de salida se avanza un poco mas

de esta forma funciona... pero el archivo de salida se guarda mal... repite las letras

Code: Select all | Expand


function Main()

   local cPDF := MemoRead( "curl.pdf" )
   local nStart := At( "stream", cPDF )
   local nEnd := At( "endstream", cPDF )
   local cBuf := Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
   local cText, nResult
   local hFile

   hFile := fcreate( "curl.out", "wb" )

   while nStart <= Len( cPDF )
      nStart = At( "stream", cPDF )
      nEnd = At( "endstream", cPDF )
      cBuf = Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
      cText = SubStr( cPDF, nStart + 6, nEnd - nStart )

      if Left( cText, 1 ) == Chr( 0x0d ) .and. ;
         SubStr( cText, 2, 1 ) == Chr( 0x0a )
         nStart += 2
      elseif Left( cText, 1 ) == Chr( 0x0a )
         nStart++
      endif

      if SubStr( cText, nEnd - 2, 1 ) == Chr( 0x0d ) .and. ;
         SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
         nEnd -= 2
      elseif SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
         nEnd--
      endif

      HB_ZUNCOMPRESS( SubStr( cPDF, nStart + 6, nEnd - nStart ), @cBuf, @nResult )

      cPDF = SubStr( cPDF, nEnd + Len( "endstream" ) + 1 )

      ProcessOutput(hFile, cBuf )
      
      if ! Empty( MemoRead( "curl.out" ) )
        //MsgInfo( MemoRead( "curl.out" ), nResult )
      endif
   end

   FClose( hFile )

return nil

Post by **Antonio Linares** » Sun Jun 09, 2013 7:38 am

Daniel,

Me habia dejado una traza en el código. Ahora parece ir mucho mejor :-)

Code: Select all | Expand

#include "FiveWin.ch"

function Main()

   local cPDF := MemoRead( "c:\curl.pdf" )
   local nStart := At( "stream", cPDF )
   local nEnd := At( "endstream", cPDF )
   local cBuf := Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
   local cText, nResult
   local hFile

   if ! File( "c:\curl.out" )
      hFile = fcreate( "c:\curl.out", "wb" )
   else
      hFile = fopen( "c:\curl.out", "wb" )
   endif   

   while nStart < Len( cPDF )
      nStart = At( "stream", cPDF )
      nEnd = At( "endstream", cPDF )
      cBuf = Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
      cText = SubStr( cPDF, nStart + 6, nEnd - nStart )

      if Left( cText, 1 ) == Chr( 0x0d ) .and. ;
         SubStr( cText, 2, 1 ) == Chr( 0x0a )
         nStart += 2
      elseif Left( cText, 1 ) == Chr( 0x0a )
         nStart++
      endif

      if SubStr( cText, nEnd - 2, 1 ) == Chr( 0x0d ) .and. ;
         SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
         nEnd -= 2
      elseif SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
         nEnd--
      endif

      HB_ZUNCOMPRESS( SubStr( cPDF, nStart + 6, nEnd - nStart ), @cBuf, @nResult )

      cPDF = SubStr( cPDF, nEnd + Len( "endstream" ) + 1 )

      ProcessOutput( hFile, cBuf )
   end

   FClose( hFile )
   MsgInfo( MemoRead( "c:\curl.out" ) )
   // FErase( "c:\curl.out" )

return nil

#pragma BEGINDUMP

#include <hbapi.h>
#include <wtypes.h>

#define oldchar 15

float ExtractNumber(const char* search, int lastcharoffset)
{
        float flt=-1.0;
        int i = lastcharoffset;
        char buffer[oldchar+5];

        while (i>0 && search[i]==' ') i--;
        while (i>0 && (isdigit(search[i]) || search[i]=='.')) i--;
        ZeroMemory(buffer,sizeof(buffer));
        strncpy(buffer, search+i+1, lastcharoffset-i);
        if (buffer[0] && sscanf(buffer, "%f", &flt))
        {
                return flt;
        }
        return -1.0;
}
BOOL seen2(const char* search, char* recent)
{
if (    recent[oldchar-3]==search[0]
     && recent[oldchar-2]==search[1]
         && (recent[oldchar-1]==' ' || recent[oldchar-1]==0x0d ||
recent[oldchar-1]==0x0a)
         && (recent[oldchar-4]==' ' || recent[oldchar-4]==0x0d ||
recent[oldchar-4]==0x0a)
         )
        {
                return TRUE;
        }
        return FALSE;
}

#include <hbapifs.h>

int iPos = 0;

static int xputc( unsigned char c, FILE * fo )
{
   return hb_fsWriteAt( ( HB_FHANDLE ) fo, &c, 1, iPos++ );
}

void ProcessOutput(FILE* file, char* output, size_t len)
{
        //Are we currently inside a text object?
        BOOL intextobject = FALSE;

        //Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
        BOOL nextliteral = FALSE;

        //() Bracket nesting level. Text appears inside ()
        int rbdepth = 0;

        //Keep previous chars to get extract numbers etc.:
        char oc[oldchar];
        int j;
        size_t i;

        for (j=0; j<oldchar; j++) 
           oc[j]=' ';

        for( i=0; i<len; i++)
        {
                unsigned char c = output[i];

                if (intextobject)
                {
                        if (rbdepth==0 && seen2("TD", oc))
                        {
                                //Positioning.
                                //See if a new line has to start or just a tab:
                                float num = ExtractNumber(oc,oldchar-5);
                                if (num>1.0)
                                {
                                        xputc(0x0d, file);
                                        xputc(0x0a, file);
                                }
                                if (num<1.0)
                                {
                                        xputc('\t', file);
                                }
                        }
                        if (rbdepth==0 && seen2("ET", oc))
                        {
                                //End of a text object, also go to a new line.
                                intextobject = FALSE;
                                xputc(0x0d, file);
                                xputc(0x0a, file);
                        }
                        else if (c=='(' && rbdepth==0 && !nextliteral)
                        {
        int num;
                                //Start outputting text!
                                rbdepth=1;
                                //See if a space or tab (>1000) is called for by looking
                                //at the number in front of (
                                num = ExtractNumber(oc,oldchar-1);
                                if (num>0)
                                {
                                        if (num>1000.0)
                                        {
                                                xputc('\t', file);
                                        }
                                        else if (num>100.0)
                                        {
                                                xputc(' ', file);
                                        }
                                }
                        }
                        else if (c==')' && rbdepth==1 && !nextliteral)
                        {
                                //Stop outputting text
                                rbdepth=0;
                        }
                        else if (rbdepth==1)
                        {
                                //Just a normal text character:
                                if (c=='\\' && !nextliteral)
                                {
                                        //Only print out next character no matter what. Do not interpret.
                                        nextliteral = TRUE;
                                }
                                else
                                {
                                        nextliteral = FALSE;
                                        if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) )
                                        {
                                                xputc(c, file);
                                        }
                                }
                        }
                }
                //Store the recent characters for when we have to go back for a number:
                for (j=0; j<oldchar-1; j++) oc[j]=oc[j+1];
                oc[oldchar-1]=c;
                if (!intextobject)
                {
                        if (seen2("BT", oc))
                        {
                                //Start of a text object:
                                intextobject = TRUE;
                        }
                }
        }
}

HB_FUNC( PROCESSOUTPUT )
{
   ProcessOutput( ( FILE * ) hb_parnl( 1 ), hb_parc( 2 ), hb_parclen( 2 ) );
}

#pragma ENDDUMP

Post by **Antonio Linares** » Sun Jun 09, 2013 8:01 am

http://blog.didierstevens.com/2008/05/19/pdf-stream-objects/

http://www.gnupdf.org/Introduction_to_PDF

Post by **Antonio Linares** » Sun Jun 09, 2013 8:10 am

Aida,

Parece que con esta utilidad gratuita:

http://qpdf.sourceforge.net/

puedes convertir todos tus PDFs a texto :-)

AIDA · Post by **AIDA** » Mon Jun 10, 2013 3:23 am

Hola

ya vi sobre ese QPDF y no le entiendo nadita :shock:

creo que si se puede en xharbour y fivewin le entenderé mas al final y aprenderé mas que algo externo

soy muy fiel a Fivewin :mrgreen:

Gracias

Saluditos

Post by **Antonio Linares** » Mon Jun 10, 2013 7:34 am

Aida,

A mi ya me funciona correctamente :-)

Encontré que aunque el formato de compresión de muchos PDFs es FlateDecode, el texto que extrae necesita una nueva conversión que ya he conseguido implementar. Aqui tienes la versión que funciona. Revisa el fichero que genera con extensión TXT que contiene el texto del PDF:

Code: Select all | Expand

#include "FiveWin.ch"

function Main()

   local oVoice := TOleAuto():New( "Sapi.SPVoice" ) 
   local cText := GetText( "c:\curl.pdf" )
   
   oVoice:Speak( "c:\curl.txt", 4 )

return nil

function GetText( cPdfFile )

   local cPDF := MemoRead( cPdfFile )
   local nStart := At( "stream", cPDF )
   local nEnd := At( "endstream", cPDF )
   local cBuf := Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
   local cText, nResult, cTemp
   local hFile, cResult := ""
   local n := 1

   if ! File( hb_CurDrive() + ":\" + cFileNoExt( cPdfFile ) + ".out" )
      hFile = fcreate( hb_CurDrive() + ":\" + cFileNoExt( cPdfFile ) + ".out", "wb" )
   else
      hFile = fopen( hb_CurDrive() + ":\" + cFileNoExt( cPdfFile ) + ".out", "wb" )
   endif   

   while "stream" $ cPDF
      nStart = At( "stream", cPDF )
      nEnd = At( "endstream", cPDF )
      
      if nStart == 0
         cPDF = ""
         exit
      endif
         
      cBuf = Replicate( Chr( 0 ), ( nEnd - nStart ) * 10 )
      cText = SubStr( cPDF, nStart + 6, nEnd - nStart )

      if Left( cText, 1 ) == Chr( 0x0d ) .and. ;
         SubStr( cText, 2, 1 ) == Chr( 0x0a )
         nStart += 2
      elseif Left( cText, 1 ) == Chr( 0x0a )
         nStart++
      endif

      if SubStr( cText, nEnd - 2, 1 ) == Chr( 0x0d ) .and. ;
         SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
         nEnd -= 2
      elseif SubStr( cText, nEnd - 1, 1 ) == Chr( 0x0a )
         nEnd--
      endif

      HB_ZUNCOMPRESS( SubStr( cPDF, nStart + 6, nEnd - nStart ), @cBuf, @nResult )
      cTemp = cBuf
      while "[" $ cTemp 
         // OutputDebugString( "dentro del bucle Translate" + CRLF )
         if ! "]" $ cTemp
            exit
         endif   
         cResult += Translate( SubStr( cTemp, At( "[", cTemp ), At( "]", cTemp ) - At( "[", cTemp ) + 1 ) )
         cTemp = SubStr( cTemp, At( "]", cTemp ) + 1 )
      end   

      // OutputDebugString( Str( Len( cPDF ) ) ) 
      // OutputDebugString( If( Empty( cPDF ), " * Empty", " * not empty" ) + CRLF )
      ProcessOutput( hFile, cBuf )
      cPDF = SubStr( cPDF, nEnd + Len( "endstream" ) + 1 )
   end

   // OutputDebugString( "done" )
   FClose( hFile )
   MemoWrit( hb_CurDrive() + ":\" + cFileNoExt( cPdfFile ) + ".txt", cResult )

return cResult

function Translate( cText )

   local cCode, cResult := "" 
   local nStart, nEnd, n
   
   cText = SubStr( cText, 2, Len( cText ) - 2 )
   
   while "<" $ cText
      nStart = At( "<", cText )
      nEnd = At( ">", cText )
      cCode = SubStr( cText, nStart + 1, nEnd - nStart - 1 )
      for n = 1 to Len( cCode ) step 4
         cResult += Chr( hb_HextoNum( SubStr( cCode, n, 4 ) ) )
      next   
      if nEnd != 0
         cText = SubStr( cText, nEnd + 1 )
      else 
         cText = ""
      endif      
   end   
   
return cResult + " "     

#pragma BEGINDUMP

#include <hbapi.h>
#include <wtypes.h>

#define oldchar 15

float ExtractNumber(const char* search, int lastcharoffset)
{
        float flt=-1.0;
        int i = lastcharoffset;
        char buffer[oldchar+5];

        while (i>0 && search[i]==' ') i--;
        while (i>0 && (isdigit(search[i]) || search[i]=='.')) i--;
        ZeroMemory(buffer,sizeof(buffer));
        strncpy(buffer, search+i+1, lastcharoffset-i);
        if (buffer[0] && sscanf(buffer, "%f", &flt))
        {
                return flt;
        }
        return -1.0;
}
BOOL seen2(const char* search, char* recent)
{
if (    recent[oldchar-3]==search[0]
     && recent[oldchar-2]==search[1]
         && (recent[oldchar-1]==' ' || recent[oldchar-1]==0x0d ||
recent[oldchar-1]==0x0a)
         && (recent[oldchar-4]==' ' || recent[oldchar-4]==0x0d ||
recent[oldchar-4]==0x0a)
         )
        {
                return TRUE;
        }
        return FALSE;
}

#include <hbapifs.h>

int iPos = 0;

static int xputc( unsigned char c, FILE * fo )
{
   return hb_fsWriteAt( ( HB_FHANDLE ) fo, &c, 1, iPos++ );
}

void ProcessOutput(FILE* file, char* output, size_t len)
{
        //Are we currently inside a text object?
        BOOL intextobject = FALSE;

        //Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
        BOOL nextliteral = FALSE;

        //() Bracket nesting level. Text appears inside ()
        int rbdepth = 0;

        //Keep previous chars to get extract numbers etc.:
        char oc[oldchar];
        int j;
        size_t i;

        for (j=0; j<oldchar; j++) 
           oc[j]=' ';

        for( i=0; i<len; i++)
        {
                unsigned char c = output[i];

                if (intextobject)
                {
                        if (rbdepth==0 && seen2("TD", oc))
                        {
                                //Positioning.
                                //See if a new line has to start or just a tab:
                                float num = ExtractNumber(oc,oldchar-5);
                                if (num>1.0)
                                {
                                        xputc(0x0d, file);
                                        xputc(0x0a, file);
                                }
                                if (num<1.0)
                                {
                                        xputc('\t', file);
                                }
                        }
                        if (rbdepth==0 && seen2("ET", oc))
                        {
                                //End of a text object, also go to a new line.
                                intextobject = FALSE;
                                xputc(0x0d, file);
                                xputc(0x0a, file);
                        }
                        else if (c=='(' && rbdepth==0 && !nextliteral)
                        {
        int num;
                                //Start outputting text!
                                rbdepth=1;
                                //See if a space or tab (>1000) is called for by looking
                                //at the number in front of (
                                num = ExtractNumber(oc,oldchar-1);
                                if (num>0)
                                {
                                        if (num>1000.0)
                                        {
                                                xputc('\t', file);
                                        }
                                        else if (num>100.0)
                                        {
                                                xputc(' ', file);
                                        }
                                }
                        }
                        else if (c==')' && rbdepth==1 && !nextliteral)
                        {
                                //Stop outputting text
                                rbdepth=0;
                        }
                        else if (rbdepth==1)
                        {
                                //Just a normal text character:
                                if (c=='\\' && !nextliteral)
                                {
                                        //Only print out next character no matter what. Do not interpret.
                                        nextliteral = TRUE;
                                }
                                else
                                {
                                        nextliteral = FALSE;
                                        if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) )
                                        {
                                                xputc(c, file);
                                        }
                                }
                        }
                }
                //Store the recent characters for when we have to go back for a number:
                for (j=0; j<oldchar-1; j++) oc[j]=oc[j+1];
                oc[oldchar-1]=c;
                if (!intextobject)
                {
                        if (seen2("BT", oc))
                        {
                                //Start of a text object:
                                intextobject = TRUE;
                        }
                }
        }
}

HB_FUNC( PROCESSOUTPUT )
{
   ProcessOutput( ( FILE * ) hb_parnl( 1 ), ( char * ) hb_parc( 2 ), hb_parclen( 2 ) );
}

#pragma ENDDUMP

AIDA · Post by **AIDA** » Mon Jun 10, 2013 5:05 pm

Super

en este momento lo copio y haré pruebas :mrgreen:

y te comento

eres mi SUPERMAN

Saluditos

AIDA · Post by **AIDA** » Mon Jun 10, 2013 5:23 pm

Me sale esto

Code: Select all | Expand

Error: Unresolved external '_HB_FUN_HB_CURDRIVE' referenced

Code: Select all | Expand

Error: Unresolved external '_HB_FUN_HB_HEXTONUM' referenced

les quite el HB_ ya no marca error pero el curl.txt sale en blanco sin nada :shock:

sera por quitar el HB_ o me faltara alguna lib del xharbour

Saluditos :wink:

Post by **Antonio Linares** » Mon Jun 10, 2013 7:37 pm

Aida,

Prueba con el pdf que te envio por email

FiveTech Software tech support forums

extractor de texto de un PDF

extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF

Re: extractor de texto de un PDF