Reply
 
LinkBack Thread Tools Display Modes
  #1 (permalink)  
Old 03-20-2009, 12:04 PM
jazz2k8's Avatar
Senior Member
 
Join Date: Apr 2008
Posts: 144
Rep Power: 0
jazz2k8 is on a distinguished road
Default PDF Box issue
Hi

I am working with the PDFBox where i want to convert pdf2txt ,encrypt and decrypt pdfs.

i have downloaded the software from

here is the link: SourceForge.net: PDFBox

i want this:

PDF Box:
http://incubator.apache.o...ineutilities/Encrypt.html

Here is my code snippet for TextExtraction from a PDF:

Code:
package com.pdfBox;
import java.io.File;
import java.io.FileOutputStream ;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer ;
 import java.net.MalformedURLException ;
import java.net.URL ;
import org.pdfbox.util.operator.OperatorProcessor;
import org.pdfbox.util.PDFStreamEngine;

 import org.pdfbox.pdmodel.PDDocument;
 import org.pdfbox.pdmodel.encryption.AccessPermission;
import org.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.pdfbox.util.PDFText2HTML;
 import org.pdfbox.util.PDFTextStripper;

 public class ExtractText
  {
      /**
      * This is the default encoding of the text to be output.
      */
     public static final String  DEFAULT_ENCODING =
         null;
          //"ISO-8859-1";
  //"ISO-8859-6"; //arabic
 //"US-ASCII";
  //"UTF-8";
  //"UTF-16";
  //"UTF-16BE";
  //"UTF-16LE";
 
 
      private static final String  PASSWORD = "-password";
      private static final String  ENCODING = "-encoding";
      private static final String  CONSOLE = "-console";
      private static final String  START_PAGE = "-startPage";
      private static final String  END_PAGE = "-endPage";
      private static final String  SORT = "-sort";
      private static final String  HTML = "-html"; // jjb - added simple HTML output
 
      /**
       * private constructor.
      */
      private ExtractText()
      {
          //static class
  }
 
      /**
       * Infamous main method.
       *
       * @param args Command line arguments, should be one and a reference to a file.
       *
       * @throws Exception If there is an error parsing the document.
       */
      public static void main( String [] arg ) throws Exception
      {
          boolean toConsole = false;
          OperatorProcessor pr=null;
       
          boolean toHTML = false;
          boolean sort = false;
          String  password = "";
          String  encoding = DEFAULT_ENCODING;
          String  pdfFile = null;
          String  textFile = null;
          int startPage = 1;
          int endPage = Integer.MAX_VALUE;
          String args[]={"C:\\101_NY.pdf","C:\\101_NY.txt"};
          for( int i=0; i<args.length; i++ )
          {
              if( args.equals( PASSWORD ) )
              {
                  i++;
                  if( i >= args.length )
                  {
                      usage();
                  }
                  password = args;
              }
              else if( args.equals( ENCODING ) )
              {
                  i++;
                  if( i >= args.length )
                 {
                      usage();
                }
                 encoding = args;
             }
            else if( args.equals( START_PAGE ) )
            {
                i++;
                 if( i >= args.length )
                  {
                     usage();
                 }
                 startPage = Integer.parseInt( args );
             }
             else if( args.equals( HTML ) )
             {
                  toHTML = true;
            }
             else if( args.equals( SORT ) )
              {
                  sort = true;
            }
              else if( args.equals( END_PAGE ) )
             {
                 i++;
                  if( i >= args.length )
                 {
                     usage();
                  }
                endPage = Integer.parseInt( args );
             }
              else if( args.equals( CONSOLE ) )
             {
                  toConsole = true;
             }
              else
             {
                  if( pdfFile == null )
                  {
                   pdfFile = args;
                 }
                  else
                  {
                      textFile = args;
                 }
             }
          }
 
        if( pdfFile == null )
         {
              usage();
         }
          else
          {
 
             Writer  output = null;
              PDDocument document = null;
              try
             {
              try
                {
                     //basically try to load it from a url first and if the URL
 //is not recognized then try to load it from the file system.
  URL  url = new URL ( pdfFile );
                     document = PDDocument.load( url );
                      String  fileName = url.getFile();
                      if( textFile == null && fileName.length() >4 )
                     {
                         File  outputFile =
                             new File ( fileName.substring( 0, fileName.length() -4 ) + ".txt" );
                         textFile = outputFile.getName();
                     }
                  }
                  catch( MalformedURLException  e )
                 {
                     document = PDDocument.load( pdfFile );
                      if( textFile == null && pdfFile.length() >4 )
                    {
                        textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
                     }
                }
     
                 //document.print();
  if( document.isEncrypted() )
                  {
                     StandardDecryptionMaterial sdm = new StandardDecryptionMaterial( password );
                   document.openProtection( sdm );
                    AccessPermission ap = document.getCurrentAccessPermission();
                   
                  if( ! ap.canExtractContent() )
                    {
                        throw new IOException ( "You do not have permission to extract text" );
                    }
                }
                 if( toConsole )
                {
                      output = new OutputStreamWriter ( System.out );
               }
               else
              {
                   if( encoding != null )
                    {
                       output = new OutputStreamWriter (
                            new FileOutputStream ( textFile ), encoding );
                     }
                    else
                    {
                         //use default encoding
 output = new OutputStreamWriter (
                            new FileOutputStream ( textFile ) );
                     }
                }
    
                PDFTextStripper stripper = null;
               if(toHTML)
               {
                   stripper = new PDFText2HTML();
                }
                  else
                 {
                    stripper = new PDFTextStripper();
                 }
                  stripper.setSortByPosition( sort );
                  stripper.setStartPage( startPage );
                  stripper.setEndPage( endPage );
                  stripper.writeText( document, output );
              }
              finally
              {
                  if( output != null )
                  {
                     output.close();
                  }
                  if( document != null )
                 {
                    document.close();
               }
             }
         }//test
     }

     /**
       * This will print the usage requirements and exit.
       */
      private static void usage()
     {
        System.err.println( "Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]\n" +
             " -password <password> Password to decrypt document\n" +
           " -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
          " -console Send text to console instead of file\n" +
              " -html Output in HTML format instead of raw text\n" +
             " -sort Sort the text before writing\n" +
              " -startPage <number> The first page to start extraction(1 based)\n" +
             " -endPage <number> The last page to extract(inclusive)\n" +
              " <PDF file> The PDF document to use\n" +
             " [Text File] The file to write the text to\n"
              );
          System.exit( 1 );
      }
  }
i got this error:
Exception in thread "main" java.lang.NoSuchMethodError: org.pdfbox.util.operator.OperatorProcessor.setCont ext(Lorg/pdfbox/util/PDFStreamEngineV
at org.pdfbox.util.PDFStreamEngine.registerOperatorPr ocessor(PDFStreamEngine.java:140)
at org.pdfbox.util.PDFStreamEngine.<init>(PDFStreamEn gine.java:123)
at org.pdfbox.util.PDFTextStripper.<init>(PDFTextStri pper.java:119)
at com.pdfBox.ExtractText.main(ExtractText.java:204)

----------------------------------------------------------------------------------------

Even i tried with the exe's which are given with the softwares..

i can nt run those....

can u help me please..
__________________
visit : www.yoteam.co.cc
Bookmark Post in Technorati
Reply With Quote
Reply

Bookmarks

Thread Tools
Display Modes

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

BB code is On
Smilies are On
[IMG] code is On
HTML code is Off
Trackbacks are On
Pingbacks are On
Refbacks are On


Similar Threads
Thread Thread Starter Forum Replies Last Post
Issue jsp with FireFox dimuthunsj JavaServer Pages (JSP) and JSTL 1 10-10-2008 12:08 PM
Threading issue Eku New To Java 2 09-18-2008 11:47 AM
NullPointerException issue fritz1474 AWT / Swing 2 09-03-2008 06:21 PM
Performance issue mathes_n Web Frameworks 8 09-02-2008 06:11 AM
Issue chaitu444 New To Java 2 11-06-2007 08:49 PM


All times are GMT +2. The time now is 03:41 PM.



VBulletin, Copyright ©2000 - 2010, Jelsoft Enterprises Ltd.
Content Relevant URLs by vBSEO ©2009, Crawlability, Inc.
Copyright ©2006 - 2007, www.java-forums.org