Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Questions asking for code must demonstrate a minimal understanding of the problem being solved. Include attempted solutions, why they didn't work, and the expected results. See also: Stack Overflow question checklist
Closed 9 years ago.
Improve this question
How can I generate a thumbnail image of pages in a PDF document, using Java?
I think http://pdfbox.apache.org/ will do what you're looking for since you can create an image from a page and then scale the image
From their example code -
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox;
import java.awt.HeadlessException;
import java.awt.Toolkit;
import java.awt.image.BufferedImage;
import javax.imageio.ImageIO;
import java.util.List;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.util.PDFImageWriter;
/**
* Convert a PDF document to an image.
*
* #author Ben Litchfield
* #version $Revision: 1.6 $
*/
public class PDFToImage
{
private static final String PASSWORD = "-password";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
private static final String IMAGE_FORMAT = "-imageType";
private static final String OUTPUT_PREFIX = "-outputPrefix";
private static final String COLOR = "-color";
private static final String RESOLUTION = "-resolution";
private static final String CROPBOX = "-cropbox";
/**
* private constructor.
*/
private PDFToImage()
{
//static class
}
/**
* Infamous main method.
*
* #param args Command line arguments, should be one and a reference to a file.
*
* #throws Exception If there is an error parsing the document.
*/
public static void main( String[] args ) throws Exception
{
String password = "";
String pdfFile = null;
String outputPrefix = null;
String imageFormat = "jpg";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
String color = "rgb";
int resolution;
float cropBoxLowerLeftX = 0;
float cropBoxLowerLeftY = 0;
float cropBoxUpperRightX = 0;
float cropBoxUpperRightY = 0;
try
{
resolution = Toolkit.getDefaultToolkit().getScreenResolution();
}
catch( HeadlessException e )
{
resolution = 96;
}
for( int i = 0; i < args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( START_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
startPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( END_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
endPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( IMAGE_FORMAT ) )
{
i++;
imageFormat = args[i];
}
else if( args[i].equals( OUTPUT_PREFIX ) )
{
i++;
outputPrefix = args[i];
}
else if( args[i].equals( COLOR ) )
{
i++;
color = args[i];
}
else if( args[i].equals( RESOLUTION ) )
{
i++;
resolution = Integer.parseInt(args[i]);
}
else if( args[i].equals( CROPBOX ) )
{
i++;
cropBoxLowerLeftX = Float.valueOf(args[i]).floatValue();
i++;
cropBoxLowerLeftY = Float.valueOf(args[i]).floatValue();
i++;
cropBoxUpperRightX = Float.valueOf(args[i]).floatValue();
i++;
cropBoxUpperRightY = Float.valueOf(args[i]).floatValue();
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
}
}
if( pdfFile == null )
{
usage();
}
else
{
if(outputPrefix == null)
{
outputPrefix = pdfFile.substring( 0, pdfFile.lastIndexOf( '.' ));
}
PDDocument document = null;
try
{
document = PDDocument.load( pdfFile );
//document.print();
if( document.isEncrypted() )
{
try
{
document.decrypt( password );
}
catch( InvalidPasswordException e )
{
if( args.length == 4 )//they supplied the wrong password
{
System.err.println( "Error: The supplied password is incorrect." );
System.exit( 2 );
}
else
{
//they didn't supply a password and the default of "" was wrong.
System.err.println( "Error: The document is encrypted." );
usage();
}
}
}
int imageType = 24;
if ("bilevel".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_BYTE_BINARY;
}
else if ("indexed".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_BYTE_INDEXED;
}
else if ("gray".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_BYTE_GRAY;
}
else if ("rgb".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_INT_RGB;
}
else if ("rgba".equalsIgnoreCase(color))
{
imageType = BufferedImage.TYPE_INT_ARGB;
}
else
{
System.err.println( "Error: the number of bits per pixel must be 1, 8 or 24." );
System.exit( 2 );
}
//si une cropBox a ete specifier, appeler la methode de modification de cropbox
//changeCropBoxes(PDDocument document,float a, float b, float c,float d)
if ( cropBoxLowerLeftX!=0 || cropBoxLowerLeftY!=0 || cropBoxUpperRightX!=0 || cropBoxUpperRightY!=0 )
{
changeCropBoxes(document,cropBoxLowerLeftX, cropBoxLowerLeftY, cropBoxUpperRightX, cropBoxUpperRightY);
}
//Make the call
PDFImageWriter imageWriter = new PDFImageWriter();
boolean success = imageWriter.writeImage(document, imageFormat, password,
startPage, endPage, outputPrefix, imageType, resolution);
if (!success)
{
System.err.println( "Error: no writer found for image format '"
+ imageFormat + "'" );
System.exit(1);
}
}
catch (Exception e)
{
System.err.println(e);
}
finally
{
if( document != null )
{
document.close();
}
}
}
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
System.err.println( "Usage: java org.apache.pdfbox.PDFToImage [OPTIONS] <PDF file>\n" +
" -password <password> Password to decrypt document\n" +
" -imageType <image type> (" + getImageFormats() + ")\n" +
" -outputPrefix <output prefix> Filename prefix for image files\n" +
" -startPage <number> The first page to start extraction(1 based)\n" +
" -endPage <number> The last page to extract(inclusive)\n" +
" -color <string> The color depth (valid: bilevel, indexed, gray, rgb, rgba)\n" +
" -resolution <number> The bitmap resolution in dpi\n" +
" -cropbox <number> <number> <number> <number> The page area to export\n" +
" <PDF file> The PDF document to use\n"
);
System.exit( 1 );
}
private static String getImageFormats()
{
StringBuffer retval = new StringBuffer();
String[] formats = ImageIO.getReaderFormatNames();
for( int i = 0; i < formats.length; i++ )
{
retval.append( formats[i] );
if( i + 1 < formats.length )
{
retval.append( "," );
}
}
return retval.toString();
}
private static void changeCropBoxes(PDDocument document,float a, float b, float c,float d)
{
List pages = document.getDocumentCatalog().getAllPages();
for( int i = 0; i < pages.size(); i++ )
{
System.out.println("resizing page");
PDPage page = (PDPage)pages.get( i );
PDRectangle rectangle = new PDRectangle();
rectangle.setLowerLeftX(a);
rectangle.setLowerLeftY(b);
rectangle.setUpperRightX(c);
rectangle.setUpperRightY(d);
page.setMediaBox(rectangle);
page.setCropBox(rectangle);
}
}
}
You could also have a look at JPedal (details at http://www.jpedal.org/pdf_thumbnail.php)
IcePdf is the best that I've seen (that's free) for reading pdfs. JPedal is awesome, but not free.
If you're going to be generating images from pdfs that the general public can send you, I assure you (from experience) that you'll get pdfs that will crash the JVM. (ie: If they're many-layered pdfs with all vector graphics). This pdf is an example that will crash many libraries (but is a perfectly valid PDF without anything funny like Javascript, etc).
We've gone down the route of trying to use a multitude of libraries and eventually resorting to delegating the work of creating a thumbnail to ImageMagick, which is a highly optimized C program for image manipulation.
This post is not only pdf, but also many other file type like office, image, text....
Related
I am using PDFBox to do a simple extraction of words from a PDF file. Then it inserts those words to a table in database. From what I have tested, a 90 degrees clockwise rotated text in PDF will gives gibberish result when I tried to extract the words.
For example, database in the file will yield atabase and also database itself as two different words. Obviously, atabase does not exist in the PDF file.
I tried converting the original file to be rotated upright and do the extraction and it works perfectly as expected. I understand this could be a limitation of the PDFBox itself.
So, in the case of someone trying to index a rotated PDF file, is there a way to tackle this?
Code snippet ( just for reference) :
String lines[] = text.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
System.out.println("Line: " + line);
preparedStatement = con1.prepareStatement(sql);
int i=0;
for (String word : words) {
// check if one or more special characters at end of string then remove OR
// check special characters in beginning of the string then remove
// insert every word directly to table db
word = word.replaceAll("([\\W]+$)|(^[\\W]+)", "");
preparedStatement.setString(1, path1);
preparedStatement.setString(2, word);
System.out.println("Token: " +word);
preparedStatement.executeUpdate();
}
}
preparedStatement.close();
}
This is the PDFBox ExtractText command line utility, which can detect rotations since 2.0.13 (PDFBOX-4371). (That release had a bug with type 3 fonts, which was fixed (PDFBOX-4390) in the repository and in this code, and is in 2.0.14). Later code may have been improved since then. The current 2.0.* source can be found here.
To extract text from rotated files, use the "rotationMagic" setting. This setting first detects the angle of every glyph, collects these angles (AngleCollector), and in a second pass it does an extraction for every angle while discarding the rest (FilteredTextStripper). The order of extraction is by angle, which may or may not make sense if there are several different angles in a page.
The PDF is modified while extracting, so don't use this on documents you are saving.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.tools;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
/**
* This is the main program that simply parses the pdf document and transforms it
* into text.
*
* #author Ben Litchfield
* #author Tilman Hausherr
*/
public final class ExtractText
{
private static final Log LOG = LogFactory.getLog(ExtractText.class);
private static final String PASSWORD = "-password";
private static final String ENCODING = "-encoding";
private static final String CONSOLE = "-console";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
private static final String SORT = "-sort";
private static final String IGNORE_BEADS = "-ignoreBeads";
private static final String DEBUG = "-debug";
private static final String HTML = "-html";
private static final String ALWAYSNEXT = "-alwaysNext";
private static final String ROTATION_MAGIC = "-rotationMagic";
private static final String STD_ENCODING = "UTF-8";
/*
* debug flag
*/
private boolean debug = false;
/**
* private constructor.
*/
private ExtractText()
{
//static class
}
/**
* Infamous main method.
*
* #param args Command line arguments, should be one and a reference to a file.
*
* #throws IOException if there is an error reading the document or extracting the text.
*/
public static void main( String[] args ) throws IOException
{
// suppress the Dock icon on OS X
System.setProperty("apple.awt.UIElement", "true");
ExtractText extractor = new ExtractText();
extractor.startExtraction(args);
}
/**
* Starts the text extraction.
*
* #param args the commandline arguments.
* #throws IOException if there is an error reading the document or extracting the text.
*/
public void startExtraction( String[] args ) throws IOException
{
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
boolean separateBeads = true;
boolean alwaysNext = false;
boolean rotationMagic = false;
String password = "";
String encoding = STD_ENCODING;
String pdfFile = null;
String outputFile = null;
// Defaults to text files
String ext = ".txt";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( ENCODING ) )
{
i++;
if( i >= args.length )
{
usage();
}
encoding = args[i];
}
else if( args[i].equals( START_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
startPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( HTML ) )
{
toHTML = true;
ext = ".html";
}
else if( args[i].equals( SORT ) )
{
sort = true;
}
else if( args[i].equals( IGNORE_BEADS ) )
{
separateBeads = false;
}
else if (args[i].equals(ALWAYSNEXT))
{
alwaysNext = true;
}
else if (args[i].equals(ROTATION_MAGIC))
{
rotationMagic = true;
}
else if( args[i].equals( DEBUG ) )
{
debug = true;
}
else if( args[i].equals( END_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
endPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( CONSOLE ) )
{
toConsole = true;
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
else
{
outputFile = args[i];
}
}
}
if( pdfFile == null )
{
usage();
}
else
{
Writer output = null;
PDDocument document = null;
try
{
long startTime = startProcessing("Loading PDF "+pdfFile);
if( outputFile == null && pdfFile.length() >4 )
{
outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
}
document = PDDocument.load(new File( pdfFile ), password);
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
throw new IOException( "You do not have permission to extract text" );
}
stopProcessing("Time for loading: ", startTime);
if( toConsole )
{
output = new OutputStreamWriter( System.out, encoding );
}
else
{
if (toHTML && !STD_ENCODING.equals(encoding))
{
encoding = STD_ENCODING;
System.out.println("The encoding parameter is ignored when writing html output.");
}
output = new OutputStreamWriter( new FileOutputStream( outputFile ), encoding );
}
startTime = startProcessing("Starting text extraction");
if (debug)
{
System.err.println("Writing to " + outputFile);
}
PDFTextStripper stripper;
if(toHTML)
{
// HTML stripper can't work page by page because of startDocument() callback
stripper = new PDFText2HTML();
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
// Extract text for main document:
stripper.writeText(document, output);
}
else
{
if (rotationMagic)
{
stripper = new FilteredTextStripper();
}
else
{
stripper = new PDFTextStripper();
}
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
// Extract text for main document:
extractPages(startPage, Math.min(endPage, document.getNumberOfPages()),
stripper, document, output, rotationMagic, alwaysNext);
}
// ... also for any embedded PDFs:
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
if (names != null)
{
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
if (embeddedFiles != null)
{
Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
if (embeddedFileNames != null)
{
for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet())
{
if (debug)
{
System.err.println("Processing embedded file " + ent.getKey() + ":");
}
PDComplexFileSpecification spec = ent.getValue();
PDEmbeddedFile file = spec.getEmbeddedFile();
if (file != null && "application/pdf".equals(file.getSubtype()))
{
if (debug)
{
System.err.println(" is PDF (size=" + file.getSize() + ")");
}
InputStream fis = file.createInputStream();
PDDocument subDoc = null;
try
{
subDoc = PDDocument.load(fis);
if (toHTML)
{
// will not really work because of HTML header + footer
stripper.writeText( subDoc, output );
}
else
{
extractPages(1, subDoc.getNumberOfPages(),
stripper, subDoc, output, rotationMagic, alwaysNext);
}
}
finally
{
fis.close();
IOUtils.closeQuietly(subDoc);
}
}
}
}
}
}
stopProcessing("Time for extraction: ", startTime);
}
finally
{
IOUtils.closeQuietly(output);
IOUtils.closeQuietly(document);
}
}
}
private void extractPages(int startPage, int endPage,
PDFTextStripper stripper, PDDocument document, Writer output,
boolean rotationMagic, boolean alwaysNext) throws IOException
{
for (int p = startPage; p <= endPage; ++p)
{
stripper.setStartPage(p);
stripper.setEndPage(p);
try
{
if (rotationMagic)
{
PDPage page = document.getPage(p - 1);
int rotation = page.getRotation();
page.setRotation(0);
AngleCollector angleCollector = new AngleCollector();
angleCollector.setStartPage(p);
angleCollector.setEndPage(p);
angleCollector.writeText(document, new NullWriter());
// rotation magic
for (int angle : angleCollector.getAngles())
{
// prepend a transformation
// (we could skip these parts for angle 0, but it doesn't matter much)
PDPageContentStream cs = new PDPageContentStream(document, page,
PDPageContentStream.AppendMode.PREPEND, false);
cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
cs.close();
stripper.writeText(document, output);
// remove prepended transformation
((COSArray) page.getCOSObject().getItem(COSName.CONTENTS)).remove(0);
}
page.setRotation(rotation);
}
else
{
stripper.writeText(document, output);
}
}
catch (IOException ex)
{
if (!alwaysNext)
{
throw ex;
}
LOG.error("Failed to process page " + p, ex);
}
}
}
private long startProcessing(String message)
{
if (debug)
{
System.err.println(message);
}
return System.currentTimeMillis();
}
private void stopProcessing(String message, long startTime)
{
if (debug)
{
long stopTime = System.currentTimeMillis();
float elapsedTime = ((float)(stopTime - startTime))/1000;
System.err.println(message + elapsedTime + " seconds");
}
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
String message = "Usage: java -jar pdfbox-app-x.y.z.jar ExtractText [options] <inputfile> [output-text-file]\n"
+ "\nOptions:\n"
+ " -password <password> : Password to decrypt document\n"
+ " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE,\n"
+ " UTF-16LE, etc.\n"
+ " -console : Send text to console instead of file\n"
+ " -html : Output in HTML format instead of raw text\n"
+ " -sort : Sort the text before writing\n"
+ " -ignoreBeads : Disables the separation by beads\n"
+ " -debug : Enables debug output about the time consumption\n"
+ " of every stage\n"
+ " -alwaysNext : Process next page (if applicable) despite\n"
+ " IOException (ignored when -html)\n"
+ " -rotationMagic : Analyze each page for rotated/skewed text,\n"
+ " rotate to 0° and extract separately\n"
+ " (slower, and ignored when -html)\n"
+ " -startPage <number> : The first page to start extraction (1 based)\n"
+ " -endPage <number> : The last page to extract (1 based, inclusive)\n"
+ " <inputfile> : The PDF document to use\n"
+ " [output-text-file] : The file to write the text to";
System.err.println(message);
System.exit( 1 );
}
}
/**
* Collect all angles while doing text extraction. Angles are in degrees and rounded to the closest
* integer (to avoid slight differences from floating point arithmethic resulting in similarly
* angled glyphs being treated separately). This class must be constructed for each page so that the
* angle set is initialized.
*/
class AngleCollector extends PDFTextStripper
{
private final Set<Integer> angles = new TreeSet<Integer>();
AngleCollector() throws IOException
{
}
Set<Integer> getAngles()
{
return angles;
}
#Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
angle = (angle + 360) % 360;
angles.add(angle);
}
}
/**
* TextStripper that only processes glyphs that have angle 0.
*/
class FilteredTextStripper extends PDFTextStripper
{
FilteredTextStripper() throws IOException
{
}
#Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
if (angle == 0)
{
super.processTextPosition(text);
}
}
}
/**
* Dummy output.
*/
class NullWriter extends Writer
{
#Override
public void write(char[] cbuf, int off, int len) throws IOException
{
// do nothing
}
#Override
public void flush() throws IOException
{
// do nothing
}
#Override
public void close() throws IOException
{
// do nothing
}
}
I am using PDFBox to do a simple extraction of words from a PDF file. Then it inserts those words to a table in database. From what I have tested, a 90 degrees clockwise rotated text in PDF will gives gibberish result when I tried to extract the words.
For example, database in the file will yield atabase and also database itself as two different words. Obviously, atabase does not exist in the PDF file.
I tried converting the original file to be rotated upright and do the extraction and it works perfectly as expected. I understand this could be a limitation of the PDFBox itself.
So, in the case of someone trying to index a rotated PDF file, is there a way to tackle this?
Code snippet ( just for reference) :
String lines[] = text.split("\\r?\\n");
for (String line : lines) {
String[] words = line.split(" ");
System.out.println("Line: " + line);
preparedStatement = con1.prepareStatement(sql);
int i=0;
for (String word : words) {
// check if one or more special characters at end of string then remove OR
// check special characters in beginning of the string then remove
// insert every word directly to table db
word = word.replaceAll("([\\W]+$)|(^[\\W]+)", "");
preparedStatement.setString(1, path1);
preparedStatement.setString(2, word);
System.out.println("Token: " +word);
preparedStatement.executeUpdate();
}
}
preparedStatement.close();
}
This is the PDFBox ExtractText command line utility, which can detect rotations since 2.0.13 (PDFBOX-4371). (That release had a bug with type 3 fonts, which was fixed (PDFBOX-4390) in the repository and in this code, and is in 2.0.14). Later code may have been improved since then. The current 2.0.* source can be found here.
To extract text from rotated files, use the "rotationMagic" setting. This setting first detects the angle of every glyph, collects these angles (AngleCollector), and in a second pass it does an extraction for every angle while discarding the rest (FilteredTextStripper). The order of extraction is by angle, which may or may not make sense if there are several different angles in a page.
The PDF is modified while extracting, so don't use this on documents you are saving.
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.tools;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.IOUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
import org.apache.pdfbox.pdmodel.PDEmbeddedFilesNameTreeNode;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification;
import org.apache.pdfbox.pdmodel.common.filespecification.PDEmbeddedFile;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
/**
* This is the main program that simply parses the pdf document and transforms it
* into text.
*
* #author Ben Litchfield
* #author Tilman Hausherr
*/
public final class ExtractText
{
private static final Log LOG = LogFactory.getLog(ExtractText.class);
private static final String PASSWORD = "-password";
private static final String ENCODING = "-encoding";
private static final String CONSOLE = "-console";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
private static final String SORT = "-sort";
private static final String IGNORE_BEADS = "-ignoreBeads";
private static final String DEBUG = "-debug";
private static final String HTML = "-html";
private static final String ALWAYSNEXT = "-alwaysNext";
private static final String ROTATION_MAGIC = "-rotationMagic";
private static final String STD_ENCODING = "UTF-8";
/*
* debug flag
*/
private boolean debug = false;
/**
* private constructor.
*/
private ExtractText()
{
//static class
}
/**
* Infamous main method.
*
* #param args Command line arguments, should be one and a reference to a file.
*
* #throws IOException if there is an error reading the document or extracting the text.
*/
public static void main( String[] args ) throws IOException
{
// suppress the Dock icon on OS X
System.setProperty("apple.awt.UIElement", "true");
ExtractText extractor = new ExtractText();
extractor.startExtraction(args);
}
/**
* Starts the text extraction.
*
* #param args the commandline arguments.
* #throws IOException if there is an error reading the document or extracting the text.
*/
public void startExtraction( String[] args ) throws IOException
{
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
boolean separateBeads = true;
boolean alwaysNext = false;
boolean rotationMagic = false;
String password = "";
String encoding = STD_ENCODING;
String pdfFile = null;
String outputFile = null;
// Defaults to text files
String ext = ".txt";
int startPage = 1;
int endPage = Integer.MAX_VALUE;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( ENCODING ) )
{
i++;
if( i >= args.length )
{
usage();
}
encoding = args[i];
}
else if( args[i].equals( START_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
startPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( HTML ) )
{
toHTML = true;
ext = ".html";
}
else if( args[i].equals( SORT ) )
{
sort = true;
}
else if( args[i].equals( IGNORE_BEADS ) )
{
separateBeads = false;
}
else if (args[i].equals(ALWAYSNEXT))
{
alwaysNext = true;
}
else if (args[i].equals(ROTATION_MAGIC))
{
rotationMagic = true;
}
else if( args[i].equals( DEBUG ) )
{
debug = true;
}
else if( args[i].equals( END_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
endPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( CONSOLE ) )
{
toConsole = true;
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
else
{
outputFile = args[i];
}
}
}
if( pdfFile == null )
{
usage();
}
else
{
Writer output = null;
PDDocument document = null;
try
{
long startTime = startProcessing("Loading PDF "+pdfFile);
if( outputFile == null && pdfFile.length() >4 )
{
outputFile = new File( pdfFile.substring( 0, pdfFile.length() -4 ) + ext ).getAbsolutePath();
}
document = PDDocument.load(new File( pdfFile ), password);
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
throw new IOException( "You do not have permission to extract text" );
}
stopProcessing("Time for loading: ", startTime);
if( toConsole )
{
output = new OutputStreamWriter( System.out, encoding );
}
else
{
if (toHTML && !STD_ENCODING.equals(encoding))
{
encoding = STD_ENCODING;
System.out.println("The encoding parameter is ignored when writing html output.");
}
output = new OutputStreamWriter( new FileOutputStream( outputFile ), encoding );
}
startTime = startProcessing("Starting text extraction");
if (debug)
{
System.err.println("Writing to " + outputFile);
}
PDFTextStripper stripper;
if(toHTML)
{
// HTML stripper can't work page by page because of startDocument() callback
stripper = new PDFText2HTML();
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
// Extract text for main document:
stripper.writeText(document, output);
}
else
{
if (rotationMagic)
{
stripper = new FilteredTextStripper();
}
else
{
stripper = new PDFTextStripper();
}
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(separateBeads);
// Extract text for main document:
extractPages(startPage, Math.min(endPage, document.getNumberOfPages()),
stripper, document, output, rotationMagic, alwaysNext);
}
// ... also for any embedded PDFs:
PDDocumentCatalog catalog = document.getDocumentCatalog();
PDDocumentNameDictionary names = catalog.getNames();
if (names != null)
{
PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
if (embeddedFiles != null)
{
Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
if (embeddedFileNames != null)
{
for (Map.Entry<String, PDComplexFileSpecification> ent : embeddedFileNames.entrySet())
{
if (debug)
{
System.err.println("Processing embedded file " + ent.getKey() + ":");
}
PDComplexFileSpecification spec = ent.getValue();
PDEmbeddedFile file = spec.getEmbeddedFile();
if (file != null && "application/pdf".equals(file.getSubtype()))
{
if (debug)
{
System.err.println(" is PDF (size=" + file.getSize() + ")");
}
InputStream fis = file.createInputStream();
PDDocument subDoc = null;
try
{
subDoc = PDDocument.load(fis);
if (toHTML)
{
// will not really work because of HTML header + footer
stripper.writeText( subDoc, output );
}
else
{
extractPages(1, subDoc.getNumberOfPages(),
stripper, subDoc, output, rotationMagic, alwaysNext);
}
}
finally
{
fis.close();
IOUtils.closeQuietly(subDoc);
}
}
}
}
}
}
stopProcessing("Time for extraction: ", startTime);
}
finally
{
IOUtils.closeQuietly(output);
IOUtils.closeQuietly(document);
}
}
}
private void extractPages(int startPage, int endPage,
PDFTextStripper stripper, PDDocument document, Writer output,
boolean rotationMagic, boolean alwaysNext) throws IOException
{
for (int p = startPage; p <= endPage; ++p)
{
stripper.setStartPage(p);
stripper.setEndPage(p);
try
{
if (rotationMagic)
{
PDPage page = document.getPage(p - 1);
int rotation = page.getRotation();
page.setRotation(0);
AngleCollector angleCollector = new AngleCollector();
angleCollector.setStartPage(p);
angleCollector.setEndPage(p);
angleCollector.writeText(document, new NullWriter());
// rotation magic
for (int angle : angleCollector.getAngles())
{
// prepend a transformation
// (we could skip these parts for angle 0, but it doesn't matter much)
PDPageContentStream cs = new PDPageContentStream(document, page,
PDPageContentStream.AppendMode.PREPEND, false);
cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
cs.close();
stripper.writeText(document, output);
// remove prepended transformation
((COSArray) page.getCOSObject().getItem(COSName.CONTENTS)).remove(0);
}
page.setRotation(rotation);
}
else
{
stripper.writeText(document, output);
}
}
catch (IOException ex)
{
if (!alwaysNext)
{
throw ex;
}
LOG.error("Failed to process page " + p, ex);
}
}
}
private long startProcessing(String message)
{
if (debug)
{
System.err.println(message);
}
return System.currentTimeMillis();
}
private void stopProcessing(String message, long startTime)
{
if (debug)
{
long stopTime = System.currentTimeMillis();
float elapsedTime = ((float)(stopTime - startTime))/1000;
System.err.println(message + elapsedTime + " seconds");
}
}
/**
* This will print the usage requirements and exit.
*/
private static void usage()
{
String message = "Usage: java -jar pdfbox-app-x.y.z.jar ExtractText [options] <inputfile> [output-text-file]\n"
+ "\nOptions:\n"
+ " -password <password> : Password to decrypt document\n"
+ " -encoding <output encoding> : UTF-8 (default) or ISO-8859-1, UTF-16BE,\n"
+ " UTF-16LE, etc.\n"
+ " -console : Send text to console instead of file\n"
+ " -html : Output in HTML format instead of raw text\n"
+ " -sort : Sort the text before writing\n"
+ " -ignoreBeads : Disables the separation by beads\n"
+ " -debug : Enables debug output about the time consumption\n"
+ " of every stage\n"
+ " -alwaysNext : Process next page (if applicable) despite\n"
+ " IOException (ignored when -html)\n"
+ " -rotationMagic : Analyze each page for rotated/skewed text,\n"
+ " rotate to 0° and extract separately\n"
+ " (slower, and ignored when -html)\n"
+ " -startPage <number> : The first page to start extraction (1 based)\n"
+ " -endPage <number> : The last page to extract (1 based, inclusive)\n"
+ " <inputfile> : The PDF document to use\n"
+ " [output-text-file] : The file to write the text to";
System.err.println(message);
System.exit( 1 );
}
}
/**
* Collect all angles while doing text extraction. Angles are in degrees and rounded to the closest
* integer (to avoid slight differences from floating point arithmethic resulting in similarly
* angled glyphs being treated separately). This class must be constructed for each page so that the
* angle set is initialized.
*/
class AngleCollector extends PDFTextStripper
{
private final Set<Integer> angles = new TreeSet<Integer>();
AngleCollector() throws IOException
{
}
Set<Integer> getAngles()
{
return angles;
}
#Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
angle = (angle + 360) % 360;
angles.add(angle);
}
}
/**
* TextStripper that only processes glyphs that have angle 0.
*/
class FilteredTextStripper extends PDFTextStripper
{
FilteredTextStripper() throws IOException
{
}
#Override
protected void processTextPosition(TextPosition text)
{
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
if (angle == 0)
{
super.processTextPosition(text);
}
}
}
/**
* Dummy output.
*/
class NullWriter extends Writer
{
#Override
public void write(char[] cbuf, int off, int len) throws IOException
{
// do nothing
}
#Override
public void flush() throws IOException
{
// do nothing
}
#Override
public void close() throws IOException
{
// do nothing
}
}
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 5 years ago.
Improve this question
I have a java program that creates multiple PNG files (screenshots). I now need to find a way to create an animated GIF from these files and have no idea where to start.
How to convert multiple PNG files to an animated GIF?
Adapted from an old thread on Sun forums. Batteries not included, all warranties null & void.
import java.awt.image.BufferedImage;
import java.io.File;
import org.w3c.dom.Node;
import javax.imageio.*;
import javax.imageio.metadata.*;
import javax.imageio.stream.ImageOutputStream;
/**
* Creates an animated GIF from GIF frames. A thin wrapper to code written by
* other people, as documented on the thread on the Sun forums 'Create animated
* GIF using imageio' http://forums.sun.com/thread.jspa?threadID=5395006 See the
* printUsage() method for details on paramaters required.
*
* #author Andrew Thompson
*/
class WriteAnimatedGif {
/**
* See http://forums.sun.com/thread.jspa?messageID=10755673#10755673
*
* #author Maxideon
* #param delayTime String Frame delay for this frame.
*/
public static void configure(IIOMetadata meta,
String delayTime,
int imageIndex) {
String metaFormat = meta.getNativeMetadataFormatName();
if (!"javax_imageio_gif_image_1.0".equals(metaFormat)) {
throw new IllegalArgumentException(
"Unfamiliar gif metadata format: " + metaFormat);
}
Node root = meta.getAsTree(metaFormat);
//find the GraphicControlExtension node
Node child = root.getFirstChild();
while (child != null) {
if ("GraphicControlExtension".equals(child.getNodeName())) {
break;
}
child = child.getNextSibling();
}
IIOMetadataNode gce = (IIOMetadataNode) child;
gce.setAttribute("userDelay", "FALSE");
gce.setAttribute("delayTime", delayTime);
//only the first node needs the ApplicationExtensions node
if (imageIndex == 0) {
IIOMetadataNode aes
= new IIOMetadataNode("ApplicationExtensions");
IIOMetadataNode ae
= new IIOMetadataNode("ApplicationExtension");
ae.setAttribute("applicationID", "NETSCAPE");
ae.setAttribute("authenticationCode", "2.0");
byte[] uo = new byte[]{
//last two bytes is an unsigned short (little endian) that
//indicates the the number of times to loop.
//0 means loop forever.
0x1, 0x0, 0x0
};
ae.setUserObject(uo);
aes.appendChild(ae);
root.appendChild(aes);
}
try {
meta.setFromTree(metaFormat, root);
} catch (IIOInvalidTreeException e) {
//shouldn't happen
throw new Error(e);
}
}
/**
* See http://forums.sun.com/thread.jspa?messageID=9988198
*
* #author GeoffTitmus
* #param file File A File in which to store the animation.
* #param frames BufferedImage[] Array of BufferedImages, the frames of the
* animation.
* #param delayTimes String[] Array of Strings, representing the frame delay
* times.
*/
public static void saveAnimate(
File file,
BufferedImage[] frames,
String[] delayTimes) throws Exception {
ImageWriter iw = ImageIO.getImageWritersByFormatName("gif").next();
ImageOutputStream ios = ImageIO.createImageOutputStream(file);
iw.setOutput(ios);
iw.prepareWriteSequence(null);
for (int i = 0; i < frames.length; i++) {
BufferedImage src = frames[i];
ImageWriteParam iwp = iw.getDefaultWriteParam();
IIOMetadata metadata = iw.getDefaultImageMetadata(
new ImageTypeSpecifier(src), iwp);
configure(metadata, delayTimes[i], i);
IIOImage ii = new IIOImage(src, null, metadata);
iw.writeToSequence(ii, null);
}
iw.endWriteSequence();
ios.close();
}
/**
* Dump the usage to the System.err stream.
*/
public static void printUsage() {
StringBuffer sb = new StringBuffer();
String eol = System.getProperty("line.separator");
sb.append("Usage: 2 forms each using 3 arguments");
sb.append(eol);
sb.append("1) output (animated GIF) file name");
sb.append(eol);
sb.append("2) input files (animation frames), separated by ','");
sb.append(eol);
sb.append("3) single frame rate, or comma separared list of frame rates");
sb.append(eol);
sb.append("java WriteAnimatedGif animate.gif frm1.gif,frm2.gif,..,frmN.gif 100");
sb.append(eol);
sb.append("java WriteAnimatedGif animate.gif frm1.gif,frm2.gif,..,frmN.gif 100,40,..,N");
sb.append(eol);
sb.append("The 2nd form must have exactly as many integers as there are frames.");
sb.append(eol);
sb.append("Frame rates are specified in increments of 1/100th second, NOT milliseconds.");
sb.append(eol);
System.err.print(sb);
}
/**
* Checks that a String intended as a delayTime is an integer>0. If not,
* dumps a warning message and the usage, then exits. If successful, returns
* the String unaltered.
*/
public static String checkDelay(String delay) {
try {
int val = Integer.parseInt(delay);
if (val < 1) {
System.err.println(
"Animation frame delay '"
+ val
+ "' is < 1!");
printUsage();
System.exit(1);
}
} catch (NumberFormatException nfe) {
System.err.println(
"Could not parse '"
+ delay
+ "' as an integer.");
printUsage();
System.exit(1);
}
return delay;
}
/**
* Parse the arguments and if successful, attempt to write the animated GIF.
*/
public static void main(String[] args) throws Exception {
if (args.length != 3) {
printUsage();
System.exit(1);
}
// deal with the output file name
File f = new File(args[0]);
// deal with the input file names
String[] names = args[1].split(",");
if (names.length < 2) {
System.err.println("An animation requires 2 or more frames!");
printUsage();
System.exit(1);
}
BufferedImage[] frames = new BufferedImage[names.length];
for (int ii = 0; ii < names.length; ii++) {
frames[ii] = ImageIO.read(new File(names[ii]));
}
// deal with the frame rates
String[] delays = args[2].split(",");
// note: length of names, not delays
String[] delayTimes = new String[names.length];
if (delays.length != names.length) {
System.err.println(delays.length
+ " delays specified for "
+ names.length
+ " frames!");
printUsage();
System.exit(1);
} else if (delays.length == 1) {
for (int ii = 0; ii < delayTimes.length; ii++) {
// fill all values with the single delayTime
delayTimes[ii] = checkDelay(delays[0]);
}
} else {
for (int ii = 0; ii < delayTimes.length; ii++) {
delayTimes[ii] = checkDelay(delays[ii]);
}
}
// save an animated GIF
saveAnimate(f, frames, delayTimes);
}
}
Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 8 years ago.
Improve this question
I'm currently learning programming (java) and I've been using a program that runs on command prompt but as an exercise, I've been asked to run it on netbeans and to find out how it can work on it. When I run the program this is the output that I got.
run:
Please specify a path to a pricing catalogue file
Java Result: 3
BUILD SUCCESSFUL (total time: 0 seconds)
I've spent several hours on this issue but I was not able to sort it out. Can anybody help me with it please.
Here is the codes.
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
public class Grocery {
private static Catalogue pricingCatalogue;
private static HashMap<String, Double> inputItemList = new HashMap<String, Double>();
private static ArrayList<PurchasedProduct> purchasedItemList;
public static void main(String[] args) {
if( args.length != 1 )
{
System.err.println( "" );
System.exit( 3 );
}
generateCatalogue( args[0] );
gatherUserInput();
rateItem();
printItemizedBill();
}
private static void generateCatalogue( String inputFile ) {
try {
List<String> lines = Files.readAllLines(
Paths.get(inputFile),
Charset.defaultCharset() );
pricingCatalogue = new Catalogue();
Boolean compareIgnoreCase = false;
for (String line : lines) {
StringTokenizer token = new StringTokenizer( line, "|" );
String productName = (String)token.nextElement();
String rateName = (String)token.nextElement();
String rateDescr = (String)token.nextElement();
Double effectiveQuantity = new Double( (String)token.nextElement() );
Double effectivePrice;
String tierPrice;
Rate newRate;
Product newProduct;
if( effectiveQuantity == -1 )
{
tierPrice = (String)token.nextElement();
newRate = new Rate( rateName, rateDescr, tierPrice );
}
else
{
effectivePrice = new Double( (String)token.nextElement() );
newRate = new Rate( rateName, rateDescr, effectiveQuantity, effectivePrice );
}
if( true == pricingCatalogue.productIsInCatalogue( productName, compareIgnoreCase ) )
{
pricingCatalogue.addRateToExistingProduct( productName, newRate );
}
else
{
newProduct = new Product( productName );
newProduct.addRate( newRate );
pricingCatalogue.addProduct( newProduct );
}
} // end reading input file
}
catch (IOException e) {
e.printStackTrace();
}
//pricingCatalogue.printCatalogue();
System.out.println( "Price catalogue loaded sucessfully from [" + inputFile + "]\n" );
}
private static void gatherUserInput()
{
BufferedReader br = new BufferedReader(new InputStreamReader( System.in ) );
String inStr = new String();
String magicWord = "CHECK OUT";
Boolean readyToCheckOut = false;
Boolean compareIgnoreCase = false;
StringTokenizer item_tok = null;
String tok = null;
String itemName = new String();
Double itemQuantity = new Double( 0 );
inputItemList.clear();
System.out.println( "Please enter an item with quantity in a format like '2 apple'" );
System.out.println( "When you are done entering item(s), type 'CHECK OUT' to get an itemized bill" );
while( false == readyToCheckOut )
{
System.out.print( ">> ");
try {
inStr = br.readLine();
}catch( IOException ioe ) {
System.err.println("Failed to read line item");
}
item_tok = new StringTokenizer( inStr );
while( false == inStr.equals( magicWord )
&&
true == item_tok.hasMoreTokens() )
{
try
{
tok = item_tok.nextElement().toString();
itemQuantity = new Double( tok );
tok = item_tok.nextElement().toString();
}
catch( NumberFormatException nfe )
{
System.err.println( "[" + tok + "] is not something I recognize. Try something like '2 apple'" );
break;
}
catch( Exception e )
{
System.err.println( "Oops I did not understand that. Try something like '2 apple'" );
break;
}
itemName = tok;
//System.out.println( "--- ITEM [" + itemName + "] QUANTITY [" + ItemQuantity + "]" );
if( false == pricingCatalogue.productIsInCatalogue( itemName, compareIgnoreCase ) )
{
System.err.println( "Item [" + itemName + "] does not exist in the catalogue" );
continue;
}
if( true == inputItemList.containsKey( itemName ) ) {
itemQuantity = itemQuantity + inputItemList.get( itemName );
inputItemList.remove( itemName );
inputItemList.put( itemName, itemQuantity );
}
else {
inputItemList.put( itemName, itemQuantity );
}
}
if( true == inStr.equals( magicWord ) ) {
readyToCheckOut = true;
}
}
//System.out.println( "inputItemList [" + inputItemList + "]" );
}
private static void rateItem()
{
purchasedItemList = new ArrayList<PurchasedProduct>();
Product aProduct;
Rate bestRate;
PurchasedProduct pp;
double purchasedQuantity = 0;
for( Map.Entry<String, Double> entry : inputItemList.entrySet() )
{
String prodName = entry.getKey();
Double prodQuantity = entry.getValue();
aProduct = pricingCatalogue.getProduct( prodName );
bestRate = aProduct.getBestRate( prodQuantity );
purchasedQuantity = bestRate.getEffectiveQuantity();
pp = new PurchasedProduct( prodName, purchasedQuantity, bestRate );
purchasedItemList.add( pp );
prodQuantity = prodQuantity - purchasedQuantity;
/*
* Keep finding the best rate for the same product until we
* have filled the quantity
*/
while( prodQuantity > 0 )
{
bestRate = aProduct.getBestRate( prodQuantity );
purchasedQuantity = bestRate.getEffectiveQuantity();
pp = new PurchasedProduct( prodName, purchasedQuantity, bestRate );
purchasedItemList.add( pp );
prodQuantity = prodQuantity - purchasedQuantity;
}
}
}
private static void printItemizedBill()
{
PurchasedProduct pp = null;
Double totalDue = new Double( 0 );
Double lineTotal = new Double( 0 );
System.out.println( "\nHere is your invoice:" );
Iterator ite = purchasedItemList.iterator();
while( ite.hasNext() )
{
pp = (PurchasedProduct)ite.next();
lineTotal = pp.getPurhcasedCost();
System.out.format( "%10s%20s%10.2f\n", pp.getPurchasedProductName(), pp.getPurchasedRateDescr(), lineTotal );
totalDue += lineTotal;
}
System.out.format( "\n%10s%20s$%9.2f\n", "TOTAL DUE", "", totalDue );
}
}
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
public class Catalogue
{
private ArrayList<Product> productList;
Catalogue()
{
this.productList = new ArrayList<Product>();
}
/*
* Returns a boolean if a product exists in the catalogue. Use the second
* argument to set whether the comparsion should take case-sensitivity into accoun
*
*/
public boolean productIsInCatalogue( String inProdName, Boolean compareIgnoreCase )
{
HashSet<String> currentProductSet = new HashSet<String>();
/*
* Get list of pricing product names
*/
Iterator ite = this.productList.iterator();
while( ite.hasNext() )
{
if( true == compareIgnoreCase )
{
currentProductSet.add( ( (Product)ite.next() ).getProductName().toUpperCase() );
}
else
{
currentProductSet.add( ( (Product)ite.next() ).getProductName() );
}
}
if( true == compareIgnoreCase )
{
return currentProductSet.contains( inProdName.toUpperCase() );
}
else
{
return currentProductSet.contains( inProdName );
}
}
public void addProduct( Product inProduct )
{
this.productList.add( inProduct );
}
public void addRateToExistingProduct( String inExistingProdName, Rate inRate )
{
Iterator ite = this.productList.iterator();
while( ite.hasNext() )
{
Product currentProd = (Product)ite.next();
String currentProdName = currentProd.getProductName();
if( 0 == currentProdName.compareTo( inExistingProdName ) )
{
currentProd.addRate( inRate );
break;
}
}
}
public Product getProduct( String inExistingProdName )
{
Product foundProduct = null;
Iterator ite = this.productList.iterator();
while( ite.hasNext() )
{
Product aProduct = (Product)ite.next();
if( true == aProduct.getProductName().equals( inExistingProdName ) )
{
foundProduct = aProduct;
break;
}
}
return foundProduct;
}
public void printCatalogue()
{
Iterator ite = this.productList.iterator();
while( ite.hasNext() )
{
( (Product)ite.next() ).printProduct();
}
}
}
import java.util.HashSet;
import java.util.Iterator;
public class Product
{
private String productName;
private HashSet <Rate> productRate;
Product()
{
this( "N/A" );
}
Product( String inProductName )
{
this.productName = inProductName;
this.productRate = new HashSet <Rate>();
}
/*
* Add a rate to this product
*/
public void addRate( Rate inRate )
{
this.productRate.add( inRate );
}
public void printProduct()
{
System.out.println( "*** PRODUCT NAME [" + this.productName + "] ***\n" );
if( this.productRate.size() > 0 )
{
Iterator ite = this.productRate.iterator();
while( ite.hasNext() )
{
((Rate)ite.next()).printRate();
}
}
else
{
System.out.println( "This product does not have rates defined");
}
System.out.println( "" );
}
public String getProductName()
{
return this.productName;
}
public Rate getBestRate( Double inQuantity )
{
Rate lowestRate = null;
HashSet <Rate> applicableRate = new HashSet <Rate>();
Iterator ite = this.productRate.iterator();
while( ite.hasNext() )
{
Rate aRate = (Rate)ite.next();
if( inQuantity >= aRate.getEffectiveQuantity() )
{
applicableRate.add( aRate );
}
}
/*
* Amongst the available rates, pick the rate with
* the lowest cost per unit
*/
ite = applicableRate.iterator();
while( ite.hasNext() )
{
Rate appRate = (Rate)ite.next();
if( null == lowestRate )
{
/*
* Handle first time entering the loop
*/
lowestRate = appRate;
}
if( lowestRate.getCostPerUnit() > appRate.getCostPerUnit() )
{
lowestRate = appRate;
}
}
return lowestRate;
}
}
import java.math.BigDecimal;
import java.math.RoundingMode;
public class PurchasedProduct
{
private String purchasedProdName;
private double purchasedQuantity;
private Rate purchasedRate;
private double purchasedCost;
PurchasedProduct( String inProdName, double inQuantity, Rate inRate )
{
//this.purchasedProdName = Character.toUpperCase( inProdName.charAt(0) ) + inProdName.substring( 1 );
this.purchasedProdName = inProdName;
this.purchasedQuantity = inQuantity;
this.purchasedRate = inRate;
this.purchasedCost = this.getCost();
}
public String getPurchasedProductName()
{
return this.purchasedProdName;
}
public String getPurchasedRateDescr()
{
return this.purchasedRate.getRateDescr();
}
public double getPurhcasedCost()
{
return this.purchasedCost;
}
private double getCost()
{
double lineCost = 0;
if( false == this.purchasedRate.isTiered() )
{
lineCost = this.purchasedRate.getEffectivePrice();
}
else
{
lineCost = this.purchasedRate.getTierPrice( new Double( this.purchasedQuantity ) );
}
return round( lineCost, 2 );
}
private double round( double value, int places )
{
if (places < 0) throw new IllegalArgumentException();
BigDecimal bd = new BigDecimal(value);
bd = bd.setScale(places, RoundingMode.HALF_UP);
return bd.doubleValue();
}
}
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.StringTokenizer;
/*
* Rate class represents the detail on how to rate a specific product
* A product may have many rates
*/
public class Rate
{
private String rateName;
private String rateDescr; // bill invoice display
private double effectiveQuantity; // the total quantity applicable to take advantage of this rate
private double effectivePrice; // the total price applicable
private double costPerUnit; // average cost per unit
private ArrayList<Map> tiers; // ONLY applicable if this is tierd pricing
Rate()
{
this( "N/A", "N/A", 0, 0 );
}
/*
* Non-tier rate constructor
*/
Rate( String inRateName, String inRateDesc, double inEffQ, double inEffPr )
{
this.rateName = inRateName;
this.rateDescr = inRateDesc;
this.effectiveQuantity = inEffQ;
this.effectivePrice = inEffPr;
this.tiers = new ArrayList<Map>();
this.costPerUnit = this.getCostPerUnit();
}
/*
* Tier rate constructor
*/
Rate( String inRateName, String inRateDesc, String tier_val )
{
this.rateName = inRateName;
this.rateDescr = inRateDesc;
this.effectiveQuantity = -1; // this is calculated later in getCostPerUnit
this.effectivePrice = -1;
this.tiers = new ArrayList<Map>();
/*
* Example tier_val "1-1,0.50,1;2-2,0.50,0.50"
* Each tier is separated by ';'
* A token within a tier is operated by ','
*/
StringTokenizer more_tiers = new StringTokenizer( tier_val, ";" );
while( more_tiers.hasMoreTokens() )
{
StringTokenizer tier_detail = new StringTokenizer( (String)more_tiers.nextElement(), "," );
Map<String, Double> tier_map = new HashMap<String, Double>();
/*
* First token is the min to max quantity e.g. 1-1 or 1-5
*/
String[] minMaxVal = tier_detail.nextElement().toString().split("-");
tier_map.put( "MIN", new Double( minMaxVal[0] ) );
tier_map.put( "MAX", new Double( minMaxVal[1] ) );
/*
* Second token is the quantity price per unit so 1.50 mean charge each unit for 1.50
*/
tier_map.put( "PRICE", new Double( (String)tier_detail.nextElement() ) );
/*
* Third token is the discount scale, e.g 1 means 100% no discount and 0.5 means 50% discount
*/
tier_map.put( "SCALE", new Double( (String)tier_detail.nextElement() ) );
tiers.add( tier_map );
}
this.costPerUnit = this.getCostPerUnit();
}
public String getRateDescr()
{
return this.rateDescr;
}
public double getEffectiveQuantity()
{
return this.effectiveQuantity;
}
public double getEffectivePrice()
{
return this.effectivePrice;
}
public Boolean isTiered()
{
return ( this.effectivePrice == -1 );
}
/*
* Calculate the total cost with the input quantity
*/
public double getTierPrice( double inQuantity )
{
Iterator ite = this.tiers.iterator();
Double min = new Double( 0 );
Double max = new Double( 0 );
Double price = new Double( 0 );
Double scale = new Double( 0 );
Double total_cost = new Double( 0 );
Double total_quan = new Double( 0 );
Double tierMaxQ = new Double( 0 );
Double toRateQuan = new Double( inQuantity );
/*
* Step through each tier
*/
while( ite.hasNext() )
{
Map tier_map = (Map)ite.next();
min = (Double)tier_map.get( "MIN" );
max = (Double)tier_map.get( "MAX" );
price = (Double)tier_map.get( "PRICE" );
scale = (Double)tier_map.get( "SCALE" );
/*
* Get the tier applicable units
*/
tierMaxQ = max - min + 1;
if( 0 >= toRateQuan )
{
break;
}
else if( toRateQuan >= tierMaxQ )
{
/*
* The incoming to-to-rated quantity is greater than
* the tier total units. Rate it with the
* maximum units in this tier
*/
total_cost = total_cost + ( tierMaxQ * price * scale );
toRateQuan = toRateQuan - tierMaxQ;
continue;
}
else
{
/*
* The incoming to-be-rated quantity is less than
* the tier total units. Rate it with the to-be-rated
* quantity
*/
total_cost = total_cost + ( toRateQuan * price * scale );
break;
}
}
return total_cost;
}
/*
* Calculate the 'average' cost per unit
*
* For a non-tiered rate, the average cost is price over quantity
*
* For a tiered rate, we calculate each tier cost, add them up and
* divide by the total quantity to get the average cost
*
*/
public double getCostPerUnit()
{
if( false == this.isTiered() )
{
/*
* Simple pricing; individual or bulk
*/
return ( this.effectivePrice / this.effectiveQuantity );
}
else
{
/*
* Tier pricing. Calculate the total cost then divide by the quantity to
* get the average cost
*/
Iterator ite = this.tiers.iterator();
Double min = new Double( 0 );
Double max = new Double( 0 );
Double price = new Double( 0 );
Double scale = new Double( 0 );
Double total_cost = new Double( 0 );
Double total_quan = new Double( 0 );
Double costPerUnit = new Double( 0 );
while( ite.hasNext() )
{
Map tier_map = (Map)ite.next();
min = (Double)tier_map.get( "MIN" );
max = (Double)tier_map.get( "MAX" );
price = (Double)tier_map.get( "PRICE" );
scale = (Double)tier_map.get( "SCALE" );
if( 0 >= ( max - min + 1 ) )
{
break;
}
total_quan = total_quan + ( max - min + 1 );
total_cost = total_cost + ( ( max - min + 1 ) * price * scale );
}
this.effectiveQuantity = total_quan;
costPerUnit = total_cost / total_quan;
return costPerUnit;
}
}
/*
* DEBUG
*/
public void printRate()
{
System.out.println( "\tRATE NAME [" + this.rateName + "]" );
System.out.println( "\tRATE DESC [" + this.rateDescr + "]" );
System.out.println( "\tQUANTITY [" + this.effectiveQuantity + "]" );
System.out.println( "\tCOST PER UNIT [" + this.costPerUnit + "]" );
if( false == this.isTiered() )
{
System.out.println( "\tPRICE [" + this.effectivePrice + "]" );
}
else
{
int num_tiers = this.tiers.size();
for( int i = 0; i < num_tiers; ++i )
{
System.out.println( "\t--- TIER [" + ( i + 1 ) + "]" );
System.out.println( "\t\t --- MIN [" + this.tiers.get( i ).get( "MIN") + "]" );
System.out.println( "\t\t --- MAX [" + this.tiers.get( i ).get( "MAX") + "]" );
System.out.println( "\t\t --- PRICE [" + this.tiers.get( i ).get( "PRICE") + "]" );
System.out.println( "\t\t --- SCALE [" + this.tiers.get( i ).get( "SCALE") + "]" );
}
}
System.out.println( "\n\n");
}
}
This
System.exit( 3 );
Causes your process to return (in Java parlance it exits) with a value to the operating system. It's equivalent to the int returned by main in c and c++. On unix style systems you can access the return value with
echo $?
It looks like you are supposed to call this program with a file-path to some catalogue as the first argument (args[0]), otherwise it exits.
In continuation from this question.
I need help in making my TableToCSV (function that converts .html table to csv), render the code to a database, rather than a .csv. I created a BufferedReader, which converts .csv to database, but I can't get the 2 to connect. Please make the output file of TableToCSV go into my bufferedreader.
TableToCSV
* [TableToCSV.java]
*
* Summary: Extracts rows in CSV tables to CSV form. Extracts data from all tables in the input. Output in xxx.csv.
*
* Copyright: (c) 2011-2014 Roedy Green, Canadian Mind Products, http://mindprod.com
*
* Licence: This software may be copied and used freely for any purpose but military.
* http://mindprod.com/contact/nonmil.html
*
* Requires: JDK 1.6+
*
* Created with: JetBrains IntelliJ IDEA IDE http://www.jetbrains.com/idea/
*
* Version History:
* 1.0 2011-01-23 initial version.
* 1.1 2011-01-25 allow you to specify encoding
*/
package com.mindprod.csv;
import com.mindprod.common11.Misc;
import com.mindprod.entities.DeEntifyStrings;
import com.mindprod.hunkio.HunkIO;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import static java.lang.System.err;
import static java.lang.System.out;
/**
* Extracts rows in CSV tables to CSV form. Extracts data from all tables in the input. Output in xxx.csv.
* <p/>
* Use: java.exe com.mindprod.TableToCSV xxxx.html
* It also strips tags and converts entities back to UTF-8 characters.
*
* #author Roedy Green, Canadian Mind Products
* #version 1.1 2011-01-25 allow you to specify encoding
* #since 2011-01-23
*/
public final class TableToCSV
{
// ------------------------------ CONSTANTS ------------------------------
/**
* how to use the command line
*/
private static final String USAGE = "TableToCSV needs the name of an HTML file on the commandline, " +
"nothing else. Output will be in xxx.csv.";
// -------------------------- PUBLIC INSTANCE METHODS --------------------------
/**
* Constructor to convert an HTML table to CSV. Strips out entities and tags.
*
* #param file CSV file to be packed to remove excess space and quotes.
* #param separatorChar field separator character, usually ',' in North America,
* ';' in Europe and sometimes '\t' for
* tab for the output file. It is tab for the input file.
* Note this is a 'char' not a "string".
* #param quoteChar character used to quote fields containing awkward chars.
* #param commentChar character to treat as comments.
* #param encoding encoding of the input and output file.
*
* #throws java.io.IOException if problems reading/writing file
*/
#SuppressWarnings({ "WeakerAccess" })
public TableToCSV( final File file, final char separatorChar, final char quoteChar, final char commentChar,
final Charset encoding ) throws IOException
{
String outFilename = Misc.getCanOrAbsPath( file );
outFilename = outFilename.substring( 0, outFilename.length() - 5 ) + ".csv";
final File outFile = new File( outFilename );
// writer, quoteLevel, separatorChar, quoteChar, commentChar, trim
final PrintWriter pw = new PrintWriter( new OutputStreamWriter( new BufferedOutputStream( new FileOutputStream(
outFile ), 32768 ), encoding ) );
final CSVWriter w = new CSVWriter( pw, 0 /* minimal */, separatorChar, quoteChar, commentChar, true );
// read the entire html file into RAM.
String big = HunkIO.readEntireFile( file, encoding );
int from = 0;
// our parser is forgiving, works even if </td> </tr> missing.
while ( true )
{
// find <tr
final int trStart = big.indexOf( "<tr", from );
if ( trStart < 0 )
{
break;
}
from = trStart + 3;
final int trEnd = big.indexOf( '>', from );
if ( trEnd < 0 )
{
break;
}
while ( true )
{
// search for <td>...</td>
final int tdStart = big.indexOf( "<td", from );
if ( tdStart < 0 )
{
break;
}
from = tdStart + 3;
final int tdEnd = big.indexOf( '>', from );
if ( tdEnd < 0 )
{
break;
}
from = tdEnd + 1;
final int startField = tdEnd + 1;
final int slashTdStart = big.indexOf( "</td", from );
final int lookaheadTd = big.indexOf( "<td", from );
final int lookaheadSlashTr = big.indexOf( "</tr", from );
final int lookaheadTr = big.indexOf( "<tr", from );
int endField = Integer.MAX_VALUE;
if ( slashTdStart >= 0 && slashTdStart < endField )
{
endField = slashTdStart;
}
if ( lookaheadTd >= 0 && lookaheadTd < endField )
{
endField = lookaheadTd;
}
if ( lookaheadSlashTr >= 0 && lookaheadSlashTr < endField )
{
endField = lookaheadSlashTr;
}
if ( lookaheadTr >= 0 && lookaheadTr < endField )
{
endField = lookaheadTr;
}
if ( endField == Integer.MAX_VALUE )
{
break;
}
from = endField + 3;
final int slashTdEnd = big.indexOf( '>', from );
if ( slashTdEnd < 0 )
{
break;
}
String field = big.substring( startField, endField );
field = DeEntifyStrings.flattenHTML( field, ' ' );
w.put( field );
from = slashTdEnd + 1;
final int lookTd = big.indexOf( "<td", from );
final int lookTr = big.indexOf( "<tr", from );
if ( lookTr >= 0 && lookTr < lookTd || lookTd < 0 )
{
break;
}
}
w.nl();
}
out.println( w.getLineCount() + " rows extracted from table to csv" );
w.close();
}
// --------------------------- main() method ---------------------------
/**
* Simple command line interface to TableToCSV. Converts one HTML file to a CSV file, extracting tables,
* with entities stripped.
* Must have extension .html <br> Use java com.mindprod.TableToCSV somefile.html . You can use TableToCSV
* constructor
* in your own programs.
*
* #param args name of csv file to remove excess quotes and space
*/
public static void main( String[] args )
{
if ( args.length != 1 )
{
throw new IllegalArgumentException( USAGE );
}
String filename = args[ 0 ];
if ( !filename.endsWith( ".html" ) )
{
throw new IllegalArgumentException( "Bad Extension. Input must be a .html file.\n" + USAGE );
}
final File file = new File( filename );
try
{
// file, separatorChar, quoteChar, commentChar, encoding
new TableToCSV( file, ',', '\"', '#', CSV.UTF8Charset );
}
catch ( IOException e )
{
err.println();
e.printStackTrace( err );
err.println( "CSVToTable failed to export" + Misc.getCanOrAbsPath( file ) );
err.println();
}
}// end main
}
And here is my BufferedReader
BufferedReader br=new BufferedReader(new FileReader(newFile));
String line;
while((line=br.readLine())!=null)
{
String[]value = line.split(",");
String sql = "INSERT into main ( , Ticket #, Status, Priority, Department, Account Name) "
+ "values ('"+value[0]+"','"+value[1]+"','"+value[2]+"','"+value[3]+"','"+value[4]+"','"+value[5]+"')";
PreparedStatement pst = DatabaseConnection.ConnectDB().prepareStatement(sql);
pst.executeUpdate();
}
br.close();
}
catch(Exception e)
{
JOptionPane.showMessageDialog(null, e);
}
}
}
});
Did you test your database codes? does it work? (Hint: your sql statement is wrong). Is it auto-commit on? If not, aren't you suppose to close() the statement/connection?
I would factor the code and move the sql statement out and creation of prepared statement out side of the loop:
String sql = "INSERT INTO MAIN(\"Ticket #\", \"Status\", \"Priority\", \"Department\", \"Account Name\") VALUES (?, ?, ?, ?, ?);
PreparedStatement pst = DatabaseConnection.ConnectDB().prepareStatement(sql);
Then inside your while loop you just set the the object before execute.
pst.setString(1, value[0]);
pst.setString(2, value[1]); //...
And finally, don't forget to close() the statement / connection too!
pst.close();
DatabaseConnection.ConnectDB().close(); ???