I am creating a RESTful web service using Jersey. Some of the resources are binary files that I get from somewhere else on demand; such files are potentially big (hundreds of Mbytes).
I want browsers to GET those resources, so I have a #GET-annotated method returning a StreamingOutput, like in this answer.
I have two questions:
Is StreamingOutput the proper way of returning files?
What should I do in the server side to make it possible for browsers to resume an interrupted file transfer?
Just use the range-relevant HTTP headers, taking care w.r.t. caching. First, advertise that you can resume by setting the Accept-Ranges header. Secondly, check the Range and If-Range headers, and send the appropriate responses.
Note that you will probably need to construct your own Response and set the needed headers and result code by hand.
For the same purpose I've built a ContainerResponseFilter to intercept requests with a Range header and mangle the response accordingly[1].
This is the ContainerResponseFilter which will use encapsulate the output stream in a RangedOutputStream (see below):
public class RangeResponseFilter implements ContainerResponseFilter {
private static final String RANGE = "Range";
private static final String ACCEPT_RANGES = "Accept-Ranges";
#Override
public void filter(ContainerRequestContext requestContext, ContainerResponseContext responseContext)
throws IOException {
if (requestContext.getHeaders().containsKey(RANGE)) {
String rangeHeader = requestContext.getHeaderString(RANGE);
String contentType = responseContext.getMediaType().toString();
OutputStream originOutputStream = responseContext.getEntityStream();
RangedOutputStream rangedOutputStream = new RangedOutputStream(originOutputStream, rangeHeader, contentType, responseContext.getHeaders());
responseContext.setStatus(Status.PARTIAL_CONTENT.getStatusCode());
responseContext.getHeaders().putSingle(ACCEPT_RANGES, rangedOutputStream.getAcceptRanges());
responseContext.setEntityStream(rangedOutputStream);
}
}
}
And here's the RangedOutputStream:
public class RangedOutputStream extends OutputStream {
public class Range extends OutputStream {
private ByteArrayOutputStream outputStream;
private Integer from;
private Integer to;
public Range(Integer from, Integer to) {
this.outputStream = new ByteArrayOutputStream();
this.from = from;
this.to = to;
}
public boolean contains(Integer i) {
if (this.to == null) {
return (this.from <= i);
}
return (this.from <= i && i <= this.to);
}
public byte[] getBytes() {
return this.outputStream.toByteArray();
}
public Integer getFrom() {
return this.from;
}
public Integer getTo(Integer ifNull) {
return this.to == null ? ifNull : this.to;
}
#Override
public void write(int b) throws IOException {
this.outputStream.write(b);
}
}
private final static char[] MULTIPART_CHARS = "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
.toCharArray();
private static final String BOUNDARY_LINE_FORMAT = "--%s";
private static final String CONTENT_TYPE_LINE_FORMAT = "Content-Type: %s";
private static final String CONTENT_RANGE_FORMAT = "%s %d-%d/%d";
private static final String CONTENT_RANGE_LINE_FORMAT = "Content-Range: " + CONTENT_RANGE_FORMAT;
private static final String EMPTY_LINE = "\r\n";
private OutputStream outputStream;
private String boundary;
private String accept;
private String contentType;
private boolean multipart;
private boolean flushed = false;
private int pos = 0;
List<Range> ranges;
MultivaluedMap<String, Object> headers;
public RangedOutputStream(OutputStream outputStream, String ranges, String contentType, MultivaluedMap<String, Object> headers) {
this.outputStream = outputStream;
this.ranges = new ArrayList<>();
String[] acceptRanges = ranges.split("=");
this.accept = acceptRanges[0];
for (String range : acceptRanges[1].split(",")) {
String[] bounds = range.split("-");
this.ranges.add(new Range(Integer.valueOf(bounds[0]), bounds.length == 2 ? Integer.valueOf(bounds[1]) : null ));
}
this.headers = headers;
this.contentType = contentType;
this.multipart = this.ranges.size() > 1;
this.boundary = this.generateBoundary();
}
private String generateBoundary() {
StringBuilder buffer = new StringBuilder();
Random rand = new Random();
int count = rand.nextInt(11) + 30;
for (int i = 0; i < count; i++) {
buffer.append(MULTIPART_CHARS[rand.nextInt(MULTIPART_CHARS.length)]);
}
return buffer.toString();
}
public boolean isMultipart() {
return this.multipart;
}
public String getBoundary() {
return this.boundary;
}
public String getAcceptRanges() {
return this.accept;
}
public String getContentRange(int index) {
Range range = this.ranges.get(index);
return String.format(CONTENT_RANGE_LINE_FORMAT, this.accept, range.getFrom(), range.getTo(this.pos), this.pos);
}
#Override
public void write(int b) throws IOException {
for (Range range : this.ranges) {
if (range.contains(this.pos)) {
range.write(b);
}
}
this.pos++;
}
#Override
public void flush() throws IOException {
if (this.flushed) {
return;
}
if (this.multipart) {
this.headers.putSingle(HttpHeaders.CONTENT_TYPE, String.format("multipart/byteranges; boundary=%s", this.boundary));
for (Range range : this.ranges) {
this.outputStream.write(String.format(BOUNDARY_LINE_FORMAT + EMPTY_LINE, this.boundary).getBytes());
this.outputStream.write(String.format(CONTENT_TYPE_LINE_FORMAT + EMPTY_LINE, this.contentType).getBytes());
this.outputStream.write(
String.format(CONTENT_RANGE_LINE_FORMAT + EMPTY_LINE, this.accept, range.getFrom(), range.getTo(this.pos), this.pos)
.getBytes());
this.outputStream.write(EMPTY_LINE.getBytes());
this.outputStream.write(range.getBytes());
this.outputStream.write(EMPTY_LINE.getBytes());
}
this.outputStream.write(String.format(BOUNDARY_LINE_FORMAT, this.boundary + "--").getBytes());
} else {
Range range = this.ranges.get(0);
this.headers.putSingle("Content-Range", String.format(CONTENT_RANGE_FORMAT, this.accept, range.getFrom(), range.getTo(this.pos), this.pos));
this.outputStream.write(range.getBytes());
}
this.flushed = true;
}
}
[1] https://github.com/heruan/jersey-range-filter
Related
(Java ver. 8)
I need to process the request body in a filter. Using the below code, I read the body.
private static String convertInputStreamToString(InputStream is) throws IOException {
ByteArrayOutputStream result = new ByteArrayOutputStream();
byte[] buffer = new byte[1024 * 50];
int length;
while ((length = is.read(buffer)) != -1) {
result.write(buffer, 0, length);
}
return result.toString("UTF-8");
}
The issue is if there are parameters posted by request body with the content type "application/x-www-form-urlencoded", then the parameters won't be available after reading the body. They are available to get using request.getParameter(), if I don't read the body.
Moreover, I tried using the below code to wrap the request and provide the body, so it would be available to the rest of the solution (e.g. servlets), but the issue with losing the parameters happens yet. code is copied/adopted from this post
public class RequestWrapper extends HttpServletRequestWrapper {
private final String body;
public RequestWrapper(HttpServletRequest request) throws IOException {
super(request);
body = convertInputStreamToString(request.getInputStream());
}
private static String convertInputStreamToString(InputStream is) throws IOException {
ByteArrayOutputStream result = new ByteArrayOutputStream();
byte[] buffer = new byte[1024 * 50];
int length;
while ((length = is.read(buffer)) != -1) {
result.write(buffer, 0, length);
}
return result.toString("UTF-8");
}
#Override
public ServletInputStream getInputStream() throws IOException {
final byte[] myBytes = body.getBytes("UTF-8");
ServletInputStream servletInputStream = new ServletInputStream() {
private int lastIndexRetrieved = -1;
private ReadListener readListener = null;
#Override
public boolean isFinished() {
return (lastIndexRetrieved == myBytes.length - 1);
}
#Override
public boolean isReady() {
return isFinished();
}
#Override
public void setReadListener(ReadListener readListener) {
this.readListener = readListener;
if (!isFinished()) {
try {
readListener.onDataAvailable();
} catch (IOException e) {
readListener.onError(e);
}
} else {
try {
readListener.onAllDataRead();
} catch (IOException e) {
readListener.onError(e);
}
}
}
#Override
public int read() throws IOException {
int i;
if (!isFinished()) {
i = myBytes[lastIndexRetrieved + 1];
lastIndexRetrieved++;
if (isFinished() && (readListener != null)) {
try {
readListener.onAllDataRead();
} catch (IOException ex) {
readListener.onError(ex);
throw ex;
}
}
return i;
} else {
return -1;
}
}
};
return servletInputStream;
}
#Override
public BufferedReader getReader() throws IOException {
return new BufferedReader(new InputStreamReader(this.getInputStream()));
}
}
I tried to run the code you mentioned you're using and I think the accepted answer may not solve your issue as it's quite old. Seems you also need to overwrite the getParameter, getParameterMap and getParameterValues methods. I tried to do that based on this answer from the same post and seems it works. Here is the code:
public class MultiReadHttpServletRequest extends HttpServletRequestWrapper {
private ByteArrayOutputStream cachedBytes;
private String body;
private Map<String, String[]> parameterMap;
public MultiReadHttpServletRequest(HttpServletRequest request) throws IOException {
super(request);
parameterMap = super.getParameterMap();
cacheBodyAsString();
System.out.println("The Body read into a String is: " + body);
}
#Override
public ServletInputStream getInputStream() throws IOException {
if (cachedBytes == null)
cacheInputStream();
return new CachedServletInputStream(cachedBytes.toByteArray());
}
#Override
public BufferedReader getReader() throws IOException {
return new BufferedReader(new InputStreamReader(getInputStream()));
}
#Override
public String getParameter(String key) {
Map<String, String[]> parameterMap = getParameterMap();
String[] values = parameterMap.get(key);
return values != null && values.length > 0 ? values[0] : null;
}
#Override
public String[] getParameterValues(String key) {
Map<String, String[]> parameterMap = getParameterMap();
return parameterMap.get(key);
}
#Override
public Map<String, String[]> getParameterMap() {
return parameterMap;
}
private void cacheInputStream() throws IOException {
// Cache the inputstream in order to read it multiple times
cachedBytes = new ByteArrayOutputStream();
byte[] buffer = new byte[1024 * 50];
int length;
InputStream is = super.getInputStream();
while ((length = is.read(buffer)) != -1) {
cachedBytes.write(buffer, 0, length);
}
}
private void cacheBodyAsString() throws IOException {
ByteArrayOutputStream result = new ByteArrayOutputStream();
byte[] buffer = new byte[1024 * 50];
int length;
InputStream is = getInputStream();
while ((length = is.read(buffer)) != -1) {
result.write(buffer, 0, length);
}
body = result.toString("UTF-8");
}
}
public class CachedServletInputStream extends ServletInputStream {
private final ByteArrayInputStream buffer;
public CachedServletInputStream(byte[] contents) {
this.buffer = new ByteArrayInputStream(contents);
}
#Override
public int read() {
return buffer.read();
}
#Override
public boolean isFinished() {
return buffer.available() == 0;
}
#Override
public boolean isReady() {
return true;
}
#Override
public void setReadListener(ReadListener listener) {
throw new RuntimeException("Not implemented");
}
}
This is just a sample implementation. I highly recommend to follow the steps specified in the answer mentioned above as it seems to be newer and it also ensures that the parameters are being read from both body and query string. My code is just a sample sketch to see if it works as expected.
Thank you #zaerymoghaddam for helping with this.
I was concerning if I am affecting the request object implicitly, so the rest of the solution is lacking something in it.
Moreover, I found that parameterMap = super.getParameterMap(); is not icluding the parameters from body (in case of post with content type of "application/x-www-form-urlencoded")
With a little bit of change of your code I came up with below solution:
public class MyRequestWrapper extends HttpServletRequestWrapper {
private ByteArrayOutputStream cachedBytes;
private String body;
private Map<String, String[]> parameterMap;
private static int bufferLength = 1024 * 50;
public MyRequestWrapper(final HttpServletRequest request) throws IOException {
super(request);
cacheBodyAsString();
parameterMap = new HashMap<>(super.getParameterMap());
addParametersFromBody();
}
#Override
public ServletInputStream getInputStream() throws IOException {
return new CachedServletInputStream(cachedBytes.toByteArray());
}
#Override
public BufferedReader getReader() throws IOException {
return new BufferedReader(new InputStreamReader(this.getInputStream()));
}
public String GetRequestBodyAsString() {
return this.body;
}
#Override
public String getParameter(String key) {
Map<String, String[]> parameterMap = getParameterMap();
String[] values = parameterMap.get(key);
return values != null && values.length > 0 ? values[0] : null;
}
#Override
public String[] getParameterValues(String key) {
Map<String, String[]> parameterMap = getParameterMap();
return parameterMap.get(key);
}
#Override
public Map<String, String[]> getParameterMap() {
return parameterMap;
}
private void cacheInputStream() throws IOException {
cachedBytes = new ByteArrayOutputStream();
byte[] buffer = new byte[bufferLength];
int length;
InputStream is = super.getInputStream();
while ((length = is.read(buffer)) != -1) {
cachedBytes.write(buffer, 0, length);
}
}
private void cacheBodyAsString() throws IOException {
if (cachedBytes == null)
cacheInputStream();
this.body = cachedBytes.toString("UTF-8");
}
private void addParametersFromBody() {
if(this.body == null || this.body.isEmpty())
return;
String[] params = this.body.split("&");
String[] value = new String[1];
for (String param : params) {
String key = param.split("=")[0];
value[0] = param.split("=")[1];
parameterMap.putIfAbsent(key, value);
}
}
class CachedServletInputStream extends ServletInputStream {
private final ByteArrayInputStream buffer;
public CachedServletInputStream(byte[] contents) {
this.buffer = new ByteArrayInputStream(contents);
}
#Override
public int read() {
return buffer.read();
}
#Override
public boolean isFinished() {
return buffer.available() == 0;
}
#Override
public boolean isReady() {
return true;
}
#Override
public void setReadListener(ReadListener listener) {
throw new RuntimeException("Not implemented");
}
}
}
Strangely HttpServletRequest content may only be read once. It comes as a stream so once you read the stream it is gone. So you need some wrapper that allows you multiple reads. Spring actually provides such wrapper. The name of the class is ContentCachingRequestWrapper. Here its Javadoc. Here is the answer that explains how to use it if you work with Spring boot: How to get request body params in spring filter?
I have a class which represents an ArrayList stored in a file, because I need an ArrayList with multiple gigabytes of data in it which is obviously too large to be stored in memory. The data is represented by a class called Field and the function Field.parse() is just for converting the Field into a String and the other way.
The Field class stores a list of (strange) chess pieces and their coordinates.
My class is working fine, but it takes a long time to add an element to the file and I need my program to run as fast as possible. Does anyone know a more efficient/faster way of doing things?
Also, I am not allowed to use external libraries/apis. Please keep that in mind.
This is the class which is responsible for storing Field objects in a temp file:
private File file;
private BufferedReader reader;
private BufferedWriter writer;
public FieldSaver() {
try {
file = File.createTempFile("chess-moves-", ".temp");
System.out.println(file.getAbsolutePath());
} catch (IOException e) {
e.printStackTrace();
}
}
public void add(Field field) {
try {
File temp = File.createTempFile("chess-moves-", ".temp");
writer = new BufferedWriter(new FileWriter(temp));
reader = new BufferedReader(new FileReader(file));
String line;
while((line = reader.readLine()) != null ) {
writer.write(line);
writer.newLine();
}
reader.close();
writer.write(field.parse());
writer.close();
file.delete();
file = new File(temp.getAbsolutePath());
} catch (IOException e) {
e.printStackTrace();
}
}
public Field get(int n) {
try {
reader = new BufferedReader(new FileReader(file));
for (int i = 0; i < n; i++) {
reader.readLine();
}
String line = reader.readLine();
reader.close();
return Field.parse(line);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
And this is the Field class:
private WildBoar wildBoar;
private HuntingDog[] huntingDogs;
private Hunter hunter;
private int size;
#Override
public String toString() {
String result = "Wildschwein: " + wildBoar.toString();
for (HuntingDog dog : huntingDogs) {
result += "; Hund: " + dog.toString();
}
return result + "; Jäger: " + hunter.toString();
}
#Override
public boolean equals(Object obj) {
if (obj instanceof Field) {
Field field = (Field) obj;
HuntingDog[] dogs = field.getHuntingDogs();
return wildBoar.equals(field.getWildBoar()) && hunter.equals(field.getHunter()) && huntingDogs[0].equals(dogs[0]) && huntingDogs[1].equals(dogs[1]) && huntingDogs[2].equals(dogs[2]);
}
return false;
}
public Field(int size, WildBoar wildBoar, HuntingDog[] huntingDogs, Hunter hunter) {
this.size = size;
this.wildBoar = wildBoar;
this.huntingDogs = huntingDogs;
this.hunter = hunter;
}
public WildBoar getWildBoar() {
return wildBoar;
}
public HuntingDog[] getHuntingDogs() {
return huntingDogs;
}
public Hunter getHunter() {
return hunter;
}
public int getSize() {
return size;
}
public static Field parse(String s) {
String[] arr = s.split(",");
WildBoar boar = WildBoar.parse(arr[0]);
Hunter hunter = Hunter.parse(arr[1]);
HuntingDog[] dogs = new HuntingDog[arr.length - 2];
for(int i = 2; i < arr.length; i++) {
dogs[i - 2] = HuntingDog.parse(arr[i]);
}
return new Field(8, boar, dogs, hunter);
}
public String parse() {
String result = wildBoar.parse() + "," + hunter.parse();
for(HuntingDog dog : huntingDogs) {
result += "," + dog.parse();
}
return result;
}
Here's an MCVE to do what you want, based on the information you provided.
You can run it and see that it can save a Field to the file and get a Field by index very quickly.
The Fields are constant length, so you can get a Field by index by going to byte offset of index times field length in bytes. This would be significantly more difficult if the field were not constant length.
import java.io.Closeable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
public class FieldSaver implements Closeable {
public static void main(String[] args) throws IOException {
File f = File.createTempFile("chess-moves-", ".temp");
try (FieldSaver test = new FieldSaver(f);) {
for (byte i = 0; i < 100; i++) {
test.add(new Field(8, new WildBoar(i, i), new Hunter(i, i), new HuntingDog[] {
new HuntingDog(i, i),
new HuntingDog(i, i),
new HuntingDog(i, i) }));
}
// Get a few Fields by index
System.out.println(test.get(0));
System.out.println(test.get(50));
System.out.println(test.get(99));
// EOF exception, there is no Field 100
// System.out.println(test.get(100));
}
}
private final RandomAccessFile data;
public FieldSaver(File f) throws FileNotFoundException {
data = new RandomAccessFile(f, "rw");
}
public void add(Field field) throws IOException {
data.seek(data.length());
field.write(data);
}
public Field get(int index) throws IOException {
data.seek(index * Field.STORAGE_LENGTH_BYTES);
return Field.read(data);
}
public void close() throws IOException { data.close(); }
static abstract class Piece {
protected byte xPos;
protected byte yPos;
public Piece(DataInput data) throws IOException {
xPos = data.readByte();
yPos = data.readByte();
}
public Piece(byte xPos, byte yPos) {
this.xPos = xPos;
this.yPos = yPos;
}
public void write(DataOutput data) throws IOException {
data.writeByte(xPos);
data.writeByte(yPos);
}
public String toString() { return "[" + xPos + ", " + yPos + "]"; }
}
static class Hunter extends Piece {
public Hunter(byte xPos, byte yPos) { super(xPos, yPos); }
public Hunter(DataInput data) throws IOException { super(data); }
}
static class HuntingDog extends Piece {
public HuntingDog(byte xPos, byte yPos) { super(xPos, yPos); }
public HuntingDog(DataInput data) throws IOException { super(data); }
}
static class WildBoar extends Piece {
public WildBoar(byte xPos, byte yPos) { super(xPos, yPos); }
public WildBoar(DataInput data) throws IOException { super(data); }
}
static class Field {
// size of boar + hunter + 3 dogs
public static final int STORAGE_LENGTH_BYTES = 2 + 2 + (3 * 2);
private int size;
private WildBoar boar;
private Hunter hunter;
private final HuntingDog[] dogs;
public Field(int size, WildBoar wildBoar, Hunter hunter, HuntingDog[] huntingDogs) {
this.size = size;
this.boar = wildBoar;
this.hunter = hunter;
this.dogs = huntingDogs;
}
public String toString() {
String result = "Wildschwein: " + boar.toString();
for (HuntingDog dog : dogs) {
result += "; Hund: " + dog.toString();
}
return result + "; Jäger: " + hunter.toString();
}
public static Field read(DataInput data) throws IOException {
WildBoar boar = new WildBoar(data);
Hunter hunter = new Hunter(data);
HuntingDog[] dogs = new HuntingDog[3];
for (int i = 0; i < 3; i++) {
dogs[i] = new HuntingDog(data);
}
return new Field(8, boar, hunter, dogs);
}
public void write(DataOutput data) throws IOException {
boar.write(data);
hunter.write(data);
for (HuntingDog dog : dogs) {
dog.write(data);
}
}
}
}
Use a Map implementation like Cache from ehcache. This library will optimize for you so you don't have to handle writing and reading to disk and manage when to keep it in memory or on disk. You can just use it as a normal map. You probably want a map instead of a list for faster lookup so the library can optimize even more for you.
http://www.ehcache.org/
CacheManager cacheManager = CacheManagerBuilder.newCacheManagerBuilder()
.withCache("preConfigured",
CacheConfigurationBuilder.newCacheConfigurationBuilder(Long.class, String.class,
ResourcePoolsBuilder.heap(100))
.build())
.build(true);
Cache<Long, String> preConfigured
= cacheManager.getCache("preConfigured", Long.class, String.class);
I create Java Application using HttpServer as bellow:
public class Application
{
public static void main(String args[])
{
HttpServer httpPaymentServer;
httpPaymentServer = HttpServer.create(new InetSocketAddress(Config.portPayment), 0);
httpPaymentServer.createContext("/json", new Payment("json"));
}
public class Payment implements HttpHandler
{
public Payment(String dataType)
{
}
public void handle(HttpExchange httpExchange) throws IOException
{
String body = "";
if(httpExchange.getRequestMethod().equalsIgnoreCase("POST"))
{
try
{
Headers requestHeaders = httpExchange.getRequestHeaders();
Set<Map.Entry<String, List<String>>> entries = requestHeaders.entrySet();
int contentLength = Integer.parseInt(requestHeaders.getFirst("Content-length"));
InputStream inputStream = httpExchange.getRequestBody();
byte[] postData = new byte[contentLength];
int length = inputStream.read(postData, 0, contentLength);
if(length < contentLength)
{
}
else
{
String fullBody = new String(postData);
Map<String, String> query = Utility.splitQuery(fullBody);
body = query.getOrDefault("data", "").toString();
}
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
}
}
On my server (Centos 7), on the first request, it is no problem. But on next request, not all of the request body can be read.
But on my PC (Windows 10) no problem.
What is the problem.
For your InputStream you call read only once - it may not return all the data. That data may even be not received at that time.
Instead you should call read in a loop until you get all the bytes (when you reach end of stream read returns -1). Or use one of the approaches suggested here How to read / convert an InputStream into a String in Java?
Thank you. This work for me
public void handle(HttpExchange httpExchange) throws IOException
{
String body = "";
if(httpExchange.getRequestMethod().equalsIgnoreCase("POST"))
{
try
{
Headers requestHeaders = httpExchange.getRequestHeaders();
Set<Map.Entry<String, List<String>>> entries = requestHeaders.entrySet();
int contentLength = Integer.parseInt(requestHeaders.getFirst("Content-length"));
InputStream inputStream = httpExchange.getRequestBody();
int j;
String fullBody = "";
for(j = 0; j < contentLength; j++)
{
byte b = (byte) httpExchange.getRequestBody().read();
fullBody += String.format("%c", b);
}
Map<String, String> query = Utility.splitQuery(fullBody);
body = query.getOrDefault("data", "").toString();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}
Is there a way to register some progress monitor on JAXB Marshaller and Unmarshaller?
I would like to show some progress information in my GUI while data is de-/serialized.
I see that you can set a Unmarshaller.Listener and Marshaller.Listener, which have a "before" and "after" method. Nevertheless, I do not see any straight forward way to get the total number of elements to serialize.
I would need that obviously to calculate some "percentage done" info.
Is it ok to parse before unmarshalling?
If so, assuming you have a list of objects, you could do something like...
final String tagName = *** name of tag you are counting ***;
InputStream in = *** stream of your xml ***;
SAXParserFactory spf = SAXParserFactory.newInstance();
SAXParser saxParser = spf.newSAXParser();
final AtomicInteger counter = new AtomicInteger();
saxParser.parse(in, new DefaultHandler() {
#Override
public void startElement (String uri, String localName, String qName, Attributes attributes) {
if (localName.equals(tagName))
counter.incrementAndGet();
}
});
Would doing a more low-level approach by leveraging on the InputStream be an acceptable solution?
E.g.
import java.io.IOException;
import java.io.InputStream;
import java.util.function.DoubleConsumer;
public class InputStreamWithProgressDecorator extends InputStream {
/** Input stream to be decorated */ private final InputStream inputStream;
/** Amount of byte read */ private long position = 0L;
/** File size */ private final long length;
/** Mark */ private int mark = 0;
/** Consumer of the progress */ private final DoubleConsumer callBack;
public InputStreamWithProgressDecorator(final InputStream is, final long l, final DoubleConsumer cb) {
inputStream = is;
length = l;
callBack = cb;
}
private void setPosition(final long fp) {
position = fp;
callBack.accept(getProgress());
}
public double getProgress() {
return length == 0L ? 100d : ((double) position) * 100d / ((double) length);
}
public long getPosition() {
return position;
}
#Override
public int read(byte[] b) throws IOException {
final int rc = inputStream.read(b);
setPosition(position + rc);
return rc;
}
#Override
public int read(byte[] b, int off, int len) throws IOException {
final int rc = inputStream.read(b, off, len);
setPosition(position + rc);
return rc;
}
#Override
public byte[] readAllBytes() throws IOException {
final byte[] result = inputStream.readAllBytes();
setPosition(position + result.length);
return result;
}
#Override
public byte[] readNBytes(int len) throws IOException {
final byte[] result = inputStream.readNBytes(len);
setPosition(position + result.length);
return result;
}
#Override
public int readNBytes(byte[] b, int off, int len) throws IOException {
final int rc = inputStream.readNBytes(b, off, len);
setPosition(position + rc);
return rc;
}
#Override
public long skip(long n) throws IOException {
final long rc = inputStream.skip(n);
setPosition(position + rc);
return rc;
}
#Override
public int available() throws IOException {
return inputStream.available();
}
#Override
public void close() throws IOException {
inputStream.close();
}
#Override
public synchronized void mark(int readlimit) {
inputStream.mark(readlimit);
mark = readlimit;
}
#Override
public synchronized void reset() throws IOException {
inputStream.reset();
setPosition(mark);
}
#Override
public boolean markSupported() {
return inputStream.markSupported();
}
#Override
public int read() throws IOException {
final int c = inputStream.read();
setPosition(position + 1);
return c;
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.function.DoubleConsumer;
public class Demo1 {
public static void main(String[] args) throws IOException {
final File file = new File(args[0]);
final DoubleConsumer callBack = p -> System.out.printf("%.0f%%\n", p);
try (final FileInputStream fis = new FileInputStream(file); final InputStreamWithProgressDecorator is = new InputStreamWithProgressDecorator(fis, file.length(), callBack)) {
// Simulating JAXB unmarshaller reads
byte[] buffer = is.readNBytes(1024);
while (buffer.length != 0) buffer = is.readNBytes(1024);
}
}
}
Or if you have a FileInputStream with a separate Thread approach :
public class FileInputStreamReadProgressThread extends Thread implements UncaughtExceptionHandler {
/** Input stream */ private final FileInputStream fileInputStream;
/** File size */ private final long length;
/** Read progress in percents */ private double progress = 0d;
/** Exception from thread */ private Throwable exception = null;
/** Consumer of the progress */ private final DoubleConsumer callBack;
public FileInputStreamReadProgressThread(final FileInputStream fis, final long l, final DoubleConsumer cb) {
fileInputStream = fis;
length = l;
callBack = cb;
setUncaughtExceptionHandler(this);
setName(getClass().getSimpleName());
}
public double getProgress() { return progress; }
public Throwable getException() { return exception; }
#Override public void uncaughtException(final Thread t, final Throwable e) { exception = e; }
#Override
public void run() {
try {
long position = -1L;
final FileChannel channel = fileInputStream.getChannel();
while (!isInterrupted() && channel.isOpen() && position < length) {
position = channel.position();
progress = length == 0L ? 100d : ((double)position) * 100d / ((double)length);
callBack.accept(progress);
sleep(100L);
}
} catch (final IOException e) {
exception = e;
} catch (final InterruptedException e) {
// Do nothing
}
}
}
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.Channels;
import java.util.function.DoubleConsumer;
public class Demo2 {
public static void main(String[] args) throws IOException {
final File file = new File(args[0]);
final DoubleConsumer callBack = p -> System.out.printf("%.0f%%\n", p);
try (final FileInputStream fis = new FileInputStream(file); final InputStream is = Channels.newInputStream(fis.getChannel())) {
final FileInputStreamReadProgressThread readProgressThread = new FileInputStreamReadProgressThread(fis, file.length(), callBack);
readProgressThread.start();
// Simulating JAXB unmarshaller reads
is.readAllBytes();
}
}
}
I am trying to parse pdf file using Apache Tika after upgrading PDFBOX version to 1.6.0... And I started getting this error for few pdf files.
Any suggestions?
java.io.IOException: expected='endstream' actual='' org.apache.pdfbox.io.PushBackInputStream#3a72d4e5
at org.apache.pdfbox.pdfparser.BaseParser.parseCOSStream(BaseParser.java:439)
at org.apache.pdfbox.pdfparser.PDFParser.parseObject(PDFParser.java:552)
at org.apache.pdfbox.pdfparser.PDFParser.parse(PDFParser.java:184)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1088)
at org.apache.pdfbox.pdmodel.PDDocument.load(PDDocument.java:1053)
at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:74)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:197)
at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:135)
at org.apache.tika.Tika.parseToString(Tika.java:357)
at edu.uci.ics.crawler4j.crawler.BinaryParser.parse(BinaryParser.java:37)
at edu.uci.ics.crawler4j.crawler.WebCrawler.handleBinary(WebCrawler.java:223)
at edu.uci.ics.crawler4j.crawler.WebCrawler.processPage(WebCrawler.java:461)
at edu.uci.ics.crawler4j.crawler.WebCrawler.run(WebCrawler.java:129)
at java.lang.Thread.run(Thread.java:662)
WARN [Crawler 2] Did not found XRef object at specified startxref position 0
And this is my code.
if (page.isBinary()) {
handleBinary(page, curURL);
}
-------------------------------------------------------------------------------
public int handleBinary(Page page, WebURL curURL) {
try {
binaryParser.parse(page.getBinaryData());
page.setText(binaryParser.getText());
handleMetaData(page, binaryParser.getMetaData());
//System.out.println(" pdf url " +page.getWebURL().getURL());
//System.out.println("Text" +page.getText());
} catch (Exception e) {
// TODO: handle exception
}
return PROCESS_OK;
}
public class BinaryParser {
private String text;
private Map<String, String> metaData;
private Tika tika;
public BinaryParser() {
tika = new Tika();
}
public void parse(byte[] data) {
InputStream is = null;
try {
is = new ByteArrayInputStream(data);
text = null;
Metadata md = new Metadata();
metaData = new HashMap<String, String>();
text = tika.parseToString(is, md).trim();
processMetaData(md);
} catch (Exception e) {
e.printStackTrace();
} finally {
IOUtils.closeQuietly(is);
}
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
private void processMetaData(Metadata md){
if ((getMetaData() == null) || (!getMetaData().isEmpty())) {
setMetaData(new HashMap<String, String>());
}
for (String name : md.names()){
getMetaData().put(name.toLowerCase(), md.get(name));
}
}
public Map<String, String> getMetaData() {
return metaData;
}
public void setMetaData(Map<String, String> metaData) {
this.metaData = metaData;
}
}
public class Page {
private WebURL url;
private String html;
// Data for textual content
private String text;
private String title;
private String keywords;
private String authors;
private String description;
private String contentType;
private String contentEncoding;
// binary data (e.g, image content)
// It's null for html pages
private byte[] binaryData;
private List<WebURL> urls;
private ByteBuffer bBuf;
private final static String defaultEncoding = Configurations
.getStringProperty("crawler.default_encoding", "UTF-8");
public boolean load(final InputStream in, final int totalsize,
final boolean isBinary) {
if (totalsize > 0) {
this.bBuf = ByteBuffer.allocate(totalsize + 1024);
} else {
this.bBuf = ByteBuffer.allocate(PageFetcher.MAX_DOWNLOAD_SIZE);
}
final byte[] b = new byte[1024];
int len;
double finished = 0;
try {
while ((len = in.read(b)) != -1) {
if (finished + b.length > this.bBuf.capacity()) {
break;
}
this.bBuf.put(b, 0, len);
finished += len;
}
} catch (final BufferOverflowException boe) {
System.out.println("Page size exceeds maximum allowed.");
return false;
} catch (final Exception e) {
System.err.println(e.getMessage());
return false;
}
this.bBuf.flip();
if (isBinary) {
binaryData = new byte[bBuf.limit()];
bBuf.get(binaryData);
} else {
this.html = "";
this.html += Charset.forName(defaultEncoding).decode(this.bBuf);
this.bBuf.clear();
if (this.html.length() == 0) {
return false;
}
}
return true;
}
public boolean isBinary() {
return binaryData != null;
}
public byte[] getBinaryData() {
return binaryData;
}
Are you sure that you don't accidentally truncate the PDF document when you load it into the binary buffer in the Page class?
There are multiple potential problems in your Page.load() method. To start with, the finished + b.length > this.bBuf.capacity() should be finished + len > this.bBuf.capacity() since the read() method could have returned fewer than b.length bytes. Also, are you sure that the totalsize argument you give is accurate? Finally, it could be that the given document is larger than the MAX_DOWNLOAD_SIZE limit.