Text Extraction from HTML Java - java

I'm working on a program that downloads HTML pages and then selects some of the information and write it to another file.
I want to extract the information which is intbetween the paragraph tags, but i can only get one line of the paragraph. My code is as follows;
FileReader fileReader = new FileReader(file);
BufferedReader buffRd = new BufferedReader(fileReader);
BufferedWriter out = new BufferedWriter(new FileWriter(newFile.txt));
String s;
while ((s = br.readLine()) !=null) {
if(s.contains("<p>")) {
try {
out.write(s);
} catch (IOException e) {
}
}
}
i was trying to add another while loop, which would tell the program to keep writing to file until the line contains the </p> tag, by saying;
while ((s = br.readLine()) !=null) {
if(s.contains("<p>")) {
while(!s.contains("</p>") {
try {
out.write(s);
} catch (IOException e) {
}
}
}
}
But this doesn't work. Could someone please help.

jsoup
Another html parser I really liked using was jsoup. You could get all the <p> elements in 2 lines of code.
Document doc = Jsoup.connect("http://en.wikipedia.org/").get();
Elements ps = doc.select("p");
Then write it out to a file in one more line
out.write(ps.text()); //it will append all of the p elements together in one long string
or if you want them on separate lines you can iterate through the elements and write them out separately.

jericho is one of several posible html parsers that could make this task both easy and safe.

JTidy can represent an HTML document (even a malformed one) as a document model, making the process of extracting the contents of a <p> tag a rather more elegant process than manually thunking through the raw text.

Try (if you don't want to use a HTML parser library):
FileReader fileReader = new FileReader(file);
BufferedReader buffRd = new BufferedReader(fileReader);
BufferedWriter out = new BufferedWriter(new FileWriter(newFile.txt));
String s;
int writeTo = 0;
while ((s = br.readLine()) !=null)
{
if(s.contains("<p>"))
{
writeTo = 1;
try
{
out.write(s);
}
catch (IOException e)
{
}
}
if(s.contains("</p>"))
{
writeTo = 0;
try
{
out.write(s);
}
catch (IOException e)
{
}
}
else if(writeTo==1)
{
try
{
out.write(s);
}
catch (IOException e)
{
}
}
}

I've had success using TagSoup & XPath to parse HTML.
http://home.ccil.org/~cowan/XML/tagsoup/

Use a ParserCallback. Its a simple class thats included with the JDK. It notifies you every time a new tag is found and then you can extract the text of the tag. Simple example:
import java.io.*;
import java.net.*;
import javax.swing.text.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;
public class ParserCallbackTest extends HTMLEditorKit.ParserCallback
{
private int tabLevel = 1;
private int line = 1;
public void handleComment(char[] data, int pos)
{
displayData(new String(data));
}
public void handleEndOfLineString(String eol)
{
System.out.println( line++ );
}
public void handleEndTag(HTML.Tag tag, int pos)
{
tabLevel--;
displayData("/" + tag);
}
public void handleError(String errorMsg, int pos)
{
displayData(pos + ":" + errorMsg);
}
public void handleMutableTag(HTML.Tag tag, MutableAttributeSet a, int pos)
{
displayData("mutable:" + tag + ": " + pos + ": " + a);
}
public void handleSimpleTag(HTML.Tag tag, MutableAttributeSet a, int pos)
{
displayData( tag + "::" + a );
// tabLevel++;
}
public void handleStartTag(HTML.Tag tag, MutableAttributeSet a, int pos)
{
displayData( tag + ":" + a );
tabLevel++;
}
public void handleText(char[] data, int pos)
{
displayData( new String(data) );
}
private void displayData(String text)
{
for (int i = 0; i < tabLevel; i++)
System.out.print("\t");
System.out.println(text);
}
public static void main(String[] args)
throws IOException
{
ParserCallbackTest parser = new ParserCallbackTest();
// args[0] is the file to parse
Reader reader = new FileReader(args[0]);
// URLConnection conn = new URL(args[0]).openConnection();
// Reader reader = new InputStreamReader(conn.getInputStream());
try
{
new ParserDelegator().parse(reader, parser, true);
}
catch (IOException e)
{
System.out.println(e);
}
}
}
So all you need to do is set a boolean flag when the paragraph tag is found. Then in the handleText() method you extract the text.

Try this.
public static void main( String[] args )
{
String url = "http://en.wikipedia.org/wiki/Big_data";
Document document;
try {
document = Jsoup.connect(url).get();
Elements paragraphs = document.select("p");
Element firstParagraph = paragraphs.first();
Element lastParagraph = paragraphs.last();
Element p;
int i=1;
p=firstParagraph;
System.out.println("* " +p.text());
while (p!=lastParagraph){
p=paragraphs.get(i);
System.out.println("* " +p.text());
i++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

You may just be using the wrong tool for the job:
perl -ne "print if m|<p>| .. m|</p>|" infile.txt >outfile.txt

Related

JAVA Replacing line or strings in textfile not working

So i have made two methods that creates the file (createFile(); and one to fill the textfile with empty highscores if none are set.
public class HighscoreList {
static String highscore = null;
static PuzzleModel theModel;
static File file = null;
public static int nom;
public static int tu;
public static int nor;
public static String search = " ";
static String replace = "2";
static String numberOfRows = null;
static String timeUsed = " ";
static String numberOfMoves = " ";
public static void main(String[] args) {
createFile();
isEmptySetEmptyHighscore();
// checkScore(0);
getHighscore(0);
}
public static void createFile() {
file = new File("C:\\Users\\Thomas\\Eclipse Workspace\\15Puzzle\\15Puzzle\\src\\FifteenPuzzle\\ScoreBoard.txt");
System.out.println("Created file " + file.getName());
if (!file.exists()) {
System.out.println("File didn't exist creating new file");
try {
file.createNewFile();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public static void isEmptySetEmptyHighscore() {
try {
BufferedReader br = new BufferedReader(new FileReader(
"C:\\Users\\Thomas\\Eclipse Workspace\\15Puzzle\\15Puzzle\\src\\FifteenPuzzle\\ScoreBoard.txt"));
if (br.readLine() == null) {
setEmptyHighscoreFile();
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static void setEmptyHighscoreFile() {
try {
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
System.out.println("File is empty, fills with empty fields");
for (int i = 3; i < 101; i++) {
bw.write(i + ":" + numberOfMoves + ":" + timeUsed+"\n");
}
bw.close();
System.out.println("Done");
} catch (IOException e) {
e.printStackTrace();
}
}
I have a getHighscore() that reads the two empty " " fields with moves and timeUsed. It is currently able to read this, but i cant write to those empty spaces in the textfile and replace them with actual numbers that i want.
EDIT: With the replace command it just adds it to the bottom of the file.
Is there something wrong with my code that re erases the text that i try to replace or how do i do it?
I tried something like this:
public static void writeToFile(int rows) {
try {
FileWriter fw = new FileWriter(file.getAbsoluteFile(),true);
BufferedWriter bw = new BufferedWriter(fw);
BufferedReader br = new BufferedReader(new FileReader(
"C:\\Users\\Thomas\\Eclipse Workspace\\15Puzzle\\15Puzzle\\src\\FifteenPuzzle\\ScoreBoard.txt"));
if(br.readLine().split(":")[0].equals(Integer.toString(rows+1))){
bw.write(br.readLine().replaceFirst(rows+2+": : ", "yes"));
System.out.println(" lel");
}
bw.close();
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
have you try this ?
String line = br.readLine();
if(line.split(":")[0].equals(Integer.toString(rows+1))){
bw.write(line.replaceFirst(rows+2+": : ", "yes"));
System.out.println(" lel");
}

Java parse csv class

I am making a class in java to parse in CSV's. It will read in the file line by line and parse out each field according to the regex pattern into an array, and then print that array. I put all this together in a main driver below. I looked over everything and it seems to be functional but for some reason whenever I run it, it just gets stuck in an infinite loop and will not cease. I have looked over this many times and can just not find where this would happen. Any help with this issue would be greatly appreciated!
import java.io.FileInputStream;
import java.io.IOException;
import java.util.regex.Pattern;
/**
*
*/
public class Csv {
private FileInputStream fin;
private String line;
private String[] parsedFields;
public boolean isEOL(char n) {
boolean eol;
if (n == '\n' || n == '\r') {
eol = true;
}
else
eol=false;
return eol;
}
public String getLine()
{
try
{
char c;
c= (char) fin.read();
while(!isEOL(c))
{
line+=c;
}
}
catch (IOException e) {
System.out.println("Input Error.");
}
return line;
}
public void parseFields(String s)
{
Pattern CSVLine=Pattern.compile("\"([^\"]*)\"|(?<=,|^)([^,]*)(?:,|$)");
parsedFields=CSVLine.split(s);
}
public void execute()
{
String field=getLine();
parseFields(field);
}
public void setFin(FileInputStream usrFin)
{
fin=usrFin;
}
public void outputFields()
{
for(int i=0; i<parsedFields.length;i++)
{
System.out.println(parsedFields[i]);
}
}
public static void main(String args[])
{
try {
FileInputStream fis;
fis = new FileInputStream("test.txt");
Csv test= new Csv();
test.setFin(fis);
test.execute();
test.outputFields();
}
catch (IOException e) {
e.printStackTrace();
}
}
}
c= (char) fin.read();
while(!isEOL(c))
{
line+=c;
}
In this part, you loop, adding c, but you never read again. c never changes during the loop, and probably is stuck there. You need to have the c = fin.read(); inside the loop too.
public static List<String> readLine(String filePath){
List<String> listStr = new ArrayList<String>();
try {
BufferedReader br = new BufferedReader(new FileReader(filePath));
String line = null;
Pattern pat = Pattern.compile(LINE_PATTERN_REGEXP);
while((line=br.readLine())!=null){
Matcher matcher = pat.matcher(line);
if(!matcher.find()){
listStr.add(line);
}
}
br.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
return listStr;
}
}

File Structured Error [duplicate]

This question already has an answer here:
StreamCorruptedException: invalid type code: AC
(1 answer)
Closed 5 years ago.
sorry to bother you, once again I need help on the Java language , more precisely on the file structured as the title .
The error in question is that after you have stored more than once , I read reports an error (of course putting in append mode) , and does so even if I do all in the main program ...
My program consists of three classes in three files:
Alluno.java:
import java.io.Serializable;
class Alunno implements Serializable {
private String nome, cognome, data_nascita, indirizzo, residenza, telefono;
public Alunno() {
nome = ""; cognome = ""; data_nascita = ""; indirizzo = ""; residenza = ""; telefono = "";
}
public void setNome(String nome) {
this.nome = nome;
}
void setCognome(String cognome) {
this.cognome = cognome;
}
void setData_Nascita(String data_nascita) {
this.data_nascita = data_nascita;
}
void setIndirizzo(String indirizzo) {
this.indirizzo = indirizzo;
}
void setResidenza(String residenza) {
this.residenza = residenza;
}
void setTelefono(String telefono) {
this.telefono = telefono;
}
}
File.java:
import java.io.*;
class File {
private int dim;
public Alunno nuovoAlunno() throws IOException {
BufferedReader t = new BufferedReader(new InputStreamReader(System.in));
Alunno a = new Alunno();
System.out.println("***Inserimento nuovo alunno***");
System.out.format("Nome: ");
a.setNome(t.readLine());
System.out.format("Cognome: ");
a.setCognome(t.readLine());
System.out.format("Data di nascita: ");
a.setData_Nascita(t.readLine());
System.out.format("Indirizzo: ");
a.setIndirizzo(t.readLine());
System.out.format("Residenza: ");
a.setResidenza(t.readLine());
System.out.format("Telefono: ");
a.setTelefono(t.readLine());
return a;
}
public void sciviFile(Alunno a) {
try {
FileOutputStream f = new FileOutputStream("istituto.dat", true);
ObjectOutputStream fOUT = new ObjectOutputStream(f);
fOUT.writeObject(a);
fOUT.flush();
fOUT.close();
} catch (Exception e) {
System.out.println("Eccezione scrittura: " + e.getMessage());
}
}
public void leggiFile() {
Alunno a;
try {
FileInputStream f = new FileInputStream("istituto.dat");
ObjectInputStream fIN = new ObjectInputStream(f);
while (true) {
try {
a = (Alunno) fIN.readObject();
dim++;
System.out.println("Dimensione file: " + dim);
} catch (EOFException e) {
break;
}
}
f.close();
} catch (Exception e) {
System.out.println("Eccezione lettura: " + e.getMessage());
}
}
}
IstitutoScolastico.java:
import java.io.*;
public class IstitutoScolastico {
public static void main(String[] args) throws IOException {
File f = new File();
//f.sciviFile(f.nuovoAlunno());
f.leggiFile();
}
}
OUTPUT:
Dimensione file: 1
Eccezione lettura: invalid type code: AC
I do not read more than one object if I put in append mode, where did I go wrong?
Ah, anyway sorry for the grammatical errors, but I'm Italian and I helped with google translate!
The problem is that ObjectOutputStream writes a header to the file in it's constructor.
Since you call the constructor for each Alunno you append, you write a new header to the file too.
However ObjectInputStream expects only one header(at the start of the file).
If you don't want to change much in your code, you should create a new ObjectInputStream for each Alunno you read, change the code in your File class:
public void leggiFile() {
Alunno a;
try {
FileInputStream f = new FileInputStream("istituto.dat");
try {
while (true) {
// the header is read in the constructor
ObjectInputStream fIN = new ObjectInputStream(f);
a = (Alunno) fIN.readObject();
dim++;
System.out.println("Dimensione file: " + dim);
}
} catch (EOFException e) { }
f.close();
} catch (Exception e) {
System.out.println("Eccezione lettura: " + e.getMessage());
}
}
A alternative would be to skip 2(?) shorts (4(?) bytes) from the FileInputStream, but if the definition of the header should change (although this seems unlikely), you might have to change your code.
Another alternative would be to read all the Alunnos that are already in the file and then write all Alunnos (including the new one) to the File starting at the beginning of the file. But this may not be as fast as you wish.
For detailed information you can read http://docs.oracle.com/javase/7/docs/platform/serialization/spec/output.html and http://docs.oracle.com/javase/7/docs/platform/serialization/spec/input.html
One last tip: If you use Java SE 7 (or higher) consider using try-with-resources for your streams.

read and write text and changing at only one column

my text
is like that
SEHiR;iL;iLCE;Tip;22356
S SI n;ISTA;ANK;A:S;22356
K K n;IS:TA;BB;B:S;22356
A A b;IS.TA;CC;DK;22356
G S b;ISTA;DD;O:P;22356
I want to change TIP column. I want to put "." instead of ":" for only Tıp column which include A:S,B:S etc.. And I want to write line before changing and after changing to csv. How can I do that? I write something but it has problem at
if(eD.tip.contains(":")) part because it dont continue to hS.Add(eD)
endeks.put("", hS); ı don’t want use “” string.
I do not have to use HasMap I could not write output what I want..
ı expected this output
S SI n;ISTA;ANK;A:S;22356
S SI n;ISTA;ANK;A.S;22356
K K n;IS:TA;BB;B:S;22356
K K n;IS:TA;BB;B.S;22356
G S b;ISTA;DD;O:P;22356
G S b;ISTA;DD;O.P;22356
public class MaliyeVknmDegil {
static class EndeksDegeri {
String sirket ;
String sehir;
String ilce;
String tip;
int numara;
}
static HashMap<String,HashSet<EndeksDegeri>> endeks = new HashMap<String, HashSet<EndeksDegeri>>();
static PrintWriter pW;
static EndeksDegeri eD = new EndeksDegeri();
static String satır;
private static PrintWriter pW2;
public static void main(String[] args) {
FileInputStream fIS;
FileOutputStream fOS;
try {
fIS = new FileInputStream("C:\\deneme\\DENEME.csv");
Reader r = new InputStreamReader(fIS, "UTF-8");
BufferedReader bR = new BufferedReader(r);
fOS = new FileOutputStream("c:\\yazdirilan\\deneme.csv");
Writer w = new OutputStreamWriter(fOS, "UTF-8");
pW2 = (new PrintWriter(w));
String satır;
String[] eleman;
while ((satır = bR.readLine()) != null) {
eleman = satır.split(";");
if(satır.contains(":")){
pW2.write(satır);
}
HashSet<EndeksDegeri> hS = new HashSet<EndeksDegeri>();
for (int i = 0; i < eleman.length; i++) {
// alteleman=eleman[i].split(" ");
EndeksDegeri eD = new EndeksDegeri();
eD.sirket = eleman[0];
eD.sehir = eleman[1];
eD.ilce = eleman[2];
if(eD.tip.contains(":")){
eD.tip.replaceAll(":", ".");
eD.tip = eleman[3];
}
eD.numara = Integer.parseInt(eleman[4]);
hS.add(eD);
}
endeks.put("", hS);
}
bR.close();
// yazdir
HashSet<EndeksDegeri> hS;
for (String s : endeks.keySet()) {
hS = endeks.get(s);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ArrayIndexOutOfBoundsException e) {
e.printStackTrace();
}
}// main end
}// clas end
package stackoverflow;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Writer;
public class TipChange {
private static String inputPath = "input.csv";
private static String outputPath = "output.csv";
private static BufferedReader bufferedReader;
private static PrintWriter printWriter;
public static void main(String[] args) {
try {
FileInputStream inputStream = new FileInputStream(inputPath);
Reader reader = new InputStreamReader(inputStream, "UTF-8");
bufferedReader = new BufferedReader(reader);
FileOutputStream outputStream = new FileOutputStream(outputPath);
Writer writer = new OutputStreamWriter(outputStream, "UTF-8");
printWriter = new PrintWriter(writer);
String line;
while ((line = bufferedReader.readLine()) != null) {
EndeksDegeri eD = lineToClass(line);
if (shouldOutput(eD)) {
printWriter.append(classToLine(eD, true));
printWriter.append(classToLine(eD, false));
}
}
} catch (IOException e) {
e.printStackTrace();
}
finally {
try {
bufferedReader.close();
printWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
private static boolean shouldOutput(EndeksDegeri eD) {
if (!eD.tip.contains(":")) {
return false;
}
return true;
}
private static String classToLine(EndeksDegeri eD, boolean original) {
if (!original) {
eD.tip = eD.tip.replace(":", ".");
}
return eD.sirket.concat(";")
.concat(eD.sehir).concat(";")
.concat(eD.ilce).concat(";")
.concat(eD.tip).concat(";")
.concat(String.valueOf(eD.numara)
.concat("\r\n"));
}
private static EndeksDegeri lineToClass(String line) {
String[] element = line.split(";");
EndeksDegeri endeksDegeri = new EndeksDegeri();
endeksDegeri.sirket = element[0];
endeksDegeri.sehir = element[1];
endeksDegeri.ilce = element[2];
endeksDegeri.tip = element[3];
endeksDegeri.numara = Integer.valueOf(element[4]);
return endeksDegeri;
}
private static class EndeksDegeri {
String sirket ;
String sehir;
String ilce;
String tip;
int numara;
}
}
Sample input:
SSI;ISTA;ANK;A:S;22356
KK;IS:TA;BB;B:S;22356
AA;IS.TA;CC;DK;22356
GS;ISTA;DD;O:P;22356
Generated Output:
SSI;ISTA;ANK;A:S;22356
SSI;ISTA;ANK;A.S;22356
KK;IS:TA;BB;B:S;22356
KK;IS:TA;BB;B.S;22356
GS;ISTA;DD;O:P;22356
GS;ISTA;DD;O.P;22356
Your code will produce a NullPointerException in the line: if(eD.tip.contains(":")){
That is because when a new EndeksDegeri instance is created all its fields are null you cannot call contains() on a null string.
Check the example code below (It writes to the console but it should get you going)
static class EndeksDegeri {
String sirket;
String sehir;
String ilce;
String tip;
int numara;
// I have added this method for convenience to write to the output
public String toString() {
return sirket + ":" + sehir + ":" + ilce + ":" + tip + ":" + numara;
}
}
while ((satir = bR.readLine()) != null) {
eleman = satir.split(";");
boolean found = false;
EndeksDegeri eD = new EndeksDegeri();
// first set all fields to not get exception
eD.sirket = eleman[0];
eD.sehir = eleman[1];
eD.ilce = eleman[2];
eD.tip = eleman[3];
eD.numara = Integer.parseInt(eleman[4]);
// check if the line contains ":"
if (eD.tip.contains(":")) {
// If yes, write the original line first
System.out.println(eD.toString());
// Change the record
eD.tip = eD.tip.replaceAll(":", ".");
found = true;
}
if (found) {
// write the corrected line now
System.out.println(eD.toString());
}
}
// Will print only the lines with ":" and its correct version
SSI:ISTA:ANK:A:S:22356
SSI:ISTA:ANK:A.S:22356
KK:IS:TA:BB:B:S:22356
KK:IS:TA:BB:B.S:22356
GS:ISTA:DD:O:P:22356
GS:ISTA:DD:O.P:22356

Sorting lines in a file by 2 fields with JAVA

I work at a printing company that has many programs in COBOL and I have been tasked to
convert the COBOL programs into JAVA programs. I've run into a snag in the one conversion. I need to take a file that each line is a record and on each line the data is blocked.
Example of a line is
60000003448595072410013 FFFFFFFFFFV 80 0001438001000014530020120808060134
I need to sort data by a 5 digit number at the 19-23 characters and then by the very first character on a line.
BufferedReader input;
BufferedWriter output;
String[] sort, sorted, style, accountNumber, customerNumber;
String holder;
int lineCount;
int lineCounter() {
int result = 0;
boolean eof = false;
try {
FileReader inputFile = new FileReader("C:\\Users\\cbook\\Desktop\\Chemical\\"
+ "LB26529.fil");
input = new BufferedReader(inputFile);
while (!eof) {
holder = input.readLine();
if (holder == null) {
eof = true;
} else {
result++;
}
}
} catch (IOException e) {
System.out.println("Error - " + e.toString());
}
return result;
}
chemSort(){
lineCount = this.lineCounter();
sort = new String[lineCount];
sorted = new String[lineCount];
style = new String[lineCount];
accountNumber = new String[lineCount];
customerNumber = new String[lineCount];
try {
FileReader inputFile = new FileReader("C:\\Users\\cbook\\Desktop\\Chemical\\"
+ "LB26529.fil");
input = new BufferedReader(inputFile);
for (int i = 0; i < (lineCount + 1); i++) {
holder = input.readLine();
if (holder != null) {
sort[i] = holder;
style[i] = sort[i].substring(0, 1);
customerNumber[i] = sort[i].substring(252, 257);
}
}
} catch (IOException e) {
System.out.println("Error - " + e.toString());
}
}
This what I have so far and I'm not really sure where to go from here or even if this is the correct way
to go about sorting the file. After the file is sorted it will be stored into another file and processed
again with another program for it to be ready for printing.
List<String> linesAsList = new ArrayList<String>();
String line=null;
while(null!=(line=reader.readLine())) linesAsList.add(line);
Collections.sort(linesAsList, new Comparator<String>() {
public int compare(String o1,String o2){
return (o1.substring(18,23)+o1.substring(0,1)).compareTo(o2.substring(18,23)+o2.substring(0,1));
}});
for (String line:linesAsList) System.out.println(line); // or whatever output stream you want
This phone's autocorrect is messing up my answer
Read the file into an ArrayList (instead of an array). Use the following methods:
// to declare the arraylist
ArrayList<String> lines = new ArrayList<String>();
// to add a new line to it (within your reading-lines loop)
lines.add(input.readLine());
Then, sort it using a custom Comparator:
Collections.sort(lines, new Comparator<String>() {
public int compare(String a, String b) {
String a5 = theFiveNumbersOf(a);
String b5 = theFiveNumbersOf(b);
int firstComparison = a5.compareTo(b5);
if (firstComparison != 0) { return firstComparison; }
String a1 = theDigitOf(a);
String b1 = theDigitOf(b);
return a1.compareTo(b1);
}
});
(It is unclear what 5 digits or what digit you want to compare; I've left them as functions for you to fill in).
Finally, write it to the output file:
BufferedWriter ow = new BufferedWriter(new FileOutputStream("filename.extension"));
for (String line : lines) {
ow.println(line);
}
ow.close();
(adding imports and try/catch as needed)
This code will sort a file based on mainframe sort parameters.
You pass 3 parameters to the main method of the Sort class.
The input file path.
The output file path.
The sort parameters in mainframe sort format. In your case, this string would be 19,5,CH,A,1,1,CH,A
This first class, the SortParameter class, holds instances of the sort parameters. There's one instance for every group of 4 parameters in the sort parameters string. This class is a basic getter / setter class, except for the getDifference method. The getDifference method brings some of the sort comparator code into the SortParameter class to simplify the comparator code in the Sort class.
public class SortParameter {
protected int fieldStartByte;
protected int fieldLength;
protected String fieldType;
protected String sortDirection;
public SortParameter(int fieldStartByte, int fieldLength, String fieldType,
String sortDirection) {
this.fieldStartByte = fieldStartByte;
this.fieldLength = fieldLength;
this.fieldType = fieldType;
this.sortDirection = sortDirection;
}
public int getFieldStartPosition() {
return fieldStartByte - 1;
}
public int getFieldEndPosition() {
return getFieldStartPosition() + fieldLength;
}
public String getFieldType() {
return fieldType;
}
public String getSortDirection() {
return sortDirection;
}
public int getDifference(String a, String b) {
int difference = 0;
if (getFieldType().equals("CH")) {
String as = a.substring(getFieldStartPosition(),
getFieldEndPosition());
String bs = b.substring(getFieldStartPosition(),
getFieldEndPosition());
difference = as.compareTo(bs);
if (getSortDirection().equals("D")) {
difference = -difference;
}
}
return difference;
}
}
The Sort class contains the code to read the input file, sort the input file, and write the output file. This class could probably use some more error checking.
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
public class Sort implements Runnable {
protected List<String> lines;
protected String inputFilePath;
protected String outputFilePath;
protected String sortParameters;
public Sort(String inputFilePath, String outputFilePath,
String sortParameters) {
this.inputFilePath = inputFilePath;
this.outputFilePath = outputFilePath;
this.sortParameters = sortParameters;
}
#Override
public void run() {
List<SortParameter> parameters = parseParameters(sortParameters);
lines = read(inputFilePath);
lines = sort(lines, parameters);
write(outputFilePath, lines);
}
protected List<SortParameter> parseParameters(String sortParameters) {
List<SortParameter> parameters = new ArrayList<SortParameter>();
String[] field = sortParameters.split(",");
for (int i = 0; i < field.length; i += 4) {
SortParameter parameter = new SortParameter(
Integer.parseInt(field[i]), Integer.parseInt(field[i + 1]),
field[i + 2], field[i + 3]);
parameters.add(parameter);
}
return parameters;
}
protected List<String> sort(List<String> lines,
final List<SortParameter> parameters) {
Collections.sort(lines, new Comparator<String>() {
#Override
public int compare(String a, String b) {
for (SortParameter parameter : parameters) {
int difference = parameter.getDifference(a, b);
if (difference != 0) {
return difference;
}
}
return 0;
}
});
return lines;
}
protected List<String> read(String filePath) {
List<String> lines = new ArrayList<String>();
BufferedReader reader = null;
try {
String line;
reader = new BufferedReader(new FileReader(filePath));
while ((line = reader.readLine()) != null) {
lines.add(line);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (reader != null) {
reader.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return lines;
}
protected void write(String filePath, List<String> lines) {
BufferedWriter writer = null;
try {
writer = new BufferedWriter(new FileWriter(filePath));
for (String line : lines) {
writer.write(line);
writer.newLine();
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (writer != null) {
writer.flush();
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
if (args.length < 3) {
System.err.println("The sort process requires 3 parameters.");
System.err.println(" 1. The input file path.");
System.err.println(" 2. The output file path.");
System.err.print (" 3. The sort parameters in mainframe ");
System.err.println("sort format. Example: 15,5,CH,A");
} else {
new Sort(args[0], args[1], args[2]).run();
}
}
}

Categories

Resources