Multiline strings concatenation in Java - java

I'm looking for some help. What is the easiest way to concatenate multiline strings in Java and print it after ?
For example : I've got two strings :
String turtle1 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r";
String turtle2 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r";
And I want to get this result in the Java Eclipse console :
_ _
.-./*) .-./*)
_/___\/ _/___\/
U U U U
I've already try some algorithms to divide my strings in differents parts and after re-concatenate it. But it was without success.
I know there are StringBuffer class and StringBuilder class but after some research, I didn't found something that correspond to my need.
Thanks in advance for your help.

See my example below, should be self explaining.
public class Turtle {
private static final String returnpattern = "\r\n";
public static void main(String[] args) {
// the data to run through
String turtle1 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r\n";
String turtle2 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r\n";
// split the data into individual parts
String[] one = turtle1.split(returnpattern);
String[] two = turtle2.split(returnpattern);
// find out the longest String in data set one
int longestString = 0;
for (String s : one) {
if (longestString < s.length()) {
longestString = s.length();
}
}
// loop through parts and build new string
StringBuilder b = new StringBuilder();
for (int i = 0; i < one.length; i++) {
String stringTwo = String.format("%1$" + longestString + "s", two[i]); // left pad the dataset two to match
// length
b.append(one[i]).append(stringTwo).append(returnpattern);
}
// output
System.out.println(b);
}
}

Just for fun, here is another solution using streams, prepared for more than two turtles to be shown side-by-side:
public static void main(String[] args) {
String turtle1 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r";
String turtle2 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r";
// split lines into fragments
List<List<String>> fragments = Stream.of(turtle1, turtle2)
.map(x -> Stream.of(x.split("\\r\\n?|\\n")).collect(Collectors.toList()))
.collect(Collectors.toList());
// make all lists same length by adding empty lines as needed
int lines = fragments.stream().mapToInt(List::size).max().orElse(0);
fragments.forEach(x -> x.addAll(Collections.nCopies(lines - x.size(), "")));
// pad all fragments to maximum width (per list)
List<List<String>> padded = fragments.stream().map(x -> {
int width = x.stream().mapToInt(String::length).max().orElse(0);
return x.stream().map(y -> String.format("%-" + width + "s", y)).collect(Collectors.toList());
}).collect(Collectors.toList());
// join corresponding fragments to result lines, and join result lines
String result = IntStream.range(0, lines)
.mapToObj(i -> padded.stream().map(x -> x.get(i)).collect(Collectors.joining()))
.collect(Collectors.joining(System.lineSeparator()));
System.out.println(result);
}

Not so pretty but works:
String turtle1 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r\n";
String turtle2 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r\n";
String[] turtle1Lines = turtle1.split("\r\n");
String[] turtle2Lines = turtle2.split("\r\n");
StringBuilder sb = new StringBuilder();
int turtle1Width = 0;
for (int i = 0; i < 4; i++) {
if (turtle1Lines[i].length() > turtle1Width) {
turtle1Width = turtle1Lines[i].length();
}
}
for (int i = 0; i < 4; i++) {
sb.append(turtle1Lines[i]);
for (int j = turtle1Width - turtle1Lines[i].length(); j > 0; j--) {
sb.append(' ');
}
sb.append(turtle2Lines[i]);
sb.append("\r\n");
}
String turtles = sb.toString();

I'm here too ;)
public class Test {
static String turtle1 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r".replace("\r", "");
static String turtle2 = " _\r\n .-./*)\r\n _/___\\/\r\n U U\r".replace("\r", "");
public static int countRows(String string){
return string.length() - string.replace("\n", "").length() + 1;
}
public static int getMaxLength(String string){
int maxLength = 0;
int currentLength = 0;
char[] data = string.toCharArray();
for(Character c : data){
if(c != '\n'){
if(++currentLength > maxLength) {
maxLength = currentLength;
}
}else{
currentLength = 0;
}
}
return maxLength;
}
public static String[] toStringArray(String string){
int length = getMaxLength(string);
int rows = countRows(string);
String[] result = new String[rows];
int last = 0;
for(int i = 0; i < rows; i++){
int temp = string.indexOf("\n", last);
String str;
if(temp != -1) {
str = string.substring(last, temp);
}else{
str = string.substring(last);
}
while(str.length() < length){
str += " ";
}
result[i] = str;
last = temp + 1;
}
return result;
}
public static String concatMultilineStrings(String first, String second){
StringBuilder sb = new StringBuilder();
String[] arrayFirst = toStringArray(first);
String[] arraySecond = toStringArray(second);
if(arrayFirst.length != arraySecond.length){
System.exit(69);
}
for(int i = 0; i < arrayFirst.length; i++){
sb.append(arrayFirst[i]);
sb.append(arraySecond[i]);
sb.append("\n");
}
return sb.toString();
}
public static void main(String[] args) {
System.out.println(concatMultilineStrings(turtle1, turtle2));
}
}

Related

How to replace a word based on its length?

All words having the given length wordLength in the string sentence must be replaced with the word myWord. All parameters come from user input and may vary. I have tried this way but it only returns the initial string with the initial words.
Here is my source code:
package main;
import java.io.BufferedReader;
import java.io.InputStreamReader;
public class Main {
public static void main(String[] args) throws Exception {
String sentence = "";
int wordLength = 0;
String myWord = "";
InputStreamReader is = new InputStreamReader(System.in);
BufferedReader bis = new BufferedReader(is);
System.out.println("Text input: ");
sentence = bis.readLine();
System.out.println("Word lenth to replace");
wordLength = Integer.parseInt(bis.readLine());
System.out.println("Word to replace to");
myWord = bis.readLine();
Text myText = new Text(myWord, sentence, wordLength);
myText.changeSentence();
System.out.println("New string" + myText.getSentence());
}
}
class Text {
private String mySentence;
private int charNumber;
private String wordToChange;
private String newSentence = "1.";
public Text(String wordToChange, String mySentece, int charNumber) {
this.mySentence = mySentece;
this.wordToChange = wordToChange;
this.charNumber = charNumber;
}
public String getSentence() {
return newSentence;
}
public void changeSentence() {
int firstPos = 0;
int i;
for (i = 0; i < mySentence.length(); i++) {
if (mySentence.charAt(i) == ' ') {
if (i - firstPos == charNumber) {
newSentence = newSentence.concat(wordToChange + " ");
firstPos = i + 1;
} else {
newSentence = newSentence.concat(mySentence.substring(firstPos, i + 1));
firstPos = i + 1;
}
} else if (i == mySentence.length() - 1) {
if (i - firstPos == charNumber) {
newSentence = newSentence.concat(wordToChange + " ");
firstPos = i + 1;
} else {
newSentence = newSentence.concat(mySentence.substring(firstPos, i + 1));
firstPos = i + 1;
}
}
}
}
}
I changed your code a little bit:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
public class Main {
public static void main(String[] args) {
String sentence = "";
int wordLenght = 0;
String myWord = "";
InputStreamReader is = new InputStreamReader(System.in);
BufferedReader bis = new BufferedReader(is);
try {
System.out.println("Text input: ");
sentence = bis.readLine();
System.out.println("Word lenth to replace");
wordLenght = Integer.parseInt(bis.readLine());
System.out.println("Word to replace to");
myWord = bis.readLine();
} catch (IOException e) {
e.printStackTrace();
}
Text myText = new Text(myWord, sentence, wordLenght);
System.out.println(myText.getChangeSentence());
}
}
class Text {
private String mySentence;
private int charNumber;
private String wordToChange;
private String newSentence = "1.";
public Text(String wordToChange, String mySentece, int charNumber) {
this.mySentence = mySentece;
this.wordToChange = wordToChange;
this.charNumber = charNumber;
}
public String getChangeSentence() {
String[] words = mySentence.split(" ");
for(int i = 0 ; i < words.length ; i++) {
if(words[i].length() == charNumber) {
words[i] = wordToChange;
}
}
for (String word : words) {
newSentence += word + " ";
}
return newSentence;
}
}
Input : This is a test
word length : 2
word to replace : ii
output: This ii a test
As I can see the only separator of words that is currently considered to appear in the input text is a single white space " ". If that's true, then the changeSentence method can be quite short. There is no need to do parse the sentence character by characted. Having in mind that the white space is a separator, you can simply split the sentence by the characted " " and collect them as words. After that you can just iterate through words and replace ones that lenght matches given input characters number. After that, you can just join words together with the previously used separator and that's it.
Examples if you want to try with loops
public void changeSentence() {
final String[] words = mySentence.split(" ");
for (int i = 0; i < words.length; i++) {
if (words[i].length() == charNumber) {
words[i] = wordToChange;
}
}
newSentence = String.join(" ", words);
}
or with regular expressions
public void changeSentence() {
String regex = "\\b\\w{" + charNumber+ "}\\b";
newSentence = mySentence.replaceAll(regex, wordToChange);
}
or with the stream API
public void changeSentence() {
newSentence = Arrays.stream(mySentence.split(" "))
.map(s -> s.length() == charNumber ? wordToChange : s)
.collect(Collectors.joining(" "));
}

How to divide a sentence to words and compare with another string?

I have saved the units in .txt file. These I am getting in an array list. Now I want to check if any of the units present in the string.
List contains :
"units", "kg", "kilogms", "kilo", "literes",
"Liter", "packets", "packet", "gms", "grams", "half kg"
Like, if I have a string - 1kg rice, I want to get numbers from this string and I want to divide this sentence to words and want to compare with each item from array list of units. If it is present I want to save it. So I want to store 1kg and rice separately. string may contain any spaces I want to trim all those spaces and check compare.
Getting text file in an array list.
public class ReadTextFiles {
public static List<String> readItemNamesFile(Context context) {
String sText = null;
List<String> stringList;
try{
InputStream is = context.getResources().openRawResource(R.raw.item_names);
//Use one of the above as per your file existing folder
int size = is.available();
byte[] buffer = new byte[size];
is.read(buffer);
is.close();
sText = new String(buffer, "UTF-8");
String[] sTextArray = sText.replace("\"", "").split(",");
stringList = new ArrayList<String>(Arrays.asList(sTextArray));
System.out.print(stringList);
} catch (IOException ex) {
ex.printStackTrace();
return null;
}
return stringList;
}
}
public void getUnits()
{
List<String> units = new ArrayList<>();
units = ReadTextFiles.readUnitsFile(getActivity());
System.out.print(units.size());
}
Now I want to compare string suppose its "1 kg potato".Then should find potato from the array list. Also it should be case insensitive.
This is the full solution of your requirement as I understood:
String measuring = "\"units\", \"kg\", \"kilogms\", \"kilo\", \"literes\", \"Liter\", \"packets\", \"packet\", \"gms\", \"grams\", \"half kg\"";
String items = "\"Potato\", \"rice\", \"Eggs\", \"Maggi\", \"Dryfruits\", \"Maza\", \"cold drink\", \"sauce\", \"catchup\", \"coconut oil\"";
String matching = "Kg500 Potato";//"Potato 1 kg";
String item = "", measuringUnit = "", quantity = "";
private void findOut() {
String[] sMeasuringArray = measuring.replace("\"", "").split(", ");
ArrayList<String> measuringList = new ArrayList<String>(Arrays.asList(sMeasuringArray));
String[] sItemsArray = items.replace("\"", "").split(", ");
ArrayList<String> itemsList = new ArrayList<String>(Arrays.asList(sItemsArray));
String[] sMatchingArray = matching.split(" ");
matching = matching.toUpperCase();
for (int i = 0; i < measuringList.size(); i++) {
if (matching.contains(measuringList.get(i).toUpperCase())) {
measuringUnit = measuringList.get(i).trim();
break;
}
}
for (int i = 0; i < itemsList.size(); i++) {
if (matching.contains(itemsList.get(i).toUpperCase())) {
item = itemsList.get(i).trim();
break;
}
}
if (matching!= null) {
String[] part = matching.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)");
for (int k = 0; k < part.length; k++) {
try {
Integer.parseInt(part[k]);
quantity = part[k];
break;
} catch (Exception ex) {
continue;
}
}
}
/*if (sMatchingArray != null) {
if (sMatchingArray.length == 3) {
for (int j = 0; j < sMatchingArray.length; j++) {
if (measuringUnit.trim().equals(sMatchingArray[j].trim())) {
quantity = sMatchingArray[j - 1].trim();
break;
}
}
} else if (sMatchingArray.length == 2) {
String[] part = matching.split("(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)");
for (int k = 0; k < part.length; k++) {
try {
Integer.parseInt(part[k]);
quantity = part[k];
break;
} catch (Exception ex) {
continue;
}
}
}
}*/
Log.e("Solution: ", "item = " + item + ", measuringUnit = " + measuringUnit + ", quantity = " + quantity);
}
I'm gonna be using algorithmic approach for the answer. So here it goes:
strItem = "1kg rice";
//Run a loop through the list of units and for each unit check this
if (strItem.contains(list.get(index)))
//Do the needful and break

Split 2 string and join

I have 2 string which I want to join as per my requirements. Say I have
String sa = {"as,asd,asdf"};
String qw = {"12,123,1234"};
String[] separated = ItemSumm.split(",");
String[] separateds = Itemumm.split(",");
StringBuffer sb = new StringBuffer();
for (int i = 0; i < separateds.length; i++)
{
if (separated.length == i + 1)
{
sb.append(separated[i] + "(" + separateds[i] + ")");
} else
{
sb.append(separated[i] + "(" + separateds[i] + "),");
}
}
deleteListItem.list_summ.setText(sb.toString());
it gives as(12),asd(123),asdf(1234)
But problem is , it can be like
String sa = {"as,asdf"};
String qw = {"12,123,1234"};
So in this I want like
as(12),asdf(123),1234
Try this code :
String sa = {"as,asd"};
String qw = {"12,123,1234"};
String[] separated = ItemSumm.split(",");
String[] separateds = Itemumm.split(",");
StringBuffer sb = new StringBuffer();
for (int i = 0; i < separateds.length; i++) {
if (separated.length == i + 1) {
if(separated.length == i) {
sb.append(separateds[i] + "");
} else {
sb.append(separated[i] + "(" + separateds[i] + ")");
}
} else {
if(separated.length == i) {
sb.append("," + separateds[i]);
} else {
sb.append(separated[i] + "(" + separateds[i] + "),");
}
}
}
deleteListItem.list_summ.setText(sb.toString());
// Answer : as(12),asd(123),1234
String sa = {"as,asd,asdf"};
String qw = {"12,123,1234"};
String[] separated = ItemSumm.split(",");
String[] separateds = Itemumm.split(",");
StringBuffer sb = new StringBuffer();
// first loop through separated, starting with a comma
for (int i = 0; i < separated.length; i++) {
sb.append(",").append(separated[i]).append("(").append(separateds[i]).append(")"));
}
// append remaining items in separateds
for (int i = separated.length; i < separateds.length; i++) {
sb.append(",").append(separateds[i]);
}
deleteListItem.list_summ.setText(sb.toString().substring(1)); // remove starting comma
if the lenghts of the strings are the sa, do the join
if (separated.length == i + 1 && (separated[i].lenght == separateds[i].lenght))

The program throws NullPointerException [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 8 years ago.
Improve this question
Not sure why it gives me the NullPointerException. Please help.
I am pretty sure all the arrays are full, and i restricted all the loops not to go passed empty spaces.
import java.util.;
import java.io.;
public class TextAnalysis {
public static void main (String [] args) throws IOException {
String fileName = args[0];
File file = new File(fileName);
Scanner fileScanner = new Scanner(file);
int MAX_WORDS = 10000;
String[] words = new String[MAX_WORDS];
int unique = 0;
System.out.println("TEXT FILE STATISTICS");
System.out.println("--------------------");
System.out.println("Length of the longest word: " + longestWord(fileScanner));
read(words, fileName);
System.out.println("Number of words in file wordlist: " + wordList(words));
System.out.println("Number of words in file: " + countWords(fileName) + "\n");
System.out.println("Word-frequency statistics");
lengthFrequency(words);
System.out.println();
System.out.println("Wordlist dump:");
wordFrequency(words,fileName);
}
public static void wordFrequency(String[] words, String fileName) throws IOException{
File file = new File(fileName);
Scanner s = new Scanner(file);
int [] array = new int [words.length];
while(s.hasNext()) {
String w = s.next();
if(w!=null){
for(int i = 0; i < words.length; i++){
if(w.equals(words[i])){
array[i]++;
}
}
for(int i = 0; i < words.length; i++){
System.out.println(words[i] + ":" + array[i]);
}
}
}
}
public static void lengthFrequency (String [] words) {
int [] lengthTimes = new int[10];
for(int i = 0; i < words.length; i++) {
String w = words[i];
if(w!=null){
if(w.length() >= 10) {
lengthTimes[9]++;
} else {
lengthTimes[w.length()-1]++;
}
}
}
for(int j = 0; j < 10; j++) {
System.out.println("Word-length " + (j+1) + ": " + lengthTimes[j]);
}
}
public static String longestWord (Scanner s) {
String longest = "";
while (s.hasNext()) {
String word = s.next();
if (word.length() > longest.length()) {
longest = word;
}
}
return (longest.length() + " " + "(\"" + longest + "\")");
}
public static int countWords (String fileName) throws IOException {
File file = new File(fileName);
Scanner fileScanner = new Scanner(file);
int count = 0;
while(fileScanner.hasNext()) {
String word = fileScanner.next();
count++;
}
return count;
}
public static void read(String[] words, String fileName) throws IOException{
File file = new File(fileName);
Scanner s = new Scanner(file);
while (s.hasNext()) {
String word = s.next();
int i;
for ( i=0; i < words.length && words[i] != null; i++ ) {
words[i]=words[i].toLowerCase();
if (words[i].equals(word)) {
break;
}
}
words[i] = word;
}
}
public static int wordList(String[] words) {
int count = 0;
while (words[count] != null) {
count++;
}
return count;
}
}
There are two problems with this code
1.You didn't do null check,although the array contains null values
2.Your array index from 0-8,if you wan't to get element at 9th index it will throw ArrayIndexOutOfBound Exception.
Your code should be like that
public static void lengthFrequency (String [] words) {
int [] lengthTimes = new int [9];
for(int i = 0; i < words.length; i++) {
String w = words[i];
if(null!=w) //This one added for null check
{
/* if(w.length() >= 10) {
lengthTimes[9]++;
} else {
lengthTimes[w.length()-1]++;
}
}*/
//Don't need to check like that ...u can do like below
for(int i = 0; i < words.length; i++) {
String w = words[i];
if(null!=w)
{
lengthTimes[i] =w.length();
}
}
}
//here we should traverse upto length of the array.
for(int i = 0; i < lengthTimes.length; i++) {
System.out.println("Word-length " + (i+1) + ": " + lengthTimes[i]);
}
}
Your String Array String[] words = new String[MAX_WORDS]; is not initialized,you are just declaring it.All its content is null,calling length method in line 31 will give you null pointer exception.
`
Simple mistake. When you declare an array, it is from size 0 to n-1. This array only has indexes from 0 to 8.
int [] lengthTimes = new int [9];
//some code here
lengthTimes[9]++; // <- this is an error (this is line 29)
for(int i = 0; i < 10; i++) {
System.out.println("Word-length " + (i+1) + ": " + lengthTimes[i]); // <- same error when i is 9. This is line 37
When you declare:
String[] words = new String[MAX_WORDS];
You're creating an array with MAX_WORDS of nulls, if your input file don't fill them all, you'll get a NullPointerException at what I think is line 37 in your original file:
if(w.length() >= 10) { // if w is null this would throw Npe
To fix it you may use a List instead:
List<String> words = new ArrayList<String>();
...
words.add( aWord );
Or perhaps you can use a Set if you don't want to have repeated words.

Sentence comparison with NLP

I used lingpipe for sentence detection but I don't have any idea if there is a better tool. As far as I have understood, there is no way to compare two sentences and see if they mean the same thing.
Is there anyother good source where I can have a pre-built method for comparing two sentences and see if they are similar?
My requirement is as below:
String sent1 = "Mary and Meera are my classmates.";
String sent2 = "Meera and Mary are my classmates.";
String sent3 = "I am in Meera and Mary's class.";
// several sentences will be formed and basically what I need to do is
// this
boolean bothAreEqual = compareOf(sent1, sent2);
sop(bothAreEqual); // should print true
boolean bothAreEqual = compareOf(sent2, sent3);
sop(bothAreEqual);// should print true
How to test if the meaning of two sentences are the same: this would be a too open-ended question.
However, there are methods for comparing two sentences and see if they are similar. There are many possible definition for similarity that can be tested with pre-built methods.
See for example http://en.wikipedia.org/wiki/Levenshtein_distance
Distance between
'Mary and Meera are my classmates.'
and 'Meera and Mary are my classmates.':
6
Distance between
'Mary and Meera are my classmates.'
and 'Alice and Bobe are not my classmates.':
14
Distance between
'Mary and Meera are my classmates.'
and 'Some totally different sentence.':
29
code:
public class LevenshteinDistance {
private static int minimum(int a, int b, int c) {
return Math.min(Math.min(a, b), c);
}
public static int computeDistance(CharSequence str1,
CharSequence str2) {
int[][] distance = new int[str1.length() + 1][str2.length() + 1];
for (int i = 0; i <= str1.length(); i++){
distance[i][0] = i;
}
for (int j = 0; j <= str2.length(); j++){
distance[0][j] = j;
}
for (int i = 1; i <= str1.length(); i++){
for (int j = 1; j <= str2.length(); j++){
distance[i][j] = minimum(
distance[i - 1][j] + 1,
distance[i][j - 1] + 1,
distance[i - 1][j - 1]
+ ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0 : 1));
}
}
int result = distance[str1.length()][str2.length()];
//log.debug("distance:"+result);
return result;
}
public static void main(String[] args) {
String sent1="Mary and Meera are my classmates.";
String sent2="Meera and Mary are my classmates.";
String sent3="Alice and Bobe are not my classmates.";
String sent4="Some totally different sentence.";
System.out.println("Distance between \n'"+sent1+"' \nand '"+sent2+"': \n"+computeDistance(sent1, sent2));
System.out.println("Distance between \n'"+sent1+"' \nand '"+sent3+"': \n"+computeDistance(sent1, sent3));
System.out.println("Distance between \n'"+sent1+"' \nand '"+sent4+"': \n"+computeDistance(sent1, sent4));
}
}
Here is wat i have come up with. this is just a substitute till i get to the real thing but it might be of some help to people out there..
package com.examples;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import com.aliasi.sentences.MedlineSentenceModel;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;
import com.aliasi.util.Files;
import com.sun.accessibility.internal.resources.accessibility;
public class SentenceWordAnalysisAndLevenshteinDistance {
private static int minimum(int a, int b, int c) {
return Math.min(Math.min(a, b), c);
}
public static int computeDistance(CharSequence str1, CharSequence str2) {
int[][] distance = new int[str1.length() + 1][str2.length() + 1];
for (int i = 0; i <= str1.length(); i++) {
distance[i][0] = i;
}
for (int j = 0; j <= str2.length(); j++) {
distance[0][j] = j;
}
for (int i = 1; i <= str1.length(); i++) {
for (int j = 1; j <= str2.length(); j++) {
distance[i][j] = minimum(
distance[i - 1][j] + 1,
distance[i][j - 1] + 1,
distance[i - 1][j - 1]
+ ((str1.charAt(i - 1) == str2.charAt(j - 1)) ? 0
: 1));
}
}
int result = distance[str1.length()][str2.length()];
return result;
}
static final TokenizerFactory TOKENIZER_FACTORY = IndoEuropeanTokenizerFactory.INSTANCE;
static final SentenceModel SENTENCE_MODEL = new MedlineSentenceModel();
public static void main(String[] args) {
try {
ArrayList<String> sentences = null;
sentences = new ArrayList<String>();
// Reading from text file
// sentences = readSentencesInFile("D:\\sam.txt");
// Giving sentences
// ArrayList<String> sentences = new ArrayList<String>();
sentences.add("Mary and Meera are my classmates.");
sentences.add("Mary and Meera are my classmates.");
sentences.add("Meera and Mary are my classmates.");
sentences.add("Alice and Bobe are not my classmates.");
sentences.add("Some totally different sentence.");
// Self-implemented
wordAnalyser(sentences);
// Internet referred
// levenshteinDistance(sentences);
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}
private static ArrayList<String> readSentencesInFile(String path) {
ArrayList<String> sentencesList = new ArrayList<String>();
try {
System.out.println("Reading file from : " + path);
File file = new File(path);
String text = Files.readFromFile(file, "ISO-8859-1");
System.out.println("INPUT TEXT: ");
System.out.println(text);
List<String> tokenList = new ArrayList<String>();
List<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(
text.toCharArray(), 0, text.length());
tokenizer.tokenize(tokenList, whiteList);
System.out.println(tokenList.size() + " TOKENS");
System.out.println(whiteList.size() + " WHITESPACES");
String[] tokens = new String[tokenList.size()];
String[] whites = new String[whiteList.size()];
tokenList.toArray(tokens);
whiteList.toArray(whites);
int[] sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens,
whites);
System.out.println(sentenceBoundaries.length
+ " SENTENCE END TOKEN OFFSETS");
if (sentenceBoundaries.length < 1) {
System.out.println("No sentence boundaries found.");
return new ArrayList<String>();
}
int sentStartTok = 0;
int sentEndTok = 0;
for (int i = 0; i < sentenceBoundaries.length; ++i) {
sentEndTok = sentenceBoundaries[i];
System.out.println("SENTENCE " + (i + 1) + ": ");
StringBuffer sentenceString = new StringBuffer();
for (int j = sentStartTok; j <= sentEndTok; j++) {
sentenceString.append(tokens[j] + whites[j + 1]);
}
System.out.println(sentenceString.toString());
sentencesList.add(sentenceString.toString());
sentStartTok = sentEndTok + 1;
}
} catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}
return sentencesList;
}
private static void levenshteinDistance(ArrayList<String> sentences) {
System.out.println("\nLevenshteinDistance");
for (int i = 0; i < sentences.size(); i++) {
System.out.println("Distance between \n'" + sentences.get(0)
+ "' \nand '" + sentences.get(i) + "': \n"
+ computeDistance(sentences.get(0),
sentences.get(i)));
}
}
private static void wordAnalyser(ArrayList<String> sentences) {
System.out.println("No.of Sentences : " + sentences.size());
List<String> stopWordsList = getStopWords();
List<String> tokenList = new ArrayList<String>();
ArrayList<List<String>> filteredSentences = new ArrayList<List<String>>();
for (int i = 0; i < sentences.size(); i++) {
tokenList = new ArrayList<String>();
List<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(sentences.get(i)
.toCharArray(), 0, sentences.get(i).length());
tokenizer.tokenize(tokenList, whiteList);
System.out.print("Sentence " + (i + 1) + ": " + tokenList.size()
+ " TOKENS, ");
System.out.println(whiteList.size() + " WHITESPACES");
filteredSentences.add(filterStopWords(tokenList, stopWordsList));
}
for (int i = 0; i < sentences.size(); i++) {
System.out.println("\n" + (i + 1) + ". Comparing\n '"
+ sentences.get(0) + "' \nwith\n '" +
sentences.get(i)
+ "' : \n");
System.out.println(filteredSentences.get(0) + "\n and \n"
+ filteredSentences.get(i));
System.out.println("Percentage of similarity: "
+ calculateSimilarity(filteredSentences.get(0),
filteredSentences.get(i))
+ "%");
}
}
private static double calculateSimilarity(List<String> list1,
List<String> list2) {
int length1 = list1.size();
int length2 = list2.size();
int count1 = 0;
int count2 = 0;
double result1 = 0.0;
double result2 = 0.0;
int least, highest;
if (length2 > length1) {
least = length1;
highest = length2;
} else {
least = length2;
highest = length1;
}
// computing result1
for (String string1 : list1) {
if (list2.contains(string1))
count1++;
}
result1 = (count1 * 100) / length1;
// computing result2
for (String string2 : list2) {
if (list1.contains(string2))
count2++;
}
result2 = (count2 * 100) / length2;
double avg = (result1 + result2) / 2;
return avg;
}
private static List<String> getStopWords() {
String stopWordsString = ".,a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your";
List<String> stopWordsList = new ArrayList<String>();
List<String> stopWordTokenList = new ArrayList<String>();
List<String> whiteList = new ArrayList<String>();
Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(
stopWordsString.toCharArray(), 0, stopWordsString.length());
tokenizer.tokenize(stopWordTokenList, whiteList);
for (int i = 0; i < stopWordTokenList.size(); i++) {
// System.out.println((i + 1) + ":" + tokenList.get(i));
if (!stopWordTokenList.get(i).equals(",")) {
stopWordsList.add(stopWordTokenList.get(i));
}
}
System.out.println("No.of stop words: " + stopWordsList.size());
return stopWordsList;
}
private static List<String> filterStopWords(List<String> tokenList,
List<String> stopWordsList) {
List<String> filteredSentenceWords = new ArrayList<String>();
for (String sentenceToken : tokenList) {
if (!stopWordsList.contains(sentenceToken)) {
filteredSentenceWords.add(sentenceToken);
}
}
return filteredSentenceWords;
}
}

Categories

Resources