Classify and Predict Multiple Attributes with Weka - java
I need to input 6 attributes and classify/predict 3 attributes from that input using Java/Weka programmatically. I've figured out how to predict 1 (the last) attribute, but how can I change this to train and predict the last 3 at the same time?
The numbers in the .arff files correspond to movie objects in a database.
Here is my Java code:
import java.io.BufferedReader;
import java.io.FileReader;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.trees.DecisionStump;
import weka.classifiers.trees.J48;
import weka.classifiers.trees.RandomForest;
import weka.classifiers.trees.RandomTree;
import weka.core.Instances;
import weka.filters.unsupervised.attribute.Remove;
public class WekaTrial {
/**
* #param args
* #throws Exception
*/
public static void main(String[] args) throws Exception {
// Create training data instance
Instances training_data = new Instances(
new BufferedReader(
new FileReader(
"C:/Users/Me/Desktop/File_Project/src/movie_training.arff")));
training_data.setClassIndex(training_data.numAttributes() - 1);
// Create testing data instance
Instances testing_data = new Instances(
new BufferedReader(
new FileReader(
"C:/Users/Me/Desktop/FileProject/src/movie_testing.arff")));
testing_data.setClassIndex(training_data.numAttributes() - 1);
// Print initial data summary
String summary = training_data.toSummaryString();
int number_samples = training_data.numInstances();
int number_attributes_per_sample = training_data.numAttributes();
System.out.println("Number of attributes in model = "
+ number_attributes_per_sample);
System.out.println("Number of samples = " + number_samples);
System.out.println("Summary: " + summary);
System.out.println();
// a classifier for decision trees:
J48 j48 = new J48();
// filter for removing samples:
Remove rm = new Remove();
rm.setAttributeIndices("1"); // remove 1st attribute
// filtered classifier
FilteredClassifier fc = new FilteredClassifier();
fc.setFilter(rm);
fc.setClassifier(j48);
// Create counters and print values
float correct = 0;
float incorrect = 0;
// train using stock_training_data.arff:
fc.buildClassifier(training_data);
// test using stock_testing_data.arff:
for (int i = 0; i < testing_data.numInstances(); i++) {
double pred = fc.classifyInstance(testing_data.instance(i));
System.out.print("Expected values: "
+ testing_data.classAttribute().value(
(int) testing_data.instance(i).classValue()));
System.out.println(", Predicted values: "
+ testing_data.classAttribute().value((int) pred));
// Increment correct/incorrect values
if (testing_data.classAttribute().value(
(int) testing_data.instance(i).classValue()) == testing_data.classAttribute().value((int) pred)) {
correct += 1;
} else {
incorrect += 1;
}
}
// Print correct/incorrect
float percent_correct = correct/(correct+incorrect)*100;
System.out.println("Number correct: " + correct + "\nNumber incorrect: " + incorrect + "\nPercent correct: " +
percent_correct + "%");
}
}
This is my .arff training file (with excess rows removed):
#relation movie_data
#attribute movie1_one {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie1_two {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie1_three {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie2_one {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie2_two {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie2_three {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute decision_one {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute decision_two {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute decision_three {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#data
18,18,18,18,18,18,18,18,18
28,18,36,18,53,10769,18,53,10769
37,37,37,28,12,14,28,12,14
27,53,27,18,10749,10769,27,53,27
12,12,12,35,10751,35,12,12,12
35,18,10749,18,18,18,35,18,10749
28,12,878,53,53,53,53,53,53
18,18,18,28,37,10769,18,18,18
18,53,18,28,12,35,18,53,18
28,80,53,80,18,10749,28,80,53
18,10749,18,18,10756,18,18,10756,18
18,10749,10769,28,12,878,18,10749,10769
18,10756,18,16,35,10751,16,35,10751
35,18,10751,35,18,10752,35,18,10751
And the .arff testing file:
#relation movie_data
#attribute movie1_one {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie1_two {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie1_three {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie2_one {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie2_two {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute movie2_three {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute decision_one {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute decision_two {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#attribute decision_three {28,12,16,35,80,105,99,18,82,2916,10751,10750,14,10753,10769,36,10595,27,10756,10402,22,9648,10754,1115,10749,878,10755,9805,10758,10757,10748,10770,53,10752,37}
#data
18,27,53,18,53,10756,18,27,53
35,18,10749,18,10769,18,18,10769,18
16,878,53,16,18,16,16,18,16
35,10749,10757,18,18,18,18,18,18
80,18,10748,18,10749,18,18,10749,18
28,18,36,35,18,10751,28,18,36
18,10749,10769,35,18,10402,35,18,10402
28,12,878,18,10749,10769,18,10749,10769
35,10749,35,14,10402,10751,14,10402,10751
If I understood you correctly, you have a "Multi-Class" or "Multi-Target" problem.
You have several simple options to solve the problem:
Create a new target class which incorporates all 3 (concatenation of decision_one, decision_two and decision_three)
Train each target separately.
I think the simplest approach would be, as Bella said, to train three separate models, one for each class, possibly removing the rest of the class attribs (depending on whether or not you want the other two classes to influence your classification).
Related
Sorting strings via stream
I am doing a coding exercise where I take the the raw data from a csv file and I print it in order of lowest to highest ranked literacy rates. For example: Adult literacy rate, population 15+ years, female (%),United Republic of Tanzania,2015,76.08978 Adult literacy rate, population 15+ years, female (%),Zimbabwe,2015,85.28513 Adult literacy rate, population 15+ years, male (%),Honduras,2014,87.39595 Adult literacy rate, population 15+ years, male (%),Honduras,2015,88.32135 Adult literacy rate, population 15+ years, male (%),Angola,2014,82.15105 Turns into: Niger (2015), female, 11.01572 Mali (2015), female, 22.19578 Guinea (2015), female, 22.87104 Afghanistan (2015), female, 23.87385 Central African Republic (2015), female, 24.35549 My code: import java.io.IOException; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.Scanner; public class LiteracyComparison { public static void main(String[] args) throws IOException { List<String> literacy = new ArrayList<>(); try (Scanner scanner = new Scanner(Paths.get("literacy.csv"))) { while(scanner.hasNextLine()){ String row = scanner.nextLine(); String[] line = row.split(","); line[2] = line[2].trim().substring(0, line[2].length() - 5); line[3] = line[3].trim(); line[4] = line[4].trim(); line[5] = line[5].trim(); String l = line[3] + " (" + line[4] + "), " + line[2] + ", " + line[5]; literacy.add(l); } } // right about where I get lost literacy.stream().sorted(); } } Now I have converted the raw data into the correct format, it's just I am lost on how to sort it. I am also wondering if there is a more efficient way to do this via the streams method. Please and thank you.
I took a few liberties while refactoring your code, but the idea is the same. This could be further improved but it is not intended to be a perfect solution, just something to answer your question and put you on the right track. The main idea here is to create a nested class called LiteracyData, which stores the summary you had before as a String. However, we also want to store the literacy rate so we have something to sort by. Then you can use a Java Comparator to define your own method for comparing custom classes, in this case LiteracyData. Finally, tie it all together by calling the sort function on your List, while passing in the custom Comparator as an argument. That will sort your list. You can then print it to view the results. import java.io.IOException; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.Scanner; import java.util.Comparator; public class LiteracyComparison { // Define a class that stores your data public class LiteracyData { private String summary; private float rate; public LiteracyData(String summary, float rate) { super(); this.summary = summary; this.rate = rate; } } // This is a custom Comparator we defined for sorting LiteracyData public class LiteracySorter implements Comparator<LiteracyData> { #Override public int compare(LiteracyData d1, LiteracyData d2) { return Float.compare(d1.rate, d2.rate); } } public void run() { List<LiteracyData> literacy = new ArrayList<>(); try (Scanner scanner = new Scanner(Paths.get("literacy.csv"))) { while(scanner.hasNextLine()){ String row = scanner.nextLine(); String[] line = row.split(","); line[2] = line[2].trim().substring(0, line[2].length() - 5); line[3] = line[3].trim(); line[4] = line[4].trim(); line[5] = line[5].trim(); String l = line[3] + " (" + line[4] + "), " + line[2] + ", " + line[5]; LiteracyData data = new LiteracyData(l, Float.parseFloat(line[5])); literacy.add(data); } } catch (Exception e) { System.out.println(e.getMessage()); } // Sort the list using your custom LiteracyData comparator literacy.sort(new LiteracySorter()); // Iterate through the list and print each item to ensure it is sorted for(LiteracyData data : literacy) { System.out.println(data.summary); } } public static void main(String[] args) throws IOException { LiteracyComparison comparison = new LiteracyComparison(); comparison.run(); } }
Java Hash map / Array List Count distinct values
I am pretty new into programming and I have an assignment to make, but I got stuck. I have to implement a program which will read a CSV file (1 million+ lines) and count how many clients ordered "x" distinct products on a specific day. The CSV looks like this: Product Name | Product ID | Client ID | Date Name 544 86 10/12/2017 Name 545 86 10/12/2017 Name 644 87 10/12/2017 Name 644 87 10/12/2017 Name 9857 801 10/12/2017 Name 3022 801 10/12/2017 Name 3021 801 10/12/2017 The result from my code is: 801: 2 - incorrect 86: 2 - correct 87: 2 - incorrect Desired output is: Client 1 (801): 3 distinct products Client 2 (86): 2 distinct products Client 3 (87): 1 distinct product Additionally, If I want to know how many clients ordered 2 distinct products I would like a result to look like this: Total: 1 client ordered 2 distinct products If I want to know the maximum number of distinct products ordered in a day, I would like the result to look like this: The maximum number of distinct products ordered is: 3 I tried to use a Hash Map and Multimap by Google Guava (my best guess here), but I couldn't wrap my head around it. My code looks like this: package Test; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.HashMultimap; public class Test { public static void main(String[] args) { //HashMultimap<String, String> myMultimap = HashMultimap.create(); Map<String, MutableInteger> map = new HashMap<String, MutableInteger>(); ArrayList<String> linesList = new ArrayList<>(); // Input of file which needs to be parsed String csvFile = "file.csv"; BufferedReader csvReader; // Data split by 'TAB' in CSV file String csvSplitBy = "\t"; try { // Read the CSV file into an ArrayList array for easy processing. String line; csvReader = new BufferedReader(new FileReader(csvFile)); while ((line = csvReader.readLine()) !=null) { linesList.add(line); } csvReader.close(); } catch (IOException e) { e.printStackTrace(); } // Process each CSV file line which is now contained within // the linesList list Array for (int i = 0; i < linesList.size(); i++) { String[] data = linesList.get(i).split(csvSplitBy); String col2 = data[1]; String col3 = data[2]; String col4 = data[3]; // Determine if Column 4 has the desired date // and count the values if (col4.contains("10/12/2017")) { String key = col3; if (map.containsKey(key)) { MutableInteger count = map.get(key); count.set(count.get() + 1); } else { map.put(key, new MutableInteger(1)); } } } for (final String k : map.keySet()) { if (map.get(k).get() == 2) { System.out.println(k + ": " + map.get(k).get()); } } } } Any advise or suggestion on how this can be implemented would be greatly appreciated. Thank you in advance guys.
You could store a Setof productIds per clientId, and just take the size of that. As a Set does not allow duplicate values, this will effectively give you the distinct number of productIds. Also, I recommend that you give your variables meaningful name instead of col2, k, map... This will make your code more readable. Map<String, Set<String>> distinctProductsPerClient = new HashMap<String, Set<String>>(); // Process each CSV file line which is now contained within // the linesList list Array // Start from 1 to skip the first line for (int i = 1; i < linesList.size(); i++) { String line = linesList.get(i); String[] data = line.split(csvSplitBy); String productId = data[1]; String clientId = data[2]; String date = data[3]; // Determine if Column 4 has the desired date // and count the values if (date.contains("10/12/2017")) { if (!distinctProductsPerClient.containsKey(clientId)) { distinctProductsPerClient.put(clientId, new HashSet<>()); } distinctProductsPerClient.get(clientId).add(productId); } } for (final String clientId : distinctProductsPerClient.keySet()) { System.out.println(clientId + ": " + distinctProductsPerClient.get(clientId).size()); } More advanced solution using Stream API (requires Java 9) If you introduce the class OrderData(that represents a single line in the CSV) like this: private static class OrderData { private final String productName; private final String productId; private final String clientId; private final String date; public OrderData(String csvLine) { String[] data = csvLine.split("\t"); this.productName = data[0]; this.productId = data[1]; this.clientId = data[2]; this.date = data[3]; } public String getProductName() { return productName; } public String getProductId() { return productId; } public String getClientId() { return clientId; } public String getDate() { return date; } } you can replace the for loop with this: Map<String, Set<String>> distinctProductsPerClient2 = linesList.stream() .skip(1) .map(OrderData::new) .collect(groupingBy(OrderData::getClientId, mapping(OrderData::getProductId, toSet()))); But I reckon this might be a little bit to complex if you're new into programming (although it might be a good exercise if you would try to understand what the above code does).
Print actual and predicted class labels using Random Forest in Java
I have a large datasets with 10000 records such that 5000 belong to class 1 and remaining 5000 to class -1. I used Random Forest and obtained a good accuracy over 90%. Now if I have an arff file #relation cds_orf #attribute start numeric #attribute end numeric #attribute score numeric #attribute orf_coverage numeric #attribute class {1,-1} #data (suppose this contains 5 records) my output should be something like this No Actual_class Predicted class 1 1 1 2 1 1 3 -1 -1 4 1 -1 5 1 1 I want the Java code to print this output. Thanks. (Note: I have used classifier.classifyInstance() but it gives NullPointerException)
Well I found the answer myself after a lot of research. The following code does the same and writes the output to anther file orf_out. import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.PrintWriter; import java.util.Random; import weka.classifiers.Evaluation; import weka.classifiers.trees.RandomForest; import weka.core.Instances; /** * * #author samy */ public class WekaTest { /** * #throws java.lang.Exception */ public static void rfnew() throws Exception { BufferedReader br; int numFolds = 10; br = new BufferedReader(new FileReader("orf_arff")); Instances trainData = new Instances(br); trainData.setClassIndex(trainData.numAttributes() - 1); br.close(); RandomForest rf = new RandomForest(); rf.setNumTrees(100); Evaluation evaluation = new Evaluation(trainData); evaluation.crossValidateModel(rf, trainData, numFolds, new Random(1)); rf.buildClassifier(trainData); PrintWriter out = new PrintWriter("orf_out"); out.println("No.\tTrue\tPredicted"); for (int i = 0; i < trainData.numInstances(); i++) { String trueClassLabel; trueClassLabel = trainData.instance(i).toString(trainData.classIndex()); // Discreet prediction double predictionIndex = rf.classifyInstance(trainData.instance(i)); // Get the predicted class label from the predictionIndex. String predictedClassLabel; predictedClassLabel = trainData.classAttribute().value((int) predictionIndex); out.println((i+1)+"\t"+trueClassLabel+"\t"+predictedClassLabel); } out.println(evaluation.toSummaryString("\nResults\n======\n", true)); out.println(evaluation.toClassDetailsString()); out.println("Results For Class -1- "); out.println("Precision= " + evaluation.precision(0)); out.println("Recall= " + evaluation.recall(0)); out.println("F-measure= " + evaluation.fMeasure(0)); out.println("Results For Class -2- "); out.println("Precision= " + evaluation.precision(1)); out.println("Recall= " + evaluation.recall(1)); out.println("F-measure= " + evaluation.fMeasure(1)); out.close(); } } I needed to use buildClassifier in my code.
Print out prediction with WEKA in Java
I am trying to make a prediction with Weka in Java, using the Naive Bayes Classifier, with the following code: JAVA public class Run { public static void main(String[] args) throws Exception { ConverterUtils.DataSource source1 = new ConverterUtils.DataSource("./data/train.arff"); Instances train = source1.getDataSet(); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well if (train.classIndex() == -1) train.setClassIndex(train.numAttributes() - 1); ConverterUtils.DataSource source2 = new ConverterUtils.DataSource("./data/test.arff"); Instances test = source2.getDataSet(); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well if (test.classIndex() == -1) test.setClassIndex(train.numAttributes() - 1); // model NaiveBayes naiveBayes = new NaiveBayes(); naiveBayes.buildClassifier(train); Evaluation evaluation = new Evaluation(train); evaluation.evaluateModel(naiveBayes, test); } } TRAIN #relation weather #attribute outlook {sunny, overcast, rainy} #attribute temperature real #attribute humidity real #attribute windy {TRUE, FALSE} #attribute play {yes, no} #data sunny,85,85,FALSE,no sunny,80,90,TRUE,no ... PREDICT #relation weather #attribute outlook {sunny, overcast, rainy} #attribute temperature real #attribute humidity real #attribute windy {TRUE, FALSE} #attribute play {yes, no} #data sunny,85,85,FALSE,? In the GUI the predicted output is === Predictions on test split === inst#, actual, predicted, error, probability distribution 1 ? 2:no + 0.145 *0.855 How can I get this output with Java? Which method do I need to use to get this?
public class Run { public static void main(String[] args) throws Exception { ConverterUtils.DataSource source1 = new ConverterUtils.DataSource("./data/train.arff"); Instances train = source1.getDataSet(); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well if (train.classIndex() == -1) train.setClassIndex(train.numAttributes() - 1); ConverterUtils.DataSource source2 = new ConverterUtils.DataSource("./data/test.arff"); Instances test = source2.getDataSet(); // setting class attribute if the data format does not provide this information // For example, the XRFF format saves the class attribute information as well if (test.classIndex() == -1) test.setClassIndex(train.numAttributes() - 1); // model NaiveBayes naiveBayes = new NaiveBayes(); naiveBayes.buildClassifier(train); // this does the trick double label = naiveBayes.classifyInstance(test.instance(0)); test.instance(0).setClassValue(label); System.out.println(test.instance(0).stringValue(4)); } }
weka: add new instance to dataset
I have a weka dataset: #attribute uid numeric #attribute itemid numeric #attribute rating numeric #attribute timestamp numeric #data 196 242 3 881250949 186 302 3 891717742 22 377 1 878887116 196 51 5 881250949 244 51 2 880606923 if I want to add a new instance like this: 244 59 2 880606923 how can I do it ? something like this ? Instances newData = arffLoader.getDataSet(); for (int i = 0; i < newData.numInstances(); i++) { Instance one = newData.instance(i); one.setDataset(data); data.add(one); }
try following code. What you need to do create a double array for your new values. Use DenseInstance class to add them to your Instances object. public static void main(String[] args) { String dataSetFileName = "stackoverflowQuestion.arff"; Instances data = MyUtilsForWekaInstanceHelper.getInstanceFromFile(dataSetFileName); System.out.println("Before adding"); System.out.println(data); double[] instanceValue1 = new double[data.numAttributes()]; instanceValue1[0] = 244; instanceValue1[1] = 59; instanceValue1[2] = 2; instanceValue1[3] = 880606923; DenseInstance denseInstance1 = new DenseInstance(1.0, instanceValue1); data.add(denseInstance1); System.out.println("-----------------------------------------------------------"); System.out.println("After adding"); System.out.println(data); public class MyUtilsForWekaInstanceHelper { public static Instances getInstanceFromFile(String pFileName) { Instances data = null; try { BufferedReader reader = new BufferedReader(new FileReader(pFileName)); data = new Instances(reader); reader.close(); // setting class attribute data.setClassIndex(data.numAttributes() - 1); } catch (Exception e) { throw new RuntimeException(e); } return data; } } output is following. Before adding #relation stackoverflowQuestion #attribute uid numeric #attribute itemid numeric #attribute rating numeric #attribute timestamp numeric #data 196,242,3,881250949 186,302,3,891717742 22,377,1,878887116 196,51,5,881250949 244,51,2,880606923 --------------------------------------------------------------------------------- After adding #relation stackoverflowQuestion #attribute uid numeric #attribute itemid numeric #attribute rating numeric #attribute timestamp numeric #data 196,242,3,881250949 186,302,3,891717742 22,377,1,878887116 196,51,5,881250949 244,51,2,880606923 244,59,2,880606923
you can simply append the new line to your arff file like: String filename= "MyDataset.arff"; FileWriter fwriter = new FileWriter(filename,true); //true will append the new instance fwiter.write("244 59 2 880606923\n");//appends the string to the file fwriter.close();
New instances can be easily added to any existing dataset as follows: //assuming we already have arff loaded in a variable called dataset Instance newInstance = new Instance(); for(int i = 0 ; i < dataset.numAttributes() ; i++) { newInstance.setValue(i , value); //i is the index of attribute //value is the value that you want to set } //add the new instance to the main dataset at the last position dataset.add(newInstance); //repeat as necessary