Java - Stanford NLP - Process all files in directory

Java - Stanford NLP - Process all files in directory - java

I am using Stanford to do some NER analysis on txt files. The problem so far is that I have been to read all files in a directory. I have just been able to read simple Strings. What should be the next step to read several files? I tried with Iterator but it did not work.
Please see my code below:
Blockquote
import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.pipeline.SentimentAnnotator;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.io.*;
import edu.stanford.nlp.ling.*;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.process.PTBEscapingProcessor;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
import edu.stanford.nlp.pipeline.*;
import edu.stanford.nlp.process.DocumentPreprocessor;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.*;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.io.FileUtils;
import com.google.common.io.Files;
import org.apache.commons.io.*;
public class NLPtest2 {
public static void main(String[] args) throws IOException {
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, parse, ner, dcoref, sentiment");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
//how can we read all documents in a directory instead of just a String??
String text = "I work at Lalalala Ltd. It is awesome";
Annotation annotation = new Annotation(text);
pipeline.annotate(annotation);
// Annotation annotation = pipeline.process(text);
List<CoreMap> sentences = annotation.get(SentencesAnnotation.class);
for (CoreMap sentence : sentences) {
String sentiment = sentence.get(SentimentCoreAnnotations.SentimentClass.class);
System.out.println(sentiment + "\t" + sentence);
// System.out.println(annotation.get(CoreAnnotations.QuotationsAnnotation.class));// dont need it
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
// this is the text of the token
String word = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
// this is the NER label of the token
String ne = token.get(NamedEntityTagAnnotation.class);
System.out.println( "Text:"+ word +"//"+"Part of Speech:"+ pos + "//"+ "Entity Recognition:"+ ne);
}
}
}
}

import edu.stanford.nlp.io.*;
import edu.stanford.nlp.util.*;
import java.util.*;
public class ReadFiles {
public static void main(String[] args) {
List<String> filePaths = IOUtils.linesFromFile(args[0]);
for (String filePath : filePaths) {
String fileContents = IOUtils.stringFromFile(filePath);
}
}
}

Related

Converting Shape File to RDF document, in Java

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.management.AttributeChangeNotification;
import org.apache.jena.datatypes.xsd.XSDDatatype;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Property;
import org.apache.jena.rdf.model.RDFNode;
import org.apache.jena.rdf.model.RDFReaderI;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.rdf.model.Statement;
import org.apache.jena.rdf.model.StmtIterator;
import org.apache.jena.riot.Lang;
import org.apache.jena.riot.RDFDataMgr;
import org.apache.jena.riot.system.StreamRDFWriter;
import org.apache.jena.vocabulary.VCARD;
import org.geotools.data.DataStore;
import org.geotools.data.DataStoreFinder;
import org.geotools.data.DataUtilities;
import org.geotools.data.FeatureSource;
import org.geotools.data.FileDataStore;
import org.geotools.data.FileDataStoreFinder;
import org.geotools.data.Query;
import org.geotools.data.ServiceInfo;
import org.geotools.data.shapefile.ShapefileDataStore;
import org.geotools.data.simple.SimpleFeatureCollection;
import org.geotools.data.simple.SimpleFeatureIterator;
import org.geotools.data.simple.SimpleFeatureSource;
import org.geotools.feature.FeatureCollection;
import org.geotools.feature.FeatureIterator;
import org.geotools.swing.data.JFileDataStoreChooser;
import org.opengis.feature.ComplexAttribute;
import org.opengis.feature.simple.SimpleFeature;
import org.opengis.feature.simple.SimpleFeatureType;
import org.opengis.feature.type.FeatureType;
import org.opengis.filter.Filter;
public class ShpToRdf {
public static void main(String[] args) throws IOException {
ArrayList<String> names = new ArrayList<String>();
ArrayList<String> values = new ArrayList<String>();
File file = JFileDataStoreChooser.showOpenFile("shp", null);
if (file == null) {
return;
}
FileDataStore myData = FileDataStoreFinder.getDataStore(file);
SimpleFeatureSource source = myData.getFeatureSource();
SimpleFeatureType schema = source.getSchema();
Query query = new Query(schema.getTypeName());
query.setMaxFeatures(100);
Model model = ModelFactory.createDefaultModel();
String shpURI = "http://www.shp.fake/";
Resource shapeFile = model.createResource(shpURI);
FeatureCollection<SimpleFeatureType, SimpleFeature> collection = source.getFeatures(query);
try (FeatureIterator<SimpleFeature> features = collection.features()) {
while (features.hasNext()) {
SimpleFeature feature = features.next();
model.setNsPrefix("shp", shpURI);
for (org.opengis.feature.Property attribute : feature.getProperties()) {
names.add(attribute.getName().toString());
values.add(attribute.getValue().toString());
}
}
}
ArrayList<Integer> ids = new ArrayList<Integer>();
for(int i=0; i<names.size();i++) {
if (names.get(i).equals("Id")) {
ids.add(i);
}
}
Property features = model.createProperty(shpURI,"features");
for(int i = 0; i<ids.size();i++) {
Property id = model.createProperty(shpURI,names.get(ids.get(i)));
shapeFile = model.createResource(shpURI)
.addProperty(features, model.createResource()
.addProperty(id,model.createResource()
.addProperty(id, values.get(ids.get(i)))
.addProperty(features, "feature1")
.addProperty(features, "feature2")
.addProperty(features, "feature3")));
}
RDFDataMgr.write(System.out, model, Lang.RDFXML);
}
}
I am trying to create an application that converts Shape File(shp) to RDF.
The problem is that I can get two ArrayLists from the shp. The one has the names of the values (id,name,geometry etc.), and the other has the values.
To create the RDF, I have to match each Id with the matching values(ex. Id =1 has name = road 1, geometry = line etc.)
Could you help me with this?
Thank you!

I think you should be able to do this by tweaking the following bit of logic
for (org.opengis.feature.Property attribute : feature.getProperties()) {
names.add(attribute.getName().toString());
values.add(attribute.getValue().toString());
}
Instead of putting them in two lists, you can put them in a list of pairs. This way when you iterate over the list, you know the mapping between the subject and object.
It should look something similar to
List<Pair<String, Integer>> contentList = new ArrayList<Pair<String, String>>();
for (org.opengis.feature.Property attribute : feature.getProperties()) {
Pair<String, Integer> subjectObjectPairs = new Pair<String, String>(attribute.getName().toString(), attribute.getValue().toString());
contentList.add(subjectObjectPairs);
}
I'm not sure what the ids ArrayList is for, but you could move that logic into the for loop above to make sure you're only getting identifiers.

DataSource cannot be resolved - Weka

I have the following class to perform PCA on a arff file. I have added the Weka jar to my project but I am still getting an error saying DataSource cannot be resolved and I don't know what to do to resolve it. Can anyone suggest what could be wrong?
package project;
import weka.core.Instances;
import weka.core.converters.ArffLoader;
import weka.core.converters.ConverterUtils;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.converters.TextDirectoryLoader;
import weka.gui.visualize.Plot2D;
import weka.gui.visualize.PlotData2D;
import weka.gui.visualize.VisualizePanel;
import java.awt.BorderLayout;
import java.io.File;
import java.util.ArrayList;
import javax.swing.JFrame;
import org.math.plot.FrameView;
import org.math.plot.Plot2DPanel;
import org.math.plot.PlotPanel;
import org.math.plot.plots.ScatterPlot;
import weka.attributeSelection.PrincipalComponents;
import weka.attributeSelection.Ranker;
public class PCA {
public static void main(String[] args) {
try {
// Load the Data.
DataSource source = new DataSource("../data/ingredients.arff");
Instances data = source.getDataSet();
// Perform PCA.
PrincipalComponents pca = new PrincipalComponents();
pca.setVarianceCovered(1.0);
//pca.setCenterData(true);
pca.setNormalize(true);
pca.setTransformBackToOriginal(false);
pca.buildEvaluator(data);
// Show transform data into eigenvector basis.
Instances transformedData = pca.transformedData();
System.out.println(transformedData);
} catch (Exception e) {
e.printStackTrace();
}
}
}

kafka to hdfs with confluent source code

For the requirement of my project, I need to build a class from the confluent java code to write data from kafka topic to the hdfs filesystem.
It is actually working in CLI with connect-standalone, but I need to do the same thing with the source code which I built successfully.
I have a problem with SinkTask and hdfsConnector classes.
An exception is showing up in the put method.
Here below is my class code:
package io.confluent.connect.hdfs;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.kafka.connect.errors.ConnectException;
import org.apache.kafka.connect.sink.SinkConnector;
import org.apache.kafka.connect.sink.SinkRecord;
import org.apache.kafka.connect.sink.SinkTaskContext;
import io.confluent.connect.avro.AvroData;
import io.confluent.connect.hdfs.avro.AvroFormat;
import io.confluent.connect.hdfs.partitioner.DefaultPartitioner;
import io.confluent.connect.storage.common.StorageCommonConfig;
import io.confluent.connect.storage.partitioner.PartitionerConfig;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.config.ConfigDef;
public class main{
private static Map<String, String> props = new HashMap<>();
protected static final TopicPartition TOPIC_PARTITION = new TopicPartition(TOPIC, PARTITION);
protected static String url = "hdfs://localhost:9000";
protected static SinkTaskContext context;
public static void main(String[] args) {
HdfsSinkConnector hk = new HdfsSinkConnector();
HdfsSinkTask h = new HdfsSinkTask();
props.put(StorageCommonConfig.STORE_URL_CONFIG, url);
props.put(HdfsSinkConnectorConfig.HDFS_URL_CONFIG, url);
props.put(HdfsSinkConnectorConfig.FLUSH_SIZE_CONFIG, "3");
props.put(HdfsSinkConnectorConfig.FORMAT_CLASS_CONFIG, AvroFormat.class.getName());
try {
hk.start(props);
Collection<SinkRecord> sinkRecords = new ArrayList<>();
SinkRecord record = new SinkRecord("test", 0, null, null, null, null, 0);
sinkRecords.add(record);
h.initialize(context);
h.put(sinkRecords);
hk.stop();
} catch (Exception e) {
throw new ConnectException("Couldn't start HdfsSinkConnector due to configuration error", e);
}
}
}

Java-Can't pass Directory variable as an argument to IndexReader.open() in Apache Lucene 6.4.2

I'm trying to use the open function defined in the Lucene documentation here- https://lucene.apache.org/core/3_5_0/api/core/org/apache/lucene/index/IndexReader.html (Do a Ctrl + F for 'open'). However Netbeans 8.1 with Apache Lucene 6.4.2 gives an in-line error on the code at statement 'reader = IndexReader.open(indexDirectory);'. Here is the error and code.
Cannot find symbol
symbol: method open(Directory)
location: class IndexReader
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class Indexing_Searching
{
public static final String FIELD_CONTENTS = "contents";
public int searchIndex(String instring, String Index_Dir_Path)
{
int numDocs =0;
try
{
Path path = Paths.get(Index_Dir_Path);
Directory indexDirectory = FSDirectory.open(path);
IndexReader reader;
reader = IndexReader.open(indexDirectory);
Term term = new Term("content", instring);
numDocs = reader.docFreq(term);
//System.out.println("Number of documents for given key" + instring +" # docs" + numDocs);
}
catch (CorruptIndexException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
return(numDocs);
}// End of one-words searching function
}

According to current IndexReader JavaDoc for Lucene 6.4.2 you should use DirectoryReader.open.

Remove user from Active Directory

I have an email distribution list "CTW DEV". I would like to remove the 1 user 'rakeshdw' from the ActiveDirectory using java. Please Find below code.
Its giving an exception. User is not getting removed. Please suggest the required changes. Thanks !
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Iterator;
import java.util.Properties;
import java.util.HashSet;
import javax.naming.Context;
import javax.naming.NamingEnumeration;
import javax.naming.NamingException;
import javax.naming.directory.Attribute;
import javax.naming.directory.Attributes;
import javax.naming.directory.BasicAttribute;
import javax.naming.directory.DirContext;
import javax.naming.directory.InitialDirContext;
import javax.naming.directory.ModificationItem;
import javax.naming.directory.SearchControls;
import javax.naming.directory.SearchResult;
import java.util.Properties;
import javax.naming.Context;
import javax.naming.directory.BasicAttribute;
import javax.naming.directory.DirContext;
import javax.naming.directory.InitialDirContext;
import javax.naming.directory.ModificationItem;
public class DeleteFromADGroup {
private String adGroup,dn;
private DirContext ctx;
private String adminName = "intranet\\patilume";
DeleteFromADGroup(){
try{
this.adGroup = "CN=CTW_DEV";
this.dn= "OU=DistributionLists,OU=Messaging,DC=INTRANET,DC=INFOSYSINT,DC=com";
Properties pr = new Properties();
pr.setProperty(Context.INITIAL_CONTEXT_FACTORY, "com.sun.jndi.ldap.LdapCtxFactory");
pr.setProperty(Context.PROVIDER_URL, "ldap://intranet.infosysint.com");
pr.setProperty(Context.SECURITY_AUTHENTICATION,"simple");
pr.setProperty(Context.SECURITY_CREDENTIALS, "myPassword"); //its password
pr.setProperty(Context.SECURITY_PRINCIPAL, this.adminName);
pr.setProperty(Context.REFERRAL, "ignore");
this.ctx = new InitialDirContext(pr);
}
catch(Exception e){
System.out.println("in constructor..");
}
}
public static void main(String[] args) {
DeleteFromADGroup dadg=new DeleteFromADGroup();
dadg.deleteUser("CTW_DEV","rakeshdw");
}
private void deleteUser(String ADGroup, String username){
String groupName = "CN="+ADGroup+",OU=DistributionLists,OU=Messaging,DC=INTRANET,DC=INFOSYSINT,DC=com";
try{
ModificationItem mods[] = new ModificationItem[1];
mods[0]= new ModificationItem(DirContext.REMOVE_ATTRIBUTE, new BasicAttribute("member", username));
//update the group
ctx.modifyAttributes(groupName,mods);
ctx.close();
}
catch(Exception e){
System.out.println("Exception while removing user from DL");
}
}
}
Exception I get is as below:
javax.naming.OperationNotSupportedException: [LDAP: error code 53 - 0000054F: SvcErr: DSID-031A0FC0, problem 5003 (WILL_NOT_PERFORM), data 0

You need to pass the DN of the user to the username attribute in your deleteUser method. For example, it might be something like cn=rakeshw,ou=people,dc=contoso,dc=com.

Develop Reference

Java is a programming language and computing platform first released by Sun Microsystems in 1995.

Java - Stanford NLP - Process all files in directory - java

import edu.stanford.nlp.io.; import edu.stanford.nlp.util.; import java.util.*; public class ReadFiles { public static void main(String[] args) { List<String> filePaths = IOUtils.linesFromFile(args[0]); for (String filePath : filePaths) { String fileContents = IOUtils.stringFromFile(filePath); } } }

Related

Converting Shape File to RDF document, in Java

DataSource cannot be resolved - Weka

kafka to hdfs with confluent source code

Java-Can't pass Directory variable as an argument to IndexReader.open() in Apache Lucene 6.4.2

Remove user from Active Directory

Categories

Resources