Java LightSIDE - How to categorize data with LightSIDE? - java
I have set LightSIDE plugin and can run properly, but I don't know why I can't save my data to empty file? This is what a simple structure I made.
Activity is the list data that need to be categorize.
I have 3 categories and each of them have each type.
I already define each category with specific list of Words. For example : Food ({Sushi, Food, Japan}, {Cap Jay, Food, Chinese}, {Jog, Sport, Running}, ...)
And this is how I save my prediction with LightSIDE.
public void predictSectionType(String[] sections, List<String> activityList) {
LightSideService currentLightsideHelper = new LightSideService();
Recipe newRecipe;
// Initialize SIDEPlugin
currentLightsideHelper.initSIDEPlugin();
try {
// Load Recipe with Extracted Features & Trained Models
ClassLoader myClassLoader = getClass().getClassLoader();
newRecipe = ConverterControl.readFromXML(new InputStreamReader(myClassLoader.getResourceAsStream("static/lightsideTrainingResult/trainingData.xml")));
// Predict Result Data
Recipe recipeToPredict = currentLightsideHelper.loadNewDocumentsFromCSV(sections); // DocumentList & Recipe Created
currentLightsideHelper.predictLabels(recipeToPredict, newRecipe);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
I have class of LightSideService as Summary Class of LightSIDE function.
public class LightSideService {
// Extract Features Parameters
final String featureTableName = "1Grams";
final int featureThreshold = 2;
final String featureAnnotation = "Code";
final Type featureType = Type.NOMINAL;
// Build Models Parameters
final String trainingResultName = "Bayes_1Grams";
// Predict Labels Parameters
final String predictionColumnName = featureAnnotation + "_Prediction";
final boolean showMaxScore = false;
final boolean showDists = true;
final boolean overwrite = false;
final boolean useEvaluation = false;
public DocumentListTableModel model = new DocumentListTableModel(null);
public Map<String, Serializable> validationSettings = new TreeMap<String, Serializable>();
public Map<FeaturePlugin, Boolean> featurePlugins = new HashMap<FeaturePlugin, Boolean>();
public Map<LearningPlugin, Boolean> learningPlugins = new HashMap<LearningPlugin, Boolean>();
public Collection<ModelMetricPlugin> modelEvaluationPlugins = new ArrayList<ModelMetricPlugin>();
public Map<WrapperPlugin, Boolean> wrapperPlugins = new HashMap<WrapperPlugin, Boolean>();
// Initialize Data ==================================================
public void initSIDEPlugin() {
SIDEPlugin[] featureExtractors = PluginManager.getSIDEPluginArrayByType("feature_hit_extractor");
boolean selected = true;
for (SIDEPlugin fe : featureExtractors) {
featurePlugins.put((FeaturePlugin) fe, selected);
selected = false;
}
SIDEPlugin[] learners = PluginManager.getSIDEPluginArrayByType("model_builder");
for (SIDEPlugin le : learners) {
learningPlugins.put((LearningPlugin) le, true);
}
SIDEPlugin[] tableEvaluations = PluginManager.getSIDEPluginArrayByType("model_evaluation");
for (SIDEPlugin fe : tableEvaluations) {
modelEvaluationPlugins.add((ModelMetricPlugin) fe);
}
SIDEPlugin[] wrappers = PluginManager.getSIDEPluginArrayByType("learning_wrapper");
for (SIDEPlugin wr : wrappers) {
wrapperPlugins.put((WrapperPlugin) wr, false);
}
}
//Used to Train Models, adjust parameters according to model
public void initValidationSettings(Recipe currentRecipe) {
validationSettings.put("testRecipe", currentRecipe);
validationSettings.put("testSet", currentRecipe.getDocumentList());
validationSettings.put("annotation", "Age");
validationSettings.put("type", "CV");
validationSettings.put("foldMethod", "AUTO");
validationSettings.put("numFolds", 10);
validationSettings.put("source", "RANDOM");
validationSettings.put("test", "true");
}
// Load CSV Doc ==================================================
public Recipe loadNewDocumentsFromCSV(String filePath) {
DocumentList testDocs;
testDocs = chooseDocumentList(filePath);
if (testDocs != null) {
testDocs.guessTextAndAnnotationColumns();
Recipe currentRecipe = Recipe.fetchRecipe();
currentRecipe.setDocumentList(testDocs);
return currentRecipe;
}
return null;
}
public Recipe loadNewDocumentsFromCSV(String[] rootCauseList) {
DocumentList testDocs;
testDocs = chooseDocumentList(rootCauseList);
if (testDocs != null) {
testDocs.guessTextAndAnnotationColumns();
Recipe currentRecipe = Recipe.fetchRecipe();
currentRecipe.setDocumentList(testDocs);
return currentRecipe;
}
return null;
}
protected DocumentList chooseDocumentList(String filePath) {
TreeSet<String> docNames = new TreeSet<String>();
docNames.add(filePath);
try {
DocumentList testDocs;
Charset encoding = Charset.forName("UTF-8");
{
testDocs = ImportController.makeDocumentList(docNames, encoding);
}
return testDocs;
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
protected DocumentList chooseDocumentList(String[] rootCauseList) {
try {
DocumentList testDocs;
testDocs = new DocumentList();
testDocs.setName("TestData.csv");
List<String> codes = new ArrayList();
List<String> roots = new ArrayList();
for (String s : rootCauseList) {
codes.add("");
roots.add((s != null) ? s : "");
}
testDocs.addAnnotation("Code", codes, false);
testDocs.addAnnotation("Root Cause Failure Description", roots, false);
return testDocs;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
// Save/Load XML ==================================================
public void saveRecipeToXml(Recipe currentRecipe, String filePath) {
File f = new File(filePath);
try {
ConverterControl.writeToXML(f, currentRecipe);
} catch (Exception e) {
e.printStackTrace();
}
}
public Recipe loadRecipeFromXml(String filePath) throws FileNotFoundException, IOException {
Recipe currentRecipe = ConverterControl.loadRecipe(filePath);
return currentRecipe;
}
// Extract Features ==================================================
public Recipe prepareBuildFeatureTable(Recipe currentRecipe) {
// Add Feature Plugins
Collection<FeaturePlugin> plugins = new TreeSet<FeaturePlugin>();
for (FeaturePlugin plugin : featurePlugins.keySet()) {
String pluginString = plugin.toString();
if (pluginString == "Basic Features" || pluginString == "Character N-Grams") {
plugins.add(plugin);
}
}
// Generate Plugin into Recipe
currentRecipe = Recipe.addPluginsToRecipe(currentRecipe, plugins);
// Setup Plugin configurations
OrderedPluginMap currentOrderedPluginMap = currentRecipe.getExtractors();
for (SIDEPlugin plugin : currentOrderedPluginMap.keySet()) {
String pluginString = plugin.toString();
Map<String, String> currentConfigurations = currentOrderedPluginMap.get(plugin);
if (pluginString == "Basic Features") {
for (String s : currentConfigurations.keySet()) {
if (s == "Unigrams" || s == "Bigrams" || s == "Trigrams" ||
s == "Count Occurences" || s == "Normalize N-Gram Counts" ||
s == "Stem N-Grams" || s == "Skip Stopwords in N-Grams") {
currentConfigurations.put(s, "true");
} else {
currentConfigurations.put(s, "false");
}
}
} else if (pluginString == "Character N-Grams") {
for (String s : currentConfigurations.keySet()) {
if (s == "Include Punctuation") {
currentConfigurations.put(s, "true");
} else if (s == "minGram") {
currentConfigurations.put(s, "3");
} else if (s == "maxGram") {
currentConfigurations.put(s, "4");
}
}
currentConfigurations.put("Extract Only Within Words", "true");
}
}
// Build FeatureTable
currentRecipe = buildFeatureTable(currentRecipe, featureTableName, featureThreshold, featureAnnotation, featureType);
return currentRecipe;
}
protected Recipe buildFeatureTable(Recipe currentRecipe, String name, int threshold, String annotation, Type type) {
FeaturePlugin activeExtractor = null;
try {
Collection<FeatureHit> hits = new HashSet<FeatureHit>();
for (SIDEPlugin plug : currentRecipe.getExtractors().keySet()) {
activeExtractor = (FeaturePlugin) plug;
hits.addAll(activeExtractor.extractFeatureHits(currentRecipe.getDocumentList(), currentRecipe.getExtractors().get(plug)));
}
FeatureTable ft = new FeatureTable(currentRecipe.getDocumentList(), hits, threshold, annotation, type);
ft.setName(name);
currentRecipe.setFeatureTable(ft);
} catch (Exception e) {
System.err.println("Feature Extraction Failed");
e.printStackTrace();
}
return currentRecipe;
}
// Build Models ==================================================
public Recipe prepareBuildModel(Recipe currentRecipe) {
try {
// Get Learner Plugins
LearningPlugin learner = null;
for (LearningPlugin plugin : learningPlugins.keySet()) {
/* if (plugin.toString() == "Naive Bayes") */
if (plugin.toString() == "Logistic Regression") {
learner = plugin;
}
}
if (Boolean.TRUE.toString().equals(validationSettings.get("test"))) {
if (validationSettings.get("type").equals("CV")) {
validationSettings.put("testSet", currentRecipe.getDocumentList());
}
}
Map<String, String> settings = learner.generateConfigurationSettings();
currentRecipe = Recipe.addLearnerToRecipe(currentRecipe, learner, settings);
currentRecipe.setValidationSettings(new TreeMap<String, Serializable>(validationSettings));
for (WrapperPlugin wrap : wrapperPlugins.keySet()) {
if (wrapperPlugins.get(wrap)) {
currentRecipe.addWrapper(wrap, wrap.generateConfigurationSettings());
}
}
buildModel(currentRecipe, validationSettings);
} catch (Exception e) {
e.printStackTrace();
}
return currentRecipe;
}
protected void buildModel(Recipe currentRecipe,
Map<String, Serializable> validationSettings) {
try {
FeatureTable currentFeatureTable = currentRecipe.getTrainingTable();
if (currentRecipe != null) {
TrainingResult results = null;
/*
* if (validationSettings.get("type").equals("SUPPLY")) {
* DocumentList test = (DocumentList)
* validationSettings.get("testSet"); FeatureTable
* extractTestFeatures = prepareTestFeatureTable(currentRecipe,
* validationSettings, test);
* validationSettings.put("testFeatureTable",
* extractTestFeatures);
*
* // if we've already trained the exact same model, don't // do
* it again. Just evaluate. Recipe cached =
* checkForCachedModel(); if (cached != null) { results =
* evaluateUsingCachedModel(currentFeatureTable,
* extractTestFeatures, cached, currentRecipe); } }
*/
if (results == null) {
results = currentRecipe.getLearner().train(currentFeatureTable, currentRecipe.getLearnerSettings(), validationSettings, currentRecipe.getWrappers());
}
if (results != null) {
currentRecipe.setTrainingResult(results);
results.setName(trainingResultName);
currentRecipe.setLearnerSettings(currentRecipe.getLearner().generateConfigurationSettings());
currentRecipe.setValidationSettings(new TreeMap<String, Serializable>(validationSettings));
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
protected static FeatureTable prepareTestFeatureTable(Recipe recipe, Map<String, Serializable> validationSettings, DocumentList test) {
prepareDocuments(recipe, validationSettings, test); // assigns classes, annotations.
Collection<FeatureHit> hits = new TreeSet<FeatureHit>();
OrderedPluginMap extractors = recipe.getExtractors();
for (SIDEPlugin plug : extractors.keySet()) {
Collection<FeatureHit> extractorHits = ((FeaturePlugin) plug).extractFeatureHits(test, extractors.get(plug));
hits.addAll(extractorHits);
}
FeatureTable originalTable = recipe.getTrainingTable();
FeatureTable ft = new FeatureTable(test, hits, 0, originalTable.getAnnotation(), originalTable.getClassValueType());
for (SIDEPlugin plug : recipe.getFilters().keySet()) {
ft = ((RestructurePlugin) plug).filterTestSet(originalTable, ft, recipe.getFilters().get(plug), recipe.getFilteredTable().getThreshold());
}
ft.reconcileFeatures(originalTable.getFeatureSet());
return ft;
}
protected static Map<String, Serializable> prepareDocuments(Recipe currentRecipe, Map<String, Serializable> validationSettings, DocumentList test) throws IllegalStateException {
DocumentList train = currentRecipe.getDocumentList();
try {
test.setCurrentAnnotation(currentRecipe.getTrainingTable().getAnnotation(), currentRecipe.getTrainingTable().getClassValueType());
test.setTextColumns(new HashSet<String>(train.getTextColumns()));
test.setDifferentiateTextColumns(train.getTextColumnsAreDifferentiated());
Collection<String> trainColumns = train.allAnnotations().keySet();
Collection<String> testColumns = test.allAnnotations().keySet();
if (!testColumns.containsAll(trainColumns)) {
ArrayList<String> missing = new ArrayList<String>(trainColumns);
missing.removeAll(testColumns);
throw new java.lang.IllegalStateException("Test set annotations do not match training set.\nMissing columns: " + missing);
}
validationSettings.put("testSet", test);
} catch (Exception e) {
e.printStackTrace();
throw new java.lang.IllegalStateException("Could not prepare test set.\n" + e.getMessage(), e);
}
return validationSettings;
}
//Predict Labels ==================================================
public void predictLabels(Recipe recipeToPredict, Recipe currentRecipe) {
DocumentList newDocs = null;
DocumentList originalDocs;
if (useEvaluation) {
originalDocs = recipeToPredict.getTrainingResult().getEvaluationTable().getDocumentList();
TrainingResult results = currentRecipe.getTrainingResult();
List<String> predictions = (List<String>) results.getPredictions();
newDocs = addLabelsToDocs(predictionColumnName, showDists, overwrite, originalDocs, results, predictions, currentRecipe.getTrainingTable());
} else {
originalDocs = recipeToPredict.getDocumentList();
Predictor predictor = new Predictor(currentRecipe, predictionColumnName);
newDocs = predictor.predict(originalDocs, predictionColumnName, showDists, overwrite);
}
// Predict Labels result
model.setDocumentList(newDocs);
}
protected DocumentList addLabelsToDocs(final String name, final boolean showDists, final boolean overwrite, DocumentList docs, TrainingResult results, List<String> predictions, FeatureTable currentFeatureTable) {
Map<String, List<Double>> distributions = results.getDistributions();
DocumentList newDocs = docs.clone();
newDocs.addAnnotation(name, predictions, overwrite);
if (distributions != null) {
if (showDists) {
for (String label : currentFeatureTable.getLabelArray()) {
List<String> dist = new ArrayList<String>();
for (int i = 0; i < predictions.size(); i++) {
dist.add(String.format("%.3f", distributions.get(label).get(i)));
}
newDocs.addAnnotation(name + "_" + label + "_score", dist, overwrite);
}
}
}
return newDocs;
}
// ==================================================
}
David. It looks like the above replicates a lot of the functionality from the edu.cmu.side.recipe package. However, it doesn't look like your predictSectionType() method actually outputs the model's predictions anywhere.
If what you're trying to do is indeed to save predictions on new data using a trained model, check out the edu.cmu.side.recipe.Predictor class. It takes a trained model path as input, It's used by the scripts/predict.sh convenience script, but you could repurpose its main method if you needed to call it programmatically.
I hope this helps!
Related
Execute command line equivalent to Runtime.getRuntime().exec(cmd); in JNI C
I was developing an app which had requirement to implement root detection logic, so by researching I found some detection logic in JAVA and had implemented following class. class RootDetection { public boolean isDeviceRooted() { return checkForBinary("su") || checkForBinary("busybox") || checkForMaliciousPaths() || checkSUonPath() || detectRootManagementApps() || detectPotentiallyDangerousApps() || detectRootCloakingApps() || checkForDangerousProps() || checkForRWPaths() || detectTestKeys() || checkSuExists(); } private boolean detectTestKeys() { String buildTags = android.os.Build.TAGS; String buildFinger = Build.FINGERPRINT; String product = Build.PRODUCT; String hardware = Build.HARDWARE; String display = Build.DISPLAY; System.out.println("Java: build: " + buildTags + "\nFingerprint: " + buildFinger + "\n Product: " + product + "\n Hardware: " + hardware + "\nDisplay: " + display); return (buildTags != null) && (buildTags.contains("test-keys") || buildFinger.contains("genric.*test-keys") || product.contains("generic") || product.contains("sdk") || hardware.contains("goldfish") || display.contains(".*test-keys")); } private boolean detectRootManagementApps() { return detectRootManagementApps(null); } private boolean detectRootManagementApps(String[] additionalRootManagementApps) { ArrayList<String> packages = new ArrayList<>(); packages.addAll(Arrays.asList(knownRootAppsPackages)); if (additionalRootManagementApps != null && additionalRootManagementApps.length > 0) { packages.addAll(Arrays.asList(additionalRootManagementApps)); } return isAnyPackageFromListInstalled(packages); } private boolean detectPotentiallyDangerousApps() { return detectPotentiallyDangerousApps(null); } private boolean detectPotentiallyDangerousApps(String[] additionalDangerousApps) { ArrayList<String> packages = new ArrayList<>(); packages.addAll(Arrays.asList(knownDangerousAppsPackages)); if (additionalDangerousApps != null && additionalDangerousApps.length > 0) { packages.addAll(Arrays.asList(additionalDangerousApps)); } return isAnyPackageFromListInstalled(packages); } private boolean detectRootCloakingApps() { return detectRootCloakingApps(null); } private boolean detectRootCloakingApps(String[] additionalRootCloakingApps) { ArrayList<String> packages = new ArrayList<>(); packages.addAll(Arrays.asList(knownRootCloakingPackages)); if (additionalRootCloakingApps != null && additionalRootCloakingApps.length > 0) { packages.addAll(Arrays.asList(additionalRootCloakingApps)); } return isAnyPackageFromListInstalled(packages); } private boolean checkForBinary(String filename) { for (String path : suPaths) { String completePath = path + filename; File f = new File(completePath); boolean fileExists = f.exists(); if (fileExists) { return true; } } return false; } private boolean checkForMaliciousPaths() { for (String path : maliciousPaths) { File f = new File(path); boolean fileExists = f.exists(); if (fileExists) { return true; } } return false; } private static boolean checkSUonPath() { for (String pathDir : System.getenv("PATH").split(":")) { if (new File(pathDir, "su").exists()) { return true; } } return false; } private String[] propsReader() { InputStream inputstream = null; try { inputstream = Runtime.getRuntime().exec("getprop").getInputStream(); } catch (IOException e) { e.printStackTrace(); } String propval = ""; try { propval = new Scanner(inputstream).useDelimiter("\\A").next(); } catch (NoSuchElementException e) { } return propval.split("\n"); } private String[] mountReader() { InputStream inputstream = null; try { inputstream = Runtime.getRuntime().exec("mount").getInputStream(); } catch (IOException e) { e.printStackTrace(); } if (inputstream == null) return null; String propval = ""; try { propval = new Scanner(inputstream).useDelimiter("\\A").next(); } catch (NoSuchElementException e) { e.printStackTrace(); } return propval.split("\n"); } private boolean isAnyPackageFromListInstalled(List<String> packages) { PackageManager pm = activity.getPackageManager(); for (String packageName : packages) { try { pm.getPackageInfo(packageName, 0); return true; } catch (PackageManager.NameNotFoundException e) { } } return false; } private boolean checkForDangerousProps() { final Map<String, String> dangerousProps = new HashMap<>(); dangerousProps.put("ro.debuggable", "1"); dangerousProps.put("ro.secure", "0"); String[] lines = propsReader(); for (String line : lines) { for (String key : dangerousProps.keySet()) { if (line.contains(key)) { String badValue = dangerousProps.get(key); badValue = "[" + badValue + "]"; if (line.contains(badValue)) { return true; } } } } return false; } private boolean checkForRWPaths() { String[] lines = mountReader(); for (String line : lines) { String[] args = line.split(" "); if (args.length < 4) { continue; } String mountPoint = args[1]; String mountOptions = args[3]; for (String pathToCheck : pathsThatShouldNotBeWrtiable) { if (mountPoint.equalsIgnoreCase(pathToCheck)) { for (String option : mountOptions.split(",")) { if (option.equalsIgnoreCase("rw")) { return true; } } } } } return false; } private boolean checkSuExists() { Process process = null; try { process = Runtime.getRuntime().exec(new String[]{"which", "su"}); BufferedReader in = new BufferedReader(new InputStreamReader(process.getInputStream())); return in.readLine() != null; } catch (Throwable t) { return false; } finally { if (process != null) process.destroy(); } } } but now to increase security I want to do this root detection logic in native C++ JNI code. I managed to migrate package detection code to JNI C but am not able to find anything regarding these 3 functions checkForDangerousProps(),checkForRWPaths(),checkSuExists() these 3 use Runtime.getRuntime().exec which am not able to find. can someone help me in converting this 3 logics to JNI C one from above code? Help would be really appreciated. Pls guys help.
Verifying information about the graph
I'm having difficulty attempting to print the contents of my Graph using a toString method. Moreover, I do not receive an error when I add my Vertex to my Graph but I am unsure if I am properly adding my Vertexes. Thank you to anyone for any help! Here's the App. public class App { public static void main(String[] args) { File EdgeFile = new File("/User/src/main/java/edu.sdsu.cs/datastructures/threeVertexList.csv"); File VertexFile = new File("User/src/main/java/edu.sdsu.cs/datastructures/threeEdgesList.csv"); if (EdgeFile.exists() && EdgeFile.isFile() && VertexFile.exists() && VertexFile.isFile()) // Checks to see if Files are in existence { IGraph<String, Integer> Graph = new WDGraph<>(); // Instantiating the Graph BufferedReader in = null; try { in = new BufferedReader(new FileReader("cities.csv")); String read = null; while ((read = in.readLine()) != null) { String[] split = read.split(","); for (String part : split) { Graph.addVertex(part); } } } catch (IOException e) { System.out.println("There was a problem: " + e); e.printStackTrace(); } finally { try { in.close(); } catch (Exception e) { } } } else { System.out.println("Error: Incorrect number of input arguments (0 or 2 expected), X provided"); } toString(); // method to overwrite to check contents of Vertices } } Here is my Graph. class WDGraph<V,E> implements IGraph<V,E> { private final List<IVertex> nodes; private final List<IEdge> edges; WDGraph() { this.nodes = nodes; this.edges = edges; } private List<IVertex> getVertexes() { return nodes; } private List<IEdge> getEdges() { return edges; }
Add words to languagetool suggesting list
I use LanguageTool for some spellchecking and spell correction functionality in my application. The LanguageTool documentation describes how to exclude words from spell checking (with call the addIgnoreTokens(...) method of the spell checking rule you're using). How do you add some words (e.g., from a specific dictionary) to spell checking? That is, can LanguageTool fix words with misspellings and suggest words from my specific dictionary?
Unfortunately, the API doesn't support this I think. Without the API, you can add words to spelling.txt to get them accepted and used as suggestions. With the API, you might need to extend MorfologikSpellerRule and change this place of the code. (Disclosure: I'm the maintainer of LanguageTool)
I have similar requirement, which is load some custom words into dictionary as "suggest words", not just "ignored words". And finally I extend MorfologikSpellerRule to do this: Create class MorfologikSpellerRuleEx extends from MorfologikSpellerRule, override the method "match()", and write my own "initSpeller()" for creating spellers. And then for the language tool, create this custom speller rule to replace existing one. Code: Language lang = new AmericanEnglish(); JLanguageTool langTool = new JLanguageTool(lang); langTool.disableRule("MORFOLOGIK_RULE_EN_US"); try { MorfologikSpellerRuleEx spellingRule = new MorfologikSpellerRuleEx(JLanguageTool.getMessageBundle(), lang); spellingRule.setSpellingFilePath(spellingFilePath); //spellingFilePath is the file has my own words + words from /hunspell/spelling_en-US.txt langTool.addRule(spellingRule); } catch (IOException e) { e.printStackTrace(); } The code of my custom MorfologikSpellerRuleEx: public class MorfologikSpellerRuleEx extends MorfologikSpellerRule { private String spellingFilePath = null; private boolean ignoreTaggedWords = false; public MorfologikSpellerRuleEx(ResourceBundle messages, Language language) throws IOException { super(messages, language); } #Override public String getFileName() { return "/en/hunspell/en_US.dict"; } #Override public String getId() { return "MORFOLOGIK_SPELLING_RULE_EX"; } #Override public void setIgnoreTaggedWords() { ignoreTaggedWords = true; } public String getSpellingFilePath() { return spellingFilePath; } public void setSpellingFilePath(String spellingFilePath) { this.spellingFilePath = spellingFilePath; } private void initSpellerEx(String binaryDict) throws IOException { String plainTextDict = null; if (JLanguageTool.getDataBroker().resourceExists(getSpellingFileName())) { plainTextDict = getSpellingFileName(); } if (plainTextDict != null) { BufferedReader br = null; if (this.spellingFilePath != null) { try { br = new BufferedReader(new FileReader(this.spellingFilePath)); } catch (Exception e) { br = null; } } if (br != null) { speller1 = new MorfologikMultiSpeller(binaryDict, br, plainTextDict, 1); speller2 = new MorfologikMultiSpeller(binaryDict, br, plainTextDict, 2); speller3 = new MorfologikMultiSpeller(binaryDict, br, plainTextDict, 3); br.close(); } else { speller1 = new MorfologikMultiSpeller(binaryDict, plainTextDict, 1); speller2 = new MorfologikMultiSpeller(binaryDict, plainTextDict, 2); speller3 = new MorfologikMultiSpeller(binaryDict, plainTextDict, 3); } setConvertsCase(speller1.convertsCase()); } else { throw new RuntimeException("Could not find ignore spell file in path: " + getSpellingFileName()); } } private boolean canBeIgnored(AnalyzedTokenReadings[] tokens, int idx, AnalyzedTokenReadings token) throws IOException { return token.isSentenceStart() || token.isImmunized() || token.isIgnoredBySpeller() || isUrl(token.getToken()) || isEMail(token.getToken()) || (ignoreTaggedWords && token.isTagged()) || ignoreToken(tokens, idx); } #Override public RuleMatch[] match(AnalyzedSentence sentence) throws IOException { List<RuleMatch> ruleMatches = new ArrayList<>(); AnalyzedTokenReadings[] tokens = getSentenceWithImmunization(sentence).getTokensWithoutWhitespace(); // lazy init if (speller1 == null) { String binaryDict = null; if (JLanguageTool.getDataBroker().resourceExists(getFileName())) { binaryDict = getFileName(); } if (binaryDict != null) { initSpellerEx(binaryDict); //here's the change } else { // should not happen, as we only configure this rule (or rather its subclasses) // when we have the resources: return toRuleMatchArray(ruleMatches); } } int idx = -1; for (AnalyzedTokenReadings token : tokens) { idx++; if (canBeIgnored(tokens, idx, token)) { continue; } // if we use token.getToken() we'll get ignored characters inside and speller // will choke String word = token.getAnalyzedToken(0).getToken(); if (tokenizingPattern() == null) { ruleMatches.addAll(getRuleMatches(word, token.getStartPos(), sentence)); } else { int index = 0; Matcher m = tokenizingPattern().matcher(word); while (m.find()) { String match = word.subSequence(index, m.start()).toString(); ruleMatches.addAll(getRuleMatches(match, token.getStartPos() + index, sentence)); index = m.end(); } if (index == 0) { // tokenizing char not found ruleMatches.addAll(getRuleMatches(word, token.getStartPos(), sentence)); } else { ruleMatches.addAll(getRuleMatches(word.subSequence(index, word.length()).toString(), token.getStartPos() + index, sentence)); } } } return toRuleMatchArray(ruleMatches); } }
OrientDB slow when browsing cluster
Well what i am trying to achieve is to save pairs of words in a sentence and if the word is already there , i am trying to save a list of words against one. To save the pairing as there could many millions as my data set file is very large , i opted for orientdb. I dont know if i am approaching it correctly but orientdb is very slow. After 8 hours of running it has only made pairs for 12000 sentences. As far as i have checked the major slowdown was in browsing cluster. Attached is my code, please if ant one can give any pointers over my approach. public static void main(String[] args) { // TODO Auto-generated method stub Main m = new Main(); m.openDatabase(); m.readFile("train_v2.txt"); m.closeDatabase(); } } class Main { ODatabaseDocumentTx db; Map<String, Object> index; List<Object> list = null; String pairing[]; ODocument doc; Main() { } public void closeDatabase() { if (!db.isClosed()) { db.close(); } } void openDatabase() { db = new ODatabaseDocumentTx("local:/databases/model").open("admin", "admin"); doc = new ODocument("final"); } public void readFile(String filename) { InputStream ins = null; // raw byte-stream Reader r = null; // cooked reader int i = 1; BufferedReader br = null; // buffered for readLine() try { String s; ins = new FileInputStream(filename); r = new InputStreamReader(ins, "UTF-8"); // leave charset out // for // default br = new BufferedReader(r); while ((s = br.readLine()) != null) { System.out.println("" + i); createTermPair(s.replaceAll("[^\\w ]", "").trim()); i++; } } catch (Exception e) { System.err.println(e.getMessage()); // handle exception } finally { closeDatabase(); if (br != null) { try { br.close(); } catch (Throwable t) { /* ensure close happens */ } } if (r != null) { try { r.close(); } catch (Throwable t) { /* ensure close happens */ } } if (ins != null) { try { ins.close(); } catch (Throwable t) { /* ensure close happens */ } } } } private void createTermPair(String phrase) { phrase = phrase + " ."; String[] word = phrase.split(" "); for (int i = 0; i < word.length - 1; i++) { if (!word[i].trim().equalsIgnoreCase("") && !word[i + 1].trim().equalsIgnoreCase("")) { String wordFirst = word[i].toLowerCase().trim(); String wordSecond = word[i + 1].toLowerCase().trim(); String pair = wordFirst + " " + wordSecond; checkForPairAndWrite(pair); } } } private void checkForPairAndWrite(String pair) { try { pairing = pair.trim().split(" "); if (!pairing[1].equalsIgnoreCase(" ")) { index = new HashMap<String, Object>(); for (ODocument docr : db.browseCluster("final")) { list = docr.field(pairing[0]); } if (list == null) { list = new ArrayList<>(); } list.add("" + pairing[1]); if (list.size() >= 1) index.put(pairing[0], list); doc.fields(index); doc.save(); }// for (int i = 0; i < list.size(); i++) { // System.out.println("" + list.get(i)); // } } catch (Exception e) { } return; } }
Getting metadata from SHOUTcast using IcyStreamMeta
I am writing an app for Android that grabs meta data from SHOUTcast mp3 streams. I am using a pretty nifty class I found online that I slightly modified, but I am still having 2 problems. 1) I have to continuously ping the server to update the metadata using a TimerTask. I am not fond of this approach but it was all I could think of. 2) There is a metric tonne of garbage collection while my app is running. Removing the TimerTask got rid of the garbage collection issue so I am not sure if I am just doing it wrong or if this is normal. Here is the class I am using: public class IcyStreamMeta { protected URL streamUrl; private Map<String, String> metadata; private boolean isError; public IcyStreamMeta(URL streamUrl) { setStreamUrl(streamUrl); isError = false; } /** * Get artist using stream's title * * #return String * #throws IOException */ public String getArtist() throws IOException { Map<String, String> data = getMetadata(); if (!data.containsKey("StreamTitle")) return ""; try { String streamTitle = data.get("StreamTitle"); String title = streamTitle.substring(0, streamTitle.indexOf("-")); return title.trim(); }catch (StringIndexOutOfBoundsException e) { return ""; } } /** * Get title using stream's title * * #return String * #throws IOException */ public String getTitle() throws IOException { Map<String, String> data = getMetadata(); if (!data.containsKey("StreamTitle")) return ""; try { String streamTitle = data.get("StreamTitle"); String artist = streamTitle.substring(streamTitle.indexOf("-")+1); return artist.trim(); } catch (StringIndexOutOfBoundsException e) { return ""; } } public Map<String, String> getMetadata() throws IOException { if (metadata == null) { refreshMeta(); } return metadata; } public void refreshMeta() throws IOException { retreiveMetadata(); } private void retreiveMetadata() throws IOException { URLConnection con = streamUrl.openConnection(); con.setRequestProperty("Icy-MetaData", "1"); con.setRequestProperty("Connection", "close"); //con.setRequestProperty("Accept", null); con.connect(); int metaDataOffset = 0; Map<String, List<String>> headers = con.getHeaderFields(); InputStream stream = con.getInputStream(); if (headers.containsKey("icy-metaint")) { // Headers are sent via HTTP metaDataOffset = Integer.parseInt(headers.get("icy-metaint").get(0)); } else { // Headers are sent within a stream StringBuilder strHeaders = new StringBuilder(); char c; while ((c = (char)stream.read()) != -1) { strHeaders.append(c); if (strHeaders.length() > 5 && (strHeaders.substring((strHeaders.length() - 4), strHeaders.length()).equals("\r\n\r\n"))) { // end of headers break; } } // Match headers to get metadata offset within a stream Pattern p = Pattern.compile("\\r\\n(icy-metaint):\\s*(.*)\\r\\n"); Matcher m = p.matcher(strHeaders.toString()); if (m.find()) { metaDataOffset = Integer.parseInt(m.group(2)); } } // In case no data was sent if (metaDataOffset == 0) { isError = true; return; } // Read metadata int b; int count = 0; int metaDataLength = 4080; // 4080 is the max length boolean inData = false; StringBuilder metaData = new StringBuilder(); // Stream position should be either at the beginning or right after headers while ((b = stream.read()) != -1) { count++; // Length of the metadata if (count == metaDataOffset + 1) { metaDataLength = b * 16; } if (count > metaDataOffset + 1 && count < (metaDataOffset + metaDataLength)) { inData = true; } else { inData = false; } if (inData) { if (b != 0) { metaData.append((char)b); } } if (count > (metaDataOffset + metaDataLength)) { break; } } // Set the data metadata = IcyStreamMeta.parseMetadata(metaData.toString()); // Close stream.close(); } public boolean isError() { return isError; } public URL getStreamUrl() { return streamUrl; } public void setStreamUrl(URL streamUrl) { this.metadata = null; this.streamUrl = streamUrl; this.isError = false; } public static Map<String, String> parseMetadata(String metaString) { Map<String, String> metadata = new HashMap<String, String>(); String[] metaParts = metaString.split(";"); Pattern p = Pattern.compile("^([a-zA-Z]+)=\\'([^\\']*)\\'$"); Matcher m; for (int i = 0; i < metaParts.length; i++) { m = p.matcher(metaParts[i]); if (m.find()) { metadata.put((String)m.group(1), (String)m.group(2)); } } return metadata; } } And here is my timer: private void getMeta() { timer.schedule(new TimerTask() { public void run() { try { icy = new IcyStreamMeta(new URL(stationUrl)); runOnUiThread(new Runnable() { public void run() { try { artist.setText(icy.getArtist()); title.setText(icy.getTitle()); } catch (IOException e) { e.printStackTrace(); } catch (StringIndexOutOfBoundsException e) { e.printStackTrace(); } } }); } catch (MalformedURLException e) { e.printStackTrace(); } } },0,5000); } Much appreciation for any assistance!
I've replaced the IcyStreamMeta class in my program and am getting the meta data from the 7.html file that is a part of the SHOUTcast spec. Far less data usage and all that so I feel it is a better option. I am still using the TimerTask, which is acceptable. There is practically no GC any more and I am happy with using 7.html and a little regex. :)