Commit a6def7da authored by mihaidascalu's avatar mihaidascalu

#16 Fixed problems while processing empty conversations

parent 8a584138
......@@ -6,7 +6,7 @@
<groupId>com.readerbench</groupId>
<artifactId>${artifactory.id}</artifactId>
<version>3.0.1</version>
<version>3.0.2</version>
<packaging>jar</packaging>
<properties>
......@@ -414,7 +414,6 @@
<version>${hibernate.version}</version>
</dependency>
<!-- /Hibernate SQLITE -->
</dependencies>
<build>
......@@ -423,12 +422,9 @@
<resource>
<directory>src/main/config</directory>
</resource>
<resource>
<directory>src/main/resources</directory>
</resource>
</resources>
</build>
</project>
\ No newline at end of file
......@@ -166,8 +166,10 @@ public abstract class AbstractDocument extends AnalysisElement {
}
public void computeAll(boolean computeDialogism, boolean useBigrams) {
computeDiscourseAnalysis(computeDialogism, useBigrams);
ComplexityIndices.computeComplexityFactors(this);
if (!blocks.isEmpty()) {
computeDiscourseAnalysis(computeDialogism, useBigrams);
ComplexityIndices.computeComplexityFactors(this);
}
}
public void save(SaveType saveOutput) {
......@@ -197,50 +199,43 @@ public abstract class AbstractDocument extends AnalysisElement {
public void computeDiscourseAnalysis(boolean computeDialogism, boolean useBigrams) {
if (computeDialogism) {
// build disambiguisation graph and lexical chains
LOGGER.info("Build disambiguation graph");
LOGGER.info("Build disambiguation graph...");
DisambiguisationGraphAndLexicalChains.buildDisambiguationGraph(this);
LOGGER.info("Prune disambiguation graph");
LOGGER.info("Prune disambiguation graph...");
DisambiguisationGraphAndLexicalChains.pruneDisambiguationGraph(this);
// System.out.println(d.disambiguationGraph);
LOGGER.info("Build lexical chains");
LOGGER.info("Build lexical chains...");
DisambiguisationGraphAndLexicalChains.buildLexicalChains(this);
// for (LexicalChain chain : lexicalChains) {
// System.out.println(chain);
// }
// determine semantic chains / voices
LOGGER.info("Determine semantic chains / voices");
LOGGER.info("Determine semantic chains / voices...");
DialogismComputations.determineVoices(this);
DialogismComputations.determineExtendedVoices(this);
// DialogismComputations.findSentimentUsingContext(this);
// DialogismComputations.findSentimentUsingContext(this);
// determine voice distributions & importance
LOGGER.info("Determine voice distributions & importance");
LOGGER.info("Determine voice distributions & importance...");
DialogismComputations.determineVoiceDistributions(this);
// DialogismComputations.determineExtendedVoiceDistributions(this);
// DialogismComputations.determineExtendedVoiceDistributions(this);
}
// build coherence graph
LOGGER.info("Build coherence graph");
LOGGER.info("Build coherence graph...");
CohesionGraph.buildCohesionGraph(this);
// t1 = System.currentTimeMillis();
// // build coherence graph
// CohesionGraph.buildCohesionGraphOld(this);
// t2 = System.currentTimeMillis();
// System.out.println("old cohesion time: " + ((t2 - t1) / 1000.) + " sec");
// determine topics
LOGGER.info("Determine topics");
LOGGER.info("Determine topics...");
KeywordModeling.determineKeywords(this, useBigrams);
// TopicModel.determineTopicsLDA(this);
Scoring.score(this);
// assign sentiment values
LOGGER.info("Assign sentiment values");
LOGGER.info("Assign sentiment values...");
SentimentAnalysis.weightSemanticValences(this);
LOGGER.info("Finished all discourse analysis processes ...");
LOGGER.info("Finished all discourse analysis processes");
}
public void setDocumentTitle(String title, List<ISemanticModel> models, Lang lang, boolean usePOSTagging) {
......@@ -265,9 +260,9 @@ public abstract class AbstractDocument extends AnalysisElement {
}
public static AbstractDocument loadGenericDocument(String pathToDoc,
Map<SimilarityType, String> modelPaths, Lang lang,
boolean usePOSTagging, boolean computeDialogism, boolean useBigrams, String pathToComplexityModel,
int[] selectedComplexityFactors, boolean cleanInput, SaveType saveOutput) {
Map<SimilarityType, String> modelPaths, Lang lang,
boolean usePOSTagging, boolean computeDialogism, boolean useBigrams, String pathToComplexityModel,
int[] selectedComplexityFactors, boolean cleanInput, SaveType saveOutput) {
List<ISemanticModel> models = SimilarityType.loadVectorModels(modelPaths, lang);
return loadGenericDocument(new File(pathToDoc), models, lang, usePOSTagging, computeDialogism, useBigrams,
pathToComplexityModel, selectedComplexityFactors, cleanInput, saveOutput);
......@@ -299,9 +294,9 @@ public abstract class AbstractDocument extends AnalysisElement {
}
public static AbstractDocument loadGenericDocument(File docFile, List<ISemanticModel> models,
Lang lang, boolean usePOSTagging, boolean computeDialogism, boolean useBigrams,
String pathToComplexityModel, int[] selectedComplexityFactors,
boolean cleanInput, SaveType saveOutput) {
Lang lang, boolean usePOSTagging, boolean computeDialogism, boolean useBigrams,
String pathToComplexityModel, int[] selectedComplexityFactors,
boolean cleanInput, SaveType saveOutput) {
// parse the XML file
LOGGER.info("Loading {} file for processing", docFile.getPath());
boolean isDocument = checkTagsDocument(docFile, "p");
......@@ -555,13 +550,13 @@ public abstract class AbstractDocument extends AnalysisElement {
out.write("\nOverlap between annotated collaboration zones and Social KB model\n" + "P=,"
+ results[0] + "\nR=," + results[1] + "\nF1 score=," + results[2] + "\nr=," + VectorAlgebra
.pearsonCorrelation(c.getAnnotatedCollabEvolution(), c.getSocialKBEvolution()));
.pearsonCorrelation(c.getAnnotatedCollabEvolution(), c.getSocialKBEvolution()));
results = Collaboration.overlapCollaborationZones(c, c.getAnnotatedCollabZones(),
c.getIntenseCollabZonesVoice());
out.write("\nOverlap between annotated collaboration zones and Voice PMI model\n" + "P=,"
+ results[0] + "\nR=," + results[1] + "\nF1 score=," + results[2] + "\nr=," + VectorAlgebra
.pearsonCorrelation(c.getAnnotatedCollabEvolution(), c.getVoicePMIEvolution()));
.pearsonCorrelation(c.getAnnotatedCollabEvolution(), c.getVoicePMIEvolution()));
}
results = Collaboration.overlapCollaborationZones(c, c.getIntenseCollabZonesSocialKB(),
c.getIntenseCollabZonesVoice());
......
/*
/*
* Copyright 2016 ReaderBench.
*
* Licensed under the Apache License, Version 2.0 (the "License");
......@@ -15,59 +15,58 @@
*/
package com.readerbench.services.complexity;
import com.readerbench.data.AbstractDocument;
import com.readerbench.data.Lang;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.readerbench.data.AbstractDocument;
import com.readerbench.data.Lang;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.Arrays;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* Class used to define all factors to be used within the complexity evaluation
* model
*
* @author Mihai Dascalu
*/
public class ComplexityIndices {
/**
* Class used to define all factors to be used within the complexity evaluation
* model
*
* @author Mihai Dascalu
*/
public class ComplexityIndices {
private static final Logger LOGGER = LoggerFactory.getLogger(ComplexityIndices.class);
private static final Logger LOGGER = LoggerFactory.getLogger(ComplexityIndices.class);
public static final int IDENTITY = -1;
public static final int IDENTITY = -1;
public static void computeComplexityFactors(AbstractDocument d) {
LOGGER.info(d.getPath() + " " +d.getText());
d.setComplexityIndices(
Arrays.stream(ComplexityIndexType.values()).parallel()
.filter(t -> t.getFactory() != null)
.map(cat -> cat.getFactory())
.flatMap(f -> f.build(d.getLanguage()).stream())
.collect(Collectors.toMap(Function.identity(), f -> f.compute(d))));
}
public static void computeComplexityFactors(AbstractDocument d) {
d.setComplexityIndices(
Arrays.stream(ComplexityIndexType.values()).parallel()
.filter(t -> t.getFactory() != null)
.map(cat -> cat.getFactory())
.flatMap(f -> f.build(d.getLanguage()).stream())
.collect(Collectors.toMap(Function.identity(), f -> f.compute(d))));
}
public static List<ComplexityIndex> getIndices(Lang lang) {
return Arrays.stream(ComplexityIndexType.values())
.filter(cat -> cat.getFactory() != null)
.map(cat -> cat.getFactory())
.flatMap(f -> f.build(lang).stream())
.collect(Collectors.toList());
}
public static List<ComplexityIndex> getIndices(Lang lang) {
return Arrays.stream(ComplexityIndexType.values())
.filter(cat -> cat.getFactory() != null)
.map(cat -> cat.getFactory())
.flatMap(f -> f.build(lang).stream())
.collect(Collectors.toList());
}
public static double[] getComplexityIndicesArray(AbstractDocument d) {
return ComplexityIndices.getIndices(d.getLanguage()).stream()
.mapToDouble(index -> d.getComplexityIndices().get(index))
.toArray();
}
public static double[] getComplexityIndicesArray(AbstractDocument d) {
return ComplexityIndices.getIndices(d.getLanguage()).stream()
.mapToDouble(index -> d.getComplexityIndices().get(index))
.toArray();
}
public static void main(String[] args) {
List<ComplexityIndex> factors = getIndices(Lang.en);
factors.stream().forEachOrdered(f -> {
System.out.println(f.getCategoryName() + "\t" + f.getAcronym() + "\t"
+ f.getDescription());
});
public static void main(String[] args) {
List<ComplexityIndex> factors = getIndices(Lang.en);
factors.stream().forEachOrdered(f -> {
System.out.println(f.getCategoryName() + "\t" + f.getAcronym() + "\t"
+ f.getDescription());
});
System.out.println("TOTAL:" + factors.size() + " factors");
}
}
System.out.println("TOTAL:" + factors.size() + " factors");
}
}
......@@ -9,6 +9,7 @@ import com.readerbench.data.AbstractDocument;
import com.readerbench.data.Block;
import com.readerbench.data.Sentence;
import com.readerbench.services.complexity.ComplexityIndex;
import com.readerbench.services.complexity.ComplexityIndices;
import com.readerbench.services.complexity.ComplexityIndicesEnum;
import com.readerbench.services.complexity.rhythm.tools.RhythmTool;
......@@ -30,60 +31,48 @@ public class LanguageRhythmicCoefficient extends ComplexityIndex {
@Override
public double compute(AbstractDocument d) {
if (d.getBlocks().isEmpty()) {
return ComplexityIndices.IDENTITY;
}
Map<Integer, Integer> cntSyllables = new TreeMap<>();
int deviations = 0;
for (Block b : d.getBlocks()) {
if (null == b) {
continue;
}
for (Sentence s : b.getSentences()) {
// System.out.println("Sentence: " + s.getText());
String[] units = s.getText().split("[\\p{Punct}]+");
for (String str : units) {
List<String> unit = Arrays.asList(str.trim().split("\\s+"));
// System.out.println(u + " " + u.size());
// List<Word> unit = s.getAllWords();
// List<Integer> repr = RhythmTool.getNumericalRepresentation(unit);
List<Integer> repr = RhythmTool.testNewUnitDefinition(unit);
if (repr.isEmpty()) {
continue;
}
for (Integer nr : repr) {
if (nr == 0) continue;
cntSyllables.put(nr,
cntSyllables.containsKey(nr) ? cntSyllables.get(nr)+1 : 1);
}
repr.stream().filter((nr) -> !(nr == 0)).forEachOrdered((nr) -> {
cntSyllables.put(nr, cntSyllables.containsKey(nr) ? cntSyllables.get(nr) + 1 : 1);
});
deviations += RhythmTool.calcDeviations(repr);
// System.out.println("Deviations: " + deviations);
// System.out.println();
}
}
}
// DecimalFormat df = new DecimalFormat("#.##");
int totalNumber = cntSyllables.values().stream().reduce(0, Integer::sum);
// for (Map.Entry<Integer, Integer> entry : cntSyllables.entrySet()) {
// double syllFreq = 1.0 * entry.getValue() / totalNumber;
// System.out.println(entry.getKey() + "\t" + totalNumber +
// "\t" + entry.getValue() +
// "\t" + df.format(syllFreq));
// }
// Integer keyOfMaxVal = Collections.max(cntSyllables.entrySet(), Map.Entry.comparingByValue()).getKey();
if (totalNumber == 0) {
return ComplexityIndices.IDENTITY;
}
int dominantInd = RhythmTool.getDominantIndex(cntSyllables.values().stream()
.collect(Collectors.toList()));
// System.out.println("Dominant ind: " + dominantInd);
if (dominantInd == -1) {
return ComplexityIndices.IDENTITY;
}
int keyOfMaxVal = cntSyllables.keySet().stream()
.collect(Collectors.toList()).get(dominantInd);
// System.out.println("Key of max val: " + keyOfMaxVal);
int sum = cntSyllables.get(keyOfMaxVal);
sum += (cntSyllables.containsKey(keyOfMaxVal-1)) ? cntSyllables.get(keyOfMaxVal-1) : 0;
sum += (cntSyllables.containsKey(keyOfMaxVal+1)) ? cntSyllables.get(keyOfMaxVal+1) : 0;
sum += (cntSyllables.containsKey(keyOfMaxVal - 1)) ? cntSyllables.get(keyOfMaxVal - 1) : 0;
sum += (cntSyllables.containsKey(keyOfMaxVal + 1)) ? cntSyllables.get(keyOfMaxVal + 1) : 0;
double coeff = 1.0 * (deviations + totalNumber - sum) / totalNumber;
// System.out.println("Deviations: " + deviations);
// System.out.println("Coefficient: " + df.format(coeff));
return coeff;
}
}
......@@ -32,7 +32,7 @@ import java.util.Map;
*/
public class SentimentAnalysis {
private static final Logger LOGGER = LoggerFactory.getLogger(SentimentAnalysis.class);
private static final Logger LOGGER = LoggerFactory.getLogger(SentimentAnalysis.class);
public static void weightSemanticValences(Sentence s) {
if (s.getAllWords().isEmpty()) {
......@@ -54,76 +54,66 @@ public class SentimentAnalysis {
}
}
public static void weightSemanticValences(Block b) {
SentimentEntity se = new SentimentEntity();
se.init();
b.setSentimentEntity(se);
Map<SentimentValence, Double> avgBlock = new HashMap<>();
Map<SentimentValence, Double> sumWeightsBlock = new HashMap<>();
// Map<SentimentValence, Double> elemValences =
// b.getSentimentEntity().getAll();
// double avgBlock = 0, sumWeightsBlock = 0;
// logger.info("[Weighting] Block " + b.getIndex() + " has " +
// b.getSentences().size() + " sentences.");
for (int i = 0; i < b.getSentences().size(); i++) {
Sentence s = b.getSentences().get(i);
weightSemanticValences(s);
// logger.info("[Weighting] There are " +
// s.getSentimentEntity().getAll().size() + " sentiments set
// for this sentence.");
for (Map.Entry<SentimentValence, Double> pair : s.getSentimentEntity().getAll().entrySet()) {
SentimentValence sv = pair.getKey();
Double value = pair.getValue();
// logger.info(" Sentence s (sentiment " + sv.getName()
// + " = " + value + ")");
if (value != null) {
avgBlock.put(sv, (avgBlock.get(sv) == null ? 0 : avgBlock.get(sv))
+ b.getSentenceBlockDistances()[i].getCohesion() * value);
sumWeightsBlock.put(sv, (sumWeightsBlock.get(sv) == null ? 0 : sumWeightsBlock.get(sv))
+ b.getSentenceBlockDistances()[i].getCohesion());
}
}
}
avgBlock.entrySet().stream().forEach(e -> {
b.getSentimentEntity().add(e.getKey(), e.getValue() / sumWeightsBlock.get(e.getKey()));
});
public static void weightSemanticValences(Block b) {
SentimentEntity se = new SentimentEntity();
se.init();
b.setSentimentEntity(se);
Map<SentimentValence, Double> avgBlock = new HashMap<>();
Map<SentimentValence, Double> sumWeightsBlock = new HashMap<>();
for (int i = 0; i < b.getSentences().size(); i++) {
Sentence s = b.getSentences().get(i);
weightSemanticValences(s);
for (Map.Entry<SentimentValence, Double> pair : s.getSentimentEntity().getAll().entrySet()) {
SentimentValence sv = pair.getKey();
Double value = pair.getValue();
if (value != null) {
avgBlock.put(sv, (avgBlock.get(sv) == null ? 0 : avgBlock.get(sv))
+ b.getSentenceBlockDistances()[i].getCohesion() * value);
sumWeightsBlock.put(sv, (sumWeightsBlock.get(sv) == null ? 0 : sumWeightsBlock.get(sv))
+ b.getSentenceBlockDistances()[i].getCohesion());
}
}
}
avgBlock.entrySet().stream().forEach(e -> {
b.getSentimentEntity().add(e.getKey(), e.getValue() / sumWeightsBlock.get(e.getKey()));
});
}
}
public static void weightSemanticValences(AbstractDocument d) {
LOGGER.info("Weighting sentiment valences ...");
public static void weightSemanticValences(AbstractDocument d) {
LOGGER.info("Weighting sentiment valences ...");
// initialize sentiment valence map for document
SentimentEntity se = new SentimentEntity();
se.init();
d.setSentimentEntity(se);
// initialize sentiment valence map for document
SentimentEntity se = new SentimentEntity();
se.init();
d.setSentimentEntity(se);
Map<SentimentValence, Double> avgDoc = new HashMap<>();
Map<SentimentValence, Double> sumWeightsDoc = new HashMap<>();
// perform weighted sentiment per block and per document
Map<SentimentValence, Double> avgDoc = new HashMap<>();
Map<SentimentValence, Double> sumWeightsDoc = new HashMap<>();
// perform weighted sentiment per block and per document
for (int i = 0; i < d.getBlocks().size(); i++) {
Block b = d.getBlocks().get(i);
if (b != null) {
weightSemanticValences(b);
for (int i = 0; i < d.getBlocks().size(); i++) {
Block b = d.getBlocks().get(i);
if (b != null) {
weightSemanticValences(b);
for (Map.Entry<SentimentValence, Double> pair : b.getSentimentEntity().getAll().entrySet()) {
SentimentValence sv = pair.getKey();
Double value = pair.getValue();
avgDoc.put(sv, (avgDoc.get(sv) == null ? 0 : avgDoc.get(sv))
+ value * d.getBlockDocDistances()[i].getCohesion());
sumWeightsDoc.put(sv, (sumWeightsDoc.get(sv) == null ? 0 : sumWeightsDoc.get(sv))
+ d.getBlockDocDistances()[i].getCohesion());
}
for (Map.Entry<SentimentValence, Double> pair : b.getSentimentEntity().getAll().entrySet()) {
SentimentValence sv = pair.getKey();
Double value = pair.getValue();
avgDoc.put(sv, (avgDoc.get(sv) == null ? 0 : avgDoc.get(sv))
+ value * d.getBlockDocDistances()[i].getCohesion());
sumWeightsDoc.put(sv, (sumWeightsDoc.get(sv) == null ? 0 : sumWeightsDoc.get(sv))
+ d.getBlockDocDistances()[i].getCohesion());
}
}
}
}
}
for (Map.Entry<SentimentValence, Double> pair : d.getSentimentEntity().getAll().entrySet()) {
SentimentValence sv = pair.getKey();
if (sumWeightsDoc.get(sv) != null) {
d.getSentimentEntity().add(sv, avgDoc.get(sv) / sumWeightsDoc.get(sv));
}
}
}
for (Map.Entry<SentimentValence, Double> pair : d.getSentimentEntity().getAll().entrySet()) {
SentimentValence sv = pair.getKey();
if (sumWeightsDoc.get(sv) != null) {
d.getSentimentEntity().add(sv, avgDoc.get(sv) / sumWeightsDoc.get(sv));
}
}
}
}
......@@ -112,8 +112,7 @@ public class DialogismComputations {
public static void determineExtendedVoices(AbstractDocument d) {
List<SemanticChain> extendedVoices = new ArrayList<>();
Map<String, Integer> auxiliaryVoices = new HashMap<String, Integer>();
System.out.println("-------Number of voices: " + d.getVoices().size());
Map<String, Integer> auxiliaryVoices = new HashMap<>();
for (SemanticChain chain : d.getVoices()) {
int noNouns = 0;
int noVerbs = 0;
......@@ -125,8 +124,11 @@ public class DialogismComputations {
extendedChain.getWords().add(w);
if (!auxiliaryVoices.containsKey(w.getText())) {
auxiliaryVoices.put(w.getText(), 1);
if (w.isNoun()) noNouns++;
else if (w.isVerb()) noVerbs++;
if (w.isNoun()) {
noNouns++;
} else if (w.isVerb()) {
noVerbs++;
}
}
}
}
......@@ -140,11 +142,6 @@ public class DialogismComputations {
}
d.setExtendedVoices(extendedVoices);
System.out.println("-------Number of voices: " + d.getVoices().size());
System.out.println("-------Number of perspectives: " + d.getNoPerspectives());
System.out.println("-------Number of nouns in perspectives: " + d.getNoNounsInPerspectives());
System.out.println("-------Number of verbs in perspectives: " + d.getNoVerbsInPerspectives());
}
public static void determineVoiceDistribution(AnalysisElement e, AbstractDocument d) {
......@@ -292,8 +289,6 @@ public class DialogismComputations {
}
}
public static void determineExtendedVoiceDistributions(AbstractDocument d) {
LOGGER.info("Identifying extended voice distributions...");
// determine distribution of each lexical chain
......@@ -308,7 +303,6 @@ public class DialogismComputations {
}
}
// determine spread
if (d.getExtendedVoices() != null) {
......@@ -341,7 +335,6 @@ public class DialogismComputations {
chain.getExtendedBlockDistribution()[blockIndex] += valence;
// build cumulative importance in terms of sentences in which occurrences have been spotted
if (voiceOccurrences.containsKey(blockIndex + "_" + sentenceIndex)) {
voiceOccurrences.put(blockIndex + "_" + sentenceIndex,
......@@ -392,25 +385,25 @@ public class DialogismComputations {
}
}
/**
* @param d
*
* Build for every sentence a context map with all the voices and the associated context Tree with its valence
* Build for every sentence a context map with all the voices and the
* associated context Tree with its valence
*/
public static void findSentimentUsingContext(AbstractDocument d) {
LOGGER.info("Searching context for every voice in every sentence");
LOGGER.info("Searching context for every voice in every sentence...");
Context ctx = new Context();
//for every sentence make a map which has key voice and value a list of pair(Tree, valence)
for (Block b: d.getBlocks()) {
for (Sentence sentence: b.getSentences()) {
for (Block b : d.getBlocks()) {
for (Sentence sentence : b.getSentences()) {
List<Word> words = sentence.getWords();
Map<Word, List<ContextSentiment>> contextMap = new HashMap<>();
for (SemanticChain chain: d.getVoices()) {
for (Word w: chain.getWords()) {
for (SemanticChain chain : d.getVoices()) {
for (Word w : chain.getWords()) {
//the context for this context was computed in the past
if (contextMap.containsKey(w)) {
continue;
......@@ -420,7 +413,7 @@ public class DialogismComputations {
continue;
}
List<ContextSentiment> contextTrees = new ArrayList<ContextSentiment>();
List<ContextSentiment> contextTrees = new ArrayList<>();
//check if the word from voice is in sentence
for (Word aux : words) {
if (aux.getText().equals(w.getText())) {
......@@ -428,7 +421,7 @@ public class DialogismComputations {
Tree tree = sentence.getTree();
List<Tree> subTrees = ctx.findContextTree(tree, w, w.isNoun());
//for every contextSubtree compute the valence
for (Tree subTree:subTrees) {
for (Tree subTree : subTrees) {
valence = RNNCoreAnnotations.getPredictedClass(subTree) - 2;
contextTrees.add(new ContextSentiment(subTree, valence));
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment