Commit f279ead4 authored by Dorinela Dascalu's avatar Dorinela Dascalu 🌺

Merged with master

parents 2e007574 4db018ee
......@@ -47,7 +47,14 @@ Thumbs.db
!/src/main/resources/
/db.properties
/tmp/
/nbactions.xml
/nbactions*.xml
/nb-configuration.xml
/ReaderBenchServer.log*
/ReaderBench.log*
/.idea/
/ReaderBench.iml
/dragos-*
/J
/C
/H
/I
This diff is collapsed.
......@@ -20,6 +20,7 @@ import data.pojo.Language;
import java.util.List;
import javax.persistence.TypedQuery;
import data.pojo.SentimentValence;
import java.util.EnumMap;
/**
*
......@@ -28,7 +29,8 @@ import data.pojo.SentimentValence;
public class ValenceDAO extends AbstractDAO<SentimentValence> {
private static ValenceDAO instance = null;
private EnumMap<Lang, List<SentimentValence>> cache = new EnumMap<>(Lang.class);
private ValenceDAO() {
}
......@@ -49,14 +51,17 @@ public class ValenceDAO extends AbstractDAO<SentimentValence> {
}
public List<SentimentValence> findByLang(Lang lang) {
final Language language = Language.fromLang(lang);
return dao.executeQuery(em -> {
TypedQuery<SentimentValence> query = em.createNamedQuery(
"SentimentValence.findByLang",
SentimentValence.class);
query.setParameter("lang", language);
return query.getResultList();
});
if (!cache.containsKey(lang)) {
final Language language = Language.fromLang(lang);
cache.put(lang, dao.executeQuery(em -> {
TypedQuery<SentimentValence> query = em.createNamedQuery(
"SentimentValence.findByLang",
SentimentValence.class);
query.setParameter("lang", language);
return query.getResultList();
}));
}
return cache.get(lang);
}
}
......@@ -56,6 +56,7 @@ import java.io.FileNotFoundException;
import java.util.EnumMap;
import java.util.StringJoiner;
import java.util.logging.Level;
import java.util.stream.Collectors;
import javax.xml.parsers.ParserConfigurationException;
import org.openide.util.Exceptions;
import org.xml.sax.SAXException;
......@@ -160,8 +161,8 @@ public abstract class AbstractDocument extends AnalysisElement {
}
}
public void computeAll(boolean computeDialogism) {
computeDiscourseAnalysis(computeDialogism);
public void computeAll(boolean computeDialogism, boolean useBigrams) {
computeDiscourseAnalysis(computeDialogism, useBigrams);
ComplexityIndices.computeComplexityFactors(this);
}
......@@ -187,8 +188,9 @@ public abstract class AbstractDocument extends AnalysisElement {
/**
*
* @param computeDialogism
* @param useBigrams
*/
public void computeDiscourseAnalysis(boolean computeDialogism) {
public void computeDiscourseAnalysis(boolean computeDialogism, boolean useBigrams) {
if (computeDialogism) {
// build disambiguisation graph and lexical chains
DisambiguisationGraphAndLexicalChains.buildDisambiguationGraph(this);
......@@ -219,7 +221,7 @@ public abstract class AbstractDocument extends AnalysisElement {
// t2 = System.currentTimeMillis();
// System.out.println("old cohesion time: " + ((t2 - t1) / 1000.) + " sec");
// determine topics
KeywordModeling.determineKeywords(this);
KeywordModeling.determineKeywords(this, useBigrams);
// TopicModel.determineTopicsLDA(this);
Scoring.score(this);
......@@ -252,10 +254,10 @@ public abstract class AbstractDocument extends AnalysisElement {
public static AbstractDocument loadGenericDocument(String pathToDoc,
Map<SimilarityType, String> modelPaths, Lang lang,
boolean usePOSTagging, boolean computeDialogism, String pathToComplexityModel,
boolean usePOSTagging, boolean computeDialogism, boolean useBigrams, String pathToComplexityModel,
int[] selectedComplexityFactors, boolean cleanInput, SaveType saveOutput) {
List<ISemanticModel> models = SimilarityType.loadVectorModels(modelPaths, lang);
return loadGenericDocument(new File(pathToDoc), models, lang, usePOSTagging, computeDialogism,
return loadGenericDocument(new File(pathToDoc), models, lang, usePOSTagging, computeDialogism, useBigrams,
pathToComplexityModel, selectedComplexityFactors, cleanInput, saveOutput);
}
......@@ -285,7 +287,7 @@ public abstract class AbstractDocument extends AnalysisElement {
}
public static AbstractDocument loadGenericDocument(File docFile, List<ISemanticModel> models,
Lang lang, boolean usePOSTagging, boolean computeDialogism,
Lang lang, boolean usePOSTagging, boolean computeDialogism, boolean useBigrams,
String pathToComplexityModel, int[] selectedComplexityFactors,
boolean cleanInput, SaveType saveOutput) {
// parse the XML file
......@@ -303,13 +305,13 @@ public abstract class AbstractDocument extends AnalysisElement {
if (isDocument) {
Document d = Document.load(docFile, models, lang, usePOSTagging);
d.computeAll(computeDialogism);
d.computeAll(computeDialogism, useBigrams);
d.save(saveOutput);
return d;
}
if (isChat) {
Conversation c = Conversation.load(docFile, models, lang, usePOSTagging);
c.computeAll(computeDialogism);
c.computeAll(computeDialogism, useBigrams);
c.save(saveOutput);
return c;
}
......@@ -382,6 +384,7 @@ public abstract class AbstractDocument extends AnalysisElement {
LOGGER.info("Writing document export");
File output = new File(path.replace(".xml", ".csv"));
try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(output), "UTF-8"), 32768)) {
out.write("SEP=,\n");
if (titleText != null) {
out.write(titleText.replaceAll(",", "").replaceAll("\\s+", " ") + "\n");
}
......@@ -417,7 +420,21 @@ public abstract class AbstractDocument extends AnalysisElement {
out.write("\nTopics - Relevance\n");
out.write("Keyword, Relevance,Tf,Average semantic similarity\n");
for (Keyword t : this.getTopics()) {
out.write(t.getWord().getLemma() + " (" + t.getWord().getPOS() + "),"
out.write(t.getWord().getLemma() + " (");
if (t.getElement() instanceof Word) {
out.write(t.getWord().getPOS());
}
else {
NGram nGram = (NGram) t.getElement();
StringBuilder sb = new StringBuilder();
for (Word word : nGram.getWords()) {
sb.append(word.getPOS()).append("_");
}
String nGramLemmas = sb.toString();
sb.setLength(0);
out.write(nGramLemmas.substring(0,nGramLemmas.length()-1));
}
out.write ("),"
+ Formatting.formatNumber(t.getRelevance()) + ","
+ Formatting.formatNumber(t.getTermFrequency()) + "," + Formatting.formatNumber(t.getSemanticSimilarity()) + "\n");
}
......@@ -427,7 +444,7 @@ public abstract class AbstractDocument extends AnalysisElement {
out.write("\nTopics - Clusters\n");
Map<Integer, List<Keyword>> topicClusters = new TreeMap<>();
this.getTopics().stream().forEach((t) -> {
Integer probClass = LDA.findMaxResemblance(t.getWord().getModelRepresentation(SimilarityType.LDA), this.getModelRepresentation(SimilarityType.LDA));
Integer probClass = LDA.findMaxResemblance(t.getModelRepresentation(SimilarityType.LDA), this.getModelRepresentation(SimilarityType.LDA));
if (!topicClusters.containsKey(probClass)) {
topicClusters.put(probClass, new ArrayList<>());
}
......@@ -522,13 +539,13 @@ public abstract class AbstractDocument extends AnalysisElement {
out.write("\nOverlap between annotated collaboration zones and Social KB model\n" + "P=,"
+ results[0] + "\nR=," + results[1] + "\nF1 score=," + results[2] + "\nr=," + VectorAlgebra
.pearsonCorrelation(c.getAnnotatedCollabEvolution(), c.getSocialKBEvolution()));
.pearsonCorrelation(c.getAnnotatedCollabEvolution(), c.getSocialKBEvolution()));
results = Collaboration.overlapCollaborationZones(c, c.getAnnotatedCollabZones(),
c.getIntenseCollabZonesVoice());
out.write("\nOverlap between annotated collaboration zones and Voice PMI model\n" + "P=,"
+ results[0] + "\nR=," + results[1] + "\nF1 score=," + results[2] + "\nr=," + VectorAlgebra
.pearsonCorrelation(c.getAnnotatedCollabEvolution(), c.getVoicePMIEvolution()));
.pearsonCorrelation(c.getAnnotatedCollabEvolution(), c.getVoicePMIEvolution()));
}
results = Collaboration.overlapCollaborationZones(c, c.getIntenseCollabZonesSocialKB(),
c.getIntenseCollabZonesVoice());
......@@ -538,7 +555,7 @@ public abstract class AbstractDocument extends AnalysisElement {
}
// print semantic chains
if (voices.size() > 0) {
if (voices != null && voices.size() > 0) {
out.write("\nVoices - Semantic chains\n");
for (SemanticChain voice : voices) {
out.write(voice.toStringAllWords() + "\n");
......@@ -804,4 +821,18 @@ public abstract class AbstractDocument extends AnalysisElement {
public boolean canUseSimType(SimilarityType simType) {
return !simType.isLoadable() || getModelVectors().keySet().contains(simType);
}
@Override
public List<NGram> getBiGrams() {
return blocks.stream()
.flatMap(s -> s.getBiGrams().stream())
.collect(Collectors.toList());
}
@Override
public List<NGram> getNGrams(int n) {
return blocks.stream()
.flatMap(s -> s.getNGrams(n).stream())
.collect(Collectors.toList());
}
}
......@@ -37,6 +37,7 @@ public class AbstractDocumentTemplate implements Serializable {
new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ENGLISH),
new SimpleDateFormat("dd MMMMMMMM yyyy HH:mm", Locale.FRANCE),
new SimpleDateFormat("HH:mm:ss"),
new SimpleDateFormat("hh:mm a", Locale.ENGLISH),
new SimpleDateFormat("EEE MMM dd HH:mm:ss z yyyy", Locale.ENGLISH)
};
......@@ -87,10 +88,10 @@ public class AbstractDocumentTemplate implements Serializable {
} catch (ParseException e) {
}
}
if (time == null) {
if (aux == null) {
try {
Long longTime = Long.parseLong(time);
aux = new Date(longTime * 1000);
aux = new Date(longTime);
} catch (NumberFormatException e) {
LOGGER.log(Level.SEVERE, "Unparsable date: {0}", time);
}
......
......@@ -24,6 +24,7 @@ import data.discourse.Keyword;
import data.sentiment.SentimentEntity;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.Objects;
import java.util.logging.Logger;
import services.semanticModels.ISemanticModel;
import services.semanticModels.SimilarityType;
......@@ -363,4 +364,42 @@ public abstract class AnalysisElement implements Serializable {
public ISemanticModel getSemanticModel(SimilarityType type) {
return semanticModels.get(type);
}
public List<NGram> getBiGrams() {
return new ArrayList<>();
}
public List<NGram> getNGrams(int n) {
return new ArrayList<>();
}
@Override
public int hashCode() {
int hash = 7;
hash = 71 * hash + this.index;
hash = 71 * hash + Objects.hashCode(this.text);
return hash;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final AnalysisElement other = (AnalysisElement) obj;
if (this.index != other.index) {
return false;
}
if (!Objects.equals(this.text, other.text)) {
return false;
}
return true;
}
}
......@@ -26,6 +26,7 @@ import data.discourse.SemanticRelatedness;
import edu.stanford.nlp.coref.data.CorefChain;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.CoreMap;
import java.util.stream.Collectors;
import services.semanticModels.ISemanticModel;
/**
......@@ -108,7 +109,7 @@ public class Block extends AnalysisElement implements Serializable {
// if (b.getIndex() != -1) {
// while (d.getBlocks().size() < b.getIndex()) {
// d.getBlocks().add(null);
// }
// }s
// d.getBlocks().add(b.getIndex(), b);
// } else {
d.getBlocks().add(b);
......@@ -310,6 +311,20 @@ public class Block extends AnalysisElement implements Serializable {
this.nextSentenceBlockDistance = nextSentenceBlockDistance;
}
@Override
public List<NGram> getBiGrams() {
return sentences.stream()
.flatMap(s -> s.getBiGrams().stream())
.collect(Collectors.toList());
}
@Override
public List<NGram> getNGrams(int n) {
return sentences.stream()
.flatMap(s -> s.getNGrams(n).stream())
.collect(Collectors.toList());
}
@Override
public String toString() {
String s = "";
......
package data;
import edu.stanford.nlp.util.Triple;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import javafx.util.Pair;
//import org.datavec.api.berkeley.Triple;
/**
* Created by Gabriel Cristian on 6/14/2017.
*/
public class CVStructure {
AbstractDocument document;
private final int MIN_CHAIN_NR = 2;
private final int MAX_CHAIN_NR = 5;
private final float CHAIN_PROPORTION = 2;
public boolean[] linesLeft;
private ArrayList<Triple<Float, Float, String>> yCoords = new ArrayList<>();
public CVStructure () {
yCoords.add(new Triple<>(0f, 0f, " "));
}
public CVStructure (AbstractDocument document) {
this.document = document;
}
public float getLastYCoord() {
return yCoords.get(yCoords.size() - 1).first;
}
public void addYCoord(Triple<Float, Float, String> y) {
yCoords.add(y);
}
public ArrayList<Triple<Float, Float, String>> getYCoords() {
return yCoords;
}
public float getCoord (Triple<Float, Float, String> info) {
return info.first;
}
public float getFontSize (Triple<Float, Float, String> info) {
return info.second;
}
public String getFontName (Triple<Float, Float, String> info) {
return info.third;
}
public int getByFontSize() {
int firstAlready = -1;
int paragraphsNoSize = 0;
//trying to get 2-4 length periodical occurrences that may indicate a paragraph
for (int p = MIN_CHAIN_NR; p < MAX_CHAIN_NR; p++) {
int linesLeftNo = linesLeft.length;
for (boolean line : linesLeft) {
if(line == true)
linesLeftNo--;
}
if((float)(linesLeft.length / linesLeftNo) > CHAIN_PROPORTION) {
break;
}
ArrayList<Pair<Triple<Float, Float, String>,Integer>> chain = new ArrayList<>();
for (int i = 0; i < yCoords.size() - p; i++ ) {
if(chain.size() != p) {
chain.add(new Pair<>(yCoords.get(i),i));
}
else {
int stop = 0;
int sndIterator = i;
int sameLines = 0;
while (stop != 1 && sndIterator < yCoords.size()) {
for(int j = 0; j < p && sndIterator + j < yCoords.size(); j++) {
while(chain.get((p + j -1) % p).getKey().second.equals(yCoords.get(sndIterator + j).second)
&& chain.get((p + j -1) % p).getKey().third.equals(yCoords.get(sndIterator + j).third)
&& sndIterator + j + 1< yCoords.size()) {
sameLines++;
sndIterator++;
}
if(!chain.get(j).getKey().second.equals(yCoords.get(sndIterator+j).second) ||
!chain.get(j).getKey().third.equals(yCoords.get(sndIterator+j).third)) {
stop = 1;
break;
}
}
if(stop != 1) {
// System.out.println("am mai gasit un paragraf cu final in " + sndIterator);
if(sndIterator + p > yCoords.size()) {
break;
}
sndIterator += p;
}
}
if(i + sameLines != sndIterator) {
paragraphsNoSize += (sndIterator - sameLines - i) / p + 1;
if(sndIterator > yCoords.size()) {
i = sndIterator - p;
}
else {
i = sndIterator;
}
while( sndIterator < yCoords.size() &&
chain.get(chain.size() - 1).getKey().second.equals(yCoords.get(sndIterator).second)
&& chain.get(chain.size() -1 ).getKey().third.equals(yCoords.get(sndIterator).third)
) {
sndIterator++;
}
i = sndIterator - 1;
for(int it = chain.get(0).getValue(); it < sndIterator; it++) {
linesLeft[it] = true;
}
chain.clear();
}
else {
for(int j = 0; j < p -1 ; j++) {
chain.set(j,chain.get(j+1));
}
while (chain.get(p-1).getKey().second.equals(yCoords.get(i).second)
&& chain.get(p-1).getKey().third.equals(yCoords.get(i).third)) {
if(i + 1 < yCoords.size()) {
i++;
}
else {
break;
}
}
chain.set(p-1,new Pair<>(yCoords.get(i), i));
}
}
}
}
return paragraphsNoSize;
}
public int getBySpacing () {
int paragraphsNo = 0;
Map<Float,Integer> diffOccurrence = new HashMap<>();
float error = (float)0.5;
boolean isAlready = false;
Float[] diffs = new Float[yCoords.size() - 1];
for (int i = 0 ; i < yCoords.size() - 1; i++) {
diffs[i] = yCoords.get(i + 1).first - yCoords.get(i).first;
}
for( int i = 0; i < diffs.length; i++) {
if(diffs[i] > 0) {
isAlready = false;
for (Map.Entry<Float, Integer> entry : diffOccurrence.entrySet()) {
if (entry.getKey() - error < diffs[i]
&& entry.getKey() + error > diffs[i]) {
entry.setValue(entry.getValue() + 1);
isAlready = true;
break;
}
}
if (isAlready == false) {
diffOccurrence.put(diffs[i], 1);
}
}
}
Float maxdiff = (float)(0);
int maxVal = 0;
for(Map.Entry<Float, Integer> entry : diffOccurrence.entrySet()) {
if(entry.getValue() > maxVal) {
maxVal = entry.getValue();
maxdiff = entry.getKey();
}
}
maxdiff += 1;
for (int i = 1 ; i < yCoords.size() - 1; i++) {
float diff = yCoords.get(i + 1).first - yCoords.get(i).first;
if (diff > maxdiff) {
if(i + 2 < yCoords.size()) {
if(linesLeft[i + 1] == false && linesLeft[i + 2] == true) {
i++;
continue;
}
}
if(i - 1 > 0) {
if(linesLeft[i - 1] == true && linesLeft[i] == false && linesLeft[i + 1] == true) {
continue;
}
}
if(linesLeft[i] == true && linesLeft[i + 1] == true) {
continue;
}
// System.out.println("Se schimba paragraful pe linia " + (i+1));
paragraphsNo++;
}
}
return paragraphsNo;
}
public int getParagraphs () {
linesLeft = new boolean[yCoords.size()];