Commit 46434d1a authored by Dorinela Dascalu's avatar Dorinela Dascalu 🌺

Merged with master. Resolved conflicts

parents 69d4988c c7e6b485
This diff is collapsed.
......@@ -20,6 +20,7 @@ import data.pojo.Language;
import java.util.List;
import javax.persistence.TypedQuery;
import data.pojo.SentimentValence;
import java.util.EnumMap;
/**
*
......@@ -28,7 +29,8 @@ import data.pojo.SentimentValence;
public class ValenceDAO extends AbstractDAO<SentimentValence> {
private static ValenceDAO instance = null;
private EnumMap<Lang, List<SentimentValence>> cache = new EnumMap<>(Lang.class);
private ValenceDAO() {
}
......@@ -49,14 +51,17 @@ public class ValenceDAO extends AbstractDAO<SentimentValence> {
}
public List<SentimentValence> findByLang(Lang lang) {
final Language language = Language.fromLang(lang);
return dao.executeQuery(em -> {
TypedQuery<SentimentValence> query = em.createNamedQuery(
"SentimentValence.findByLang",
SentimentValence.class);
query.setParameter("lang", language);
return query.getResultList();
});
if (!cache.containsKey(lang)) {
final Language language = Language.fromLang(lang);
cache.put(lang, dao.executeQuery(em -> {
TypedQuery<SentimentValence> query = em.createNamedQuery(
"SentimentValence.findByLang",
SentimentValence.class);
query.setParameter("lang", language);
return query.getResultList();
}));
}
return cache.get(lang);
}
}
......@@ -161,8 +161,8 @@ public abstract class AbstractDocument extends AnalysisElement {
}
}
public void computeAll(boolean computeDialogism) {
computeDiscourseAnalysis(computeDialogism);
public void computeAll(boolean computeDialogism, boolean useBigrams) {
computeDiscourseAnalysis(computeDialogism, useBigrams);
ComplexityIndices.computeComplexityFactors(this);
}
......@@ -188,8 +188,9 @@ public abstract class AbstractDocument extends AnalysisElement {
/**
*
* @param computeDialogism
* @param useBigrams
*/
public void computeDiscourseAnalysis(boolean computeDialogism) {
public void computeDiscourseAnalysis(boolean computeDialogism, boolean useBigrams) {
if (computeDialogism) {
// build disambiguisation graph and lexical chains
DisambiguisationGraphAndLexicalChains.buildDisambiguationGraph(this);
......@@ -220,7 +221,7 @@ public abstract class AbstractDocument extends AnalysisElement {
// t2 = System.currentTimeMillis();
// System.out.println("old cohesion time: " + ((t2 - t1) / 1000.) + " sec");
// determine topics
KeywordModeling.determineKeywords(this);
KeywordModeling.determineKeywords(this, useBigrams);
// TopicModel.determineTopicsLDA(this);
Scoring.score(this);
......@@ -253,10 +254,10 @@ public abstract class AbstractDocument extends AnalysisElement {
public static AbstractDocument loadGenericDocument(String pathToDoc,
Map<SimilarityType, String> modelPaths, Lang lang,
boolean usePOSTagging, boolean computeDialogism, String pathToComplexityModel,
boolean usePOSTagging, boolean computeDialogism, boolean useBigrams, String pathToComplexityModel,
int[] selectedComplexityFactors, boolean cleanInput, SaveType saveOutput) {
List<ISemanticModel> models = SimilarityType.loadVectorModels(modelPaths, lang);
return loadGenericDocument(new File(pathToDoc), models, lang, usePOSTagging, computeDialogism,
return loadGenericDocument(new File(pathToDoc), models, lang, usePOSTagging, computeDialogism, useBigrams,
pathToComplexityModel, selectedComplexityFactors, cleanInput, saveOutput);
}
......@@ -286,7 +287,7 @@ public abstract class AbstractDocument extends AnalysisElement {
}
public static AbstractDocument loadGenericDocument(File docFile, List<ISemanticModel> models,
Lang lang, boolean usePOSTagging, boolean computeDialogism,
Lang lang, boolean usePOSTagging, boolean computeDialogism, boolean useBigrams,
String pathToComplexityModel, int[] selectedComplexityFactors,
boolean cleanInput, SaveType saveOutput) {
// parse the XML file
......@@ -304,13 +305,13 @@ public abstract class AbstractDocument extends AnalysisElement {
if (isDocument) {
Document d = Document.load(docFile, models, lang, usePOSTagging);
d.computeAll(computeDialogism);
d.computeAll(computeDialogism, useBigrams);
d.save(saveOutput);
return d;
}
if (isChat) {
Conversation c = Conversation.load(docFile, models, lang, usePOSTagging);
c.computeAll(computeDialogism);
c.computeAll(computeDialogism, useBigrams);
c.save(saveOutput);
return c;
}
......
package data;
import edu.stanford.nlp.util.Triple;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import javafx.util.Pair;
//import org.datavec.api.berkeley.Triple;
/**
* Created by Gabriel Cristian on 6/14/2017.
*/
public class CVStructure {
AbstractDocument document;
private final int MIN_CHAIN_NR = 2;
private final int MAX_CHAIN_NR = 5;
private final float CHAIN_PROPORTION = 2;
public boolean[] linesLeft;
private ArrayList<Triple<Float, Float, String>> yCoords = new ArrayList<>();
public CVStructure () {
yCoords.add(new Triple<>(0f, 0f, " "));
}
public CVStructure (AbstractDocument document) {
this.document = document;
}
public float getLastYCoord() {
return yCoords.get(yCoords.size() - 1).first;
}
public void addYCoord(Triple<Float, Float, String> y) {
yCoords.add(y);
}
public ArrayList<Triple<Float, Float, String>> getYCoords() {
return yCoords;
}
public float getCoord (Triple<Float, Float, String> info) {
return info.first;
}
public float getFontSize (Triple<Float, Float, String> info) {
return info.second;
}
public String getFontName (Triple<Float, Float, String> info) {
return info.third;
}
public int getByFontSize() {
int firstAlready = -1;
int paragraphsNoSize = 0;
//trying to get 2-4 length periodical occurrences that may indicate a paragraph
for (int p = MIN_CHAIN_NR; p < MAX_CHAIN_NR; p++) {
int linesLeftNo = linesLeft.length;
for (boolean line : linesLeft) {
if(line == true)
linesLeftNo--;
}
if((float)(linesLeft.length / linesLeftNo) > CHAIN_PROPORTION) {
break;
}
ArrayList<Pair<Triple<Float, Float, String>,Integer>> chain = new ArrayList<>();
for (int i = 0; i < yCoords.size() - p; i++ ) {
if(chain.size() != p) {
chain.add(new Pair<>(yCoords.get(i),i));
}
else {
int stop = 0;
int sndIterator = i;
int sameLines = 0;
while (stop != 1 && sndIterator < yCoords.size()) {
for(int j = 0; j < p && sndIterator + j < yCoords.size(); j++) {
while(chain.get((p + j -1) % p).getKey().second.equals(yCoords.get(sndIterator + j).second)
&& chain.get((p + j -1) % p).getKey().third.equals(yCoords.get(sndIterator + j).third)
&& sndIterator + j + 1< yCoords.size()) {
sameLines++;
sndIterator++;
}
if(!chain.get(j).getKey().second.equals(yCoords.get(sndIterator+j).second) ||
!chain.get(j).getKey().third.equals(yCoords.get(sndIterator+j).third)) {
stop = 1;
break;
}
}
if(stop != 1) {
// System.out.println("am mai gasit un paragraf cu final in " + sndIterator);
if(sndIterator + p > yCoords.size()) {
break;
}
sndIterator += p;
}
}
if(i + sameLines != sndIterator) {
paragraphsNoSize += (sndIterator - sameLines - i) / p + 1;
if(sndIterator > yCoords.size()) {
i = sndIterator - p;
}
else {
i = sndIterator;
}
while( sndIterator < yCoords.size() &&
chain.get(chain.size() - 1).getKey().second.equals(yCoords.get(sndIterator).second)
&& chain.get(chain.size() -1 ).getKey().third.equals(yCoords.get(sndIterator).third)
) {
sndIterator++;
}
i = sndIterator - 1;
for(int it = chain.get(0).getValue(); it < sndIterator; it++) {
linesLeft[it] = true;
}
chain.clear();
}
else {
for(int j = 0; j < p -1 ; j++) {
chain.set(j,chain.get(j+1));
}
while (chain.get(p-1).getKey().second.equals(yCoords.get(i).second)
&& chain.get(p-1).getKey().third.equals(yCoords.get(i).third)) {
if(i + 1 < yCoords.size()) {
i++;
}
else {
break;
}
}
chain.set(p-1,new Pair<>(yCoords.get(i), i));
}
}
}
}
return paragraphsNoSize;
}
public int getBySpacing () {
int paragraphsNo = 0;
Map<Float,Integer> diffOccurrence = new HashMap<>();
float error = (float)0.5;
boolean isAlready = false;
Float[] diffs = new Float[yCoords.size() - 1];
for (int i = 0 ; i < yCoords.size() - 1; i++) {
diffs[i] = yCoords.get(i + 1).first - yCoords.get(i).first;
}
for( int i = 0; i < diffs.length; i++) {
if(diffs[i] > 0) {
isAlready = false;
for (Map.Entry<Float, Integer> entry : diffOccurrence.entrySet()) {
if (entry.getKey() - error < diffs[i]
&& entry.getKey() + error > diffs[i]) {
entry.setValue(entry.getValue() + 1);
isAlready = true;
break;
}
}
if (isAlready == false) {
diffOccurrence.put(diffs[i], 1);
}
}
}
Float maxdiff = (float)(0);
int maxVal = 0;
for(Map.Entry<Float, Integer> entry : diffOccurrence.entrySet()) {
if(entry.getValue() > maxVal) {
maxVal = entry.getValue();
maxdiff = entry.getKey();
}
}
maxdiff += 1;
for (int i = 1 ; i < yCoords.size() - 1; i++) {
float diff = yCoords.get(i + 1).first - yCoords.get(i).first;
if (diff > maxdiff) {
if(i + 2 < yCoords.size()) {
if(linesLeft[i + 1] == false && linesLeft[i + 2] == true) {
i++;
continue;
}
}
if(i - 1 > 0) {
if(linesLeft[i - 1] == true && linesLeft[i] == false && linesLeft[i + 1] == true) {
continue;
}
}
if(linesLeft[i] == true && linesLeft[i + 1] == true) {
continue;
}
// System.out.println("Se schimba paragraful pe linia " + (i+1));
paragraphsNo++;
}
}
return paragraphsNo;
}
public int getParagraphs () {
linesLeft = new boolean[yCoords.size()];
for (boolean bool: linesLeft) {
bool = false;
}
int paragraphs= getByFontSize();
paragraphs += getBySpacing();
// System.out.println("NUMARUL TOTAL DE PARAGRAFE ESTE " + paragraphs);
return paragraphs;
}
public int getSentences (String parsedText) {
int sentencesNo = 0;
String[] lines = parsedText.split("[\\r\\n]+");
for(int i = 0; i < lines.length; i++) {
sentencesNo++;
for(int j = 0; j < lines[i].length() - 1; j++) {
if(lines[i].charAt(j) == '.' &&
lines[i].charAt(j + 1) == ' ') {
sentencesNo++;
}
}
}
return sentencesNo;
}
}
......@@ -23,7 +23,7 @@ public enum Lang implements Serializable {
fr("French", Locale.FRENCH),
it("Italian", Locale.ENGLISH),
ro("Romanian", Locale.ENGLISH),
es("Spanish", Locale.ENGLISH),
es("Spanish", new Locale("es", "ES")),
de("German", Locale.ENGLISH),
nl("Dutch", Locale.ENGLISH),
la("Latin", Locale.ENGLISH);
......
......@@ -27,16 +27,26 @@ public enum SemanticCorpora implements Serializable {
tasa_en_lsa("TASA", Lang.en, SimilarityType.LSA),
tasa_en_lda("TASA", Lang.en, SimilarityType.LDA),
tasa_en_word2vec("TASA", Lang.en, SimilarityType.WORD2VEC),
tasa_lak_en_lsa("TASA_LAK", Lang.en, SimilarityType.LSA),
tasa_lak_en_lda("TASA_LAK", Lang.en, SimilarityType.LDA),
sciref_en_lsa("SciRef", Lang.en, SimilarityType.LSA),
enea_tasa_en_lsa("ENEA_TASA", Lang.en, SimilarityType.LSA),
enea_tasa_en_lda("ENEA_TASA", Lang.en, SimilarityType.LDA),
enea_tasa_en_word2vec("ENEA_TASA", Lang.en, SimilarityType.WORD2VEC),
le_monde_fr_lsa("Le_Monde", Lang.fr, SimilarityType.LSA),
le_monde_fr_lda("Le_Monde", Lang.fr, SimilarityType.LDA),
le_monde_fr_word2vec("Le_Monde", Lang.fr, SimilarityType.WORD2VEC),
euro_parlamentean_nl_lda("Euro_Parlamentean", Lang.nl, SimilarityType.LDA),
inl_nl_lda("INL", Lang.nl, SimilarityType.LDA);
inl_nl_lda("INL", Lang.nl, SimilarityType.LDA),
jose_antonio_es_lsa("Jose_Antonio", Lang.es, SimilarityType.LSA),
jose_antonio_es_lda("Jose_Antonio", Lang.es, SimilarityType.LDA),
jose_antonio_es_word2vec("Jose_Antonio", Lang.es, SimilarityType.WORD2VEC);
private final static String SEMANTIC_CORPORA_ROOT = "resources/config/";
private final String corpora;
......
......@@ -28,6 +28,7 @@ import java.util.ArrayList;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import services.commons.TextPreprocessing;
import services.semanticModels.ISemanticModel;
/**
......@@ -42,6 +43,7 @@ public class Sentence extends AnalysisElement implements Comparable<Sentence> {
private List<Word> allWords;
private transient SemanticGraph dependencies;
private final Map<Word, Word> pronimialReplacementMap;
private transient String cleanedText = null;
public Sentence(Block b, int index, String text, List<ISemanticModel> models, Lang lang) {
super(b, index, text.replaceAll("\\s", " ").trim(), models, lang);
......@@ -93,6 +95,19 @@ public class Sentence extends AnalysisElement implements Comparable<Sentence> {
public void setAllWords(List<Word> allWords) {
this.allWords = allWords;
}
public String getCleanedText() {
if (cleanedText == null) {
cleanedText = TextPreprocessing.cleanText(getText(), getLanguage());
}
return cleanedText;
}
public void setCleanedText(String cleanedText) {
this.cleanedText = cleanedText;
}
private Word getWordByIndex(IndexedWord iw) {
int index = iw.get(CoreAnnotations.IndexAnnotation.class) - 1;
......
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package data;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
/**
*
* @author admin_licenta
*/
public class Syllable implements Serializable, Comparable<Syllable> {
private final String text;
private final List<String> symbols;
private final boolean primaryStressed;
private final boolean secondaryStressed;
public Syllable(String syll) {
text = syll;
symbols = new ArrayList<>(Arrays.asList(syll.trim().split("\\s+")));
primaryStressed = syll.contains("1");
secondaryStressed = syll.contains("2");
}
public String getText() {
return text;
}
public List<String> getSymbols() {
return symbols;
}
public boolean isPrimaryStressed() {
return primaryStressed;
}
public boolean isSecondaryStressed() {
return secondaryStressed;
}
@Override
public String toString() {
return symbols.stream().map(Object::toString).collect(Collectors.joining(" ")).trim();
}
@Override
public int compareTo(Syllable s) {
return text.compareTo(s.getText());
}
public static void main(String[] args) {
String syll1 = "AE1 ";
String syll2 = "B AH0 ";
String syll3 = "K AH0 S";
Syllable s1 = new Syllable(syll1);
System.out.println(s1.getSymbols());
System.out.println(s1.toString().length());
Syllable s2 = new Syllable(syll2);
System.out.println(s2.getSymbols());
System.out.println(s2.toString().length());
Syllable s3 = new Syllable(syll3);
System.out.println(s3.getSymbols().size());
System.out.println(s3.isPrimaryStressed());
}
}
This diff is collapsed.
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package data.article;
/**
*
* @author Cosmin
*/
public enum ReferencesYearsInfo {
MAX_YEAR, MIN_YEAR, SLOPE,
AVG, STD_DEV, NORMALITY, NOVELTY,
AVG_NOVELTY, STARTING_FROM, COUNT,
NO_MAX_YEAR;
}
/*
* Copyright 2016 ReaderBench.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package data.cscl;
import java.util.ResourceBundle;
import services.commons.VectorAlgebra;
public enum CSCLCriteria {
AVERAGE, STDEV, SLOPE, ENTROPY, UNIFORMITY, LOCAL_EXTREME, RECURRENCE_AVERAGE, RECURRENCE_STDEV;
public String getDescription() {
return ResourceBundle.getBundle("utils.localization.CSCL_criteria").getString(this.name());
}
/**
* Apply a certain criteria on the input vector
*
* @param crit
* @param v
* @return
*/
public static double getValue(CSCLCriteria crit, double[] v) {
switch (crit) {
case AVERAGE:
return VectorAlgebra.avg(v);
case STDEV:
return VectorAlgebra.stdev(v);
case SLOPE:
return VectorAlgebra.slope(v);
case ENTROPY:
return VectorAlgebra.entropy(v);
case UNIFORMITY:
return VectorAlgebra.uniformity(v); </