Commit 6d3656bd authored by Gabriel Gutu-Robu's avatar Gabriel Gutu-Robu

Merge branch 'master' into 'cercetare_analiza_CV'

# Conflicts:
#   src/main/resources/utils/localization/cv_errors.properties
parents 6c86c747 a0dacc3e
This diff is collapsed.
......@@ -20,6 +20,7 @@ import data.pojo.Language;
import java.util.List;
import javax.persistence.TypedQuery;
import data.pojo.SentimentValence;
import java.util.EnumMap;
/**
*
......@@ -28,7 +29,8 @@ import data.pojo.SentimentValence;
public class ValenceDAO extends AbstractDAO<SentimentValence> {
private static ValenceDAO instance = null;
private EnumMap<Lang, List<SentimentValence>> cache = new EnumMap<>(Lang.class);
private ValenceDAO() {
}
......@@ -49,14 +51,17 @@ public class ValenceDAO extends AbstractDAO<SentimentValence> {
}
public List<SentimentValence> findByLang(Lang lang) {
final Language language = Language.fromLang(lang);
return dao.executeQuery(em -> {
TypedQuery<SentimentValence> query = em.createNamedQuery(
"SentimentValence.findByLang",
SentimentValence.class);
query.setParameter("lang", language);
return query.getResultList();
});
if (!cache.containsKey(lang)) {
final Language language = Language.fromLang(lang);
cache.put(lang, dao.executeQuery(em -> {
TypedQuery<SentimentValence> query = em.createNamedQuery(
"SentimentValence.findByLang",
SentimentValence.class);
query.setParameter("lang", language);
return query.getResultList();
}));
}
return cache.get(lang);
}
}
package data;
import edu.stanford.nlp.util.Triple;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import javafx.util.Pair;
import org.datavec.api.berkeley.Triple;
//import org.datavec.api.berkeley.Triple;
/**
* Created by Gabriel Cristian on 6/14/2017.
......@@ -20,7 +21,7 @@ public class CVStructure {
private ArrayList<Triple<Float, Float, String>> yCoords = new ArrayList<>();
public CVStructure () {
yCoords.add(new Triple<>(new Float(0),new Float(0), " "));
yCoords.add(new Triple<>(0f, 0f, " "));
}
public CVStructure (AbstractDocument document) {
......@@ -28,10 +29,10 @@ public class CVStructure {
}
public float getLastYCoord() {
return yCoords.get(yCoords.size() - 1).getFirst();
return yCoords.get(yCoords.size() - 1).first;
}
public void addYCoord(Triple y) {
public void addYCoord(Triple<Float, Float, String> y) {
yCoords.add(y);
}
......@@ -40,15 +41,15 @@ public class CVStructure {
}
public float getCoord (Triple<Float, Float, String> info) {
return info.getFirst();
return info.first;
}
public float getFontSize (Triple<Float, Float, String> info) {
return info.getSecond();
return info.second;
}
public String getFontName (Triple<Float, Float, String> info) {
return info.getThird();
return info.third;
}
......@@ -85,15 +86,15 @@ public class CVStructure {
for(int j = 0; j < p && sndIterator + j < yCoords.size(); j++) {
while(chain.get((p + j -1) % p).getKey().getSecond().equals(yCoords.get(sndIterator + j).getSecond())
&& chain.get((p + j -1) % p).getKey().getThird().equals(yCoords.get(sndIterator + j).getThird())
while(chain.get((p + j -1) % p).getKey().second.equals(yCoords.get(sndIterator + j).second)
&& chain.get((p + j -1) % p).getKey().third.equals(yCoords.get(sndIterator + j).third)
&& sndIterator + j + 1< yCoords.size()) {
sameLines++;
sndIterator++;
}
if(!chain.get(j).getKey().getSecond().equals(yCoords.get(sndIterator+j).getSecond()) ||
!chain.get(j).getKey().getThird().equals(yCoords.get(sndIterator+j).getThird())) {
if(!chain.get(j).getKey().second.equals(yCoords.get(sndIterator+j).second) ||
!chain.get(j).getKey().third.equals(yCoords.get(sndIterator+j).third)) {
stop = 1;
break;
......@@ -120,8 +121,8 @@ public class CVStructure {
while( sndIterator < yCoords.size() &&
chain.get(chain.size() - 1).getKey().getSecond().equals(yCoords.get(sndIterator).getSecond())
&& chain.get(chain.size() -1 ).getKey().getThird().equals(yCoords.get(sndIterator).getThird())
chain.get(chain.size() - 1).getKey().second.equals(yCoords.get(sndIterator).second)
&& chain.get(chain.size() -1 ).getKey().third.equals(yCoords.get(sndIterator).third)
) {
sndIterator++;
......@@ -139,8 +140,8 @@ public class CVStructure {
for(int j = 0; j < p -1 ; j++) {
chain.set(j,chain.get(j+1));
}
while (chain.get(p-1).getKey().getSecond().equals(yCoords.get(i).getSecond())
&& chain.get(p-1).getKey().getThird().equals(yCoords.get(i).getThird())) {
while (chain.get(p-1).getKey().second.equals(yCoords.get(i).second)
&& chain.get(p-1).getKey().third.equals(yCoords.get(i).third)) {
if(i + 1 < yCoords.size()) {
i++;
}
......@@ -168,7 +169,7 @@ public class CVStructure {
Float[] diffs = new Float[yCoords.size() - 1];
for (int i = 0 ; i < yCoords.size() - 1; i++) {
diffs[i] = yCoords.get(i + 1).getFirst() - yCoords.get(i).getFirst();
diffs[i] = yCoords.get(i + 1).first - yCoords.get(i).first;
}
for( int i = 0; i < diffs.length; i++) {
......@@ -199,7 +200,7 @@ public class CVStructure {
maxdiff += 1;
for (int i = 1 ; i < yCoords.size() - 1; i++) {
float diff = yCoords.get(i + 1).getFirst() - yCoords.get(i).getFirst();
float diff = yCoords.get(i + 1).first - yCoords.get(i).first;
if (diff > maxdiff) {
if(i + 2 < yCoords.size()) {
......
......@@ -23,7 +23,7 @@ public enum Lang implements Serializable {
fr("French", Locale.FRENCH),
it("Italian", Locale.ENGLISH),
ro("Romanian", Locale.ENGLISH),
es("Spanish", Locale.ENGLISH),
es("Spanish", new Locale("es", "ES")),
de("German", Locale.ENGLISH),
nl("Dutch", Locale.ENGLISH),
la("Latin", Locale.ENGLISH);
......
......@@ -27,16 +27,26 @@ public enum SemanticCorpora implements Serializable {
tasa_en_lsa("TASA", Lang.en, SimilarityType.LSA),
tasa_en_lda("TASA", Lang.en, SimilarityType.LDA),
tasa_en_word2vec("TASA", Lang.en, SimilarityType.WORD2VEC),
tasa_lak_en_lsa("TASA_LAK", Lang.en, SimilarityType.LSA),
tasa_lak_en_lda("TASA_LAK", Lang.en, SimilarityType.LDA),
sciref_en_lsa("SciRef", Lang.en, SimilarityType.LSA),
enea_tasa_en_lsa("ENEA_TASA", Lang.en, SimilarityType.LSA),
enea_tasa_en_lda("ENEA_TASA", Lang.en, SimilarityType.LDA),
enea_tasa_en_word2vec("ENEA_TASA", Lang.en, SimilarityType.WORD2VEC),
le_monde_fr_lsa("Le_Monde", Lang.fr, SimilarityType.LSA),
le_monde_fr_lda("Le_Monde", Lang.fr, SimilarityType.LDA),
le_monde_fr_word2vec("Le_Monde", Lang.fr, SimilarityType.WORD2VEC),
euro_parlamentean_nl_lda("Euro_Parlamentean", Lang.nl, SimilarityType.LDA),
inl_nl_lda("INL", Lang.nl, SimilarityType.LDA);
inl_nl_lda("INL", Lang.nl, SimilarityType.LDA),
jose_antonio_es_lsa("Jose_Antonio", Lang.es, SimilarityType.LSA),
jose_antonio_es_lda("Jose_Antonio", Lang.es, SimilarityType.LDA),
jose_antonio_es_word2vec("Jose_Antonio", Lang.es, SimilarityType.WORD2VEC);
private final static String SEMANTIC_CORPORA_ROOT = "resources/config/";
private final String corpora;
......
......@@ -28,6 +28,7 @@ import java.util.ArrayList;
import java.util.Objects;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import services.commons.TextPreprocessing;
import services.semanticModels.ISemanticModel;
/**
......@@ -42,6 +43,7 @@ public class Sentence extends AnalysisElement implements Comparable<Sentence> {
private List<Word> allWords;
private transient SemanticGraph dependencies;
private final Map<Word, Word> pronimialReplacementMap;
private transient String cleanedText = null;
public Sentence(Block b, int index, String text, List<ISemanticModel> models, Lang lang) {
super(b, index, text.replaceAll("\\s", " ").trim(), models, lang);
......@@ -93,6 +95,19 @@ public class Sentence extends AnalysisElement implements Comparable<Sentence> {
public void setAllWords(List<Word> allWords) {
this.allWords = allWords;
}
public String getCleanedText() {
if (cleanedText == null) {
cleanedText = TextPreprocessing.cleanText(getText(), getLanguage());
}
return cleanedText;
}
public void setCleanedText(String cleanedText) {
this.cleanedText = cleanedText;
}
private Word getWordByIndex(IndexedWord iw) {
int index = iw.get(CoreAnnotations.IndexAnnotation.class) - 1;
......
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package data.article;
/**
*
* @author Cosmin
*/
public enum ReferencesYearsInfo {
MAX_YEAR, MIN_YEAR, SLOPE,
AVG, STD_DEV, NORMALITY, NOVELTY,
AVG_NOVELTY, STARTING_FROM, COUNT,
NO_MAX_YEAR;
}
......@@ -51,6 +51,6 @@ public enum CSCLIndices {
}
public String getAcronym() {
return ResourceBundle.getBundle("utils.localization.CSCL_indices_acro").getString(this.name());
return ResourceBundle.getBundle("utils.localization.CSCL_indices_acronyms").getString(this.name());
}
}
......@@ -602,7 +602,7 @@ public class Community extends AnalysisElement {
}
public static void processAllFolders(String folder, Lang lang, String prefix, boolean needsAnonymization,
boolean restartProcessing, String pathToLSA, String pathToLDA, boolean usePOSTagging,
boolean restartProcessing, String pathToLSA, String pathToLDA, String pathToWord2Vec, boolean usePOSTagging,
boolean useTextualComplexity, boolean exportIntoCsv, boolean generateParticipantView, boolean generateParticipantViewD3, boolean generateParticipantViewSubCommunities,
boolean generateConceptView, Date startDate, Date endDate, int monthIncrement, int dayIncrement) {
File dir = new File(folder);
......@@ -618,7 +618,7 @@ public class Community extends AnalysisElement {
checkpoint.delete();
}
}
SerialProcessing.processCorpus(f.getAbsolutePath(), pathToLSA, pathToLDA, lang, usePOSTagging,
SerialProcessing.processCorpus(f.getAbsolutePath(), pathToLSA, pathToLDA, pathToWord2Vec, lang, usePOSTagging,
true, true, SaveType.SERIALIZED_AND_CSV_EXPORT);
Community.processDocumentCollection(f.getAbsolutePath(), lang, needsAnonymization, useTextualComplexity,
exportIntoCsv, generateParticipantView, generateParticipantViewD3, generateParticipantViewSubCommunities, generateConceptView,
......
......@@ -90,13 +90,18 @@ public class Document extends AbstractDocument implements Comparable<Document> {
}
public Document(AbstractDocumentTemplate docTmp, List<ISemanticModel> semModels, Lang lang, boolean usePOSTagging) {
this(docTmp, semModels, lang, usePOSTagging, Parsing.getParser(lang));
}
public Document(AbstractDocumentTemplate docTmp, List<ISemanticModel> semModels,
Lang lang, boolean usePOSTagging, Parsing parser) {
setLanguage(lang);
super.setSemanticModels(semModels);
setDisambiguationGraph(new DisambiguationGraph(lang));
setText(docTmp.getText());
setDocTmp(docTmp);
Parsing.getParser(lang).parseDoc(docTmp, this, usePOSTagging);
parser.parseDoc(docTmp, this, usePOSTagging);
}
public static Document load(String pathToDoc, Map<SimilarityType, String> modelPaths, Lang lang, boolean usePOSTagging) {
......
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package runtime.converters;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author Cosmin
*/
public class APARefsEntriesSeparator {
public static List<String> separate(String refSection) {
List<String> sections = new ArrayList<String>();
String[] splits = refSection.split("\n");
String patternString1 = "^\\s*[A-ZĂÎÂȘȚ][a-zA-Z-ăîâșțĂÎÂȘȚ']+,\\s[A-ZăîâșțĂÎÂȘȚ]\\.";
String patternString2 = "(?<![A-Z])\\.\\s+[A-ZĂÎÂȘȚ][a-zA-Z-ăîâșțĂÎÂȘȚ']+,\\s[A-ZăîâșțĂÎÂȘȚ]\\.";
Pattern pattern1 = Pattern.compile(patternString1);
Pattern pattern2 = Pattern.compile(patternString2);
//Matcher matcher1 = pattern1.matcher(refText);
String currentSection = "";
for (int i = 0; i < splits.length; i++) {
Matcher matcher1 = pattern1.matcher(splits[i]);
Matcher matcher2 = pattern2.matcher(splits[i]);
if (matcher1.find()) {
//System.out.println(splits[i]);
if (currentSection != "") {
sections.add(currentSection);
currentSection = "";
}
}
int last_pos = 0;
if (matcher2.find()) {
System.out.println("MATCHER " + matcher2.group(0));
currentSection += splits[i].substring(last_pos, matcher2.start());
sections.add(currentSection);
currentSection = "";
last_pos = matcher2.start();
while (matcher2.find()) {
System.out.println("MATCHER " + matcher2.group(0));
currentSection += splits[i].substring(last_pos, matcher2.start());
sections.add(currentSection);
currentSection = "";
last_pos = matcher2.start();
}
currentSection += splits[i].substring(last_pos);
} else {
currentSection += splits[i];
}
}
if (currentSection != "") {
sections.add(currentSection);
}
return sections;
}
}
package runtime.converters;
import java.util.List;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author Cosmin
*/
public class IEEERefsEntriesSeparator {
public static List<String> separate(String refSection) {
List<String> sections = new ArrayList<String>();
String[] splits = refSection.split("\n");
String patternString1 = "^([0-9]+\\.\\s|\\[[0-9]+\\]\\s)";
String patternString2 = "(?<!vol.)\\s([0-9]{1,2}\\.|\\[[0-9]{1,2}\\])\\s";
Pattern pattern1 = Pattern.compile(patternString1);
Pattern pattern2 = Pattern.compile(patternString2);
//Matcher matcher1 = pattern1.matcher(refText);
String currentSection = "";
for (int i = 0; i < splits.length; i++) {
Matcher matcher1 = pattern1.matcher(splits[i]);
Matcher matcher2 = pattern2.matcher(splits[i]);
if (matcher1.find()) {
//System.out.println(splits[i]);
if (currentSection != "") {
sections.add(currentSection);
currentSection = "";
}
}
int last_pos = 0;
if (matcher2.find()) {
currentSection += splits[i].substring(last_pos, matcher2.start());
sections.add(currentSection);
currentSection = "";
last_pos = matcher2.start();
while (matcher2.find()) {
currentSection += splits[i].substring(last_pos, matcher2.start());
sections.add(currentSection);
currentSection = "";
last_pos = matcher2.start();
}
currentSection += splits[i].substring(last_pos);
} else {
currentSection += splits[i];
}
}
if (currentSection != "") {
sections.add(currentSection);
}
return sections;
}
}
/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package runtime.converters;
import data.AbstractDocumentTemplate;
import data.Lang;
import data.document.Document;
import edu.stanford.nlp.util.Pair;
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.openide.util.Exceptions;
import services.complexity.CAF.CAFFactory;
import services.complexity.ComplexityIndex;
import services.complexity.ComplexityIndexType;
import services.complexity.ComplexityIndices;
import services.complexity.cohesion.CohesionFactory;
import services.complexity.connectives.ConnectivesFactory;
import services.complexity.coreference.CoreferenceFactory;
import services.complexity.dependencies.SyntacticDependenciesFactory;
import services.complexity.dialogism.DialogismFactory;
import services.complexity.entityDensity.EntityDensityFactory;
import services.complexity.readability.ReadabilityFactory;
import services.complexity.surface.SurfaceFactory;
import services.complexity.syntax.SyntaxFactory;
import services.complexity.wordComplexity.WordComplexityFactory;
import services.complexity.wordLists.WordListsIndicesFactory;
import webService.ReaderBenchServer;
/**
*
* @author stefan
*/
public class TextToIndices {
private List<ComplexityIndex> indices;
public TextToIndices(Lang lang) {
ComplexityIndexType[] types = {
ComplexityIndexType.READABILITY,
ComplexityIndexType.SURFACE,
ComplexityIndexType.SYNTAX,
ComplexityIndexType.WORD_COMPLEXITY,
ComplexityIndexType.ENTITY_DENSITY,
ComplexityIndexType.CONNECTIVES,
ComplexityIndexType.SEMANTIC_DEPENDENCIES,
};
indices = Arrays.stream(types)
.filter(cat -> cat.getFactory() != null)
.map(cat -> cat.getFactory())
.flatMap(f -> f.build(lang).stream())
.collect(Collectors.toList());
}
public double[] computeIndices(String text) {
AbstractDocumentTemplate adt = AbstractDocumentTemplate.getDocumentModel(text);
Document doc = new Document(adt, new ArrayList<>(), Lang.en, false);
return indices.stream()
.mapToDouble(ind -> ind.compute(doc))
.toArray();
}
public static void textToCSV(String inputFile, String outputFile) {
TextToIndices t2i = new TextToIndices(Lang.en);
try (BufferedReader in = new BufferedReader(new FileReader(inputFile));
PrintWriter out = new PrintWriter(outputFile)) {
String sep = ";";
out.println("sep=" + sep);
out.println("target" + sep + t2i.indices.stream()
.map(ComplexityIndex::getAcronym)
.collect(Collectors.joining(sep)));
in.readLine();
String line = in.readLine();
List<Pair<String, String>> texts = new ArrayList<>();
while ((line = in.readLine()) != null) {
String[] split = line.split(";");
if (split.length < 8) {
continue;
}
String target = split[7];
String question = split[6];
texts.add(new Pair<>(target, question));
}
List<Pair<String, double[]>> processed = texts.parallelStream()
.map(pair -> new Pair<>(pair.first, t2i.computeIndices(pair.second)))
.collect(Collectors.toList());
for (Pair<String, double[]> pair : processed) {
out.println(pair.first + sep + Arrays.stream(pair.second)
.mapToObj(x -> x + "")
.collect(Collectors.joining(sep)));
}
} catch (FileNotFoundException ex) {
Exceptions.printStackTrace(ex);
} catch (IOException ex) {
Exceptions.printStackTrace(ex);
}
}
public static void main(String[] args) {
ReaderBenchServer.initializeDB();
textToCSV("resources/in/Q&A/Question corpus/train.csv", "resources/in/Q&A/Question corpus/train-indices.csv");
}
}
......@@ -37,7 +37,7 @@ public class CSCLCommunityTest {
// String path = "resources/in/MOOC/forum_posts&comments";
String path = "resources/in/1 year/";
SerialProcessing.processCorpus(path, "resources/config/EN/LSA/TASA_LAK", "resources/config/EN/LDA/TASA_LAK", Lang.en, true, true, true, AbstractDocument.SaveType.SERIALIZED_AND_CSV_EXPORT);
SerialProcessing.processCorpus(path, "resources/config/EN/LSA/TASA_LAK", "resources/config/EN/LDA/TASA_LAK", "resources/config/EN/word2vec/TASA_LAK", Lang.en, true, true, true, AbstractDocument.SaveType.SERIALIZED_AND_CSV_EXPORT);
Community.processDocumentCollection(path, Lang.en, false, false, true, false, false, false, false, null, null, 0, 7);
......@@ -52,7 +52,7 @@ public class CSCLCommunityTest {
}
public static void computeCommunity(String path){
SerialProcessing.processCorpus(path, "resources/config/EN/LSA/TASA", "resources/config/EN/LDA/TASA", Lang.en, true, true, true, AbstractDocument.SaveType.SERIALIZED_AND_CSV_EXPORT);
SerialProcessing.processCorpus(path, "resources/config/EN/LSA/TASA", "resources/config/EN/LDA/TASA", "resources/config/EN/word2vec/TASA_LAK", Lang.en, true, true, true, AbstractDocument.SaveType.SERIALIZED_AND_CSV_EXPORT);
CommunityUtils.processDocumentCollectionForClustering(path, false, false, new DateTime(2012, 8, 1, 0, 0).toDate(),
new DateTime(2013, 7, 11, 0, 0).toDate(), 0, 7);
}
......
......@@ -104,7 +104,7 @@ public class CreativityTest {
}
}
public static void processFolder(String folder, boolean restartProcessing, String pathToLSA, String pathToLDA, Lang lang, boolean usePOSTagging) {
public static void processFolder(String folder, boolean restartProcessing, String pathToLSA, String pathToLDA, String pathToWord2Vec, Lang lang, boolean usePOSTagging) {
File dir = new File(folder);
if (dir.isDirectory()) {
......@@ -120,7 +120,7 @@ public class CreativityTest {
checkpoint.delete();
}
}
SerialProcessing.processCorpus(dir.getAbsolutePath(), pathToLSA, pathToLDA, lang, usePOSTagging,
SerialProcessing.processCorpus(dir.getAbsolutePath(), pathToLSA, pathToLDA, pathToWord2Vec, lang, usePOSTagging,
true, true, AbstractDocument.SaveType.SERIALIZED_AND_CSV_EXPORT);
processConversations(dir.getAbsolutePath());
}
......@@ -179,6 +179,6 @@ public class CreativityTest {
public static void main(String[] args) {
ReaderBenchServer.initializeDB();
CreativityTest.processFolder("resources/in/creativity/separated tasks", true, "resources/config/EN/LSA/TASA_LAK", "resources/config/EN/LDA/TASA_LAK", Lang.en, true);
CreativityTest.processFolder("resources/in/creativity/separated tasks", true, "resources/config/EN/LSA/TASA_LAK", "resources/config/EN/LDA/TASA_LAK", "resources/config/EN/word2vec/TASA_LAK", Lang.en, true);
}
}
......@@ -373,7 +373,7 @@ public class CVAnalyzer {
CVAnalyzer frenchCVAnalyzer = new CVAnalyzer(lang, models