Commit c5317b93 authored by Teodor Mihai Cotet's avatar Teodor Mihai Cotet

first

parents
Pipeline #67 canceled with stages
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# dotenv
.env
# virtualenv
.venv/
venv/
ENV/
# Spyder project settings
.spyderproject
# Rope project settings
.ropeproject
# Angular
node_modules
dist
# See http://help.github.com/ignore-files/ for more about ignoring files.
# compiled output
/dist
/tmp
/out-tsc
# dependencies
/node_modules
# IDEs and editors
/.idea
.project
.classpath
.c9/
*.launch
.settings/
*.sublime-workspace
# IDE - VSCode
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
# misc
/.sass-cache
/connect.lock
/coverage
/libpeerconnection.log
npm-debug.log
yarn-error.log
testem.log
/typings
/corpus
/spacy-src-old
/RO
/models
/resources
# System Files
.DS_Store
Thumbs.db
.vscode/settings.json
.keras/
.pylint.d/
debug.log
errors.log
info.log
*.tmp
*.html
.neuralcoref_cache/*
\ No newline at end of file
This diff is collapsed.
recursive-include rb *.json
# ReaderBench Python
## Install
`pip3 install --user rbpy-rb`
./install.sh
You may also need some spacy models which are downloaded through spacy.
You have to download these spacy models by yourself, using the command:
`python3 -m spacy download name_of_the_model`
The logger will also write instructions on which models you need, and how to download them.
## Usage
For tokenization, lemmatiozation, pos tagging, use:
```sh
from rb.parser.spacy_parser import SpacyParser
from rb.core.lang import Lang
from rb.core.document import Document
nlp_ro = SpacyParser.get_instance().get_model(Lang.RO)
test_text_ro = "Am mers repede la magazinul frumos."
# tokenize
docs_ro = nlp_ro(test_text_ro)
# print all attributes of token objects
print(dir(docs_ro[0]))
for token in docs_ro:
print(token.lemma_, token.is_stop, token.tag_, token.pos_)
```
For indices use:
```sh
from rb.core.lang import Lang
from rb.core.document import Document
doc = Document(Lang.EN, 'This is a sample document. It can contain multiple sentences and paragraphs')
```
See `examples.py` for usage examples.
## Dev instructions
## How to use the logger
In each file you have to initialize the logger:
```sh
from rb.utils.rblogger import Logger
logger = Logger.get_logger()
logger.info("info msg")
logger.warning("warning msg")
logger.error()
```
from rb.parser.spacy_parser import SpacyParser
from rb.core.lang import Lang
from rb.core.document import Document
nlp_ro = SpacyParser.get_instance().get_model(Lang.EN)
test_text_ro = "I was going to the store, when an elephant appeared in 23.2 seconds. I was going to shop."
# tokenize
docs_ro = nlp_ro(test_text_ro)
# print all attributes of token objects
print(dir(docs_ro[0]))
for token in docs_ro:
print(token.lemma_, token.is_stop, token.tag_, token.pos_)
# # eng
# docs_en = Document(Lang.EN, 'This is a sample document. It can contain multiple sentences and paragraphs')
# docs_en.indices.
\ No newline at end of file
#!/bin/bash
python3 nltk_download.py
python3 -m spacy download xx_ent_wiki_sm
\ No newline at end of file
{
"version": 1,
"disable_existing_loggers": false,
"formatters": {
"simple": {
"format": "%(asctime)s %(levelname)-8s [%(pathname)s:%(lineno)d] %(message)s",
"datefmt": "%d-%b-%y %H:%M:%S"
}
},
"handlers": {
"console": {
"class": "logging.StreamHandler",
"level": "DEBUG",
"formatter": "simple",
"stream": "ext://sys.stdout"
},
"debug_file_handler": {
"class": "logging.handlers.RotatingFileHandler",
"level": "DEBUG",
"formatter": "simple",
"filename": "debug.log",
"maxBytes": 10485760,
"backupCount": 20,
"encoding": "utf8"
},
"info_file_handler": {
"class": "logging.handlers.RotatingFileHandler",
"level": "INFO",
"formatter": "simple",
"filename": "info.log",
"maxBytes": 10485760,
"backupCount": 20,
"encoding": "utf8"
},
"error_file_handler": {
"class": "logging.handlers.RotatingFileHandler",
"level": "ERROR",
"formatter": "simple",
"filename": "errors.log",
"maxBytes": 10485760,
"backupCount": 20,
"encoding": "utf8"
}
},
"root": {
"level": "DEBUG",
"handlers": ["console", "info_file_handler", "error_file_handler", "debug_file_handler"]
}
}
\ No newline at end of file
from os import getenv, path
import nltk
from rb.core.lang import Lang
from rb.utils.downloader import download_wordnet
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw')
nltk_path = getenv('NLTK_DATA')
if nltk_path is None:
nltk_path = path.expanduser('~/nltk_data/')
download_wordnet(Lang.RO, nltk_path + "corpora/omw")
from rb.complexity.complexity_index import ComplexityIndex
from rb.core.lang import Lang
from rb.core.text_element import TextElement
from rb.complexity.index_category import IndexCategory
from rb.complexity.measure_function import MeasureFunction
from rb.core.text_element_type import TextElementType
from typing import List, Callable
from rb.similarity.vector_model import VectorModel
from rb.utils.rblogger import Logger
logger = Logger.get_logger()
class AdjCohesion(ComplexityIndex):
def __init__(self, lang: Lang,
reduce_depth: int, reduce_function: MeasureFunction):
ComplexityIndex.__init__(self, lang=lang, category=IndexCategory.COHESION,
abbr="AdjCoh", reduce_depth=reduce_depth,
reduce_function=reduce_function)
def process(self, element: TextElement) -> float:
return self.reduce_function(self.compute_above(element))
def compute_below(self, element: TextElement) -> float:
if element.is_sentence() == True:
return 0
elif element.depth <= self.reduce_depth:
res = 0
for child in element.components:
res += self.compute_below(child)
return res
def compute_above(self, element: TextElement) -> List[float]:
if element.depth > self.reduce_depth:
values = []
for child in element.components:
values += self.compute_above(child)
element.indices[self] = self.reduce_function(values)
elif element.depth == self.reduce_depth:
values = [self.compute_below(element)]
else:
logger.error('wrong reduce depth value.')
return values
from typing import List
from typing import List
from rb.core.lang import Lang
from rb.complexity.measure_function import MeasureFunction
from rb.core.text_element_type import TextElementType
from rb.complexity.measure_function import MeasureFunction
# create all indices
# dependencies need to be putted in function because otherwise circular dependencies happens
def create(lang: Lang) -> List["ComplexityIndex"]:
from rb.complexity.cohesion.adj_cohesion import AdjCohesion
indices = [AdjCohesion(lang=lang, reduce_depth=TextElementType.SENT.value, reduce_function=MeasureFunction.AVG)]
return indices
\ No newline at end of file
from multiprocessing import Pool, cpu_count
from typing import Callable, Iterable, List, Tuple
from rb.complexity.index_category import IndexCategory
from rb.complexity.measure_function import MeasureFunction
from rb.core.lang import Lang
from rb.core.text_element import TextElement
from rb.core.text_element_type import TextElementType
from rb.utils.rblogger import Logger
from rb.similarity.lda import LDA
from rb.similarity.lsa import LSA
from rb.similarity.word2vec import Word2Vec
from joblib import Parallel, delayed
logger = Logger.get_logger()
class ComplexityIndex():
"""General class for any complexity index
Attributes
----------
lang : Lang
language where the index is applied
category : IndexCategory
type of the index e.g. SURFACE, SYNTAX
abbr : str
short string describing the index
reduce_depth : TextElementType
the depth (in the document) where the reduce_function is applied
the index is applied recursivley on all the above element types from the document tree
reduce_function : Callable[[List], float]
a function to summarize the results of the index (average or standard deviation)
Methods
-------
process(element: TextElement)
computes the index, overwritten for each index
__repr__()
detailed string representation of the index, should overwritten by each index
"""
def __init__(self, lang: Lang, category: IndexCategory, abbr: str, reduce_depth: int, reduce_function: MeasureFunction):
self.lang = lang
self.category = category
self.abbr = abbr
self.reduce_function = reduce_function
self.reduce_depth = reduce_depth
# overwritten by each index
def process(self, element: TextElement) -> float:
pass
# overwritten by each index
def __repr__(self):
return self.abbr
def compute_index(index: ComplexityIndex, element: TextElement) -> float:
return index.process(element)
# computed indices and saves for each TextElement in indices dictionary
def compute_indices(element: TextElement):
logger.info('Starting computing all indices for {0} type element'.format(type(element).__name__))
num_cores = cpu_count()
# for cat in IndexCategory:
# for index in cat.value(element.lang):
# index.process(element)
# with Pool(processes=num_cores) as pool:
# tasks = [(index, element) for cat in IndexCategory for index in cat.create(element.lang)]
lda = LDA('coca', Lang.EN)
Parallel(n_jobs=num_cores, prefer="threads")(delayed(compute_index)(index, element) for cat in IndexCategory for index in cat.create(element.lang))
\ No newline at end of file
from enum import Enum
from typing import List
from rb.complexity.morphology.factory import create as morphology
from rb.complexity.surface.factory import create as surface
from rb.complexity.syntax.factory import create as syntax
from rb.complexity.word.factory import create as word
from rb.complexity.cohesion.factory import create as cohesion
from rb.core.lang import Lang
class IndexCategory(Enum):
SURFACE = 0
MORPHOLOGY = 1
SYNTAX = 2
WORD = 3
COHESION = 4
def create(self, lang: Lang) -> List["ComplexityIndex"]:
functions = [surface, morphology, syntax, word, cohesion]
return functions[self.value](lang)
from enum import Enum
from functools import partial
from statistics import mean, stdev
from typing import List
def average(elements: List[float]) -> float:
return mean(elements)
def standard_deviation(elements: List[float]) -> float:
return stdev(elements)
class MeasureFunction(Enum):
AVG = average
STDEV = standard_deviation
\ No newline at end of file
from typing import List
from rb.core.lang import Lang
from rb.complexity.measure_function import MeasureFunction
from rb.core.text_element_type import TextElementType
from rb.complexity.measure_function import MeasureFunction
from rb.core.pos import POS as PosEum
# create all indices
# dependencies need to be putted in function because otherwise circular dependencies happens
def create(lang: Lang) -> List["ComplexityIndex"]:
from rb.complexity.syntax.dep import DepIndex
from rb.complexity.syntax.dep_enum import DepEnum
from rb.complexity.morphology.pos_main import PosMain
from rb.complexity.morphology.unq_pos_main import UnqPosMain
indices = []
indices.append(PosMain(lang, PosEum.VERB, TextElementType.SENT.value, MeasureFunction.AVG))
indices.append(PosMain(lang, PosEum.NOUN, TextElementType.SENT.value, MeasureFunction.AVG))
indices.append(UnqPosMain(lang, PosEum.NOUN, TextElementType.SENT.value, MeasureFunction.AVG))
#indices = [DepIndex(lang, dep, TextElementType.BLOCK, MeasureFunction.AVG) for dep in DepEnum]
#indices += [DepIndex(lang, dep, TextElementType.BLOCK, MeasureFunction.STDEV) for dep in DepEnum]
return indices
\ No newline at end of file
from rb.complexity.complexity_index import ComplexityIndex
from rb.core.lang import Lang
from rb.core.text_element import TextElement
from rb.complexity.index_category import IndexCategory
from rb.complexity.measure_function import MeasureFunction
from rb.core.text_element_type import TextElementType
from typing import List, Callable
from rb.core.pos import POS as PosEum
from rb.utils.rblogger import Logger
logger = Logger.get_logger()
class PosMain(ComplexityIndex):
def __init__(self, lang: Lang, pos_type: PosEum,
reduce_depth: int, reduce_function: MeasureFunction):
ComplexityIndex.__init__(self, lang=lang, category=IndexCategory.MORPHOLOGY,
abbr="POSMain", reduce_depth=reduce_depth,
reduce_function=reduce_function)
self.pos_type = pos_type
def process(self, element: TextElement) -> float:
return self.reduce_function(self.compute_above(element))
def compute_below(self, element: TextElement) -> float:
if element.is_sentence() == True:
res = sum(1 for word in element.components if word.pos == self.pos_type.name)
return res
elif element.depth <= self.reduce_depth:
res = 0
for child in element.components:
res += self.compute_below(child)
return res
def compute_above(self, element: TextElement) -> List[float]:
if element.depth > self.reduce_depth:
values = []
for child in element.components:
values += self.compute_above(child)
element.indices[self] = self.reduce_function(values)
elif element.depth == self.reduce_depth:
values = [self.compute_below(element)]
else:
logger.error('wrong reduce depth value.')
return values
def __repr__(self):
return self.abbr + "_" + self.pos_type.name.lower()
from rb.complexity.complexity_index import ComplexityIndex
from rb.core.lang import Lang
from rb.core.text_element import TextElement
from rb.complexity.index_category import IndexCategory
from rb.complexity.measure_function import MeasureFunction
from rb.core.text_element_type import TextElementType
from typing import List, Callable, Set
from rb.core.pos import POS as PosEum
from rb.utils.rblogger import Logger
logger = Logger.get_logger()
class UnqPosMain(ComplexityIndex):
def __init__(self, lang: Lang, pos_type: PosEum,
reduce_depth: int, reduce_function: MeasureFunction):
ComplexityIndex.__init__(self, lang=lang, category=IndexCategory.MORPHOLOGY,
abbr="UnqPOSMain", reduce_depth=reduce_depth,
reduce_function=reduce_function)
self.pos_type = pos_type
def process(self, element: TextElement) -> float:
return self.reduce_function(self.compute_above(element))
def compute_below(self, element: TextElement) -> Set[str]:
if element.is_word() == True:
res = set()
if element.pos.name == self.pos_type.name:
res.add(element.text)
return res
elif element.depth <= self.reduce_depth:
res = set()
for child in element.components:
res.update(self.compute_below(child))
return res
def compute_above(self, element: TextElement) -> List[float]:
if element.depth > self.reduce_depth:
values = []
for child in element.components:
values += self.compute_above(child)
element.indices[self] = self.reduce_function(values)
elif element.depth == self.reduce_depth:
values = [len(self.compute_below(element))]
else:
logger.error('wrong reduce depth value.')
return values
def __repr__(self):
return self.abbr + "_" + self.pos_type.name.lower()
from rb.core.lang import Lang
from rb.core.text_element import TextElement
from rb.core.word import Word
from rb.complexity.complexity_index import ComplexityIndex
from rb.complexity.index_category import IndexCategory
from rb.core.text_element_type import TextElementType
from rb.complexity.measure_function import MeasureFunction
from typing import Callable, List
from collections import Counter
from rb.utils.rblogger import Logger
import math
logger = Logger.get_logger()
class ChEntropy(ComplexityIndex):
def __init__(self, lang: Lang, reduce_depth: int,
reduce_function: MeasureFunction):
ComplexityIndex.__init__(self, lang=lang, category=IndexCategory.SURFACE,
abbr="ChEntropy", reduce_depth=reduce_depth,
reduce_function=reduce_function)
def process(self, element: TextElement) -> float:
return self.reduce_function(self.compute_above(element))
def compute_below(self, element: TextElement) -> List[float]:
if element.is_word() == True:
chars = list(element.text)
counter = Counter(chars)
nr_total_chars = len(chars)
res = 0
for _, v in counter.items():
v = v / nr_total_chars
res += -v * math.log(v)
return [res]
elif element.depth <= self.reduce_depth:
res = []
for child in element.components:
res += self.compute_below(child)
return res
else:
logger.error('wrong reduce depth value.')
def compute_above(self, element: TextElement) -> List[float]:
if element.depth > self.reduce_depth:
values = []
for child in element.components:
values += self.compute_above(child)
element.indices[self] = self.reduce_function(values)
elif element.depth == self.reduce_depth:
values = self.compute_below(element)
else:
logger.error('wrong reduce depth value.')
return values
\ No newline at end of file
from rb.core.lang import Lang
from rb.core.text_element import TextElement
from rb.core.word import Word
from rb.complexity.complexity_index import ComplexityIndex
from rb.complexity.index_category import IndexCategory
from rb.core.text_element_type import TextElementType
from rb.complexity.measure_function import MeasureFunction
from typing import Callable, List