# Copyright 2018 The NLP Odyssey Authors.
# Copyright 2018 Marco Nicola <marconicola@disroot.org>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module providing the :class:`.ConlluLexerBuilder` class and related
exception classes.
"""
from ply.lex import LexToken, TOKEN, Lexer, lex # type: ignore
from colonel.upostag import UposTag
[docs]class LexerError(Exception):
"""Generic error class for :class:`.ConlluLexerBuilder`."""
pass
[docs]class IllegalCharacterError(LexerError):
"""Exception raised by :class:`.ConlluLexerBuilder` when a lexer error
caused by invalid input is encountered.
An exception instance must be initialized with the :class:`.LexToken` which
the lexer was not able to process, so that :attr:`line_number` and
:attr:`column_number` can be extracted; a short error message is also
generated by the constructor.
"""
def __init__(self, token: LexToken) -> None:
#: Line number containing the illegal character, or the start of an
#: illegal sequence.
self.line_number: int = token.lexer.lineno
#: Column position, associated with :attr:`line_number`, containing the
#: illegal character, or the start of an illegal sequence.
self.column_number: int = ConlluLexerBuilder.find_column(token)
super(IllegalCharacterError, self).__init__(
"Illegal character %s at (or sequence from) %s:%s" %
(repr(token.value[0]), self.line_number, self.column_number)
)
# We disable pylint invalid names complaints due to PLY lexer naming convention
# pylint: disable=invalid-name
[docs]class ConlluLexerBuilder:
"""Class containing *PLY Lex* rules for processing the *CoNLL-U* format and
for creating new related *PLY* :class:`.Lexer` instances.
Usually you can simply invoke the class method :meth:`build` which returns
a *PLY* :class:`.Lexer`; such lexer instance is ready to process your
input, making use of the rules provided by the :class:`ConlluLexerBuilder`
class itself.
"""
states = (
('v0', 'exclusive'), ('v1', 'exclusive'), ('v2', 'exclusive'),
('v3', 'exclusive'), ('v4', 'exclusive'), ('v5', 'exclusive'),
('v6', 'exclusive'), ('v7', 'exclusive'), ('v8', 'exclusive'),
('v9', 'exclusive'),
('c1', 'exclusive'), ('c2', 'exclusive'), ('c3', 'exclusive'),
('c4', 'exclusive'), ('c5', 'exclusive'), ('c6', 'exclusive'),
('c7', 'exclusive'), ('c8', 'exclusive'), ('c9', 'exclusive'),
)
tokens = (
'NEWLINE', 'TAB', 'COMMENT', 'INTEGER_ID', 'RANGE_ID', 'DECIMAL_ID',
'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS',
'MISC',
)
#: Pattern for the name of a morphological feature (left part)
_feat_name = r'[A-Z0-9][A-Z0-9a-z]*(\[[a-z0-9]+\])?'
#: Pattern for a single value of a morphological feature (right part)
_feat_value = r'[A-Z0-9][a-zA-Z0-9]*'
#: Pattern for a list of values of a morphological feature (right part)
_feat_values = r'{0}(,{0})*'.format(_feat_value)
#: Pattern for a single morphological feature name+values pair
_feat_pair = r'{0}={1}'.format(_feat_name, _feat_values)
#: Pattern for a nullable list of morphological features
_feats = r'({0}([|]{0})*)|_'.format(_feat_pair)
#: Pattern for the head part of a head+deprel pair (left part)
_dep_head = r'([1-9][0-9]+|[0-9])'
#: Pattern for the deprel part of a head+deprel pair (right part)
_dep_deprel = r'[^\n\t ]+'
#: Pattern for a head+deprel pair
_dep_pair = r'{0}:{1}'.format(_dep_head, _dep_deprel)
#: Pattern for a nullable list of head+deprel pairs
_deps = r'({0}([|]{0})*)|_'.format(_dep_pair)
#: Pattern for a nullable Universal part-of-speech tag
_upos = r'({0}|_)'.format('|'.join(tag.name for tag in UposTag))
[docs] def t_v0_v1_v2_v3_v4_v5_v6_v7_v8_TAB(self, token: LexToken) -> LexToken:
r'\t'
self._tab_count += 1
token.lexer.begin(f'c{self._tab_count}')
return token
[docs] @staticmethod
def t_RANGE_ID(token: LexToken) -> LexToken:
r'[1-9][0-9]*-[1-9][0-9]*'
token.value = tuple(map(int, token.value.split('-')))
token.lexer.begin('v0')
return token
[docs] @staticmethod
def t_DECIMAL_ID(token: LexToken) -> LexToken:
r'([1-9][0-9]+|[0-9])\.[1-9][0-9]*'
token.value = tuple(map(int, token.value.split('.')))
token.lexer.begin('v0')
return token
[docs] @staticmethod
def t_INTEGER_ID(token: LexToken) -> LexToken:
r'[1-9][0-9]*'
token.value = int(token.value)
token.lexer.begin('v0')
return token
[docs] @staticmethod
def t_c2_LEMMA(token: LexToken) -> LexToken:
r'[^\n\t]+'
token.lexer.begin('v2')
return token
[docs] @staticmethod
@TOKEN(_upos)
def t_c3_UPOS(token: LexToken) -> LexToken:
# pylint: disable=missing-docstring
token.value = None if token.value == '_' else token.value
token.lexer.begin('v3')
return token
[docs] @staticmethod
def t_c4_XPOS(token: LexToken) -> LexToken:
r'[^\n\t ]+'
token.value = None if token.value == '_' else token.value
token.lexer.begin('v4')
return token
[docs] @staticmethod
@TOKEN(_feats)
def t_c5_FEATS(token: LexToken) -> LexToken:
# pylint: disable=missing-docstring
token.value = None if token.value == '_' else tuple(
(x[:x.index('=')], tuple(x[x.index('=')+1:].split(',')))
for x in token.value.split('|')
)
token.lexer.begin('v5')
return token
[docs] @staticmethod
def t_c6_HEAD(token: LexToken) -> LexToken:
r'([1-9][0-9]+|[0-9])|_'
token.value = None if token.value == '_' else int(token.value)
token.lexer.begin('v6')
return token
[docs] @staticmethod
def t_c7_DEPREL(token: LexToken) -> LexToken:
r'[^\n\t ]+'
token.value = None if token.value == '_' else token.value
token.lexer.begin('v7')
return token
[docs] @staticmethod
@TOKEN(_deps)
def t_c8_DEPS(token: LexToken) -> LexToken:
# pylint: disable=missing-docstring
token.value = None if token.value == '_' else tuple(
(int(x[:x.index(':')]), x[x.index(':')+1:])
for x in token.value.split('|')
)
token.lexer.begin('v8')
return token
[docs] @staticmethod
def t_c9_MISC(token: LexToken) -> LexToken:
r'[^\n\t ]+'
token.value = None if token.value == '_' else token.value
token.lexer.begin('v9')
return token
[docs] def t_INITIAL_v9_NEWLINE(self, token: LexToken) -> LexToken:
r'\n'
token.lexer.lineno += 1
self._tab_count = 0
token.lexer.begin('INITIAL')
return token
[docs] @staticmethod
def t_ANY_error(token: LexToken) -> None:
# pylint: disable=missing-docstring
raise IllegalCharacterError(token)
[docs] @staticmethod
def find_column(token: LexToken) -> int:
"""Given a :class:`.LexToken`, it returns the related column number.
"""
line_start = token.lexer.lexdata.rfind('\n', 0, token.lexpos) + 1
return (token.lexpos - line_start) + 1
def __init__(self) -> None:
self.lexer: Lexer = lex(module=self)
self._tab_count = 0
[docs] @classmethod
def build(cls) -> Lexer:
"""Returns a *PLY* :class:`Lexer` instance for *CoNLL-U* processing.
The returned lexer makes use of the rules defined by
:class:`ConlluLexerBuilder`.
"""
return cls().lexer