Source code for colonel.conllu.lexer

# Copyright 2018 The NLP Odyssey Authors.
# Copyright 2018 Marco Nicola <marconicola@disroot.org>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Module providing the :class:`.ConlluLexerBuilder` class and related
exception classes.
"""

from ply.lex import LexToken, TOKEN, Lexer, lex  # type: ignore
from colonel.upostag import UposTag


[docs]class LexerError(Exception): """Generic error class for :class:`.ConlluLexerBuilder`.""" pass
[docs]class IllegalCharacterError(LexerError): """Exception raised by :class:`.ConlluLexerBuilder` when a lexer error caused by invalid input is encountered. An exception instance must be initialized with the :class:`.LexToken` which the lexer was not able to process, so that :attr:`line_number` and :attr:`column_number` can be extracted; a short error message is also generated by the constructor. """ def __init__(self, token: LexToken) -> None: #: Line number containing the illegal character, or the start of an #: illegal sequence. self.line_number: int = token.lexer.lineno #: Column position, associated with :attr:`line_number`, containing the #: illegal character, or the start of an illegal sequence. self.column_number: int = ConlluLexerBuilder.find_column(token) super(IllegalCharacterError, self).__init__( "Illegal character %s at (or sequence from) %s:%s" % (repr(token.value[0]), self.line_number, self.column_number) )
# We disable pylint invalid names complaints due to PLY lexer naming convention # pylint: disable=invalid-name
[docs]class ConlluLexerBuilder: """Class containing *PLY Lex* rules for processing the *CoNLL-U* format and for creating new related *PLY* :class:`.Lexer` instances. Usually you can simply invoke the class method :meth:`build` which returns a *PLY* :class:`.Lexer`; such lexer instance is ready to process your input, making use of the rules provided by the :class:`ConlluLexerBuilder` class itself. """ states = ( ('v0', 'exclusive'), ('v1', 'exclusive'), ('v2', 'exclusive'), ('v3', 'exclusive'), ('v4', 'exclusive'), ('v5', 'exclusive'), ('v6', 'exclusive'), ('v7', 'exclusive'), ('v8', 'exclusive'), ('v9', 'exclusive'), ('c1', 'exclusive'), ('c2', 'exclusive'), ('c3', 'exclusive'), ('c4', 'exclusive'), ('c5', 'exclusive'), ('c6', 'exclusive'), ('c7', 'exclusive'), ('c8', 'exclusive'), ('c9', 'exclusive'), ) tokens = ( 'NEWLINE', 'TAB', 'COMMENT', 'INTEGER_ID', 'RANGE_ID', 'DECIMAL_ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', ) #: Pattern for the name of a morphological feature (left part) _feat_name = r'[A-Z0-9][A-Z0-9a-z]*(\[[a-z0-9]+\])?' #: Pattern for a single value of a morphological feature (right part) _feat_value = r'[A-Z0-9][a-zA-Z0-9]*' #: Pattern for a list of values of a morphological feature (right part) _feat_values = r'{0}(,{0})*'.format(_feat_value) #: Pattern for a single morphological feature name+values pair _feat_pair = r'{0}={1}'.format(_feat_name, _feat_values) #: Pattern for a nullable list of morphological features _feats = r'({0}([|]{0})*)|_'.format(_feat_pair) #: Pattern for the head part of a head+deprel pair (left part) _dep_head = r'([1-9][0-9]+|[0-9])' #: Pattern for the deprel part of a head+deprel pair (right part) _dep_deprel = r'[^\n\t ]+' #: Pattern for a head+deprel pair _dep_pair = r'{0}:{1}'.format(_dep_head, _dep_deprel) #: Pattern for a nullable list of head+deprel pairs _deps = r'({0}([|]{0})*)|_'.format(_dep_pair) #: Pattern for a nullable Universal part-of-speech tag _upos = r'({0}|_)'.format('|'.join(tag.name for tag in UposTag))
[docs] def t_v0_v1_v2_v3_v4_v5_v6_v7_v8_TAB(self, token: LexToken) -> LexToken: r'\t' self._tab_count += 1 token.lexer.begin(f'c{self._tab_count}') return token
[docs] @staticmethod def t_COMMENT(token: LexToken) -> LexToken: r'[#][^\n]*' token.value = token.value[1:].strip() return token
[docs] @staticmethod def t_RANGE_ID(token: LexToken) -> LexToken: r'[1-9][0-9]*-[1-9][0-9]*' token.value = tuple(map(int, token.value.split('-'))) token.lexer.begin('v0') return token
[docs] @staticmethod def t_DECIMAL_ID(token: LexToken) -> LexToken: r'([1-9][0-9]+|[0-9])\.[1-9][0-9]*' token.value = tuple(map(int, token.value.split('.'))) token.lexer.begin('v0') return token
[docs] @staticmethod def t_INTEGER_ID(token: LexToken) -> LexToken: r'[1-9][0-9]*' token.value = int(token.value) token.lexer.begin('v0') return token
[docs] @staticmethod def t_c1_FORM(token: LexToken) -> LexToken: r'[^\n\t]+' token.lexer.begin('v1') return token
[docs] @staticmethod def t_c2_LEMMA(token: LexToken) -> LexToken: r'[^\n\t]+' token.lexer.begin('v2') return token
[docs] @staticmethod @TOKEN(_upos) def t_c3_UPOS(token: LexToken) -> LexToken: # pylint: disable=missing-docstring token.value = None if token.value == '_' else token.value token.lexer.begin('v3') return token
[docs] @staticmethod def t_c4_XPOS(token: LexToken) -> LexToken: r'[^\n\t ]+' token.value = None if token.value == '_' else token.value token.lexer.begin('v4') return token
[docs] @staticmethod @TOKEN(_feats) def t_c5_FEATS(token: LexToken) -> LexToken: # pylint: disable=missing-docstring token.value = None if token.value == '_' else tuple( (x[:x.index('=')], tuple(x[x.index('=')+1:].split(','))) for x in token.value.split('|') ) token.lexer.begin('v5') return token
[docs] @staticmethod def t_c6_HEAD(token: LexToken) -> LexToken: r'([1-9][0-9]+|[0-9])|_' token.value = None if token.value == '_' else int(token.value) token.lexer.begin('v6') return token
[docs] @staticmethod def t_c7_DEPREL(token: LexToken) -> LexToken: r'[^\n\t ]+' token.value = None if token.value == '_' else token.value token.lexer.begin('v7') return token
[docs] @staticmethod @TOKEN(_deps) def t_c8_DEPS(token: LexToken) -> LexToken: # pylint: disable=missing-docstring token.value = None if token.value == '_' else tuple( (int(x[:x.index(':')]), x[x.index(':')+1:]) for x in token.value.split('|') ) token.lexer.begin('v8') return token
[docs] @staticmethod def t_c9_MISC(token: LexToken) -> LexToken: r'[^\n\t ]+' token.value = None if token.value == '_' else token.value token.lexer.begin('v9') return token
[docs] def t_INITIAL_v9_NEWLINE(self, token: LexToken) -> LexToken: r'\n' token.lexer.lineno += 1 self._tab_count = 0 token.lexer.begin('INITIAL') return token
[docs] @staticmethod def t_ANY_error(token: LexToken) -> None: # pylint: disable=missing-docstring raise IllegalCharacterError(token)
[docs] @staticmethod def find_column(token: LexToken) -> int: """Given a :class:`.LexToken`, it returns the related column number. """ line_start = token.lexer.lexdata.rfind('\n', 0, token.lexpos) + 1 return (token.lexpos - line_start) + 1
def __init__(self) -> None: self.lexer: Lexer = lex(module=self) self._tab_count = 0
[docs] @classmethod def build(cls) -> Lexer: """Returns a *PLY* :class:`Lexer` instance for *CoNLL-U* processing. The returned lexer makes use of the rules defined by :class:`ConlluLexerBuilder`. """ return cls().lexer