Source code for colonel.conllu.parser

# Copyright 2018 The NLP Odyssey Authors.
# Copyright 2018 Marco Nicola <marconicola@disroot.org>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Module providing the :class:`.ConlluParserBuilder` class and related
exception classes.
"""

from typing import Optional
from ply.yacc import yacc, LRParser, YaccProduction  # type: ignore
from ply.lex import LexToken  # type: ignore
from colonel.conllu.lexer import ConlluLexerBuilder
from colonel.sentence import Sentence
from colonel.word import Word
from colonel.emptynode import EmptyNode
from colonel.multiword import Multiword
from colonel.upostag import UposTag


[docs]class ParserError(Exception): """Generic error class for :class:`.ConlluParserBuilder`.""" pass
[docs]class IllegalTokenError(ParserError): """Exception raised by :class:`.ConlluParserBuilder` when a parser error caused by invalid token is encountered. An exception instance must be initialized with the :class:`.LexToken` which the parser was not able to process, so that all the exception attributes can be extracted; a short error message is also generated by the constructor. """ def __init__(self, t: LexToken) -> None: #: The type of the illegal token encountered, or of the first token #: of an illegal tokens sequence. self.type = t.type #: The value of the illegal token encountered, or of the first token #: of an illegal tokens sequence. self.value = t.value #: Line number related to the illegal token encountered, or to the #: first token of an illegal tokens sequence. self.line_number = t.lexer.lineno #: Column position, associated with :attr:`line_number`, related to the #: illegal token encountered, or to the first token of an illegal #: tokens sequence. self.column_number = ConlluLexerBuilder.find_column(t) super(IllegalTokenError, self).__init__( "Illegal token %s %s at (or tokens from) %s:%s." % (self.type, repr(self.value), self.line_number, self.column_number) )
[docs]class IllegalEofError(ParserError): """Exception raised by :class:`.ConlluParserBuilder` when a parser error caused by invalid *end-of-file* is encountered. When this exception is raised, it means that the end of the input data has been reached, but some additional tokens were expected in order to be valid *CoNLL-U*. """ def __init__(self) -> None: super(IllegalEofError, self).__init__( 'Illegal EOF reached, but token expected.' )
[docs]class IllegalMultiwordError(ParserError): """Exception raised by :class:`.ConlluParserBuilder` when a word line was parsed correctly and has been recognised as a *multiword token* line, however the data is not valid for this kind of element. An exception instance must be initialized with the :class:`.YaccProduction` related to the word line containing illegal data, so that the :attr:`line_number` can be extracted; a short error message is also generated by the constructor. """ def __init__(self, prod: YaccProduction) -> None: self.line_number = prod.lineno(1) super(IllegalMultiwordError, self).__init__( f'Illegal multiword data at line {self.line_number}.' )
[docs]class IllegalEmptyNodeError(ParserError): """Exception raised by :class:`.ConlluParserBuilder` when a word line was parsed correctly and has been recognised as an *empty node* line, however the data is not valid for this kind of element. An exception instance must be initialized with the :class:`.YaccProduction` related to the word line containing illegal data, so that the :attr:`line_number` can be extracted; a short error message is also generated by the constructor. """ def __init__(self, prod: YaccProduction) -> None: self.line_number = prod.lineno(1) super(IllegalEmptyNodeError, self).__init__( f'Illegal empty-node data at line {self.line_number}.' )
[docs]class ConlluParserBuilder: """Class containing *PLY Yacc* rules for processing the *CoNLL-U* format and for creating new related *PLY* :class:`.LRParser` instances. Usually you can simply invoke the class method :meth:`build` which returns a *PLY* :class:`.LRParser`; such parser instance is ready to process your input, making use of the rules provided by the :class:`ConlluParserBuilder` class itself. As usual, this class is paired with an associated lexer, which in in this case is served by :class:`.ConlluLexerBuilder`. """
[docs] @staticmethod def p_sentences_many(prod: YaccProduction) -> None: 'sentences : sentences sentence' prod[0] = prod[1] + [prod[2]]
[docs] @staticmethod def p_sentences_one(prod: YaccProduction) -> None: 'sentences : sentence' prod[0] = [prod[1]]
[docs] @staticmethod def p_sentence_with_comments(prod: YaccProduction) -> None: 'sentence : comments wordlines NEWLINE' prod[0] = Sentence(prod[2], prod[1])
[docs] @staticmethod def p_sentence_without_comments(prod: YaccProduction) -> None: 'sentence : wordlines NEWLINE' prod[0] = Sentence(prod[1])
[docs] @staticmethod def p_comments_many(prod: YaccProduction) -> None: 'comments : comments comment' prod[0] = prod[1] + [prod[2]]
[docs] @staticmethod def p_comments_one(prod: YaccProduction) -> None: 'comments : comment' prod[0] = [prod[1]]
[docs] @staticmethod def p_comment(prod: YaccProduction) -> None: 'comment : COMMENT NEWLINE' prod[0] = prod[1]
[docs] @staticmethod def p_wordlines_many(prod: YaccProduction) -> None: 'wordlines : wordlines wordline' prod[0] = prod[1] + [prod[2]]
[docs] @staticmethod def p_wordlines_one(prod: YaccProduction) -> None: 'wordlines : wordline' prod[0] = [prod[1]]
[docs] @staticmethod def p_wordline_word(prod: YaccProduction) -> None: 'wordline : INTEGER_ID TAB FORM TAB LEMMA TAB UPOS TAB XPOS TAB ' \ 'FEATS TAB HEAD TAB DEPREL TAB DEPS TAB MISC NEWLINE' prod[0] = Word( index=prod[1], form=prod[3], lemma=prod[5], upos=UposTag[prod[7]] if prod[7] else None, xpos=prod[9], feats=prod[11], head=prod[13], deprel=prod[15], deps=prod[17], misc=prod[19] )
[docs] @staticmethod def p_wordline_multiword(prod: YaccProduction) -> None: 'wordline : RANGE_ID TAB FORM TAB LEMMA TAB UPOS TAB XPOS TAB FEATS ' \ 'TAB HEAD TAB DEPREL TAB DEPS TAB MISC NEWLINE' if prod[5] != '_' or any(prod[i] is not None for i in range(7, 18, 2)): raise IllegalMultiwordError(prod) prod[0] = Multiword( first_index=prod[1][0], last_index=prod[1][1], form=prod[3], misc=prod[19] )
[docs] @staticmethod def p_wordline_emptynode(prod: YaccProduction) -> None: 'wordline : DECIMAL_ID TAB FORM TAB LEMMA TAB UPOS TAB XPOS TAB ' \ 'FEATS TAB HEAD TAB DEPREL TAB DEPS TAB MISC NEWLINE' if prod[13] is not None or prod[15] is not None: raise IllegalEmptyNodeError(prod) prod[0] = EmptyNode( main_index=prod[1][0], sub_index=prod[1][1], form=prod[3], lemma=prod[5], upos=UposTag[prod[7]] if prod[7] else None, xpos=prod[9], feats=prod[11], deps=prod[17], misc=prod[19] )
[docs] @staticmethod def p_error(token: Optional[LexToken]) -> None: # pylint: disable=missing-docstring if token: raise IllegalTokenError(token) else: raise IllegalEofError()
def __init__(self) -> None: self.tokens = ConlluLexerBuilder.tokens self.lexer = ConlluLexerBuilder.build() self.parser = yacc(module=self)
[docs] @classmethod def build(cls) -> LRParser: """Returns a *PLY* :class:`LRParser` instance for *CoNLL-U* processing. The returned parser makes use of the rules defined by :class:`ConlluParserBuilder`. """ return cls().parser