# Copyright 2018 The NLP Odyssey Authors.
# Copyright 2018 Marco Nicola <marconicola@disroot.org>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Module providing the :class:`.ConlluParserBuilder` class and related
exception classes.
"""
from typing import Optional
from ply.yacc import yacc, LRParser, YaccProduction # type: ignore
from ply.lex import LexToken # type: ignore
from colonel.conllu.lexer import ConlluLexerBuilder
from colonel.sentence import Sentence
from colonel.word import Word
from colonel.emptynode import EmptyNode
from colonel.multiword import Multiword
from colonel.upostag import UposTag
[docs]class ParserError(Exception):
"""Generic error class for :class:`.ConlluParserBuilder`."""
pass
[docs]class IllegalTokenError(ParserError):
"""Exception raised by :class:`.ConlluParserBuilder` when a parser error
caused by invalid token is encountered.
An exception instance must be initialized with the :class:`.LexToken` which
the parser was not able to process, so that all the exception attributes
can be extracted; a short error message is also generated by the
constructor.
"""
def __init__(self, t: LexToken) -> None:
#: The type of the illegal token encountered, or of the first token
#: of an illegal tokens sequence.
self.type = t.type
#: The value of the illegal token encountered, or of the first token
#: of an illegal tokens sequence.
self.value = t.value
#: Line number related to the illegal token encountered, or to the
#: first token of an illegal tokens sequence.
self.line_number = t.lexer.lineno
#: Column position, associated with :attr:`line_number`, related to the
#: illegal token encountered, or to the first token of an illegal
#: tokens sequence.
self.column_number = ConlluLexerBuilder.find_column(t)
super(IllegalTokenError, self).__init__(
"Illegal token %s %s at (or tokens from) %s:%s." %
(self.type, repr(self.value), self.line_number, self.column_number)
)
[docs]class IllegalEofError(ParserError):
"""Exception raised by :class:`.ConlluParserBuilder` when a parser error
caused by invalid *end-of-file* is encountered.
When this exception is raised, it means that the end of the input data
has been reached, but some additional tokens were expected in order to
be valid *CoNLL-U*.
"""
def __init__(self) -> None:
super(IllegalEofError, self).__init__(
'Illegal EOF reached, but token expected.'
)
[docs]class IllegalMultiwordError(ParserError):
"""Exception raised by :class:`.ConlluParserBuilder` when a word line
was parsed correctly and has been recognised as a *multiword token*
line, however the data is not valid for this kind of element.
An exception instance must be initialized with the :class:`.YaccProduction`
related to the word line containing illegal data, so that the
:attr:`line_number` can be extracted; a short error message is also
generated by the constructor.
"""
def __init__(self, prod: YaccProduction) -> None:
self.line_number = prod.lineno(1)
super(IllegalMultiwordError, self).__init__(
f'Illegal multiword data at line {self.line_number}.'
)
[docs]class IllegalEmptyNodeError(ParserError):
"""Exception raised by :class:`.ConlluParserBuilder` when a word line
was parsed correctly and has been recognised as an *empty node*
line, however the data is not valid for this kind of element.
An exception instance must be initialized with the :class:`.YaccProduction`
related to the word line containing illegal data, so that the
:attr:`line_number` can be extracted; a short error message is also
generated by the constructor.
"""
def __init__(self, prod: YaccProduction) -> None:
self.line_number = prod.lineno(1)
super(IllegalEmptyNodeError, self).__init__(
f'Illegal empty-node data at line {self.line_number}.'
)
[docs]class ConlluParserBuilder:
"""Class containing *PLY Yacc* rules for processing the *CoNLL-U* format
and for creating new related *PLY* :class:`.LRParser` instances.
Usually you can simply invoke the class method :meth:`build` which returns
a *PLY* :class:`.LRParser`; such parser instance is ready to process your
input, making use of the rules provided by the :class:`ConlluParserBuilder`
class itself.
As usual, this class is paired with an associated lexer, which in in this
case is served by :class:`.ConlluLexerBuilder`.
"""
[docs] @staticmethod
def p_sentences_many(prod: YaccProduction) -> None:
'sentences : sentences sentence'
prod[0] = prod[1] + [prod[2]]
[docs] @staticmethod
def p_sentences_one(prod: YaccProduction) -> None:
'sentences : sentence'
prod[0] = [prod[1]]
[docs] @staticmethod
def p_wordlines_many(prod: YaccProduction) -> None:
'wordlines : wordlines wordline'
prod[0] = prod[1] + [prod[2]]
[docs] @staticmethod
def p_wordlines_one(prod: YaccProduction) -> None:
'wordlines : wordline'
prod[0] = [prod[1]]
[docs] @staticmethod
def p_wordline_word(prod: YaccProduction) -> None:
'wordline : INTEGER_ID TAB FORM TAB LEMMA TAB UPOS TAB XPOS TAB ' \
'FEATS TAB HEAD TAB DEPREL TAB DEPS TAB MISC NEWLINE'
prod[0] = Word(
index=prod[1],
form=prod[3],
lemma=prod[5],
upos=UposTag[prod[7]] if prod[7] else None,
xpos=prod[9],
feats=prod[11],
head=prod[13],
deprel=prod[15],
deps=prod[17],
misc=prod[19]
)
[docs] @staticmethod
def p_wordline_multiword(prod: YaccProduction) -> None:
'wordline : RANGE_ID TAB FORM TAB LEMMA TAB UPOS TAB XPOS TAB FEATS ' \
'TAB HEAD TAB DEPREL TAB DEPS TAB MISC NEWLINE'
if prod[5] != '_' or any(prod[i] is not None for i in range(7, 18, 2)):
raise IllegalMultiwordError(prod)
prod[0] = Multiword(
first_index=prod[1][0],
last_index=prod[1][1],
form=prod[3],
misc=prod[19]
)
[docs] @staticmethod
def p_wordline_emptynode(prod: YaccProduction) -> None:
'wordline : DECIMAL_ID TAB FORM TAB LEMMA TAB UPOS TAB XPOS TAB ' \
'FEATS TAB HEAD TAB DEPREL TAB DEPS TAB MISC NEWLINE'
if prod[13] is not None or prod[15] is not None:
raise IllegalEmptyNodeError(prod)
prod[0] = EmptyNode(
main_index=prod[1][0],
sub_index=prod[1][1],
form=prod[3],
lemma=prod[5],
upos=UposTag[prod[7]] if prod[7] else None,
xpos=prod[9],
feats=prod[11],
deps=prod[17],
misc=prod[19]
)
[docs] @staticmethod
def p_error(token: Optional[LexToken]) -> None:
# pylint: disable=missing-docstring
if token:
raise IllegalTokenError(token)
else:
raise IllegalEofError()
def __init__(self) -> None:
self.tokens = ConlluLexerBuilder.tokens
self.lexer = ConlluLexerBuilder.build()
self.parser = yacc(module=self)
[docs] @classmethod
def build(cls) -> LRParser:
"""Returns a *PLY* :class:`LRParser` instance for *CoNLL-U* processing.
The returned parser makes use of the rules defined by
:class:`ConlluParserBuilder`.
"""
return cls().parser