Source code for colonel.sentence

# Copyright 2018 The NLP Odyssey Authors.
# Copyright 2018 Marco Nicola <marconicola@disroot.org>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Module providing the :class:`.Sentence` class."""

from typing import Optional, List, Iterator, Union
from colonel.base_sentence_element import BaseSentenceElement
from colonel.word import Word
from colonel.emptynode import EmptyNode
from colonel.multiword import Multiword

__all__ = ['Sentence']


[docs]class Sentence:
    """Representation of a *sentence*.

    This class is modeled starting from the *CoNLL-U Format* specification,
    which states that *sentences consist of one or more word lines*. Each
    *word line* contains a series of fields, first of all an *ID*, the
    value of which determines the *kind* of the whole line: a *single word*,
    a *(multiword) token* or an *empty node*.

    Analogously, here a :class:`Sentence` mostly consists of an ordered list of
    :attr:`elements`, which can be object of any
    :class:`.BaseSentenceElement`'s subclass, commonly a :class:`.Word`, a
    :class:`.Multiword` or an :class:`.EmptyNode`.

    Since the *CoNLL-U* format allows the presence of comment lines before a
    sentence, the :attr:`comments` attribute is made available here as a simple
    list of strings.
    """

    __slots__ = ('elements', 'comments')

    def __init__(
            self,
            elements: Optional[List[BaseSentenceElement]] = None,
            comments: Optional[List[str]] = None
    ) -> None:

        #: Ordered list of words, tokens and nodes which form the sentence.
        #:
        #: Usually this list can be freely and directly manipulated, since the
        #: methods of the class always recompute their returned value
        #: accordingly; just pay particular attention performing changes while
        #: in the context of iterations (see for example :meth:`words` and
        #: :meth:`raw_tokens` methods).
        self.elements: List[BaseSentenceElement] = \
            [] if elements is None else elements

        #: Miscellaneous comments related to the sentence.
        #:
        #: For the time being, in the context of this project no particular
        #: meaning is given to the values of this attribute, however the
        #: following guidelines *should* be followed in order to facilitate
        #: possible future usages and processing:
        #:
        #: - the presence of the leading ``#`` character (which denotes the
        #:   start of a comment line in *CoNLL-U* format) is discouraged, in
        #:   order to keep comments more format-independent;
        #: - each comment should be always stripped from leading/trailing
        #:   spaces or newline characters.
        self.comments: List[str] = [] if comments is None else comments

[docs]    def words(self) -> Iterator[Word]:
        """Extracts the sequence of words.

        Iterates through :attr:`elements` and yields :class:`.Word` elements
        only. This can be especially handy in many dependency parsing contexts,
        where the focus mostly resides among simple words and their relations,
        ignoring the additional information carried by *empty nodes* and
        *(multiword) tokens*.

        This method do not perform any validity check among the elements, so if
        you want to ensure valid and meaningful results, please refer to
        :meth:`is_valid`; unless you really know what you are doing, iterating
        an invalid sentence could lead to wrong or incoherent results or
        unexpected behaviours.
        """
        for element in self.elements:
            if isinstance(element, Word):
                yield element

[docs]    def raw_tokens(self) -> Iterator[Union[Word, Multiword]]:
        """Extracts the raw token sequence.

        Iterates through :attr:`elements` and yields the only elements which
        represent the raw sequence of tokens in the sentence. The result
        includes :class:`.Word` and :class:`.Multiword` elements,
        skipping all :class:`.Word` items which indexes are included in the
        range of a preceding :class:`.MultiWord`.

        Empty nodes are ignored.

        This method do not perform any validity check among the elements, so if
        you want to ensure valid and meaningful results, please refer to
        :meth:`is_valid`; unless you really know what you are doing, iterating
        an invalid sentence could lead to wrong or incoherent results or
        unexpected behaviours.
        """
        last_index = 0
        for item in self.elements:
            if isinstance(item, Multiword):
                yield item
                last_index = item.last_index or 0
            elif isinstance(item, Word) and (item.index or 0) > last_index:
                yield item

[docs]    def is_valid(self) -> bool:
        """Returns whether or not the sentence is valid.

        The checks implemented here are mostly based on the *CoNLL-U* format
        and on the most widely adopted common practices among NLP and
        dependency parsing contexts, yet including a minimum set of essential
        validation, so that you are free to use this as a foundation for
        other custom rules in your application.

        A sentence is considered *valid* only if **all** of the following
        conditions apply:

        - there is at least one element of type :class:`Word`;
        - every single element is valid as well - see
          :meth:`.BaseSentenceElement.is_valid` and the overriding of its
          subclasses;
        - the ordered sequence of the elements and their *ID* is valid, that
          is:

            - the sequence of :attr:`.Word.index` starts from ``1`` and
              progressively increases by 1 step;
            - there are no *index* duplicates or range overlapping;
            - the :class:`.EmptyNode` elements (if any) are correctly placed
              after the :class:`.Word` element related to their
              :attr:`.EmptyNode.main_index` (or before the first word of the
              sentence, when the *main index* is zero), and for each sequence
              of *empty nodes* their :attr:`.EmptyNode.sub_index` starts from
              ``1`` and progressively increases by 1 step;
            - the :class:`.Multiword` elements (if any) are correctly placed
              before the first :class:`.Word` included in their *index* range,
              and each range always cover existing :class:`.Word` elements in
              the sentence;

        - if one or more :attr:`.Word.head` values are set (not ``None``), each
          head must refer to the *index* of a :class:`.Word` existing within
          the sentence, or at least be equal to zero (``0``, for ``root``
          grammatical relations).
        """
        return any(self.words()) and \
            self._all_elements_are_valid() and \
            self._starts_with_valid_index() and \
            self._no_indexes_overlap() and \
            self._sequence_is_valid() and \
            self._heads_are_valid()

    def _all_elements_are_valid(self) -> bool:
        """Returns whether or not all :attr:`elements` are valid, ignoring the
        context of the whole sentence.
        """
        return all(element.is_valid() for element in self.elements)

    def _starts_with_valid_index(self) -> bool:
        """Returns whether or not the first element is a valid candidate for
        the start of a sentence, in relation to its index or indexes values.

        This is a helper method for :meth:`is_valid`; it must be called only
        if :attr:`elements` is not empty.

        The following rules apply, depending on the type of the first element
        of the sentence:
        - a :class:`Word` element is a valid candidate if its *index* is ``1``;
        - a :class:`Multiword` element is a valid candidate if its *first
          index* is ``1``;
        - an :class:`EmptyNode` element is a valid candidate if its *main
          index* is `0`.
        """
        element = self.elements[0]
        return (isinstance(element, Word) and element.index == 1) or \
            (isinstance(element, Multiword) and element.first_index == 1) or \
            (isinstance(element, EmptyNode) and element.main_index == 0)

    def _no_indexes_overlap(self):
        """Returns whether or not there are overlaps among the *index* ranges
        of :class:`.Multiword` elements (if any).

        This is a helper method for :meth:`is_valid`; it must be called only
        if all the of :attr:`elements` are valid (evaluated separately invoking
        :meth:`_all_elements_are_valid`).
        """
        ranges = set()

        for element in self.elements:
            if isinstance(element, Multiword):
                new_range = range(element.first_index, element.last_index + 1)
                range_set = set(new_range)
                if any(range_set.intersection(r) for r in ranges):
                    return False
                ranges.add(new_range)

        return True

    def _sequence_is_valid(
            self,
            position: int = 0,
            index: int = 0,
            sub_index: int = 0
    ) -> bool:
        """Returns whether or not a valid sequence of indexes is respected
        among all elements of the sentence.

        This is a helper method for :meth:`is_valid`; it must be called only
        if all the of :attr:`elements` are valid (evaluated separately invoking
        :meth:`_all_elements_are_valid`) and there are no indexes overlaps
        (see :meth:`_no_indexes_overlap`).

        For more details, refer to the documentation of :meth:`is_valid`.
        """

        if position >= len(self.elements):
            return True

        elem = self.elements[position]

        if isinstance(elem, Word):
            return elem.index == index + 1 and \
                   self._sequence_is_valid(position + 1, elem.index)

        elif isinstance(elem, Multiword):
            return elem.first_index == index + 1 and \
                   self._has_word_with_index(elem.last_index or 0) and \
                   self._sequence_is_valid(position + 1, index)

        elif isinstance(elem, EmptyNode):
            return elem.main_index == index and \
                   elem.sub_index == sub_index + 1 and \
                   self._sequence_is_valid(position + 1, index, elem.sub_index)

        return True

    def _heads_are_valid(self) -> bool:
        """Returns whether or not the *head* values of the element are valid in
        the context of the sentence.

        This is a helper method for :meth:`is_valid`; it must be called only
        if all the of :attr:`elements` are valid (evaluated separately invoking
        :meth:`_all_elements_are_valid`) and the elements sequence is valid
        (see :meth:`_sequence_is_valid`).

        A head is considered valid only if its value is either not set
        (``None``), equals to zero (``0``, for ``root``grammatical relations),
        or less than or equal to the *index* of the last :class:`.Word` within
        the sentence.
        """
        words = list(self.words())
        last_index = words[-1].index or 0
        return all(0 <= (word.head or 0) <= last_index for word in words)

    def _has_word_with_index(self, index: int) -> bool:
        """Returns whether or not :meth:`elements` contains a :class:`.Word`
        element with the given *index*.
        """
        return any(word.index == index for word in self.words())

[docs]    def to_conllu(self) -> str:
        """Returns a *CoNLL-U* formatted representation of the sentence.

        No validity check is performed on the sentence and its element;
        elements and values not compatible with *CoNLL-U* format could lead to
        an incorrect output value or raising of exceptions.
        """
        comments = ''.join(f'# {c}\n' for c in self.comments or [])
        word_lines = ''.join(f'{e.to_conllu()}\n' for e in self.elements)
        return f'{comments}{word_lines}\n'