Module bases.alphabet.range_alphabet

Alphabets implicitly specified by Unicode codepoint range.

Expand source code
"""
    Alphabets implicitly specified by Unicode codepoint range.
"""

from typing import Any, Iterator, Mapping, overload, Union
from typing_validation import validate

from .abstract import Alphabet
from .string_alphabet import StringAlphabet

class RangeAlphabet(Alphabet):
    """
        Class for alphabets implicitly specified by a range of Unicode codepoints
        and optional case sensitivity (default: case-sensitive).

        Example usage:

        ```py
        >>> from bases.alphabet import RangeAlphabet
        >>> RangeAlphabet(range(0x00, 0x100))
        RangeAlphabet(range(0x0, 0x100))
        ```
    """

    _codepoints: range
    _revdir: Mapping[str, int]
    _case_sensitive: bool

    def __init__(self, codepoints: range, *,
                 case_sensitive: bool = True):
        super().__init__(case_sensitive)
        validate(codepoints, range)
        self._codepoints = codepoints
        self._revdir = _RangeAlphabetRevdir(self)
        self.__validate_init()

    def __validate_init(self) -> None:
        codepoints = self._codepoints
        case_sensitive = self.case_sensitive
        if len(codepoints) <= 1:
            raise ValueError("Alphabet must have at least two characters.")
        if not case_sensitive:
            codepoints_set = set(codepoints)
            for i in codepoints:
                c = chr(i)
                if ord(c.upper()) in codepoints_set and ord(c.lower()) in codepoints_set:
                    raise ValueError("Alphabet contains lowercase and uppercase versions of the same character, "
                                     "encoding must be case-sensitive.")

    @property
    def codepoints(self) -> range:
        """
            The codepoint range that defines this alphabet.

            Example usage:

            ```py
            >>> RangeAlphabet(range(0x00, 0x100)).codepoints
            range(0, 256)
            ```
        """
        return self._codepoints

    @property
    def revdir(self) -> Mapping[str, int]:
        return self._revdir

    def __len__(self) -> int:
        return len(self._codepoints)

    @overload
    def __getitem__(self, idx: int) -> str:
        ...

    @overload
    def __getitem__(self, idx: slice) -> "RangeAlphabet":
        ...

    def __getitem__(self, idx: Union[int, slice]) -> Union[str, "RangeAlphabet"]:
        validate(idx, Union[int, slice])
        if isinstance(idx, slice):
            new_codepoints = self._codepoints[idx]
            return RangeAlphabet(new_codepoints, case_sensitive=self.case_sensitive)
        return chr(self._codepoints[idx])

    def with_case_sensitivity(self, case_sensitive: bool) -> "RangeAlphabet":
        validate(case_sensitive, bool)
        if case_sensitive == self.case_sensitive:
            return self
        return RangeAlphabet(self.codepoints, case_sensitive=case_sensitive)

    def as_string_alphabet(self) -> StringAlphabet:
        """
            Converts this alphabet into a string alphabet explicitly defined
            by the string containing all characters in the codepoint range.

            Example usage:

            ```py
            >>> RangeAlphabet(range(0x20, 0x7E)).as_string_alphabet()
            StringAlphabet(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMN
                            OPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}')
            ```
        """
        chars = "".join(self)
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def upper(self) -> StringAlphabet:
        chars = "".join(self).upper()
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def lower(self) -> StringAlphabet:
        chars = "".join(self).lower()
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, RangeAlphabet):
            return NotImplemented
        return self.codepoints == other.codepoints and self.case_sensitive == other.case_sensitive

    def __hash__(self) -> int:
        return hash((type(self), self.codepoints, self.case_sensitive))

    def __repr__(self) -> str:
        codepoints_str = f"range({hex(self.codepoints.start)}, {hex(self.codepoints.stop)})"
        if self.case_sensitive:
            return f"RangeAlphabet({codepoints_str})"
        case_sensitive_str = f"case_sensitive={self.case_sensitive}"
        return f"RangeAlphabet({codepoints_str}, {case_sensitive_str})"

class _RangeAlphabetRevdir(Mapping[str, int]):

    _alphabet: RangeAlphabet

    def __init__(self, alphabet: RangeAlphabet):
        self._alphabet = alphabet

    def __iter__(self) -> Iterator[str]:
        return iter(self._alphabet)

    def __len__(self) -> int:
        return len(self._alphabet)

    def __contains__(self, char: Any) -> bool:
        if not isinstance(char, str):
            return False
        alphabet = self._alphabet
        if alphabet.case_sensitive:
            return ord(char) in alphabet.codepoints
        return ord(char.upper()) in alphabet.codepoints or ord(char.lower()) in alphabet.codepoints

    def __getitem__(self, char: str) -> int:
        validate(char, str)
        alphabet = self._alphabet
        if ord(char) in alphabet.codepoints:
            return ord(char)-alphabet.codepoints.start
        if not alphabet.case_sensitive:
            if ord(char.upper()) in alphabet.codepoints:
                return ord(char.upper())-alphabet.codepoints.start
            if ord(char.lower()) in alphabet.codepoints:
                return ord(char.lower())-alphabet.codepoints.start
        raise KeyError(f"Character {repr(char)} not in alphabet.")

Classes

class RangeAlphabet (codepoints: range, *, case_sensitive: bool = True)

Class for alphabets implicitly specified by a range of Unicode codepoints and optional case sensitivity (default: case-sensitive).

Example usage:

>>> from bases.alphabet import RangeAlphabet
>>> RangeAlphabet(range(0x00, 0x100))
RangeAlphabet(range(0x0, 0x100))
Expand source code
class RangeAlphabet(Alphabet):
    """
        Class for alphabets implicitly specified by a range of Unicode codepoints
        and optional case sensitivity (default: case-sensitive).

        Example usage:

        ```py
        >>> from bases.alphabet import RangeAlphabet
        >>> RangeAlphabet(range(0x00, 0x100))
        RangeAlphabet(range(0x0, 0x100))
        ```
    """

    _codepoints: range
    _revdir: Mapping[str, int]
    _case_sensitive: bool

    def __init__(self, codepoints: range, *,
                 case_sensitive: bool = True):
        super().__init__(case_sensitive)
        validate(codepoints, range)
        self._codepoints = codepoints
        self._revdir = _RangeAlphabetRevdir(self)
        self.__validate_init()

    def __validate_init(self) -> None:
        codepoints = self._codepoints
        case_sensitive = self.case_sensitive
        if len(codepoints) <= 1:
            raise ValueError("Alphabet must have at least two characters.")
        if not case_sensitive:
            codepoints_set = set(codepoints)
            for i in codepoints:
                c = chr(i)
                if ord(c.upper()) in codepoints_set and ord(c.lower()) in codepoints_set:
                    raise ValueError("Alphabet contains lowercase and uppercase versions of the same character, "
                                     "encoding must be case-sensitive.")

    @property
    def codepoints(self) -> range:
        """
            The codepoint range that defines this alphabet.

            Example usage:

            ```py
            >>> RangeAlphabet(range(0x00, 0x100)).codepoints
            range(0, 256)
            ```
        """
        return self._codepoints

    @property
    def revdir(self) -> Mapping[str, int]:
        return self._revdir

    def __len__(self) -> int:
        return len(self._codepoints)

    @overload
    def __getitem__(self, idx: int) -> str:
        ...

    @overload
    def __getitem__(self, idx: slice) -> "RangeAlphabet":
        ...

    def __getitem__(self, idx: Union[int, slice]) -> Union[str, "RangeAlphabet"]:
        validate(idx, Union[int, slice])
        if isinstance(idx, slice):
            new_codepoints = self._codepoints[idx]
            return RangeAlphabet(new_codepoints, case_sensitive=self.case_sensitive)
        return chr(self._codepoints[idx])

    def with_case_sensitivity(self, case_sensitive: bool) -> "RangeAlphabet":
        validate(case_sensitive, bool)
        if case_sensitive == self.case_sensitive:
            return self
        return RangeAlphabet(self.codepoints, case_sensitive=case_sensitive)

    def as_string_alphabet(self) -> StringAlphabet:
        """
            Converts this alphabet into a string alphabet explicitly defined
            by the string containing all characters in the codepoint range.

            Example usage:

            ```py
            >>> RangeAlphabet(range(0x20, 0x7E)).as_string_alphabet()
            StringAlphabet(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMN
                            OPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}')
            ```
        """
        chars = "".join(self)
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def upper(self) -> StringAlphabet:
        chars = "".join(self).upper()
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def lower(self) -> StringAlphabet:
        chars = "".join(self).lower()
        return StringAlphabet(chars, case_sensitive=self.case_sensitive)

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, RangeAlphabet):
            return NotImplemented
        return self.codepoints == other.codepoints and self.case_sensitive == other.case_sensitive

    def __hash__(self) -> int:
        return hash((type(self), self.codepoints, self.case_sensitive))

    def __repr__(self) -> str:
        codepoints_str = f"range({hex(self.codepoints.start)}, {hex(self.codepoints.stop)})"
        if self.case_sensitive:
            return f"RangeAlphabet({codepoints_str})"
        case_sensitive_str = f"case_sensitive={self.case_sensitive}"
        return f"RangeAlphabet({codepoints_str}, {case_sensitive_str})"

Ancestors

  • Alphabet
  • abc.ABC
  • collections.abc.Sequence
  • collections.abc.Reversible
  • collections.abc.Collection
  • collections.abc.Sized
  • collections.abc.Iterable
  • collections.abc.Container
  • typing.Generic

Instance variables

var codepoints : range

The codepoint range that defines this alphabet.

Example usage:

>>> RangeAlphabet(range(0x00, 0x100)).codepoints
range(0, 256)
Expand source code
@property
def codepoints(self) -> range:
    """
        The codepoint range that defines this alphabet.

        Example usage:

        ```py
        >>> RangeAlphabet(range(0x00, 0x100)).codepoints
        range(0, 256)
        ```
    """
    return self._codepoints

Methods

def as_string_alphabet(self) ‑> StringAlphabet

Converts this alphabet into a string alphabet explicitly defined by the string containing all characters in the codepoint range.

Example usage:

>>> RangeAlphabet(range(0x20, 0x7E)).as_string_alphabet()
StringAlphabet(' !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMN
                OPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}')
Expand source code
def as_string_alphabet(self) -> StringAlphabet:
    """
        Converts this alphabet into a string alphabet explicitly defined
        by the string containing all characters in the codepoint range.

        Example usage:

        ```py
        >>> RangeAlphabet(range(0x20, 0x7E)).as_string_alphabet()
        StringAlphabet(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMN
                        OPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}')
        ```
    """
    chars = "".join(self)
    return StringAlphabet(chars, case_sensitive=self.case_sensitive)

Inherited members