Skip to content

Grapheme string

GraphemeString ืœ

An object that unifies the functions available from the grapheme library under an object. Functions all work as they do in the grapheme library, this is simply an interface.

Source code in hebrew/grapheme_string.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
class GraphemeString:
    """
    An object that unifies the functions available from the grapheme library under an object.
    Functions all work as they do in the grapheme library, this is simply an interface.
    """

    UNICODE_VERSION: str = grapheme.UNICODE_VERSION

    def __init__(self, string: str):
        self.string = string

    @property
    def graphemes(self) -> Iterator[GraphemeIterator]:
        """
        Returns an iterator of all graphemes of given string.

        ``` python
        >>> rainbow_flag = "๐Ÿณ๏ธโ€๐ŸŒˆ"
        >>> [codepoint for codepoint in rainbow_flag]
        ['๐Ÿณ', '๏ธ', '\u200d', '๐ŸŒˆ']
        >>> list(GraphemeString("multi codepoint grapheme: " + rainbow_flag).graphemes)
        ['m', 'u', 'l', 't', 'i', ' ', 'c', 'o', 'd', 'e', 'p', 'o', 'i', 'n', 't', ' ', 'g', 'r', 'a', 'p', 'h', 'e', 'm', 'e', ':', ' ', '๐Ÿณ๏ธโ€๐ŸŒˆ']
        ```
        """
        return grapheme.graphemes(self.string)

    @property
    def length(self) -> int:
        """
        Returns the number of graphemes in the string.

        Note that this functions needs to traverse the full string to calculate the length,
        unlike `len(string)` and it's time consumption is linear to the length of the string
        (up to the `until` value).

        Only counts up to the `until` argument, if given. This is useful when testing
        the length of a string against some limit and the excess length is not interesting.

        ``` python
        >>> rainbow_flag = "๐Ÿณ๏ธโ€๐ŸŒˆ"
        >>> len(rainbow_flag)
        4
        >>> GraphemeString(rainbow_flag).length
        1
        ```
        """
        return grapheme.length(self.string)

    def get_length(self, until: int) -> int:
        """
        Returns the number of graphemes in the string.

        Note that this functions needs to traverse the full string to calculate the length,
        unlike `len(string)` and it's time consumption is linear to the length of the string
        (up to the `until` value).

        Only counts up to the `until` argument, if given. This is useful when testing
        the length of a string against some limit and the excess length is not interesting.

        ``` python
        >>> rainbow_flag = "๐Ÿณ๏ธโ€๐ŸŒˆ"
        >>> len(rainbow_flag)
        4
        >>> GraphemeString(rainbow_flag).length
        1
        >>> GraphemeString("".join(str(i) for i in range(100))).get_length(30)
        30
        ```
        """
        return grapheme.length(self.string, until)

    @property
    def grapheme_lengths(self) -> Iterator[int]:
        """
        Returns an iterator of number of code points in each grapheme of the string.
        """
        return grapheme.grapheme_lengths(self.string)

    def slice(self, start: int = None, end: int = None) -> str:
        """
        Returns a substring of the given string, counting graphemes instead of codepoints.

        Negative indices is currently not supported.

        ``` python
        >>> string = "tamil เฎจเฎฟ (ni)"

        >>> string[:7]
        'tamil เฎจ'
        >>> GraphemeString(string).slice(end=7)
        'tamil เฎจเฎฟ'
        >>> string[7:]
        'เฎฟ (ni)'
        >>> GraphemeString(string).slice(start=7)
        ' (ni)'
        ```
        """
        return grapheme.slice(self.string, start, end)

    def contains(self, substring: str) -> bool:
        """
        Returns true if the sequence of graphemes in substring is also present in string.

        This differs from the normal python `in` operator, since the python operator will return
        true if the sequence of codepoints are withing the other string without considering
        grapheme boundaries.

        Performance notes: Very fast if `substring not in string`, since that also means that
        the same graphemes can not be in the two strings. Otherwise this function has linear time
        complexity in relation to the string length. It will traverse the sequence of graphemes until
        a match is found, so it will generally perform better for grapheme sequences that match early.

        ``` python
        >>> "๐Ÿ‡ธ๐Ÿ‡ช" in "๐Ÿ‡ช๐Ÿ‡ธ๐Ÿ‡ช๐Ÿ‡ช"
        True
        >>> GraphemeString("๐Ÿ‡ช๐Ÿ‡ธ๐Ÿ‡ช๐Ÿ‡ช").contains("๐Ÿ‡ธ๐Ÿ‡ช")
        False
        ```
        """
        return grapheme.contains(self.string, substring)

    def safe_split_index(self, max_length: int) -> int:
        """
        Returns the highest index up to `max_len` at which the given string can be sliced, without breaking a grapheme.

        This is useful for when you want to split or take a substring from a string, and don't really care about
        the exact grapheme length, but don't want to risk breaking existing graphemes.

        This function does normally not traverse the full grapheme sequence up to the given length, so it can be used
        for arbitrarily long strings and high `max_len`s. However, some grapheme boundaries depend on the previous state,
        so the worst case performance is O(n). In practice, it's only very long non-broken sequences of country flags
        (represented as Regional Indicators) that will perform badly.

        The return value will always be between `0` and `len(string)`.

        ``` python
        >>> string = "tamil เฎจเฎฟ (ni)"
        >>> i = GraphemeString(string).safe_split_index(7)
        >>> i
        6
        >>> string[:i]
        'tamil '
        >>> string[i:]
        'เฎจเฎฟ (ni)'
        ```
        """
        return grapheme.safe_split_index(self.string, max_length)

    def startswith(self, prefix: str) -> bool:
        """
        Like str.startswith, but also checks that the string starts with the given prefixes sequence of graphemes.

        str.startswith may return true for a prefix that is not visually represented as a prefix if a grapheme cluster
        is continued after the prefix ends.

        ``` python
        >>> GraphemeString("โœŠ๐Ÿพ").startswith("โœŠ")
        False
        >>> "โœŠ๐Ÿพ".startswith("โœŠ")
        True
        ```
        """
        return grapheme.startswith(self.string, prefix)

    def endswith(self, suffix: str) -> bool:
        """
        Like str.endswith, but also checks that the string ends with the given prefixes sequence of graphemes.

        str.endswith may return true for a suffix that is not visually represented as a suffix if a grapheme cluster
        is initiated before the suffix starts.

        ``` python
        >>> GraphemeString("๐Ÿณ๏ธโ€๐ŸŒˆ").endswith("๐ŸŒˆ")
        False
        >>> "๐Ÿณ๏ธโ€๐ŸŒˆ".endswith("๐ŸŒˆ")
        True
        ```
        """
        return grapheme.endswith(self.string, suffix)

    def __str__(self) -> str:
        return self.string

    def __repr__(self) -> str:
        return self.__str__()

    def __add__(self, other) -> GraphemeStringT:
        return GraphemeString(self.string + str(other))

    def __key(self) -> str:
        return self.string

    def __eq__(self, other) -> bool:
        if isinstance(other, GraphemeString):
            return self.__key() == other.__key()
        return False

    def __hash__(self):
        return hash(self.__key())

grapheme_lengths: Iterator[int] property ืœ

Returns an iterator of number of code points in each grapheme of the string.

graphemes: Iterator[GraphemeIterator] property ืœ

Returns an iterator of all graphemes of given string.

>>> rainbow_flag = "๐Ÿณ๏ธโ€๐ŸŒˆ"
>>> [codepoint for codepoint in rainbow_flag]
['๐Ÿณ', '๏ธ', 'โ€', '๐ŸŒˆ']
>>> list(GraphemeString("multi codepoint grapheme: " + rainbow_flag).graphemes)
['m', 'u', 'l', 't', 'i', ' ', 'c', 'o', 'd', 'e', 'p', 'o', 'i', 'n', 't', ' ', 'g', 'r', 'a', 'p', 'h', 'e', 'm', 'e', ':', ' ', '๐Ÿณ๏ธโ€๐ŸŒˆ']

length: int property ืœ

Returns the number of graphemes in the string.

Note that this functions needs to traverse the full string to calculate the length, unlike len(string) and it's time consumption is linear to the length of the string (up to the until value).

Only counts up to the until argument, if given. This is useful when testing the length of a string against some limit and the excess length is not interesting.

>>> rainbow_flag = "๐Ÿณ๏ธโ€๐ŸŒˆ"
>>> len(rainbow_flag)
4
>>> GraphemeString(rainbow_flag).length
1

contains(substring) ืœ

Returns true if the sequence of graphemes in substring is also present in string.

This differs from the normal python in operator, since the python operator will return true if the sequence of codepoints are withing the other string without considering grapheme boundaries.

Performance notes: Very fast if substring not in string, since that also means that the same graphemes can not be in the two strings. Otherwise this function has linear time complexity in relation to the string length. It will traverse the sequence of graphemes until a match is found, so it will generally perform better for grapheme sequences that match early.

>>> "๐Ÿ‡ธ๐Ÿ‡ช" in "๐Ÿ‡ช๐Ÿ‡ธ๐Ÿ‡ช๐Ÿ‡ช"
True
>>> GraphemeString("๐Ÿ‡ช๐Ÿ‡ธ๐Ÿ‡ช๐Ÿ‡ช").contains("๐Ÿ‡ธ๐Ÿ‡ช")
False
Source code in hebrew/grapheme_string.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def contains(self, substring: str) -> bool:
    """
    Returns true if the sequence of graphemes in substring is also present in string.

    This differs from the normal python `in` operator, since the python operator will return
    true if the sequence of codepoints are withing the other string without considering
    grapheme boundaries.

    Performance notes: Very fast if `substring not in string`, since that also means that
    the same graphemes can not be in the two strings. Otherwise this function has linear time
    complexity in relation to the string length. It will traverse the sequence of graphemes until
    a match is found, so it will generally perform better for grapheme sequences that match early.

    ``` python
    >>> "๐Ÿ‡ธ๐Ÿ‡ช" in "๐Ÿ‡ช๐Ÿ‡ธ๐Ÿ‡ช๐Ÿ‡ช"
    True
    >>> GraphemeString("๐Ÿ‡ช๐Ÿ‡ธ๐Ÿ‡ช๐Ÿ‡ช").contains("๐Ÿ‡ธ๐Ÿ‡ช")
    False
    ```
    """
    return grapheme.contains(self.string, substring)

endswith(suffix) ืœ

Like str.endswith, but also checks that the string ends with the given prefixes sequence of graphemes.

str.endswith may return true for a suffix that is not visually represented as a suffix if a grapheme cluster is initiated before the suffix starts.

>>> GraphemeString("๐Ÿณ๏ธโ€๐ŸŒˆ").endswith("๐ŸŒˆ")
False
>>> "๐Ÿณ๏ธโ€๐ŸŒˆ".endswith("๐ŸŒˆ")
True
Source code in hebrew/grapheme_string.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
def endswith(self, suffix: str) -> bool:
    """
    Like str.endswith, but also checks that the string ends with the given prefixes sequence of graphemes.

    str.endswith may return true for a suffix that is not visually represented as a suffix if a grapheme cluster
    is initiated before the suffix starts.

    ``` python
    >>> GraphemeString("๐Ÿณ๏ธโ€๐ŸŒˆ").endswith("๐ŸŒˆ")
    False
    >>> "๐Ÿณ๏ธโ€๐ŸŒˆ".endswith("๐ŸŒˆ")
    True
    ```
    """
    return grapheme.endswith(self.string, suffix)

get_length(until) ืœ

Returns the number of graphemes in the string.

Note that this functions needs to traverse the full string to calculate the length, unlike len(string) and it's time consumption is linear to the length of the string (up to the until value).

Only counts up to the until argument, if given. This is useful when testing the length of a string against some limit and the excess length is not interesting.

>>> rainbow_flag = "๐Ÿณ๏ธโ€๐ŸŒˆ"
>>> len(rainbow_flag)
4
>>> GraphemeString(rainbow_flag).length
1
>>> GraphemeString("".join(str(i) for i in range(100))).get_length(30)
30
Source code in hebrew/grapheme_string.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def get_length(self, until: int) -> int:
    """
    Returns the number of graphemes in the string.

    Note that this functions needs to traverse the full string to calculate the length,
    unlike `len(string)` and it's time consumption is linear to the length of the string
    (up to the `until` value).

    Only counts up to the `until` argument, if given. This is useful when testing
    the length of a string against some limit and the excess length is not interesting.

    ``` python
    >>> rainbow_flag = "๐Ÿณ๏ธโ€๐ŸŒˆ"
    >>> len(rainbow_flag)
    4
    >>> GraphemeString(rainbow_flag).length
    1
    >>> GraphemeString("".join(str(i) for i in range(100))).get_length(30)
    30
    ```
    """
    return grapheme.length(self.string, until)

safe_split_index(max_length) ืœ

Returns the highest index up to max_len at which the given string can be sliced, without breaking a grapheme.

This is useful for when you want to split or take a substring from a string, and don't really care about the exact grapheme length, but don't want to risk breaking existing graphemes.

This function does normally not traverse the full grapheme sequence up to the given length, so it can be used for arbitrarily long strings and high max_lens. However, some grapheme boundaries depend on the previous state, so the worst case performance is O(n). In practice, it's only very long non-broken sequences of country flags (represented as Regional Indicators) that will perform badly.

The return value will always be between 0 and len(string).

>>> string = "tamil เฎจเฎฟ (ni)"
>>> i = GraphemeString(string).safe_split_index(7)
>>> i
6
>>> string[:i]
'tamil '
>>> string[i:]
'เฎจเฎฟ (ni)'
Source code in hebrew/grapheme_string.py
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def safe_split_index(self, max_length: int) -> int:
    """
    Returns the highest index up to `max_len` at which the given string can be sliced, without breaking a grapheme.

    This is useful for when you want to split or take a substring from a string, and don't really care about
    the exact grapheme length, but don't want to risk breaking existing graphemes.

    This function does normally not traverse the full grapheme sequence up to the given length, so it can be used
    for arbitrarily long strings and high `max_len`s. However, some grapheme boundaries depend on the previous state,
    so the worst case performance is O(n). In practice, it's only very long non-broken sequences of country flags
    (represented as Regional Indicators) that will perform badly.

    The return value will always be between `0` and `len(string)`.

    ``` python
    >>> string = "tamil เฎจเฎฟ (ni)"
    >>> i = GraphemeString(string).safe_split_index(7)
    >>> i
    6
    >>> string[:i]
    'tamil '
    >>> string[i:]
    'เฎจเฎฟ (ni)'
    ```
    """
    return grapheme.safe_split_index(self.string, max_length)

slice(start=None, end=None) ืœ

Returns a substring of the given string, counting graphemes instead of codepoints.

Negative indices is currently not supported.

>>> string = "tamil เฎจเฎฟ (ni)"

>>> string[:7]
'tamil เฎจ'
>>> GraphemeString(string).slice(end=7)
'tamil เฎจเฎฟ'
>>> string[7:]
'เฎฟ (ni)'
>>> GraphemeString(string).slice(start=7)
' (ni)'
Source code in hebrew/grapheme_string.py
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def slice(self, start: int = None, end: int = None) -> str:
    """
    Returns a substring of the given string, counting graphemes instead of codepoints.

    Negative indices is currently not supported.

    ``` python
    >>> string = "tamil เฎจเฎฟ (ni)"

    >>> string[:7]
    'tamil เฎจ'
    >>> GraphemeString(string).slice(end=7)
    'tamil เฎจเฎฟ'
    >>> string[7:]
    'เฎฟ (ni)'
    >>> GraphemeString(string).slice(start=7)
    ' (ni)'
    ```
    """
    return grapheme.slice(self.string, start, end)

startswith(prefix) ืœ

Like str.startswith, but also checks that the string starts with the given prefixes sequence of graphemes.

str.startswith may return true for a prefix that is not visually represented as a prefix if a grapheme cluster is continued after the prefix ends.

>>> GraphemeString("โœŠ๐Ÿพ").startswith("โœŠ")
False
>>> "โœŠ๐Ÿพ".startswith("โœŠ")
True
Source code in hebrew/grapheme_string.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def startswith(self, prefix: str) -> bool:
    """
    Like str.startswith, but also checks that the string starts with the given prefixes sequence of graphemes.

    str.startswith may return true for a prefix that is not visually represented as a prefix if a grapheme cluster
    is continued after the prefix ends.

    ``` python
    >>> GraphemeString("โœŠ๐Ÿพ").startswith("โœŠ")
    False
    >>> "โœŠ๐Ÿพ".startswith("โœŠ")
    True
    ```
    """
    return grapheme.startswith(self.string, prefix)