Nuspell
spell checker
Loading...
Searching...
No Matches
unicode.hxx
1/* Copyright 2021-2022 Dimitrij Mijoski
2 *
3 * This file is part of Nuspell.
4 *
5 * Nuspell is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * Nuspell is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public License
16 * along with Nuspell. If not, see <http://www.gnu.org/licenses/>.
17 */
18#ifndef NUSPELL_UNICODE_HXX
19#define NUSPELL_UNICODE_HXX
20#include <string>
21#include <string_view>
22#include <unicode/utf16.h>
23#include <unicode/utf8.h>
24
25namespace nuspell {
26inline namespace v5 {
27
28// UTF-8, work on malformed
29
30inline constexpr auto u8_max_cp_length = U8_MAX_LENGTH;
31
32auto inline u8_is_cp_error(int32_t cp) -> bool { return cp < 0; }
33
34template <class Range>
35auto u8_advance_cp(const Range& str, size_t& i, int32_t& cp) -> void
36{
37 using std::size, std::data;
38#if U_ICU_VERSION_MAJOR_NUM <= 60
39 auto s_ptr = data(str);
40 int32_t idx = i;
41 int32_t len = size(str);
42 U8_NEXT(s_ptr, idx, len, cp);
43 i = idx;
44#else
45 auto len = size(str);
46 U8_NEXT(str, i, len, cp);
47#endif
48}
49
50template <class Range>
51auto u8_advance_index(const Range& str, size_t& i) -> void
52{
53 using std::size;
54 auto len = size(str);
55 U8_FWD_1(str, i, len);
56}
57
58template <class Range>
59auto u8_reverse_cp(const Range& str, size_t& i, int32_t& cp) -> void
60{
61 using std::size, std::data;
62 auto ptr = data(str);
63 int32_t idx = i;
64 U8_PREV(ptr, 0, idx, cp);
65 i = idx;
66}
67
68template <class Range>
69auto u8_reverse_index(const Range& str, size_t& i) -> void
70{
71 using std::size, std::data;
72 auto ptr = data(str);
73 int32_t idx = i;
74 U8_BACK_1(ptr, 0, idx);
75 i = idx;
76}
77
78template <class Range>
79auto u8_write_cp_and_advance(Range& buf, size_t& i, int32_t cp, bool& error)
80 -> void
81{
82 using std::size, std::data;
83#if U_ICU_VERSION_MAJOR_NUM <= 60
84 auto ptr = data(buf);
85 int32_t idx = i;
86 int32_t len = size(buf);
87 U8_APPEND(buf, idx, len, cp, error);
88 i = idx;
89#else
90 auto len = size(buf);
91 U8_APPEND(buf, i, len, cp, error);
92#endif
93}
94
95// UTF-8, valid
96
97template <class Range>
98auto valid_u8_advance_cp(const Range& str, size_t& i, char32_t& cp) -> void
99{
100 U8_NEXT_UNSAFE(str, i, cp);
101}
102
103template <class Range>
104auto valid_u8_advance_index(const Range& str, size_t& i) -> void
105{
106 U8_FWD_1_UNSAFE(str, i);
107}
108
109template <class Range>
110auto valid_u8_reverse_cp(const Range& str, size_t& i, char32_t& cp) -> void
111{
112 U8_PREV_UNSAFE(str, i, cp);
113}
114
115template <class Range>
116auto valid_u8_reverse_index(const Range& str, size_t& i) -> void
117{
118 U8_BACK_1_UNSAFE(str, i);
119}
120
121template <class Range>
122auto valid_u8_write_cp_and_advance(Range& buf, size_t& i, char32_t cp) -> void
123{
124 U8_APPEND_UNSAFE(buf, i, cp);
125}
126
127// UTF-16, work on malformed
128
129inline constexpr auto u16_max_cp_length = U16_MAX_LENGTH;
130
131auto inline u16_is_cp_error(int32_t cp) -> bool { return U_IS_SURROGATE(cp); }
132
133template <class Range>
134auto u16_advance_cp(const Range& str, size_t& i, int32_t& cp) -> void
135{
136 using std::size;
137 auto len = size(str);
138 U16_NEXT(str, i, len, cp);
139}
140
141template <class Range>
142auto u16_advance_index(const Range& str, size_t& i) -> void
143{
144 using std::size;
145 auto len = size(str);
146 U16_FWD_1(str, i, len);
147}
148
149template <class Range>
150auto u16_reverse_cp(const Range& str, size_t& i, int32_t& cp) -> void
151{
152 U16_PREV(str, 0, i, cp);
153}
154
155template <class Range>
156auto u16_reverse_index(const Range& str, size_t& i) -> void
157{
158 U16_BACK_1(str, 0, i);
159}
160
161template <class Range>
162auto u16_write_cp_and_advance(Range& buf, size_t& i, int32_t cp, bool& error)
163 -> void
164{
165 using std::size;
166 auto len = size(buf);
167 U16_APPEND(buf, i, len, cp, error);
168}
169
170// UTF-16, valid
171
172template <class Range>
173auto valid_u16_advance_cp(const Range& str, size_t& i, char32_t& cp) -> void
174{
175 U16_NEXT_UNSAFE(str, i, cp);
176}
177
178template <class Range>
179auto valid_u16_advance_index(const Range& str, size_t& i) -> void
180{
181 U16_FWD_1_UNSAFE(str, i);
182}
183
184template <class Range>
185auto valid_u16_reverse_cp(const Range& str, size_t& i, char32_t& cp) -> void
186{
187 U16_PREV_UNSAFE(str, i, cp);
188}
189
190template <class Range>
191auto valid_u16_reverse_index(const Range& str, size_t& i) -> void
192{
193 U16_BACK_1_UNSAFE(str, i);
194}
195
196template <class Range>
197auto valid_u16_write_cp_and_advance(Range& buf, size_t& i, char32_t cp) -> void
198{
199 U16_APPEND_UNSAFE(buf, i, cp);
200}
201
202// higer level funcs
203
204struct U8_CP_Pos {
205 size_t begin_i = 0;
206 size_t end_i = begin_i;
207};
208
210 char d[u8_max_cp_length];
211 int sz;
212
213 public:
214 explicit U8_Encoded_CP(std::string_view str, U8_CP_Pos pos)
215 : sz(pos.end_i - pos.begin_i)
216 {
217 auto i = sz;
218 auto j = pos.end_i;
219 auto max_len = 4;
220 do {
221 d[--i] = str[--j];
222 } while (i && --max_len);
223 }
224 U8_Encoded_CP(char32_t cp)
225 {
226 size_t z = 0;
227 valid_u8_write_cp_and_advance(d, z, cp);
228 sz = z;
229 }
230 auto size() const noexcept -> size_t { return sz; }
231 auto data() const noexcept -> const char* { return d; }
232 operator std::string_view() const noexcept
233 {
234 return std::string_view(data(), size());
235 }
236 auto copy_to(std::string& str, size_t j) const
237 {
238 auto i = sz;
239 j += sz;
240 auto max_len = 4;
241 do {
242 str[--j] = d[--i];
243 } while (i && --max_len);
244 }
245};
246
247auto inline u8_swap_adjacent_cp(std::string& str, size_t i1, size_t i2,
248 size_t i3) -> size_t
249{
250 auto cp1 = U8_Encoded_CP(str, {i1, i2});
251 auto cp2 = U8_Encoded_CP(str, {i2, i3});
252 auto new_i2 = i1 + std::size(cp2);
253 cp1.copy_to(str, new_i2);
254 cp2.copy_to(str, i1);
255 return new_i2;
256}
257
258auto inline u8_swap_cp(std::string& str, U8_CP_Pos pos1, U8_CP_Pos pos2)
259 -> std::pair<size_t, size_t>
260{
261 using std::size;
262 auto cp1 = U8_Encoded_CP(str, pos1);
263 auto cp2 = U8_Encoded_CP(str, pos2);
264 auto new_p1_end_i = pos1.begin_i + size(cp2);
265 auto new_p2_begin_i = pos2.end_i - size(cp1);
266 std::char_traits<char>::move(&str[new_p1_end_i], &str[pos1.end_i],
267 pos2.begin_i - pos1.end_i);
268 cp2.copy_to(str, pos1.begin_i);
269 cp1.copy_to(str, new_p2_begin_i);
270 return {new_p1_end_i, new_p2_begin_i};
271}
272
273// bellow go func without out-parametars
274
275// UTF-8, can be malformed, no out-parametars
276
278 size_t end_i;
279 int32_t cp;
280};
281
283 size_t begin_i;
284 int32_t cp;
285};
286
288 size_t end_i;
289 bool error;
290};
291
292template <class Range>
293[[nodiscard]] auto u8_next_cp(const Range& str, size_t i) -> Idx_And_Next_CP
294{
295 int32_t cp;
296 u8_advance_cp(str, i, cp);
297 return {i, cp};
298}
299
300template <class Range>
301[[nodiscard]] auto u8_next_index(const Range& str, size_t i) -> size_t
302{
303 u8_advance_index(str, i);
304 return i;
305}
306
307template <class Range>
308[[nodiscard]] auto u8_prev_cp(const Range& str, size_t i) -> Idx_And_Prev_CP
309{
310 int32_t cp;
311 u8_reverse_cp(str, i, cp);
312 return {i, cp};
313}
314
315template <class Range>
316[[nodiscard]] auto u8_prev_index(const Range& str, size_t i) -> size_t
317{
318 u8_reverse_index(str, i);
319 return i;
320}
321
322template <class Range>
323[[nodiscard]] auto u8_write_cp(Range& buf, size_t i, int32_t cp)
325{
326 bool err;
327 u8_write_cp_and_advance(buf, i, cp, err);
328 return {i, err};
329}
330
331// UTF-8, valid, no out-parametars
332
334 size_t end_i;
335 char32_t cp;
336};
337
339 size_t begin_i;
340 char32_t cp;
341};
342
343template <class Range>
344[[nodiscard]] auto valid_u8_next_cp(const Range& str, size_t i)
346{
347 char32_t cp;
348 valid_u8_advance_cp(str, i, cp);
349 return {i, cp};
350}
351
352template <class Range>
353[[nodiscard]] auto valid_u8_next_index(const Range& str, size_t i) -> size_t
354{
355 valid_u8_advance_index(str, i);
356 return i;
357}
358
359template <class Range>
360[[nodiscard]] auto valid_u8_prev_cp(const Range& str, size_t i)
362{
363 char32_t cp;
364 valid_u8_reverse_cp(str, i, cp);
365 return {i, cp};
366}
367
368template <class Range>
369[[nodiscard]] auto valid_u8_prev_index(const Range& str, size_t i) -> size_t
370{
371 valid_u8_reverse_index(str, i);
372 return i;
373}
374
375template <class Range>
376[[nodiscard]] auto valid_u8_write_cp(Range& buf, size_t i, int32_t cp) -> size_t
377{
378 valid_u8_write_cp_and_advance(buf, i, cp);
379 return i;
380}
381} // namespace v5
382} // namespace nuspell
383#endif // NUSPELL_UNICODE_HXX
Definition unicode.hxx:209
Library main namespace with version number attached.
Definition aff_data.cxx:42
Library main namespace.
Definition aff_data.cxx:33
Definition unicode.hxx:333
Definition unicode.hxx:277
Definition unicode.hxx:338
Definition unicode.hxx:282
Definition unicode.hxx:204
Definition unicode.hxx:287