El Octavio  1.0
This is a video game about adventures.
Utf.inl
Go to the documentation of this file.
1
2//
3// SFML - Simple and Fast Multimedia Library
4// Copyright (C) 2007-2018 Laurent Gomila (laurent@sfml-dev.org)
5//
6// This software is provided 'as-is', without any express or implied warranty.
7// In no event will the authors be held liable for any damages arising from the use of this software.
8//
9// Permission is granted to anyone to use this software for any purpose,
10// including commercial applications, and to alter it and redistribute it freely,
11// subject to the following restrictions:
12//
13// 1. The origin of this software must not be misrepresented;
14// you must not claim that you wrote the original software.
15// If you use this software in a product, an acknowledgment
16// in the product documentation would be appreciated but is not required.
17//
18// 2. Altered source versions must be plainly marked as such,
19// and must not be misrepresented as being the original software.
20//
21// 3. This notice may not be removed or altered from any source distribution.
22//
24
25
27// References:
28//
29// https://www.unicode.org/
30// https://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
31// https://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h
32// https://people.w3.org/rishida/scripts/uniview/conversion
33//
35
36
38template <typename In>
39In Utf<8>::decode(In begin, In end, Uint32& output, Uint32 replacement)
40{
41 // Some useful precomputed data
42 static const int trailing[256] =
43 {
44 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
52 };
53 static const Uint32 offsets[6] =
54 {
55 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080
56 };
57
58 // decode the character
59 int trailingBytes = trailing[static_cast<Uint8>(*begin)];
60 if (begin + trailingBytes < end)
61 {
62 output = 0;
63 switch (trailingBytes)
64 {
65 case 5: output += static_cast<Uint8>(*begin++); output <<= 6;
66 case 4: output += static_cast<Uint8>(*begin++); output <<= 6;
67 case 3: output += static_cast<Uint8>(*begin++); output <<= 6;
68 case 2: output += static_cast<Uint8>(*begin++); output <<= 6;
69 case 1: output += static_cast<Uint8>(*begin++); output <<= 6;
70 case 0: output += static_cast<Uint8>(*begin++);
71 }
72 output -= offsets[trailingBytes];
73 }
74 else
75 {
76 // Incomplete character
77 begin = end;
78 output = replacement;
79 }
80
81 return begin;
82}
83
84
86template <typename Out>
87Out Utf<8>::encode(Uint32 input, Out output, Uint8 replacement)
88{
89 // Some useful precomputed data
90 static const Uint8 firstBytes[7] =
91 {
92 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
93 };
94
95 // encode the character
96 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
97 {
98 // Invalid character
99 if (replacement)
100 *output++ = replacement;
101 }
102 else
103 {
104 // Valid character
105
106 // Get the number of bytes to write
107 std::size_t bytestoWrite = 1;
108 if (input < 0x80) bytestoWrite = 1;
109 else if (input < 0x800) bytestoWrite = 2;
110 else if (input < 0x10000) bytestoWrite = 3;
111 else if (input <= 0x0010FFFF) bytestoWrite = 4;
112
113 // Extract the bytes to write
114 Uint8 bytes[4];
115 switch (bytestoWrite)
116 {
117 case 4: bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
118 case 3: bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
119 case 2: bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
120 case 1: bytes[0] = static_cast<Uint8> (input | firstBytes[bytestoWrite]);
121 }
122
123 // Add them to the output
124 output = std::copy(bytes, bytes + bytestoWrite, output);
125 }
126
127 return output;
128}
129
130
132template <typename In>
133In Utf<8>::next(In begin, In end)
134{
135 Uint32 codepoint;
136 return decode(begin, end, codepoint);
137}
138
139
141template <typename In>
142std::size_t Utf<8>::count(In begin, In end)
143{
144 std::size_t length = 0;
145 while (begin < end)
146 {
147 begin = next(begin, end);
148 ++length;
149 }
150
151 return length;
152}
153
154
156template <typename In, typename Out>
157Out Utf<8>::fromAnsi(In begin, In end, Out output, const std::locale& locale)
158{
159 while (begin < end)
160 {
161 Uint32 codepoint = Utf<32>::decodeAnsi(*begin++, locale);
162 output = encode(codepoint, output);
163 }
164
165 return output;
166}
167
168
170template <typename In, typename Out>
171Out Utf<8>::fromWide(In begin, In end, Out output)
172{
173 while (begin < end)
174 {
175 Uint32 codepoint = Utf<32>::decodeWide(*begin++);
176 output = encode(codepoint, output);
177 }
178
179 return output;
180}
181
182
184template <typename In, typename Out>
185Out Utf<8>::fromLatin1(In begin, In end, Out output)
186{
187 // Latin-1 is directly compatible with Unicode encodings,
188 // and can thus be treated as (a sub-range of) UTF-32
189 while (begin < end)
190 output = encode(*begin++, output);
191
192 return output;
193}
194
195
197template <typename In, typename Out>
198Out Utf<8>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
199{
200 while (begin < end)
201 {
202 Uint32 codepoint;
203 begin = decode(begin, end, codepoint);
204 output = Utf<32>::encodeAnsi(codepoint, output, replacement, locale);
205 }
206
207 return output;
208}
209
210
212template <typename In, typename Out>
213Out Utf<8>::toWide(In begin, In end, Out output, wchar_t replacement)
214{
215 while (begin < end)
216 {
217 Uint32 codepoint;
218 begin = decode(begin, end, codepoint);
219 output = Utf<32>::encodeWide(codepoint, output, replacement);
220 }
221
222 return output;
223}
224
225
227template <typename In, typename Out>
228Out Utf<8>::toLatin1(In begin, In end, Out output, char replacement)
229{
230 // Latin-1 is directly compatible with Unicode encodings,
231 // and can thus be treated as (a sub-range of) UTF-32
232 while (begin < end)
233 {
234 Uint32 codepoint;
235 begin = decode(begin, end, codepoint);
236 *output++ = codepoint < 256 ? static_cast<char>(codepoint) : replacement;
237 }
238
239 return output;
240}
241
242
244template <typename In, typename Out>
245Out Utf<8>::toUtf8(In begin, In end, Out output)
246{
247 return std::copy(begin, end, output);
248}
249
250
252template <typename In, typename Out>
253Out Utf<8>::toUtf16(In begin, In end, Out output)
254{
255 while (begin < end)
256 {
257 Uint32 codepoint;
258 begin = decode(begin, end, codepoint);
259 output = Utf<16>::encode(codepoint, output);
260 }
261
262 return output;
263}
264
265
267template <typename In, typename Out>
268Out Utf<8>::toUtf32(In begin, In end, Out output)
269{
270 while (begin < end)
271 {
272 Uint32 codepoint;
273 begin = decode(begin, end, codepoint);
274 *output++ = codepoint;
275 }
276
277 return output;
278}
279
280
282template <typename In>
283In Utf<16>::decode(In begin, In end, Uint32& output, Uint32 replacement)
284{
285 Uint16 first = *begin++;
286
287 // If it's a surrogate pair, first convert to a single UTF-32 character
288 if ((first >= 0xD800) && (first <= 0xDBFF))
289 {
290 if (begin < end)
291 {
292 Uint32 second = *begin++;
293 if ((second >= 0xDC00) && (second <= 0xDFFF))
294 {
295 // The second element is valid: convert the two elements to a UTF-32 character
296 output = static_cast<Uint32>(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000);
297 }
298 else
299 {
300 // Invalid character
301 output = replacement;
302 }
303 }
304 else
305 {
306 // Invalid character
307 begin = end;
308 output = replacement;
309 }
310 }
311 else
312 {
313 // We can make a direct copy
314 output = first;
315 }
316
317 return begin;
318}
319
320
322template <typename Out>
323Out Utf<16>::encode(Uint32 input, Out output, Uint16 replacement)
324{
325 if (input <= 0xFFFF)
326 {
327 // The character can be copied directly, we just need to check if it's in the valid range
328 if ((input >= 0xD800) && (input <= 0xDFFF))
329 {
330 // Invalid character (this range is reserved)
331 if (replacement)
332 *output++ = replacement;
333 }
334 else
335 {
336 // Valid character directly convertible to a single UTF-16 character
337 *output++ = static_cast<Uint16>(input);
338 }
339 }
340 else if (input > 0x0010FFFF)
341 {
342 // Invalid character (greater than the maximum Unicode value)
343 if (replacement)
344 *output++ = replacement;
345 }
346 else
347 {
348 // The input character will be converted to two UTF-16 elements
349 input -= 0x0010000;
350 *output++ = static_cast<Uint16>((input >> 10) + 0xD800);
351 *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00);
352 }
353
354 return output;
355}
356
357
359template <typename In>
360In Utf<16>::next(In begin, In end)
361{
362 Uint32 codepoint;
363 return decode(begin, end, codepoint);
364}
365
366
368template <typename In>
369std::size_t Utf<16>::count(In begin, In end)
370{
371 std::size_t length = 0;
372 while (begin < end)
373 {
374 begin = next(begin, end);
375 ++length;
376 }
377
378 return length;
379}
380
381
383template <typename In, typename Out>
384Out Utf<16>::fromAnsi(In begin, In end, Out output, const std::locale& locale)
385{
386 while (begin < end)
387 {
388 Uint32 codepoint = Utf<32>::decodeAnsi(*begin++, locale);
389 output = encode(codepoint, output);
390 }
391
392 return output;
393}
394
395
397template <typename In, typename Out>
398Out Utf<16>::fromWide(In begin, In end, Out output)
399{
400 while (begin < end)
401 {
402 Uint32 codepoint = Utf<32>::decodeWide(*begin++);
403 output = encode(codepoint, output);
404 }
405
406 return output;
407}
408
409
411template <typename In, typename Out>
412Out Utf<16>::fromLatin1(In begin, In end, Out output)
413{
414 // Latin-1 is directly compatible with Unicode encodings,
415 // and can thus be treated as (a sub-range of) UTF-32
416 return std::copy(begin, end, output);
417}
418
419
421template <typename In, typename Out>
422Out Utf<16>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
423{
424 while (begin < end)
425 {
426 Uint32 codepoint;
427 begin = decode(begin, end, codepoint);
428 output = Utf<32>::encodeAnsi(codepoint, output, replacement, locale);
429 }
430
431 return output;
432}
433
434
436template <typename In, typename Out>
437Out Utf<16>::toWide(In begin, In end, Out output, wchar_t replacement)
438{
439 while (begin < end)
440 {
441 Uint32 codepoint;
442 begin = decode(begin, end, codepoint);
443 output = Utf<32>::encodeWide(codepoint, output, replacement);
444 }
445
446 return output;
447}
448
449
451template <typename In, typename Out>
452Out Utf<16>::toLatin1(In begin, In end, Out output, char replacement)
453{
454 // Latin-1 is directly compatible with Unicode encodings,
455 // and can thus be treated as (a sub-range of) UTF-32
456 while (begin < end)
457 {
458 *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement;
459 begin++;
460 }
461
462 return output;
463}
464
465
467template <typename In, typename Out>
468Out Utf<16>::toUtf8(In begin, In end, Out output)
469{
470 while (begin < end)
471 {
472 Uint32 codepoint;
473 begin = decode(begin, end, codepoint);
474 output = Utf<8>::encode(codepoint, output);
475 }
476
477 return output;
478}
479
480
482template <typename In, typename Out>
483Out Utf<16>::toUtf16(In begin, In end, Out output)
484{
485 return std::copy(begin, end, output);
486}
487
488
490template <typename In, typename Out>
491Out Utf<16>::toUtf32(In begin, In end, Out output)
492{
493 while (begin < end)
494 {
495 Uint32 codepoint;
496 begin = decode(begin, end, codepoint);
497 *output++ = codepoint;
498 }
499
500 return output;
501}
502
503
505template <typename In>
506In Utf<32>::decode(In begin, In /*end*/, Uint32& output, Uint32 /*replacement*/)
507{
508 output = *begin++;
509 return begin;
510}
511
512
514template <typename Out>
515Out Utf<32>::encode(Uint32 input, Out output, Uint32 /*replacement*/)
516{
517 *output++ = input;
518 return output;
519}
520
521
523template <typename In>
524In Utf<32>::next(In begin, In /*end*/)
525{
526 return ++begin;
527}
528
529
531template <typename In>
532std::size_t Utf<32>::count(In begin, In end)
533{
534 return begin - end;
535}
536
537
539template <typename In, typename Out>
540Out Utf<32>::fromAnsi(In begin, In end, Out output, const std::locale& locale)
541{
542 while (begin < end)
543 *output++ = decodeAnsi(*begin++, locale);
544
545 return output;
546}
547
548
550template <typename In, typename Out>
551Out Utf<32>::fromWide(In begin, In end, Out output)
552{
553 while (begin < end)
554 *output++ = decodeWide(*begin++);
555
556 return output;
557}
558
559
561template <typename In, typename Out>
562Out Utf<32>::fromLatin1(In begin, In end, Out output)
563{
564 // Latin-1 is directly compatible with Unicode encodings,
565 // and can thus be treated as (a sub-range of) UTF-32
566 return std::copy(begin, end, output);
567}
568
569
571template <typename In, typename Out>
572Out Utf<32>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
573{
574 while (begin < end)
575 output = encodeAnsi(*begin++, output, replacement, locale);
576
577 return output;
578}
579
580
582template <typename In, typename Out>
583Out Utf<32>::toWide(In begin, In end, Out output, wchar_t replacement)
584{
585 while (begin < end)
586 output = encodeWide(*begin++, output, replacement);
587
588 return output;
589}
590
591
593template <typename In, typename Out>
594Out Utf<32>::toLatin1(In begin, In end, Out output, char replacement)
595{
596 // Latin-1 is directly compatible with Unicode encodings,
597 // and can thus be treated as (a sub-range of) UTF-32
598 while (begin < end)
599 {
600 *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement;
601 begin++;
602 }
603
604 return output;
605}
606
607
609template <typename In, typename Out>
610Out Utf<32>::toUtf8(In begin, In end, Out output)
611{
612 while (begin < end)
613 output = Utf<8>::encode(*begin++, output);
614
615 return output;
616}
617
619template <typename In, typename Out>
620Out Utf<32>::toUtf16(In begin, In end, Out output)
621{
622 while (begin < end)
623 output = Utf<16>::encode(*begin++, output);
624
625 return output;
626}
627
628
630template <typename In, typename Out>
631Out Utf<32>::toUtf32(In begin, In end, Out output)
632{
633 return std::copy(begin, end, output);
634}
635
636
638template <typename In>
639Uint32 Utf<32>::decodeAnsi(In input, const std::locale& locale)
640{
641 // On Windows, GCC's standard library (glibc++) has almost
642 // no support for Unicode stuff. As a consequence, in this
643 // context we can only use the default locale and ignore
644 // the one passed as parameter.
645
646 #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \
647 (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \
648 !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */
649
650 (void)locale; // to avoid warnings
651
652 wchar_t character = 0;
653 mbtowc(&character, &input, 1);
654 return static_cast<Uint32>(character);
655
656 #else
657
658 // Get the facet of the locale which deals with character conversion
659 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
660
661 // Use the facet to convert each character of the input string
662 return static_cast<Uint32>(facet.widen(input));
663
664 #endif
665}
666
667
669template <typename In>
670Uint32 Utf<32>::decodeWide(In input)
671{
672 // The encoding of wide characters is not well defined and is left to the system;
673 // however we can safely assume that it is UCS-2 on Windows and
674 // UCS-4 on Unix systems.
675 // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4,
676 // and UCS-4 *is* UTF-32).
677
678 return input;
679}
680
681
683template <typename Out>
684Out Utf<32>::encodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale)
685{
686 // On Windows, gcc's standard library (glibc++) has almost
687 // no support for Unicode stuff. As a consequence, in this
688 // context we can only use the default locale and ignore
689 // the one passed as parameter.
690
691 #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \
692 (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \
693 !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */
694
695 (void)locale; // to avoid warnings
696
697 char character = 0;
698 if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0)
699 *output++ = character;
700 else if (replacement)
701 *output++ = replacement;
702
703 return output;
704
705 #else
706
707 // Get the facet of the locale which deals with character conversion
708 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
709
710 // Use the facet to convert each character of the input string
711 *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement);
712
713 return output;
714
715 #endif
716}
717
718
720template <typename Out>
721Out Utf<32>::encodeWide(Uint32 codepoint, Out output, wchar_t replacement)
722{
723 // The encoding of wide characters is not well defined and is left to the system;
724 // however we can safely assume that it is UCS-2 on Windows and
725 // UCS-4 on Unix systems.
726 // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4).
727 // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32).
728
729 switch (sizeof(wchar_t))
730 {
731 case 4:
732 {
733 *output++ = static_cast<wchar_t>(codepoint);
734 break;
735 }
736
737 default:
738 {
739 if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF)))
740 {
741 *output++ = static_cast<wchar_t>(codepoint);
742 }
743 else if (replacement)
744 {
745 *output++ = replacement;
746 }
747 break;
748 }
749 }
750
751 return output;
752}
unsigned short Uint16
Definition: Config.hpp:218
unsigned char Uint8
Definition: Config.hpp:214
unsigned int Uint32
Definition: Config.hpp:222
unsigned int character