#include "unicode/utf.h"
Go to the source code of this file.
Defines | |
| #define | UTF_SIZE 16 |
| Number of bits in a Unicode string code unit - ICU uses 16-bit Unicode. | |
| #define | UTF_SAFE |
| The default choice for general Unicode string macros is to use the . | |
| #define | UTF8_ERROR_VALUE_1 0x15 |
| #define | UTF8_ERROR_VALUE_2 0x9f |
| See documentation on UTF8_ERROR_VALUE_1 for details. | |
| #define | UTF_ERROR_VALUE 0xffff |
| Error value for all UTFs. | |
| #define | UTF_IS_ERROR(c) (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) |
| Is a given 32-bit code an error value as returned by one of the macros for any UTF? | |
| #define | UTF_IS_VALID(c) |
| This is a combined macro: Is c a valid Unicode value _and_ not an error code? | |
| #define | UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800) |
| Is this code unit or code point a surrogate (U+d800. | |
| #define | UTF_IS_UNICODE_NONCHAR(c) |
| Is a given 32-bit code point a Unicode noncharacter? | |
| #define | UTF_IS_UNICODE_CHAR(c) |
| Is a given 32-bit value a Unicode code point value (0. | |
| #define | UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) |
| Count the trail bytes for a UTF-8 lead byte. | |
| #define | UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
| Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. | |
| #define | UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0) |
| Is this this code point a single code unit (byte)? | |
| #define | UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e) |
| Is this this code unit the lead code unit (byte) of a code point? | |
| #define | UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80) |
| Is this this code unit a trailing code unit (byte) of a code point? | |
| #define | UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f) |
| Does this scalar Unicode value need multiple code units for storage? | |
| #define | UTF8_CHAR_LENGTH(c) |
| Given the lead character, how many bytes are taken by this code point. | |
| #define | UTF8_MAX_CHAR_LENGTH 4 |
| The maximum number of bytes per code point. | |
| #define | UTF8_ARRAY_SIZE(size) ((5*(size))/2) |
| Average number of code units compared to UTF-16. | |
| #define | UTF8_GET_CHAR_UNSAFE(s, i, c) |
| #define | UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) |
| #define | UTF8_NEXT_CHAR_UNSAFE(s, i, c) |
| #define | UTF8_APPEND_CHAR_UNSAFE(s, i, c) |
| #define | UTF8_FWD_1_UNSAFE(s, i) |
| #define | UTF8_FWD_N_UNSAFE(s, i, n) |
| #define | UTF8_SET_CHAR_START_UNSAFE(s, i) |
| #define | UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) |
| #define | UTF8_APPEND_CHAR_SAFE(s, i, length, c) |
| #define | UTF8_FWD_1_SAFE(s, i, length) U8_FWD_1(s, i, length) |
| #define | UTF8_FWD_N_SAFE(s, i, length, n) U8_FWD_N(s, i, length, n) |
| #define | UTF8_SET_CHAR_START_SAFE(s, start, i) U8_SET_CP_START(s, start, i) |
| #define | UTF8_PREV_CHAR_UNSAFE(s, i, c) |
| #define | UTF8_BACK_1_UNSAFE(s, i) |
| #define | UTF8_BACK_N_UNSAFE(s, i, n) |
| #define | UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) |
| #define | UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) |
| #define | UTF8_BACK_1_SAFE(s, start, i) U8_BACK_1(s, start, i) |
| #define | UTF8_BACK_N_SAFE(s, start, i, n) U8_BACK_N(s, start, i, n) |
| #define | UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) U8_SET_CP_LIMIT(s, start, i, length) |
| #define | UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800) |
| Is uchar a first/lead surrogate? | |
| #define | UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00) |
| Is uchar a second/trail surrogate? | |
| #define | UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0) |
| Assuming c is a surrogate, is it a first/lead surrogate? | |
| #define | UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) |
| Helper constant for UTF16_GET_PAIR_VALUE. | |
| #define | UTF16_GET_PAIR_VALUE(first, second) (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) |
| Get the UTF-32 value from the surrogate code units. | |
| #define | UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) |
| #define | UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) |
| #define | UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary) |
| #define | UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary) |
| #define | UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar) |
| #define | UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar) |
| #define | UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar) |
| #define | UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff) |
| Does this scalar Unicode value need multiple code units for storage? | |
| #define | UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) |
| #define | UTF16_MAX_CHAR_LENGTH 2 |
| #define | UTF16_ARRAY_SIZE(size) (size) |
| Average number of code units compared to UTF-16. | |
| #define | UTF16_GET_CHAR_UNSAFE(s, i, c) |
| Get a single code point from an offset that points to any of the code units that belong to that code point. | |
| #define | UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) |
| #define | UTF16_NEXT_CHAR_UNSAFE(s, i, c) |
| #define | UTF16_APPEND_CHAR_UNSAFE(s, i, c) |
| #define | UTF16_FWD_1_UNSAFE(s, i) |
| #define | UTF16_FWD_N_UNSAFE(s, i, n) |
| #define | UTF16_SET_CHAR_START_UNSAFE(s, i) |
| #define | UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) |
| #define | UTF16_APPEND_CHAR_SAFE(s, i, length, c) |
| #define | UTF16_FWD_1_SAFE(s, i, length) U16_FWD_1(s, i, length) |
| #define | UTF16_FWD_N_SAFE(s, i, length, n) U16_FWD_N(s, i, length, n) |
| #define | UTF16_SET_CHAR_START_SAFE(s, start, i) U16_SET_CP_START(s, start, i) |
| #define | UTF16_PREV_CHAR_UNSAFE(s, i, c) |
| #define | UTF16_BACK_1_UNSAFE(s, i) |
| #define | UTF16_BACK_N_UNSAFE(s, i, n) |
| #define | UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) |
| #define | UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) |
| #define | UTF16_BACK_1_SAFE(s, start, i) U16_BACK_1(s, start, i) |
| #define | UTF16_BACK_N_SAFE(s, start, i, n) U16_BACK_N(s, start, i, n) |
| #define | UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length) |
| #define | UTF32_IS_SAFE(c, strict) |
| #define | UTF32_IS_SINGLE(uchar) 1 |
| #define | UTF32_IS_LEAD(uchar) 0 |
| #define | UTF32_IS_TRAIL(uchar) 0 |
| #define | UTF32_NEED_MULTIPLE_UCHAR(c) 0 |
| #define | UTF32_CHAR_LENGTH(c) 1 |
| #define | UTF32_MAX_CHAR_LENGTH 1 |
| #define | UTF32_ARRAY_SIZE(size) (size) |
| #define | UTF32_GET_CHAR_UNSAFE(s, i, c) |
| #define | UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) |
| #define | UTF32_NEXT_CHAR_UNSAFE(s, i, c) |
| #define | UTF32_APPEND_CHAR_UNSAFE(s, i, c) |
| #define | UTF32_FWD_1_UNSAFE(s, i) |
| #define | UTF32_FWD_N_UNSAFE(s, i, n) |
| #define | UTF32_SET_CHAR_START_UNSAFE(s, i) |
| #define | UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) |
| #define | UTF32_APPEND_CHAR_SAFE(s, i, length, c) |
| #define | UTF32_FWD_1_SAFE(s, i, length) |
| #define | UTF32_FWD_N_SAFE(s, i, length, n) |
| #define | UTF32_SET_CHAR_START_SAFE(s, start, i) |
| #define | UTF32_PREV_CHAR_UNSAFE(s, i, c) |
| #define | UTF32_BACK_1_UNSAFE(s, i) |
| #define | UTF32_BACK_N_UNSAFE(s, i, n) |
| #define | UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) |
| #define | UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) |
| #define | UTF32_BACK_1_SAFE(s, start, i) |
| #define | UTF32_BACK_N_SAFE(s, start, i, n) |
| #define | UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) |
| #define | UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size) |
| Estimate the number of code units for a string based on the number of UTF-16 code units. | |
| #define | UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c) |
| #define | UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) |
| #define | UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c) |
| #define | UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) |
| #define | UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c) |
| #define | UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) |
| #define | UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i) |
| #define | UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length) |
| #define | UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n) |
| #define | UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n) |
| #define | UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i) |
| #define | UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i) |
| #define | UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c) |
| #define | UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) |
| #define | UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i) |
| #define | UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i) |
| #define | UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n) |
| #define | UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n) |
| #define | UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) |
| #define | UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) |
| #define | UTF_IS_SINGLE(uchar) U16_IS_SINGLE(uchar) |
| Does this code unit alone encode a code point (BMP, not a surrogate)? Same as UTF16_IS_SINGLE. | |
| #define | UTF_IS_LEAD(uchar) U16_IS_LEAD(uchar) |
| Is this code unit the first one of several (a lead surrogate)? Same as UTF16_IS_LEAD. | |
| #define | UTF_IS_TRAIL(uchar) U16_IS_TRAIL(uchar) |
| Is this code unit one of several but not the first one (a trail surrogate)? Same as UTF16_IS_TRAIL. | |
| #define | UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c) |
| Does this code point require multiple code units (is it a supplementary code point)? Same as UTF16_NEED_MULTIPLE_UCHAR. | |
| #define | UTF_CHAR_LENGTH(c) U16_LENGTH(c) |
| How many code units are used to encode this code point (1 or 2)? Same as UTF16_CHAR_LENGTH. | |
| #define | UTF_MAX_CHAR_LENGTH U16_MAX_LENGTH |
| How many code units are used at most for any Unicode code point (2)? Same as UTF16_MAX_CHAR_LENGTH. | |
| #define | UTF_GET_CHAR(s, start, i, length, c) U16_GET(s, start, i, length, c) |
| Set c to the code point that contains the code unit i. | |
| #define | UTF_NEXT_CHAR(s, i, length, c) U16_NEXT(s, i, length, c) |
| Set c to the code point that starts at code unit i and advance i to beyond the code units of this code point (post-increment). | |
| #define | UTF_APPEND_CHAR(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) |
| Append the code units of code point c to the string at index i and advance i to beyond the new code units (post-increment). | |
| #define | UTF_FWD_1(s, i, length) U16_FWD_1(s, i, length) |
| Advance i to beyond the code units of the code point that begins at i. | |
| #define | UTF_FWD_N(s, i, length, n) U16_FWD_N(s, i, length, n) |
| Advance i to beyond the code units of the n code points where the first one begins at i. | |
| #define | UTF_SET_CHAR_START(s, start, i) U16_SET_CP_START(s, start, i) |
| Take the random-access index i and adjust it so that it points to the beginning of a code point. | |
| #define | UTF_PREV_CHAR(s, start, i, c) U16_PREV(s, start, i, c) |
| Set c to the code point that has code units before i and move i backward (towards the beginning of the string) to the first code unit of this code point (pre-increment). | |
| #define | UTF_BACK_1(s, start, i) U16_BACK_1(s, start, i) |
| Move i backward (towards the beginning of the string) to the first code unit of the code point that has code units before i. | |
| #define | UTF_BACK_N(s, start, i, n) U16_BACK_N(s, start, i, n) |
| Move i backward (towards the beginning of the string) to the first code unit of the n code points that have code units before i. | |
| #define | UTF_SET_CHAR_LIMIT(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length) |
| Take the random-access index i and adjust it so that it points beyond a code point. | |
Definition in file utf_old.h.
| #define UTF16_APPEND_CHAR_SAFE | ( | s, | |||
| i, | |||||
| length, | |||||
| c | ) |
Value:
{ \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else if((uint32_t)(c)<=0x10ffff) { \
if((i)+1<(length)) { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} else /* not enough space */ { \
(s)[(i)++]=UTF_ERROR_VALUE; \
} \
} else /* c>0x10ffff, write error value */ { \
(s)[(i)++]=UTF_ERROR_VALUE; \
} \
}
| #define UTF16_APPEND_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
| #define UTF16_ARRAY_SIZE | ( | size | ) | (size) |
| #define UTF16_BACK_1_SAFE | ( | s, | |||
| start, | |||||
| i | ) | U16_BACK_1(s, start, i) |
| #define UTF16_BACK_1_UNSAFE | ( | s, | |||
| i | ) |
Value:
{ \
if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \
--(i); \
} \
}
| #define UTF16_BACK_N_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| n | ) | U16_BACK_N(s, start, i, n) |
| #define UTF16_BACK_N_UNSAFE | ( | s, | |||
| i, | |||||
| n | ) |
Value:
{ \
int32_t __N=(n); \
while(__N>0) { \
UTF16_BACK_1_UNSAFE(s, i); \
--__N; \
} \
}
| #define UTF16_CHAR_LENGTH | ( | c | ) | ((uint32_t)(c)<=0xffff ? 1 : 2) |
| #define UTF16_FWD_1_UNSAFE | ( | s, | |||
| i | ) |
Value:
{ \
if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \
++(i); \
} \
}
| #define UTF16_FWD_N_UNSAFE | ( | s, | |||
| i, | |||||
| n | ) |
Value:
{ \
int32_t __N=(n); \
while(__N>0) { \
UTF16_FWD_1_UNSAFE(s, i); \
--__N; \
} \
}
| #define UTF16_GET_CHAR_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| length, | |||||
| c, | |||||
| strict | ) |
Value:
{ \
(c)=(s)[i]; \
if(UTF_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(UTF_IS_SURROGATE_FIRST(c)) { \
if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \
(c)=UTF16_GET_PAIR_VALUE((c), __c2); \
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
} else if(strict) {\
/* unmatched first surrogate */ \
(c)=UTF_ERROR_VALUE; \
} \
} else { \
if((i)-1>=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
(c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
} else if(strict) {\
/* unmatched second surrogate */ \
(c)=UTF_ERROR_VALUE; \
} \
} \
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
(c)=UTF_ERROR_VALUE; \
} \
}
| #define UTF16_GET_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
Value:
{ \
(c)=(s)[i]; \
if(UTF_IS_SURROGATE(c)) { \
if(UTF_IS_SURROGATE_FIRST(c)) { \
(c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \
} else { \
(c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \
} \
} \
}
Assume 0<=i<length.
This could be used for iteration together with UTF16_CHAR_LENGTH() and UTF_IS_ERROR(), but the use of UTF16_NEXT_CHAR[_UNSAFE]() and UTF16_PREV_CHAR[_UNSAFE]() is more efficient for that.
| #define UTF16_GET_PAIR_VALUE | ( | first, | |||
| second | ) | (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) |
Get the UTF-32 value from the surrogate code units.
| #define UTF16_IS_LEAD | ( | uchar | ) | UTF_IS_FIRST_SURROGATE(uchar) |
| #define UTF16_IS_SINGLE | ( | uchar | ) | !UTF_IS_SURROGATE(uchar) |
| #define UTF16_IS_TRAIL | ( | uchar | ) | UTF_IS_SECOND_SURROGATE(uchar) |
| #define UTF16_LEAD | ( | supplementary | ) | UTF_FIRST_SURROGATE(supplementary) |
| #define UTF16_MAX_CHAR_LENGTH 2 |
| #define UTF16_NEED_MULTIPLE_UCHAR | ( | c | ) | ((uint32_t)(c)>0xffff) |
Does this scalar Unicode value need multiple code units for storage?
| #define UTF16_NEXT_CHAR_SAFE | ( | s, | |||
| i, | |||||
| length, | |||||
| c, | |||||
| strict | ) |
Value:
{ \
(c)=(s)[(i)++]; \
if(UTF_IS_FIRST_SURROGATE(c)) { \
uint16_t __c2; \
if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \
++(i); \
(c)=UTF16_GET_PAIR_VALUE((c), __c2); \
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
} else if(strict) {\
/* unmatched first surrogate */ \
(c)=UTF_ERROR_VALUE; \
} \
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
/* unmatched second surrogate or other non-character */ \
(c)=UTF_ERROR_VALUE; \
} \
}
| #define UTF16_NEXT_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
Value:
{ \
(c)=(s)[(i)++]; \
if(UTF_IS_FIRST_SURROGATE(c)) { \
(c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \
} \
}
| #define UTF16_PREV_CHAR_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| c, | |||||
| strict | ) |
Value:
{ \
(c)=(s)[--(i)]; \
if(UTF_IS_SECOND_SURROGATE(c)) { \
uint16_t __c2; \
if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
--(i); \
(c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
} else if(strict) {\
/* unmatched second surrogate */ \
(c)=UTF_ERROR_VALUE; \
} \
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
/* unmatched first surrogate or other non-character */ \
(c)=UTF_ERROR_VALUE; \
} \
}
| #define UTF16_PREV_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
Value:
{ \
(c)=(s)[--(i)]; \
if(UTF_IS_SECOND_SURROGATE(c)) { \
(c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \
} \
}
| #define UTF16_SET_CHAR_LIMIT_UNSAFE | ( | s, | |||
| i | ) |
Value:
{ \
if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
++(i); \
} \
}
| #define UTF16_SET_CHAR_START_SAFE | ( | s, | |||
| start, | |||||
| i | ) | U16_SET_CP_START(s, start, i) |
| #define UTF16_SET_CHAR_START_UNSAFE | ( | s, | |||
| i | ) |
Value:
{ \
if(UTF_IS_SECOND_SURROGATE((s)[i])) { \
--(i); \
} \
}
| #define UTF16_TRAIL | ( | supplementary | ) | UTF_SECOND_SURROGATE(supplementary) |
| #define UTF32_APPEND_CHAR_SAFE | ( | s, | |||
| i, | |||||
| length, | |||||
| c | ) |
Value:
{ \
if((uint32_t)(c)<=0x10ffff) { \
(s)[(i)++]=(c); \
} else /* c>0x10ffff, write 0xfffd */ { \
(s)[(i)++]=0xfffd; \
} \
}
| #define UTF32_APPEND_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
| #define UTF32_ARRAY_SIZE | ( | size | ) | (size) |
| #define UTF32_BACK_1_SAFE | ( | s, | |||
| start, | |||||
| i | ) |
| #define UTF32_BACK_1_UNSAFE | ( | s, | |||
| i | ) |
| #define UTF32_BACK_N_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| n | ) |
Value:
{ \
(i)-=(n); \
if((i)<(start)) { \
(i)=(start); \
} \
}
| #define UTF32_BACK_N_UNSAFE | ( | s, | |||
| i, | |||||
| n | ) |
| #define UTF32_CHAR_LENGTH | ( | c | ) | 1 |
| #define UTF32_FWD_1_SAFE | ( | s, | |||
| i, | |||||
| length | ) |
| #define UTF32_FWD_1_UNSAFE | ( | s, | |||
| i | ) |
| #define UTF32_FWD_N_SAFE | ( | s, | |||
| i, | |||||
| length, | |||||
| n | ) |
| #define UTF32_FWD_N_UNSAFE | ( | s, | |||
| i, | |||||
| n | ) |
| #define UTF32_GET_CHAR_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| length, | |||||
| c, | |||||
| strict | ) |
Value:
{ \
(c)=(s)[i]; \
if(!UTF32_IS_SAFE(c, strict)) { \
(c)=UTF_ERROR_VALUE; \
} \
}
| #define UTF32_GET_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
| #define UTF32_IS_LEAD | ( | uchar | ) | 0 |
| #define UTF32_IS_SAFE | ( | c, | |||
| strict | ) |
Value:
(!(strict) ? \
(uint32_t)(c)<=0x10ffff : \
UTF_IS_UNICODE_CHAR(c))
| #define UTF32_IS_SINGLE | ( | uchar | ) | 1 |
| #define UTF32_IS_TRAIL | ( | uchar | ) | 0 |
| #define UTF32_MAX_CHAR_LENGTH 1 |
| #define UTF32_NEED_MULTIPLE_UCHAR | ( | c | ) | 0 |
| #define UTF32_NEXT_CHAR_SAFE | ( | s, | |||
| i, | |||||
| length, | |||||
| c, | |||||
| strict | ) |
Value:
{ \
(c)=(s)[(i)++]; \
if(!UTF32_IS_SAFE(c, strict)) { \
(c)=UTF_ERROR_VALUE; \
} \
}
| #define UTF32_NEXT_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
| #define UTF32_PREV_CHAR_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| c, | |||||
| strict | ) |
Value:
{ \
(c)=(s)[--(i)]; \
if(!UTF32_IS_SAFE(c, strict)) { \
(c)=UTF_ERROR_VALUE; \
} \
}
| #define UTF32_PREV_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
| #define UTF32_SET_CHAR_LIMIT_SAFE | ( | s, | |||
| i, | |||||
| length | ) |
| #define UTF32_SET_CHAR_LIMIT_UNSAFE | ( | s, | |||
| i | ) |
| #define UTF32_SET_CHAR_START_SAFE | ( | s, | |||
| start, | |||||
| i | ) |
| #define UTF32_SET_CHAR_START_UNSAFE | ( | s, | |||
| i | ) |
| #define UTF8_APPEND_CHAR_SAFE | ( | s, | |||
| i, | |||||
| length, | |||||
| c | ) |
| #define UTF8_APPEND_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
Value:
{ \
if((uint32_t)(c)<=0x7f) { \
(s)[(i)++]=(uint8_t)(c); \
} else { \
if((uint32_t)(c)<=0x7ff) { \
(s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
} else { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
} else { \
(s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
(s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
} \
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
} \
}
| #define UTF8_ARRAY_SIZE | ( | size | ) | ((5*(size))/2) |
| #define UTF8_BACK_1_SAFE | ( | s, | |||
| start, | |||||
| i | ) | U8_BACK_1(s, start, i) |
| #define UTF8_BACK_1_UNSAFE | ( | s, | |||
| i | ) |
Value:
{ \
while(UTF8_IS_TRAIL((s)[--(i)])) {} \
}
| #define UTF8_BACK_N_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| n | ) | U8_BACK_N(s, start, i, n) |
| #define UTF8_BACK_N_UNSAFE | ( | s, | |||
| i, | |||||
| n | ) |
Value:
{ \
int32_t __N=(n); \
while(__N>0) { \
UTF8_BACK_1_UNSAFE(s, i); \
--__N; \
} \
}
| #define UTF8_CHAR_LENGTH | ( | c | ) |
Value:
((uint32_t)(c)<=0x7f ? 1 : \ ((uint32_t)(c)<=0x7ff ? 2 : \ ((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \ ) \ )
ICU does not deal with code points >0x10ffff unless necessary for advancing in the byte stream.
These length macros take into account that for values >0x10ffff the UTF8_APPEND_CHAR_SAFE macros would write the error code point 0xffff with 3 bytes. Code point comparisons need to be in uint32_t because UChar32 may be a signed type, and negative values must be recognized.
| #define UTF8_COUNT_TRAIL_BYTES | ( | leadByte | ) | (utf8_countTrailBytes[(uint8_t)leadByte]) |
Count the trail bytes for a UTF-8 lead byte.
| #define UTF8_ERROR_VALUE_1 0x15 |
UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8, which need 1 or 2 bytes in UTF-8:
U+0015 = NAK = Negative Acknowledge, C0 control character
U+009f = highest C1 control character
These are used by UTF8_..._SAFE macros so that they can return an error value that needs the same number of code units (bytes) as were seen by a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().
| #define UTF8_ERROR_VALUE_2 0x9f |
| #define UTF8_FWD_1_UNSAFE | ( | s, | |||
| i | ) |
Value:
{ \
(i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \
}
| #define UTF8_FWD_N_UNSAFE | ( | s, | |||
| i, | |||||
| n | ) |
Value:
{ \
int32_t __N=(n); \
while(__N>0) { \
UTF8_FWD_1_UNSAFE(s, i); \
--__N; \
} \
}
| #define UTF8_GET_CHAR_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| length, | |||||
| c, | |||||
| strict | ) |
Value:
{ \
int32_t _utf8_get_char_safe_index=(int32_t)(i); \
UTF8_SET_CHAR_START_SAFE(s, start, _utf8_get_char_safe_index); \
UTF8_NEXT_CHAR_SAFE(s, _utf8_get_char_safe_index, length, c, strict); \
}
| #define UTF8_GET_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
Value:
{ \
int32_t _utf8_get_char_unsafe_index=(int32_t)(i); \
UTF8_SET_CHAR_START_UNSAFE(s, _utf8_get_char_unsafe_index); \
UTF8_NEXT_CHAR_UNSAFE(s, _utf8_get_char_unsafe_index, c); \
}
| #define UTF8_IS_LEAD | ( | uchar | ) | ((uint8_t)((uchar)-0xc0)<0x3e) |
Is this this code unit the lead code unit (byte) of a code point?
| #define UTF8_IS_SINGLE | ( | uchar | ) | (((uchar)&0x80)==0) |
Is this this code point a single code unit (byte)?
| #define UTF8_IS_TRAIL | ( | uchar | ) | (((uchar)&0xc0)==0x80) |
Is this this code unit a trailing code unit (byte) of a code point?
| #define UTF8_MASK_LEAD_BYTE | ( | leadByte, | |||
| countTrailBytes | ) | ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
| #define UTF8_MAX_CHAR_LENGTH 4 |
The maximum number of bytes per code point.
| #define UTF8_NEED_MULTIPLE_UCHAR | ( | c | ) | ((uint32_t)(c)>0x7f) |
Does this scalar Unicode value need multiple code units for storage?
| #define UTF8_NEXT_CHAR_SAFE | ( | s, | |||
| i, | |||||
| length, | |||||
| c, | |||||
| strict | ) |
Value:
{ \
(c)=(s)[(i)++]; \
if((c)>=0x80) { \
if(UTF8_IS_LEAD(c)) { \
(c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict); \
} else { \
(c)=UTF8_ERROR_VALUE_1; \
} \
} \
}
| #define UTF8_NEXT_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
Value:
{ \
(c)=(s)[(i)++]; \
if((uint8_t)((c)-0xc0)<0x35) { \
uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \
UTF8_MASK_LEAD_BYTE(c, __count); \
switch(__count) { \
/* each following branch falls through to the next one */ \
case 3: \
(c)=((c)<<6)|((s)[(i)++]&0x3f); \
case 2: \
(c)=((c)<<6)|((s)[(i)++]&0x3f); \
case 1: \
(c)=((c)<<6)|((s)[(i)++]&0x3f); \
/* no other branches to optimize switch() */ \
break; \
} \
} \
}
| #define UTF8_PREV_CHAR_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| c, | |||||
| strict | ) |
Value:
{ \
(c)=(s)[--(i)]; \
if((c)>=0x80) { \
if((c)<=0xbf) { \
(c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \
} else { \
(c)=UTF8_ERROR_VALUE_1; \
} \
} \
}
| #define UTF8_PREV_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) |
Value:
{ \
(c)=(s)[--(i)]; \
if(UTF8_IS_TRAIL(c)) { \
uint8_t __b, __count=1, __shift=6; \
\
/* c is a trail byte */ \
(c)&=0x3f; \
for(;;) { \
__b=(s)[--(i)]; \
if(__b>=0xc0) { \
UTF8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
break; \
} else { \
(c)|=(UChar32)(__b&0x3f)<<__shift; \
++__count; \
__shift+=6; \
} \
} \
} \
}
| #define UTF8_SET_CHAR_LIMIT_UNSAFE | ( | s, | |||
| i | ) |
Value:
{ \
UTF8_BACK_1_UNSAFE(s, i); \
UTF8_FWD_1_UNSAFE(s, i); \
}
| #define UTF8_SET_CHAR_START_SAFE | ( | s, | |||
| start, | |||||
| i | ) | U8_SET_CP_START(s, start, i) |
| #define UTF8_SET_CHAR_START_UNSAFE | ( | s, | |||
| i | ) |
Value:
{ \
while(UTF8_IS_TRAIL((s)[i])) { --(i); } \
}
Append the code units of code point c to the string at index i and advance i to beyond the new code units (post-increment).
The code units beginning at index i will be overwritten. Same as UTF16_APPEND_CHAR.
0<=i<length
| #define UTF_APPEND_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) | UTF16_APPEND_CHAR_UNSAFE(s, i, c) |
| #define UTF_ARRAY_SIZE | ( | size | ) | UTF16_ARRAY_SIZE(size) |
Estimate the number of code units for a string based on the number of UTF-16 code units.
| #define UTF_BACK_1 | ( | s, | |||
| start, | |||||
| i | ) | U16_BACK_1(s, start, i) |
Move i backward (towards the beginning of the string) to the first code unit of the code point that has code units before i.
I.e., move i backward by one code point. i must point to the first code unit after the last unit of a code point (i==length is allowed). Same as UTF16_BACK_1.
| #define UTF_BACK_1_SAFE | ( | s, | |||
| start, | |||||
| i | ) | UTF16_BACK_1_SAFE(s, start, i) |
| #define UTF_BACK_1_UNSAFE | ( | s, | |||
| i | ) | UTF16_BACK_1_UNSAFE(s, i) |
| #define UTF_BACK_N | ( | s, | |||
| start, | |||||
| i, | |||||
| n | ) | U16_BACK_N(s, start, i, n) |
Move i backward (towards the beginning of the string) to the first code unit of the n code points that have code units before i.
I.e., move i backward by n code points. i must point to the first code unit after the last unit of a code point (i==length is allowed). Same as UTF16_BACK_N.
| #define UTF_BACK_N_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| n | ) | UTF16_BACK_N_SAFE(s, start, i, n) |
| #define UTF_BACK_N_UNSAFE | ( | s, | |||
| i, | |||||
| n | ) | UTF16_BACK_N_UNSAFE(s, i, n) |
| #define UTF_CHAR_LENGTH | ( | c | ) | U16_LENGTH(c) |
How many code units are used to encode this code point (1 or 2)? Same as UTF16_CHAR_LENGTH.
| #define UTF_ERROR_VALUE 0xffff |
Error value for all UTFs.
This code point value will be set by macros with error checking if an error is detected.
| #define UTF_FIRST_SURROGATE | ( | supplementary | ) | (UChar)(((supplementary)>>10)+0xd7c0) |
Advance i to beyond the code units of the code point that begins at i.
I.e., advance i by one code point. Same as UTF16_FWD_1.
| #define UTF_FWD_1_UNSAFE | ( | s, | |||
| i | ) | UTF16_FWD_1_UNSAFE(s, i) |
Advance i to beyond the code units of the n code points where the first one begins at i.
I.e., advance i by n code points. Same as UT16_FWD_N.
| #define UTF_FWD_N_UNSAFE | ( | s, | |||
| i, | |||||
| n | ) | UTF16_FWD_N_UNSAFE(s, i, n) |
Set c to the code point that contains the code unit i.
i could point to the lead or the trail surrogate for the code point. i is not modified. Same as UTF16_GET_CHAR.
| #define UTF_GET_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) | UTF16_GET_CHAR_UNSAFE(s, i, c) |
| #define UTF_IS_ERROR | ( | c | ) | (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) |
Is a given 32-bit code an error value as returned by one of the macros for any UTF?
| #define UTF_IS_FIRST_SURROGATE | ( | uchar | ) | (((uchar)&0xfffffc00)==0xd800) |
Is uchar a first/lead surrogate?
| #define UTF_IS_LEAD | ( | uchar | ) | U16_IS_LEAD(uchar) |
Is this code unit the first one of several (a lead surrogate)? Same as UTF16_IS_LEAD.
| #define UTF_IS_SECOND_SURROGATE | ( | uchar | ) | (((uchar)&0xfffffc00)==0xdc00) |
Is uchar a second/trail surrogate?
| #define UTF_IS_SINGLE | ( | uchar | ) | U16_IS_SINGLE(uchar) |
Does this code unit alone encode a code point (BMP, not a surrogate)? Same as UTF16_IS_SINGLE.
| #define UTF_IS_SURROGATE | ( | uchar | ) | (((uchar)&0xfffff800)==0xd800) |
Is this code unit or code point a surrogate (U+d800.
.U+dfff)?
| #define UTF_IS_SURROGATE_FIRST | ( | c | ) | (((c)&0x400)==0) |
Assuming c is a surrogate, is it a first/lead surrogate?
| #define UTF_IS_TRAIL | ( | uchar | ) | U16_IS_TRAIL(uchar) |
Is this code unit one of several but not the first one (a trail surrogate)? Same as UTF16_IS_TRAIL.
| #define UTF_IS_UNICODE_CHAR | ( | c | ) |
Value:
((uint32_t)(c)<0xd800 || \ ((uint32_t)(c)>0xdfff && \ (uint32_t)(c)<=0x10ffff && \ !UTF_IS_UNICODE_NONCHAR(c)))
.U+10ffff) that can be assigned a character?
Code points that are not characters include:
This means that all code points below U+d800 are character code points, and that boundary is tested first for performance.
| #define UTF_IS_UNICODE_NONCHAR | ( | c | ) |
| #define UTF_IS_VALID | ( | c | ) |
Value:
(UTF_IS_UNICODE_CHAR(c) && \ (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2)
| #define UTF_MAX_CHAR_LENGTH U16_MAX_LENGTH |
How many code units are used at most for any Unicode code point (2)? Same as UTF16_MAX_CHAR_LENGTH.
| #define UTF_NEED_MULTIPLE_UCHAR | ( | c | ) | UTF16_NEED_MULTIPLE_UCHAR(c) |
Does this code point require multiple code units (is it a supplementary code point)? Same as UTF16_NEED_MULTIPLE_UCHAR.
Set c to the code point that starts at code unit i and advance i to beyond the code units of this code point (post-increment).
i must point to the first code unit of a code point. Otherwise c is set to the trail unit (surrogate) itself. Same as UTF16_NEXT_CHAR.
| #define UTF_NEXT_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) | UTF16_NEXT_CHAR_UNSAFE(s, i, c) |
| #define UTF_PREV_CHAR | ( | s, | |||
| start, | |||||
| i, | |||||
| c | ) | U16_PREV(s, start, i, c) |
Set c to the code point that has code units before i and move i backward (towards the beginning of the string) to the first code unit of this code point (pre-increment).
i must point to the first code unit after the last unit of a code point (i==length is allowed). Same as UTF16_PREV_CHAR.
| #define UTF_PREV_CHAR_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| c, | |||||
| strict | ) | UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) |
| #define UTF_PREV_CHAR_UNSAFE | ( | s, | |||
| i, | |||||
| c | ) | UTF16_PREV_CHAR_UNSAFE(s, i, c) |
| #define UTF_SAFE |
The default choice for general Unicode string macros is to use the .
.._SAFE macro implementations with strict=FALSE.
| #define UTF_SECOND_SURROGATE | ( | supplementary | ) | (UChar)(((supplementary)&0x3ff)|0xdc00) |
Take the random-access index i and adjust it so that it points beyond a code point.
The input index points beyond any code unit of a code point and is moved to point beyond the last code unit of the same code point. i is never decremented. In other words, if i points to a trail surrogate that is preceded by a matching lead surrogate, then i is incremented. Otherwise it is not modified. This can be used to start an iteration with UTF_PREV_CHAR() from a random index. Same as UTF16_SET_CHAR_LIMIT.
| #define UTF_SET_CHAR_LIMIT_SAFE | ( | s, | |||
| start, | |||||
| i, | |||||
| length | ) | UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) |
| #define UTF_SET_CHAR_LIMIT_UNSAFE | ( | s, | |||
| i | ) | UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) |
| #define UTF_SET_CHAR_START | ( | s, | |||
| start, | |||||
| i | ) | U16_SET_CP_START(s, start, i) |
Take the random-access index i and adjust it so that it points to the beginning of a code point.
The input index points to any code unit of a code point and is moved to point to the first code unit of the same code point. i is never incremented. In other words, if i points to a trail surrogate that is preceded by a matching lead surrogate, then i is decremented. Otherwise it is not modified. This can be used to start an iteration with UTF_NEXT_CHAR() from a random index. Same as UTF16_SET_CHAR_START.
| #define UTF_SET_CHAR_START_SAFE | ( | s, | |||
| start, | |||||
| i | ) | UTF16_SET_CHAR_START_SAFE(s, start, i) |
| #define UTF_SET_CHAR_START_UNSAFE | ( | s, | |||
| i | ) | UTF16_SET_CHAR_START_UNSAFE(s, i) |
| #define UTF_SIZE 16 |
Number of bits in a Unicode string code unit - ICU uses 16-bit Unicode.
| #define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) |
Helper constant for UTF16_GET_PAIR_VALUE.
1.5.1