Changeset View
Changeset View
Standalone View
Standalone View
source/blender/blenlib/intern/string_utf8.c
| Show First 20 Lines • Show All 319 Lines • ▼ Show 20 Lines | size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst, | ||||
| BLI_assert(maxncpy != 0); | BLI_assert(maxncpy != 0); | ||||
| #ifdef DEBUG_STRSIZE | #ifdef DEBUG_STRSIZE | ||||
| memset(dst, 0xff, sizeof(*dst) * maxncpy); | memset(dst, 0xff, sizeof(*dst) * maxncpy); | ||||
| #endif | #endif | ||||
| while (*src && len <= maxlen_secured) { | while (*src && len <= maxlen_secured) { | ||||
| len += BLI_str_utf8_from_unicode((uint)*src++, dst + len); | len += BLI_str_utf8_from_unicode((BLI_unicode)*src++, dst + len); | ||||
| } | } | ||||
| /* We have to be more careful for the last six bytes, | /* We have to be more careful for the last six bytes, | ||||
| * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */ | * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */ | ||||
| while (*src) { | while (*src) { | ||||
| char t[6]; | char t[6]; | ||||
| size_t l = BLI_str_utf8_from_unicode((uint)*src++, t); | size_t l = BLI_str_utf8_from_unicode((BLI_unicode)*src++, t); | ||||
| BLI_assert(l <= 6); | BLI_assert(l <= 6); | ||||
| if (len + l > maxlen) { | if (len + l > maxlen) { | ||||
| break; | break; | ||||
| } | } | ||||
| memcpy(dst + len, t, l); | memcpy(dst + len, t, l); | ||||
| len += l; | len += l; | ||||
| } | } | ||||
| dst[len] = '\0'; | dst[len] = '\0'; | ||||
| return len; | return len; | ||||
| } | } | ||||
| /* wchar len in utf8 */ | /* wchar len in utf8 */ | ||||
| size_t BLI_wstrlen_utf8(const wchar_t *src) | size_t BLI_wstrlen_utf8(const wchar_t *src) | ||||
| { | { | ||||
| size_t len = 0; | size_t len = 0; | ||||
| while (*src) { | while (*src) { | ||||
| len += BLI_str_utf8_from_unicode((uint)*src++, NULL); | len += BLI_str_utf8_from_unicode((BLI_unicode)*src++, NULL); | ||||
| } | } | ||||
| return len; | return len; | ||||
| } | } | ||||
| size_t BLI_strlen_utf8_ex(const char *strc, size_t *r_len_bytes) | size_t BLI_strlen_utf8_ex(const char *strc, size_t *r_len_bytes) | ||||
| { | { | ||||
| size_t len; | size_t len; | ||||
| ▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines | size_t BLI_strncpy_wchar_from_utf8(wchar_t *__restrict dst_w, | ||||
| BLI_assert(maxncpy != 0); | BLI_assert(maxncpy != 0); | ||||
| #ifdef DEBUG_STRSIZE | #ifdef DEBUG_STRSIZE | ||||
| memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy); | memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy); | ||||
| #endif | #endif | ||||
| while (*src_c && len != maxlen) { | while (*src_c && len != maxlen) { | ||||
| size_t step = 0; | size_t step = 0; | ||||
| uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step); | BLI_unicode unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step); | ||||
| if (unicode != BLI_UTF8_ERR) { | if (unicode != BLI_UTF8_ERR) { | ||||
| *dst_w = (wchar_t)unicode; | *dst_w = (wchar_t)unicode; | ||||
| src_c += step; | src_c += step; | ||||
| } | } | ||||
| else { | else { | ||||
| *dst_w = '?'; | *dst_w = '?'; | ||||
| src_c = BLI_str_find_next_char_utf8(src_c, NULL); | src_c = BLI_str_find_next_char_utf8(src_c, NULL); | ||||
| } | } | ||||
| Show All 18 Lines | |||||
| int BLI_wcswidth(const wchar_t *pwcs, size_t n) | int BLI_wcswidth(const wchar_t *pwcs, size_t n) | ||||
| { | { | ||||
| return mk_wcswidth(pwcs, n); | return mk_wcswidth(pwcs, n); | ||||
| } | } | ||||
| int BLI_str_utf8_char_width(const char *p) | int BLI_str_utf8_char_width(const char *p) | ||||
| { | { | ||||
| uint unicode = BLI_str_utf8_as_unicode(p); | BLI_unicode unicode = BLI_str_utf8_as_unicode(p); | ||||
| if (unicode == BLI_UTF8_ERR) { | if (unicode == BLI_UTF8_ERR) { | ||||
| return -1; | return -1; | ||||
| } | } | ||||
| return BLI_wcwidth((wchar_t)unicode); | return BLI_wcwidth((wchar_t)unicode); | ||||
| } | } | ||||
| int BLI_str_utf8_char_width_safe(const char *p) | int BLI_str_utf8_char_width_safe(const char *p) | ||||
| { | { | ||||
| int columns; | int columns; | ||||
| uint unicode = BLI_str_utf8_as_unicode(p); | BLI_unicode unicode = BLI_str_utf8_as_unicode(p); | ||||
| if (unicode == BLI_UTF8_ERR) { | if (unicode == BLI_UTF8_ERR) { | ||||
| return 1; | return 1; | ||||
| } | } | ||||
| columns = BLI_wcwidth((wchar_t)unicode); | columns = BLI_wcwidth((wchar_t)unicode); | ||||
| return (columns < 0) ? 1 : columns; | return (columns < 0) ? 1 : columns; | ||||
| } | } | ||||
| /* --------------------------------------------------------------------------*/ | /* --------------------------------------------------------------------------*/ | ||||
| /* copied from glib's gutf8.c, added 'Err' arg */ | /* copied from glib's gutf8.c, added 'Err' arg */ | ||||
| /* note, glib uses uint for unicode, best we do the same, | /* note, glib uses uint for unicode, best we do the same - campbell */ | ||||
| * though we don't typedef it - campbell */ | |||||
| #define UTF8_COMPUTE(Char, Mask, Len, Err) \ | #define UTF8_COMPUTE(Char, Mask, Len, Err) \ | ||||
| if (Char < 128) { \ | if (Char < 128) { \ | ||||
| Len = 1; \ | Len = 1; \ | ||||
| Mask = 0x7f; \ | Mask = 0x7f; \ | ||||
| } \ | } \ | ||||
| else if ((Char & 0xe0) == 0xc0) { \ | else if ((Char & 0xe0) == 0xc0) { \ | ||||
| Len = 2; \ | Len = 2; \ | ||||
| ▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines | |||||
| * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. | * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. | ||||
| * If \a p does not point to a valid UTF-8 encoded character, results are | * If \a p does not point to a valid UTF-8 encoded character, results are | ||||
| * undefined. If you are not sure that the bytes are complete | * undefined. If you are not sure that the bytes are complete | ||||
| * valid Unicode characters, you should use g_utf8_get_char_validated() | * valid Unicode characters, you should use g_utf8_get_char_validated() | ||||
| * instead. | * instead. | ||||
| * | * | ||||
| * Return value: the resulting character | * Return value: the resulting character | ||||
| */ | */ | ||||
| uint BLI_str_utf8_as_unicode(const char *p) | BLI_unicode BLI_str_utf8_as_unicode(const char *p) | ||||
| { | { | ||||
| int i, len; | int i, len; | ||||
| uint mask = 0; | uint mask = 0; | ||||
| uint result; | BLI_unicode result; | ||||
| const unsigned char c = (unsigned char)*p; | const unsigned char c = (unsigned char)*p; | ||||
| UTF8_COMPUTE(c, mask, len, -1); | UTF8_COMPUTE(c, mask, len, -1); | ||||
| if (UNLIKELY(len == -1)) { | if (UNLIKELY(len == -1)) { | ||||
| return BLI_UTF8_ERR; | return BLI_UTF8_ERR; | ||||
| } | } | ||||
| UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); | UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); | ||||
| return result; | return result; | ||||
| } | } | ||||
| /* variant that increments the length */ | /* variant that increments the length */ | ||||
| uint BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index) | BLI_unicode BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index) | ||||
| { | { | ||||
| int i, len; | int i, len; | ||||
| unsigned mask = 0; | unsigned mask = 0; | ||||
| uint result; | BLI_unicode result; | ||||
| const unsigned char c = (unsigned char)*p; | const unsigned char c = (unsigned char)*p; | ||||
| UTF8_COMPUTE(c, mask, len, -1); | UTF8_COMPUTE(c, mask, len, -1); | ||||
| if (UNLIKELY(len == -1)) { | if (UNLIKELY(len == -1)) { | ||||
| return BLI_UTF8_ERR; | return BLI_UTF8_ERR; | ||||
| } | } | ||||
| UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); | UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); | ||||
| *index += (size_t)len; | *index += (size_t)len; | ||||
| return result; | return result; | ||||
| } | } | ||||
| uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__restrict index) | BLI_unicode BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, | ||||
| size_t *__restrict index) | |||||
| { | { | ||||
| int i, len; | int i, len; | ||||
| uint mask = 0; | uint mask = 0; | ||||
| uint result; | BLI_unicode result; | ||||
| const unsigned char c = (unsigned char)*p; | const unsigned char c = (unsigned char)*p; | ||||
| UTF8_COMPUTE(c, mask, len, -1); | UTF8_COMPUTE(c, mask, len, -1); | ||||
| if (UNLIKELY(len == -1)) { | if (UNLIKELY(len == -1)) { | ||||
| *index += 1; | *index += 1; | ||||
| return c; | return c; | ||||
| } | } | ||||
| UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); | UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR); | ||||
| *index += (size_t)len; | *index += (size_t)len; | ||||
| return result; | return result; | ||||
| } | } | ||||
| /* another variant that steps over the index, | /* another variant that steps over the index, | ||||
| * note, currently this also falls back to latin1 for text drawing. */ | * note, currently this also falls back to latin1 for text drawing. */ | ||||
| uint BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t *__restrict index) | BLI_unicode BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t *__restrict index) | ||||
| { | { | ||||
| int i, len; | int i, len; | ||||
| uint mask = 0; | uint mask = 0; | ||||
| uint result; | BLI_unicode result; | ||||
| unsigned char c; | unsigned char c; | ||||
| p += *index; | p += *index; | ||||
| c = (unsigned char)*p; | c = (unsigned char)*p; | ||||
| UTF8_COMPUTE(c, mask, len, -1); | UTF8_COMPUTE(c, mask, len, -1); | ||||
| if (UNLIKELY(len == -1)) { | if (UNLIKELY(len == -1)) { | ||||
| /* when called with NULL end, result will never be NULL, | /* when called with NULL end, result will never be NULL, | ||||
| Show All 35 Lines | |||||
| * \param outbuf: output buffer, must have at least 6 bytes of space. | * \param outbuf: output buffer, must have at least 6 bytes of space. | ||||
| * If %NULL, the length will be computed and returned | * If %NULL, the length will be computed and returned | ||||
| * and nothing will be written to outbuf. | * and nothing will be written to outbuf. | ||||
| * | * | ||||
| * Converts a single character to UTF-8. | * Converts a single character to UTF-8. | ||||
| * | * | ||||
| * \return number of bytes written | * \return number of bytes written | ||||
| */ | */ | ||||
| size_t BLI_str_utf8_from_unicode(uint c, char *outbuf) | size_t BLI_str_utf8_from_unicode(BLI_unicode c, char *outbuf) | ||||
| { | { | ||||
| /* If this gets modified, also update the copy in g_string_insert_unichar() */ | /* If this gets modified, also update the copy in g_string_insert_unichar() */ | ||||
| uint len = 0; | uint len = 0; | ||||
| uint first; | uint first; | ||||
| uint i; | uint i; | ||||
| if (c < 0x80) { | if (c < 0x80) { | ||||
| first = 0; | first = 0; | ||||
| Show All 26 Lines | for (i = len - 1; i > 0; i--) { | ||||
| c >>= 6; | c >>= 6; | ||||
| } | } | ||||
| outbuf[0] = c | first; | outbuf[0] = c | first; | ||||
| } | } | ||||
| return len; | return len; | ||||
| } | } | ||||
| size_t BLI_str_utf8_as_unicode_array(BLI_unicode *__restrict dst_w, | |||||
| const char *__restrict src_c, | |||||
| const size_t maxncpy) | |||||
| { | |||||
| const size_t maxlen = maxncpy - 1; | |||||
| size_t len = 0; | |||||
| BLI_assert(maxncpy != 0); | |||||
| #ifdef DEBUG_STRSIZE | |||||
| memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy); | |||||
| #endif | |||||
| while (*src_c && len != maxlen) { | |||||
| size_t step = 0; | |||||
| BLI_unicode unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step); | |||||
| if (unicode != BLI_UTF8_ERR) { | |||||
| *dst_w = unicode; | |||||
| src_c += step; | |||||
| } | |||||
| else { | |||||
| *dst_w = '?'; | |||||
| src_c = BLI_str_find_next_char_utf8(src_c, NULL); | |||||
| } | |||||
| dst_w++; | |||||
| len++; | |||||
| } | |||||
| *dst_w = 0; | |||||
| return len; | |||||
| } | |||||
| size_t BLI_unicode_array_as_str_utf8(char *__restrict dst, | |||||
| const BLI_unicode *__restrict src, | |||||
| const size_t maxncpy) | |||||
| { | |||||
| const size_t maxlen = maxncpy - 1; | |||||
| /* 6 is max utf8 length of an unicode char. */ | |||||
| const int64_t maxlen_secured = (int64_t)maxlen - 6; | |||||
| size_t len = 0; | |||||
| BLI_assert(maxncpy != 0); | |||||
| #ifdef DEBUG_STRSIZE | |||||
| memset(dst, 0xff, sizeof(*dst) * maxncpy); | |||||
| #endif | |||||
| while (*src && len <= maxlen_secured) { | |||||
| len += BLI_str_utf8_from_unicode(*src++, dst + len); | |||||
| } | |||||
| /* We have to be more careful for the last six bytes, | |||||
| * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */ | |||||
| while (*src) { | |||||
| char t[6]; | |||||
| size_t l = BLI_str_utf8_from_unicode(*src++, t); | |||||
| BLI_assert(l <= 6); | |||||
| if (len + l > maxlen) { | |||||
| break; | |||||
| } | |||||
| memcpy(dst + len, t, l); | |||||
| len += l; | |||||
| } | |||||
| dst[len] = '\0'; | |||||
| return len; | |||||
| } | |||||
| /* len of utf8 in unicode array */ | |||||
| size_t BLI_unicode_array_utf8_len(const BLI_unicode *src) | |||||
| { | |||||
| size_t len = 0; | |||||
| while (*src) { | |||||
| len += BLI_str_utf8_from_unicode(*src++, NULL); | |||||
| } | |||||
| return len; | |||||
| } | |||||
| /* was g_utf8_find_prev_char */ | /* was g_utf8_find_prev_char */ | ||||
| /** | /** | ||||
| * BLI_str_find_prev_char_utf8: | * BLI_str_find_prev_char_utf8: | ||||
| * \param str: pointer to the beginning of a UTF-8 encoded string | * \param str: pointer to the beginning of a UTF-8 encoded string | ||||
| * \param p: pointer to some position within \a str | * \param p: pointer to some position within \a str | ||||
| * | * | ||||
| * Given a position \a p with a UTF-8 encoded string \a str, find the start | * Given a position \a p with a UTF-8 encoded string \a str, find the start | ||||
| * of the previous UTF-8 character starting before. \a p Returns %NULL if no | * of the previous UTF-8 character starting before. \a p Returns %NULL if no | ||||
| ▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines | while (1) { | ||||
| if ((*p & 0xc0) != 0x80) { | if ((*p & 0xc0) != 0x80) { | ||||
| return (char *)p; | return (char *)p; | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| /* end glib copy */ | /* end glib copy */ | ||||
| size_t BLI_str_partition_utf8(const char *str, | size_t BLI_str_partition_utf8(const char *str, | ||||
| const uint delim[], | const BLI_unicode delim[], | ||||
| const char **sep, | const char **sep, | ||||
| const char **suf) | const char **suf) | ||||
| { | { | ||||
| return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, false); | return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, false); | ||||
| } | } | ||||
| size_t BLI_str_rpartition_utf8(const char *str, | size_t BLI_str_rpartition_utf8(const char *str, | ||||
| const uint delim[], | const BLI_unicode delim[], | ||||
| const char **sep, | const char **sep, | ||||
| const char **suf) | const char **suf) | ||||
| { | { | ||||
| return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, true); | return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, true); | ||||
| } | } | ||||
| size_t BLI_str_partition_ex_utf8(const char *str, | size_t BLI_str_partition_ex_utf8(const char *str, | ||||
| const char *end, | const char *end, | ||||
| const uint delim[], | const BLI_unicode delim[], | ||||
| const char **sep, | const char **sep, | ||||
| const char **suf, | const char **suf, | ||||
| const bool from_right) | const bool from_right) | ||||
| { | { | ||||
| const uint *d; | const BLI_unicode *d; | ||||
| const size_t str_len = end ? (size_t)(end - str) : strlen(str); | const size_t str_len = end ? (size_t)(end - str) : strlen(str); | ||||
| size_t index; | size_t index; | ||||
| /* Note that here, we assume end points to a valid utf8 char! */ | /* Note that here, we assume end points to a valid utf8 char! */ | ||||
| BLI_assert(end == NULL || (end >= str && (BLI_str_utf8_as_unicode(end) != BLI_UTF8_ERR))); | BLI_assert(end == NULL || (end >= str && (BLI_str_utf8_as_unicode(end) != BLI_UTF8_ERR))); | ||||
| *suf = (char *)(str + str_len); | *suf = (char *)(str + str_len); | ||||
| for (*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, str + str_len) : str), | for (*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, str + str_len) : str), | ||||
| index = 0; | index = 0; | ||||
| *sep >= str && (!end || *sep < end) && **sep != '\0'; | *sep >= str && (!end || *sep < end) && **sep != '\0'; | ||||
| *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) { | *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) { | ||||
| const uint c = BLI_str_utf8_as_unicode_and_size(*sep, &index); | const BLI_unicode c = BLI_str_utf8_as_unicode_and_size(*sep, &index); | ||||
| if (c == BLI_UTF8_ERR) { | if (c == BLI_UTF8_ERR) { | ||||
| *suf = *sep = NULL; | *suf = *sep = NULL; | ||||
| break; | break; | ||||
| } | } | ||||
| for (d = delim; *d != '\0'; d++) { | for (d = delim; *d != '\0'; d++) { | ||||
| if (*d == c) { | if (*d == c) { | ||||
| ▲ Show 20 Lines • Show All 64 Lines • Show Last 20 Lines | |||||