From 6b2b7e8803027b7089a853af0f47b53a7d20e950 Mon Sep 17 00:00:00 2001 From: Peter Jones Date: Tue, 18 Jun 2019 13:12:39 -0400 Subject: [PATCH 35/86] ucs2: document things a little better Signed-off-by: Peter Jones --- src/ucs2.h | 135 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 100 insertions(+), 35 deletions(-) diff --git a/src/ucs2.h b/src/ucs2.h index 478de23b23f..3f8a41d8ccc 100644 --- a/src/ucs2.h +++ b/src/ucs2.h @@ -22,11 +22,20 @@ #define ev_bits(val, mask, shift) \ (((val) & ((mask) << (shift))) >> (shift)) +/* + * ucs2len(): Count the number of characters in a UCS-2 string. + * s: a UCS-2 string + * limit: the maximum number of uint16_t bytepairs to examine + * + * returns the number of characters before NUL is found (i.e., excluding + * the NUL character). If limit is non-negative, no character index above + * limit will be accessed, and the maximum return value is limit. + */ static inline size_t UNUSED -ucs2len(const void *vs, ssize_t limit) +ucs2len(const void *s, ssize_t limit) { ssize_t i; - const uint8_t *s8 = vs; + const uint8_t *s8 = s; for (i = 0; i < (limit >= 0 ? limit : i+1) && !(s8[0] == 0 && s8[1] == 0); @@ -35,6 +44,15 @@ ucs2len(const void *vs, ssize_t limit) return i; } +/* + * ucs2size(): count the number of bytes in use by a UCS-2 string. + * s: a UCS-2 string + * limit: the maximum number of uint16_t bytepairs to examine + * + * returns the number of bytes, including NUL, in the UCS-2 string s. If + * limit is non-negative, no character index above limit will be accessed, + * and the maximum return value is limit. + */ static inline size_t UNUSED ucs2size(const void *s, ssize_t limit) { @@ -46,6 +64,18 @@ ucs2size(const void *s, ssize_t limit) return rc; } +/* + * utf8len(): Count the number of characters in a UTF-8 string. + * s: a UTF-8 string + * limit: the maximum number of bytes to examine + * + * returns the number of UTF-8 charters before NUL is found (i.e., + * excluding the NUL character). If limit is non-negative, no character + * index above limit will be accessed, and the maximum return value is + * limit. + * + * Caveat: only good up to 3-byte sequences. + */ static inline size_t UNUSED NONNULL(1) utf8len(const unsigned char *s, ssize_t limit) { @@ -63,6 +93,15 @@ utf8len(const unsigned char *s, ssize_t limit) return j; } +/* + * utf8size(): count the number of bytes in use by a UTF-8 string. + * s: a UTF-8 string + * limit: the maximum number of bytes to examine + * + * returns the number of bytes, including NUL, in the UTF-8 string s. + * If limit is non-negative, no character index above limit will be + * accessed, and the maximum return value is limit. + */ static inline size_t UNUSED NONNULL(1) utf8size(const unsigned char *s, ssize_t limit) { @@ -72,68 +111,94 @@ utf8size(const unsigned char *s, ssize_t limit) return ret; } +/* + * ucs2_to_utf8(): convert UCS-2 to UTF-8 + * s: the UCS-2 string + * limit: the maximum number of characters to copy from s, including the + * NUL terminator, or -1 for no limit. + * + * returns an allocated string, into which at most limit - 1 characters of + * UTF-8 are translated from UCS-2. The return value is *always* + * NUL-terminated. + */ static inline unsigned char * UNUSED -ucs2_to_utf8(const void * const voidchars, ssize_t limit) +ucs2_to_utf8(const void * const s, ssize_t limit) { ssize_t i, j; - unsigned char *ret; - const uint16_t * const chars = voidchars; + unsigned char *out, *ret; + const uint16_t * const chars = s; if (limit < 0) limit = ucs2len(chars, -1); - ret = malloc(limit * 6 + 1); - if (!ret) + out = malloc(limit * 6 + 1); + if (!out) return NULL; - memset(ret, 0, limit * 6 +1); + memset(out, 0, limit * 6 +1); for (i=0, j=0; chars[i] && i < (limit >= 0 ? limit : i+1); i++,j++) { if (chars[i] <= 0x7f) { - ret[j] = chars[i]; + out[j] = chars[i]; } else if (chars[i] > 0x7f && chars[i] <= 0x7ff) { - ret[j++] = 0xc0 | ev_bits(chars[i], 0x1f, 6); - ret[j] = 0x80 | ev_bits(chars[i], 0x3f, 0); + out[j++] = 0xc0 | ev_bits(chars[i], 0x1f, 6); + out[j] = 0x80 | ev_bits(chars[i], 0x3f, 0); #if 1 } else if (chars[i] > 0x7ff) { - ret[j++] = 0xe0 | ev_bits(chars[i], 0xf, 12); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); - ret[j] = 0x80| ev_bits(chars[i], 0x3f, 0); + out[j++] = 0xe0 | ev_bits(chars[i], 0xf, 12); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); + out[j] = 0x80| ev_bits(chars[i], 0x3f, 0); } #else } else if (chars[i] > 0x7ff && chars[i] < 0x10000) { - ret[j++] = 0xe0 | ev_bits(chars[i], 0xf, 12); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); - ret[j] = 0x80| ev_bits(chars[i], 0x3f, 0); + out[j++] = 0xe0 | ev_bits(chars[i], 0xf, 12); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); + out[j] = 0x80| ev_bits(chars[i], 0x3f, 0); } else if (chars[i] > 0xffff && chars[i] < 0x200000) { - ret[j++] = 0xf0 | ev_bits(chars[i], 0x7, 18); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); - ret[j] = 0x80| ev_bits(chars[i], 0x3f, 0); + out[j++] = 0xf0 | ev_bits(chars[i], 0x7, 18); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); + out[j] = 0x80| ev_bits(chars[i], 0x3f, 0); } else if (chars[i] > 0x1fffff && chars[i] < 0x4000000) { - ret[j++] = 0xf8 | ev_bits(chars[i], 0x3, 24); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 18); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); - ret[j] = 0x80 | ev_bits(chars[i], 0x3f, 0); + out[j++] = 0xf8 | ev_bits(chars[i], 0x3, 24); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 18); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); + out[j] = 0x80 | ev_bits(chars[i], 0x3f, 0); } else if (chars[i] > 0x3ffffff) { - ret[j++] = 0xfc | ev_bits(chars[i], 0x1, 30); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 24); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 18); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12); - ret[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); - ret[j] = 0x80 | ev_bits(chars[i], 0x3f, 0); + out[j++] = 0xfc | ev_bits(chars[i], 0x1, 30); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 24); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 18); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 12); + out[j++] = 0x80 | ev_bits(chars[i], 0x3f, 6); + out[j] = 0x80 | ev_bits(chars[i], 0x3f, 0); } #endif } - ret[j] = '\0'; + out[j++] = '\0'; + ret = realloc(out, j); + if (!ret) { + free(out); + return NULL; + } return ret; } +/* + * utf8_to_ucs2(): convert UTF-8 to UCS-2 + * s: the destination buffer to write to. + * size: the size of the allocation to write to + * terminate: whether or not to add a terminator to the string + * utf8: the utf8 source + * + * returns the number of characters written to s, including the NUL + * terminator if "terminate" is true, or -1 on error. In the case of an + * error, the buffer will not be modified. + */ static inline ssize_t UNUSED NONNULL(4) -utf8_to_ucs2(void *ucs2void, ssize_t size, int terminate, const unsigned char *utf8) +utf8_to_ucs2(void *s, ssize_t size, bool terminate, const unsigned char *utf8) { ssize_t req; ssize_t i, j; - uint16_t *ucs2 = ucs2void; + uint16_t *ucs2 = s; uint16_t val16; if (!ucs2 && size > 0) { -- 2.24.1