From 4af322cad402fb19de7f77cd29f61f9200f14aba Mon Sep 17 00:00:00 2001 From: Michael Adam Date: Fri, 29 Oct 2010 22:06:05 +0200 Subject: [PATCH 1/7] s3:lib/charcnv: add next_codepoint_ext() that accepts input charset. next_codepoint() takes as string in CH_UNIX encoding and returns the unicode codepoint of the next (possibly multibyte) character of the input string. The new next_codepoint_ext() function adds the encoding of the input string as a parameter. next_codepoint() now only calls next_codepoint_ext() with CH_UNIX als src_charset argument. (cherry picked from commit b887a7b33a855bc3ac6b06f497136b371340d46a) --- source3/include/proto.h | 2 + source3/lib/charcnv.c | 74 +++++++++++++++++++++++++++++++--------------- 2 files changed, 52 insertions(+), 24 deletions(-) diff --git a/source3/include/proto.h b/source3/include/proto.h index 5064fdb..c9c0b26 100644 --- a/source3/include/proto.h +++ b/source3/include/proto.h @@ -410,6 +410,8 @@ size_t pull_string_talloc_fn(const char *function, size_t src_len, int flags); size_t align_string(const void *base_ptr, const char *p, int flags); +codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, + size_t *bytes_consumed); codepoint_t next_codepoint(const char *str, size_t *size); /* The following definitions come from lib/clobber.c */ diff --git a/source3/lib/charcnv.c b/source3/lib/charcnv.c index 9ac9930..03d1031 100644 --- a/source3/lib/charcnv.c +++ b/source3/lib/charcnv.c @@ -1793,17 +1793,23 @@ size_t align_string(const void *base_ptr, const char *p, int flags) return 0; } -/* - Return the unicode codepoint for the next multi-byte CH_UNIX character - in the string. The unicode codepoint (codepoint_t) is an unsinged 32 bit value. - - Also return the number of bytes consumed (which tells the caller - how many bytes to skip to get to the next CH_UNIX character). - - Return INVALID_CODEPOINT if the next character cannot be converted. -*/ +/** + * Return the unicode codepoint for the next character in the input + * string in the given src_charset. + * The unicode codepoint (codepoint_t) is an unsinged 32 bit value. + * + * Also return the number of bytes consumed (which tells the caller + * how many bytes to skip to get to the next src_charset-character). + * + * This is implemented (in the non-ascii-case) by first converting the + * next character in the input string to UTF16_LE and then calculating + * the unicode codepoint from that. + * + * Return INVALID_CODEPOINT if the next character cannot be converted. + */ -codepoint_t next_codepoint(const char *str, size_t *size) +codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, + size_t *bytes_consumed) { /* It cannot occupy more than 4 bytes in UTF16 format */ uint8_t buf[4]; @@ -1813,41 +1819,46 @@ codepoint_t next_codepoint(const char *str, size_t *size) size_t olen; char *outbuf; + /* fastpath if the character is ASCII */ if ((str[0] & 0x80) == 0) { - *size = 1; + *bytes_consumed = 1; return (codepoint_t)str[0]; } - /* We assume that no multi-byte character can take - more than 5 bytes. This is OK as we only - support codepoints up to 1M */ + /* + * We assume that no multi-byte character can take more than + * 5 bytes. This is OK as we only support codepoints up to 1M + */ ilen_orig = strnlen(str, 5); ilen = ilen_orig; - lazy_initialize_conv(); + lazy_initialize_conv(); - descriptor = conv_handles[CH_UNIX][CH_UTF16LE]; + descriptor = conv_handles[src_charset][CH_UTF16LE]; if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) { - *size = 1; + *bytes_consumed = 1; return INVALID_CODEPOINT; } - /* This looks a little strange, but it is needed to cope - with codepoints above 64k which are encoded as per RFC2781. */ + /* + * This looks a little strange, but it is needed to cope + * with codepoints above 64k which are encoded as per RFC2781. + */ olen = 2; outbuf = (char *)buf; smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); if (olen == 2) { - /* We failed to convert to a 2 byte character. - See if we can convert to a 4 UTF16-LE byte char encoding. - */ + /* + * We failed to convert to a 2 byte character. + * See if we can convert to a 4 UTF16-LE byte char encoding. + */ olen = 4; outbuf = (char *)buf; smb_iconv(descriptor, &str, &ilen, &outbuf, &olen); if (olen == 4) { /* We didn't convert any bytes */ - *size = 1; + *bytes_consumed = 1; return INVALID_CODEPOINT; } olen = 4 - olen; @@ -1855,7 +1866,7 @@ codepoint_t next_codepoint(const char *str, size_t *size) olen = 2 - olen; } - *size = ilen_orig - ilen; + *bytes_consumed = ilen_orig - ilen; if (olen == 2) { /* 2 byte, UTF16-LE encoded value. */ @@ -1877,6 +1888,21 @@ codepoint_t next_codepoint(const char *str, size_t *size) } /* + Return the unicode codepoint for the next multi-byte CH_UNIX character + in the string. The unicode codepoint (codepoint_t) is an unsinged 32 bit value. + + Also return the number of bytes consumed (which tells the caller + how many bytes to skip to get to the next CH_UNIX character). + + Return INVALID_CODEPOINT if the next character cannot be converted. +*/ + +codepoint_t next_codepoint(const char *str, size_t *size) +{ + return next_codepoint_ext(str, CH_UNIX, size); +} + +/* push a single codepoint into a CH_UNIX string the target string must be able to hold the full character, which is guaranteed if it is at least 5 bytes in size. The caller may pass less than 5 bytes if they -- 1.6.3.3 From 111e2a3bd4663e22cae3b013a09a7426f8cc01ed Mon Sep 17 00:00:00 2001 From: Michael Adam Date: Fri, 29 Oct 2010 22:11:30 +0200 Subject: [PATCH 2/7] s3:lib/charcnv: clarify comments in next_codepoint_ext() (giving the unicod U+ notation of the codepoints referred to in the comments) (cherry picked from commit bd874fec1ca70cdb1d1551ffcc8be51bb95c8d26) --- source3/lib/charcnv.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source3/lib/charcnv.c b/source3/lib/charcnv.c index 03d1031..9c76e1b 100644 --- a/source3/lib/charcnv.c +++ b/source3/lib/charcnv.c @@ -1827,7 +1827,7 @@ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, /* * We assume that no multi-byte character can take more than - * 5 bytes. This is OK as we only support codepoints up to 1M + * 5 bytes. This is OK as we only support codepoints up to 1M (U+100000) */ ilen_orig = strnlen(str, 5); @@ -1843,7 +1843,7 @@ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset, /* * This looks a little strange, but it is needed to cope - * with codepoints above 64k which are encoded as per RFC2781. + * with codepoints above 64k (U+10000) which are encoded as per RFC2781. */ olen = 2; outbuf = (char *)buf; -- 1.6.3.3 From ab4474abd92b0b1134e04166e250edc03bbc8af4 Mon Sep 17 00:00:00 2001 From: Michael Adam Date: Fri, 29 Oct 2010 22:21:47 +0200 Subject: [PATCH 3/7] s3:util_str: clarify the comment header for strlen_m(). (cherry picked from commit cd79c661994530e6bd26aae1a7977a3dc04d42c0) --- source3/lib/util_str.c | 10 ++++++---- 1 files changed, 6 insertions(+), 4 deletions(-) diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c index 9a0b12a..7b2ee05 100644 --- a/source3/lib/util_str.c +++ b/source3/lib/util_str.c @@ -1454,10 +1454,12 @@ void strupper_m(char *s) } /** - Count the number of UCS2 characters in a string. Normally this will - be the same as the number of bytes in a string for single byte strings, - but will be different for multibyte. -**/ + * Calculate the number of 16-bit units that would be needed to convert + * the input string which is expected to be in CH_UNIX encoding to UTF16. + * + * This will be the same as the number of bytes in a string for single + * byte strings, but will be different for multibyte. + */ size_t strlen_m(const char *s) { -- 1.6.3.3 From 2eb186e21fc5a0a73d4fbcc331eb79b50ad9d34e Mon Sep 17 00:00:00 2001 From: Michael Adam Date: Sat, 30 Oct 2010 02:03:02 +0200 Subject: [PATCH 4/7] s3:util_str: add strlen_m_ext that takes input and output charset The function calculates the number of units (8 or 16-bit, depending on the destination charset), that would be needed to convert the input string which is expected to be in in src_charset encoding to the dst_charset (which should be a unicode charset). (cherry picked from commit 805f7331063db4a5e1156091bff4da0856daa2c2) --- source3/include/proto.h | 2 + source3/lib/util_str.c | 70 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 59 insertions(+), 13 deletions(-) diff --git a/source3/include/proto.h b/source3/include/proto.h index c9c0b26..3c36488 100644 --- a/source3/include/proto.h +++ b/source3/include/proto.h @@ -1541,6 +1541,8 @@ char *strnrchr_m(const char *s, char c, unsigned int n); char *strstr_m(const char *src, const char *findstr); void strlower_m(char *s); void strupper_m(char *s); +size_t strlen_m_ext(const char *s, const charset_t src_charset, + const charset_t dst_charset); size_t strlen_m(const char *s); size_t strlen_m_term(const char *s); size_t strlen_m_term_null(const char *s); diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c index 7b2ee05..5937c9d 100644 --- a/source3/lib/util_str.c +++ b/source3/lib/util_str.c @@ -1454,14 +1454,14 @@ void strupper_m(char *s) } /** - * Calculate the number of 16-bit units that would be needed to convert - * the input string which is expected to be in CH_UNIX encoding to UTF16. - * - * This will be the same as the number of bytes in a string for single - * byte strings, but will be different for multibyte. + * Calculate the number of units (8 or 16-bit, depending on the + * destination charset), that would be needed to convert the input + * string which is expected to be in in src_charset encoding to the + * destination charset (which should be a unicode charset). */ -size_t strlen_m(const char *s) +size_t strlen_m_ext(const char *s, const charset_t src_charset, + const charset_t dst_charset) { size_t count = 0; @@ -1480,21 +1480,65 @@ size_t strlen_m(const char *s) while (*s) { size_t c_size; - codepoint_t c = next_codepoint(s, &c_size); - if (c < 0x10000) { - /* Unicode char fits into 16 bits. */ + codepoint_t c = next_codepoint_ext(s, src_charset, &c_size); + s += c_size; + + switch (dst_charset) { + case CH_UTF16LE: + case CH_UTF16BE: + case CH_UTF16MUNGED: + if (c < 0x10000) { + /* Unicode char fits into 16 bits. */ + count += 1; + } else { + /* Double-width unicode char - 32 bits. */ + count += 2; + } + break; + case CH_UTF8: + /* + * this only checks ranges, and does not + * check for invalid codepoints + */ + if (c < 0x80) { + count += 1; + } else if (c < 0x800) { + count += 2; + } else if (c < 0x1000) { + count += 3; + } else { + count += 4; + } + break; + default: + /* non-unicode encoding - does not apply */ + return 0; + /* + * non-unicode encoding: + * assume that each codepoint fits into + * one unit in the destination encoding. + */ count += 1; - } else { - /* Double-width unicode char - 32 bits. */ - count += 2; } - s += c_size; } return count; } /** + * Calculate the number of 16-bit units that would bee needed to convert + * the input string which is expected to be in CH_UNIX encoding to UTF16. + * + * This will be the same as the number of bytes in a string for single + * byte strings, but will be different for multibyte. + */ + +size_t strlen_m(const char *s) +{ + return strlen_m_ext(s, CH_UNIX, CH_UTF16LE); +} + +/** Count the number of UCS2 characters in a string including the null terminator. **/ -- 1.6.3.3 From 767cac2ba518d5bc1526dda519d7bbcec0853e7d Mon Sep 17 00:00:00 2001 From: Michael Adam Date: Sun, 31 Oct 2010 02:02:16 +0200 Subject: [PATCH 5/7] s3:util_str: add strlen_m_ext_term() - variant of strlen_m_ext() counting terminator (cherry picked from commit f9cc1fa45dad2702ffdd155cec98ad4f51aac39a) --- source3/include/proto.h | 2 ++ source3/lib/util_str.c | 9 +++++++++ 2 files changed, 11 insertions(+), 0 deletions(-) diff --git a/source3/include/proto.h b/source3/include/proto.h index 3c36488..f817c2d 100644 --- a/source3/include/proto.h +++ b/source3/include/proto.h @@ -1543,6 +1543,8 @@ void strlower_m(char *s); void strupper_m(char *s); size_t strlen_m_ext(const char *s, const charset_t src_charset, const charset_t dst_charset); +size_t strlen_m_ext_term(const char *s, const charset_t src_charset, + const charset_t dst_charset); size_t strlen_m(const char *s); size_t strlen_m_term(const char *s); size_t strlen_m_term_null(const char *s); diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c index 5937c9d..33c2bce 100644 --- a/source3/lib/util_str.c +++ b/source3/lib/util_str.c @@ -1525,6 +1525,15 @@ size_t strlen_m_ext(const char *s, const charset_t src_charset, return count; } +size_t strlen_m_ext_term(const char *s, const charset_t src_charset, + const charset_t dst_charset) +{ + if (!s) { + return 0; + } + return strlen_m_ext(s, src_charset, dst_charset) + 1; +} + /** * Calculate the number of 16-bit units that would bee needed to convert * the input string which is expected to be in CH_UNIX encoding to UTF16. -- 1.6.3.3 From 4bbd327651794becf5b0a67c6cab8589869faf28 Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 25 Aug 2010 10:05:15 +0200 Subject: [PATCH 6/7] librpc/ndr: correctly implement ndr_charset_length() Before we ignored the charset type. metze Signed-off-by: Michael Adam --- source3/librpc/ndr/ndr_string.c | 16 ++++++++++++++-- 1 files changed, 14 insertions(+), 2 deletions(-) diff --git a/source3/librpc/ndr/ndr_string.c b/source3/librpc/ndr/ndr_string.c index 519be7b..51b0d89 100644 --- a/source3/librpc/ndr/ndr_string.c +++ b/source3/librpc/ndr/ndr_string.c @@ -714,7 +714,19 @@ _PUBLIC_ enum ndr_err_code ndr_push_charset(struct ndr_push *ndr, int ndr_flags, /* Return number of elements in a string in the specified charset */ _PUBLIC_ uint32_t ndr_charset_length(const void *var, charset_t chset) { - /* FIXME: Treat special chars special here, taking chset into account */ - /* Also include 0 byte */ + switch (chset) { + /* case CH_UTF16: this has the same value as CH_UTF16LE */ + case CH_UTF16LE: + case CH_UTF16BE: + case CH_UTF16MUNGED: + return strlen_m_term((const char *)var); + case CH_DISPLAY: + case CH_DOS: + case CH_UNIX: + case CH_UTF8: + return strlen((const char *)var)+1; + } + + /* Fallback, this should never happen */ return strlen((const char *)var)+1; } -- 1.6.3.3 From 7b9b874dc434763ef83397997840d203a656f97c Mon Sep 17 00:00:00 2001 From: Michael Adam Date: Sun, 31 Oct 2010 02:04:25 +0200 Subject: [PATCH 7/7] s3:librpc/ndr: use new strlen_m_ext_term() in ndr_charset_length(): fix bug #7594 This fixes the calculation of needed space for destination unicode charset. --- source3/librpc/ndr/ndr_string.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source3/librpc/ndr/ndr_string.c b/source3/librpc/ndr/ndr_string.c index 51b0d89..ec745e9 100644 --- a/source3/librpc/ndr/ndr_string.c +++ b/source3/librpc/ndr/ndr_string.c @@ -719,11 +719,11 @@ _PUBLIC_ uint32_t ndr_charset_length(const void *var, charset_t chset) case CH_UTF16LE: case CH_UTF16BE: case CH_UTF16MUNGED: - return strlen_m_term((const char *)var); + case CH_UTF8: + return strlen_m_ext_term((const char *)var, CH_UNIX, chset); case CH_DISPLAY: case CH_DOS: case CH_UNIX: - case CH_UTF8: return strlen((const char *)var)+1; } -- 1.6.3.3