Attachment 6041 Details for Bug 7594 – Proposed patchset for fixing the bug.

[patch] Proposed patchset for fixing the bug.

patchset-bug7594.mbox (text/plain), 15.58 KB, created by Michael Adam on 2010-11-01 09:07:43 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: Michael Adam

Created: 2010-11-01 09:07:43 UTC

Size: 15.58 KB

patch

obsolete

>From 4af322cad402fb19de7f77cd29f61f9200f14aba Mon Sep 17 00:00:00 2001
>From: Michael Adam <obnox@samba.org>
>Date: Fri, 29 Oct 2010 22:06:05 +0200
>Subject: [PATCH 1/7] s3:lib/charcnv: add next_codepoint_ext() that accepts input charset.
>
>next_codepoint() takes as string in CH_UNIX encoding and returns the
>unicode codepoint of the next (possibly multibyte) character of the
>input string.
>
>The new next_codepoint_ext() function adds the encoding of the input
>string as a parameter. next_codepoint() now only calls next_codepoint_ext()
>with CH_UNIX als src_charset argument.
>(cherry picked from commit b887a7b33a855bc3ac6b06f497136b371340d46a)
>---
> source3/include/proto.h |    2 +
> source3/lib/charcnv.c   |   74 +++++++++++++++++++++++++++++++---------------
> 2 files changed, 52 insertions(+), 24 deletions(-)
>
>diff --git a/source3/include/proto.h b/source3/include/proto.h
>index 5064fdb..c9c0b26 100644
>--- a/source3/include/proto.h
>+++ b/source3/include/proto.h
>@@ -410,6 +410,8 @@ size_t pull_string_talloc_fn(const char *function,
> 			size_t src_len,
> 			int flags);
> size_t align_string(const void *base_ptr, const char *p, int flags);
>+codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
>+			       size_t *bytes_consumed);
> codepoint_t next_codepoint(const char *str, size_t *size);
> 
> /* The following definitions come from lib/clobber.c  */
>diff --git a/source3/lib/charcnv.c b/source3/lib/charcnv.c
>index 9ac9930..03d1031 100644
>--- a/source3/lib/charcnv.c
>+++ b/source3/lib/charcnv.c
>@@ -1793,17 +1793,23 @@ size_t align_string(const void *base_ptr, const char *p, int flags)
> 	return 0;
> }
> 
>-/*
>-  Return the unicode codepoint for the next multi-byte CH_UNIX character
>-  in the string. The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
>-
>-  Also return the number of bytes consumed (which tells the caller
>-  how many bytes to skip to get to the next CH_UNIX character).
>-
>-  Return INVALID_CODEPOINT if the next character cannot be converted.
>-*/
>+/**
>+ * Return the unicode codepoint for the next character in the input
>+ * string in the given src_charset.
>+ * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
>+ *
>+ * Also return the number of bytes consumed (which tells the caller
>+ * how many bytes to skip to get to the next src_charset-character).
>+ *
>+ * This is implemented (in the non-ascii-case) by first converting the
>+ * next character in the input string to UTF16_LE and then calculating
>+ * the unicode codepoint from that.
>+ *
>+ * Return INVALID_CODEPOINT if the next character cannot be converted.
>+ */
> 
>-codepoint_t next_codepoint(const char *str, size_t *size)
>+codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
>+			       size_t *bytes_consumed)
> {
> 	/* It cannot occupy more than 4 bytes in UTF16 format */
> 	uint8_t buf[4];
>@@ -1813,41 +1819,46 @@ codepoint_t next_codepoint(const char *str, size_t *size)
> 	size_t olen;
> 	char *outbuf;
> 
>+	/* fastpath if the character is ASCII */
> 	if ((str[0] & 0x80) == 0) {
>-		*size = 1;
>+		*bytes_consumed = 1;
> 		return (codepoint_t)str[0];
> 	}
> 
>-	/* We assume that no multi-byte character can take
>-	   more than 5 bytes. This is OK as we only
>-	   support codepoints up to 1M */
>+	/*
>+	 * We assume that no multi-byte character can take more than
>+	 * 5 bytes. This is OK as we only support codepoints up to 1M
>+	 */
> 
> 	ilen_orig = strnlen(str, 5);
> 	ilen = ilen_orig;
> 
>-        lazy_initialize_conv();
>+	lazy_initialize_conv();
> 
>-        descriptor = conv_handles[CH_UNIX][CH_UTF16LE];
>+	descriptor = conv_handles[src_charset][CH_UTF16LE];
> 	if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
>-		*size = 1;
>+		*bytes_consumed = 1;
> 		return INVALID_CODEPOINT;
> 	}
> 
>-	/* This looks a little strange, but it is needed to cope
>-	   with codepoints above 64k which are encoded as per RFC2781. */
>+	/*
>+	 * This looks a little strange, but it is needed to cope
>+	 * with codepoints above 64k which are encoded as per RFC2781.
>+	 */
> 	olen = 2;
> 	outbuf = (char *)buf;
> 	smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
> 	if (olen == 2) {
>-		/* We failed to convert to a 2 byte character.
>-		   See if we can convert to a 4 UTF16-LE byte char encoding.
>-		*/
>+		/*
>+		 * We failed to convert to a 2 byte character.
>+		 * See if we can convert to a 4 UTF16-LE byte char encoding.
>+		 */
> 		olen = 4;
> 		outbuf = (char *)buf;
> 		smb_iconv(descriptor,  &str, &ilen, &outbuf, &olen);
> 		if (olen == 4) {
> 			/* We didn't convert any bytes */
>-			*size = 1;
>+			*bytes_consumed = 1;
> 			return INVALID_CODEPOINT;
> 		}
> 		olen = 4 - olen;
>@@ -1855,7 +1866,7 @@ codepoint_t next_codepoint(const char *str, size_t *size)
> 		olen = 2 - olen;
> 	}
> 
>-	*size = ilen_orig - ilen;
>+	*bytes_consumed = ilen_orig - ilen;
> 
> 	if (olen == 2) {
> 		/* 2 byte, UTF16-LE encoded value. */
>@@ -1877,6 +1888,21 @@ codepoint_t next_codepoint(const char *str, size_t *size)
> }
> 
> /*
>+  Return the unicode codepoint for the next multi-byte CH_UNIX character
>+  in the string. The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
>+
>+  Also return the number of bytes consumed (which tells the caller
>+  how many bytes to skip to get to the next CH_UNIX character).
>+
>+  Return INVALID_CODEPOINT if the next character cannot be converted.
>+*/
>+
>+codepoint_t next_codepoint(const char *str, size_t *size)
>+{
>+	return next_codepoint_ext(str, CH_UNIX, size);
>+}
>+
>+/*
>   push a single codepoint into a CH_UNIX string the target string must
>   be able to hold the full character, which is guaranteed if it is at
>   least 5 bytes in size. The caller may pass less than 5 bytes if they
>-- 
>1.6.3.3
>
>
>From 111e2a3bd4663e22cae3b013a09a7426f8cc01ed Mon Sep 17 00:00:00 2001
>From: Michael Adam <obnox@samba.org>
>Date: Fri, 29 Oct 2010 22:11:30 +0200
>Subject: [PATCH 2/7] s3:lib/charcnv: clarify comments in next_codepoint_ext()
>
>(giving the unicod U+<hexnumber> notation of the codepoints
> referred to in the comments)
>(cherry picked from commit bd874fec1ca70cdb1d1551ffcc8be51bb95c8d26)
>---
> source3/lib/charcnv.c |    4 ++--
> 1 files changed, 2 insertions(+), 2 deletions(-)
>
>diff --git a/source3/lib/charcnv.c b/source3/lib/charcnv.c
>index 03d1031..9c76e1b 100644
>--- a/source3/lib/charcnv.c
>+++ b/source3/lib/charcnv.c
>@@ -1827,7 +1827,7 @@ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
> 
> 	/*
> 	 * We assume that no multi-byte character can take more than
>-	 * 5 bytes. This is OK as we only support codepoints up to 1M
>+	 * 5 bytes. This is OK as we only support codepoints up to 1M (U+100000)
> 	 */
> 
> 	ilen_orig = strnlen(str, 5);
>@@ -1843,7 +1843,7 @@ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
> 
> 	/*
> 	 * This looks a little strange, but it is needed to cope
>-	 * with codepoints above 64k which are encoded as per RFC2781.
>+	 * with codepoints above 64k (U+10000) which are encoded as per RFC2781.
> 	 */
> 	olen = 2;
> 	outbuf = (char *)buf;
>-- 
>1.6.3.3
>
>
>From ab4474abd92b0b1134e04166e250edc03bbc8af4 Mon Sep 17 00:00:00 2001
>From: Michael Adam <obnox@samba.org>
>Date: Fri, 29 Oct 2010 22:21:47 +0200
>Subject: [PATCH 3/7] s3:util_str: clarify the comment header for strlen_m().
> (cherry picked from commit cd79c661994530e6bd26aae1a7977a3dc04d42c0)
>
>---
> source3/lib/util_str.c |   10 ++++++----
> 1 files changed, 6 insertions(+), 4 deletions(-)
>
>diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c
>index 9a0b12a..7b2ee05 100644
>--- a/source3/lib/util_str.c
>+++ b/source3/lib/util_str.c
>@@ -1454,10 +1454,12 @@ void strupper_m(char *s)
> }
> 
> /**
>- Count the number of UCS2 characters in a string. Normally this will
>- be the same as the number of bytes in a string for single byte strings,
>- but will be different for multibyte.
>-**/
>+ * Calculate the number of 16-bit units that would be needed to convert
>+ * the input string which is expected to be in CH_UNIX encoding to UTF16.
>+ *
>+ * This will be the same as the number of bytes in a string for single
>+ * byte strings, but will be different for multibyte.
>+ */
> 
> size_t strlen_m(const char *s)
> {
>-- 
>1.6.3.3
>
>
>From 2eb186e21fc5a0a73d4fbcc331eb79b50ad9d34e Mon Sep 17 00:00:00 2001
>From: Michael Adam <obnox@samba.org>
>Date: Sat, 30 Oct 2010 02:03:02 +0200
>Subject: [PATCH 4/7] s3:util_str: add strlen_m_ext that takes input and output charset
>
>The function calculates the number of units (8 or 16-bit, depending
>on the destination charset), that would be needed to convert the
>input string which is expected to be in in src_charset encoding
>to the dst_charset (which should be a unicode charset).
>(cherry picked from commit 805f7331063db4a5e1156091bff4da0856daa2c2)
>---
> source3/include/proto.h |    2 +
> source3/lib/util_str.c  |   70 ++++++++++++++++++++++++++++++++++++++---------
> 2 files changed, 59 insertions(+), 13 deletions(-)
>
>diff --git a/source3/include/proto.h b/source3/include/proto.h
>index c9c0b26..3c36488 100644
>--- a/source3/include/proto.h
>+++ b/source3/include/proto.h
>@@ -1541,6 +1541,8 @@ char *strnrchr_m(const char *s, char c, unsigned int n);
> char *strstr_m(const char *src, const char *findstr);
> void strlower_m(char *s);
> void strupper_m(char *s);
>+size_t strlen_m_ext(const char *s, const charset_t src_charset,
>+		    const charset_t dst_charset);
> size_t strlen_m(const char *s);
> size_t strlen_m_term(const char *s);
> size_t strlen_m_term_null(const char *s);
>diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c
>index 7b2ee05..5937c9d 100644
>--- a/source3/lib/util_str.c
>+++ b/source3/lib/util_str.c
>@@ -1454,14 +1454,14 @@ void strupper_m(char *s)
> }
> 
> /**
>- * Calculate the number of 16-bit units that would be needed to convert
>- * the input string which is expected to be in CH_UNIX encoding to UTF16.
>- *
>- * This will be the same as the number of bytes in a string for single
>- * byte strings, but will be different for multibyte.
>+ * Calculate the number of units (8 or 16-bit, depending on the
>+ * destination charset), that would be needed to convert the input
>+ * string which is expected to be in in src_charset encoding to the
>+ * destination charset (which should be a unicode charset).
>  */
> 
>-size_t strlen_m(const char *s)
>+size_t strlen_m_ext(const char *s, const charset_t src_charset,
>+		    const charset_t dst_charset)
> {
> 	size_t count = 0;
> 
>@@ -1480,21 +1480,65 @@ size_t strlen_m(const char *s)
> 
> 	while (*s) {
> 		size_t c_size;
>-		codepoint_t c = next_codepoint(s, &c_size);
>-		if (c < 0x10000) {
>-			/* Unicode char fits into 16 bits. */
>+		codepoint_t c = next_codepoint_ext(s, src_charset, &c_size);
>+		s += c_size;
>+
>+		switch (dst_charset) {
>+		case CH_UTF16LE:
>+		case CH_UTF16BE:
>+		case CH_UTF16MUNGED:
>+			if (c < 0x10000) {
>+				/* Unicode char fits into 16 bits. */
>+				count += 1;
>+			} else {
>+				/* Double-width unicode char - 32 bits. */
>+				count += 2;
>+			}
>+			break;
>+		case CH_UTF8:
>+			/*
>+			 * this only checks ranges, and does not
>+			 * check for invalid codepoints
>+			 */
>+			if (c < 0x80) {
>+				count += 1;
>+			} else if (c < 0x800) {
>+				count += 2;
>+			} else if (c < 0x1000) {
>+				count += 3;
>+			} else {
>+				count += 4;
>+			}
>+			break;
>+		default:
>+			/* non-unicode encoding - does not apply */
>+			 return 0;
>+			/*
>+			 * non-unicode encoding:
>+			 * assume that each codepoint fits into
>+			 * one unit in the destination encoding.
>+			 */
> 			count += 1;
>-		} else {
>-			/* Double-width unicode char - 32 bits. */
>-			count += 2;
> 		}
>-		s += c_size;
> 	}
> 
> 	return count;
> }
> 
> /**
>+ * Calculate the number of 16-bit units that would bee needed to convert
>+ * the input string which is expected to be in CH_UNIX encoding to UTF16.
>+ *
>+ * This will be the same as the number of bytes in a string for single
>+ * byte strings, but will be different for multibyte.
>+ */
>+
>+size_t strlen_m(const char *s)
>+{
>+	return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
>+}
>+
>+/**
>  Count the number of UCS2 characters in a string including the null
>  terminator.
> **/
>-- 
>1.6.3.3
>
>
>From 767cac2ba518d5bc1526dda519d7bbcec0853e7d Mon Sep 17 00:00:00 2001
>From: Michael Adam <obnox@samba.org>
>Date: Sun, 31 Oct 2010 02:02:16 +0200
>Subject: [PATCH 5/7] s3:util_str: add strlen_m_ext_term() - variant of strlen_m_ext() counting terminator
> (cherry picked from commit f9cc1fa45dad2702ffdd155cec98ad4f51aac39a)
>
>---
> source3/include/proto.h |    2 ++
> source3/lib/util_str.c  |    9 +++++++++
> 2 files changed, 11 insertions(+), 0 deletions(-)
>
>diff --git a/source3/include/proto.h b/source3/include/proto.h
>index 3c36488..f817c2d 100644
>--- a/source3/include/proto.h
>+++ b/source3/include/proto.h
>@@ -1543,6 +1543,8 @@ void strlower_m(char *s);
> void strupper_m(char *s);
> size_t strlen_m_ext(const char *s, const charset_t src_charset,
> 		    const charset_t dst_charset);
>+size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
>+			 const charset_t dst_charset);
> size_t strlen_m(const char *s);
> size_t strlen_m_term(const char *s);
> size_t strlen_m_term_null(const char *s);
>diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c
>index 5937c9d..33c2bce 100644
>--- a/source3/lib/util_str.c
>+++ b/source3/lib/util_str.c
>@@ -1525,6 +1525,15 @@ size_t strlen_m_ext(const char *s, const charset_t src_charset,
> 	return count;
> }
> 
>+size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
>+			 const charset_t dst_charset)
>+{
>+	if (!s) {
>+		return 0;
>+	}
>+	return strlen_m_ext(s, src_charset, dst_charset) + 1;
>+}
>+
> /**
>  * Calculate the number of 16-bit units that would bee needed to convert
>  * the input string which is expected to be in CH_UNIX encoding to UTF16.
>-- 
>1.6.3.3
>
>
>From 4bbd327651794becf5b0a67c6cab8589869faf28 Mon Sep 17 00:00:00 2001
>From: Stefan Metzmacher <metze@samba.org>
>Date: Wed, 25 Aug 2010 10:05:15 +0200
>Subject: [PATCH 6/7] librpc/ndr: correctly implement ndr_charset_length()
>
>Before we ignored the charset type.
>
>metze
>
>Signed-off-by: Michael Adam <obnox@samba.org>
>---
> source3/librpc/ndr/ndr_string.c |   16 ++++++++++++++--
> 1 files changed, 14 insertions(+), 2 deletions(-)
>
>diff --git a/source3/librpc/ndr/ndr_string.c b/source3/librpc/ndr/ndr_string.c
>index 519be7b..51b0d89 100644
>--- a/source3/librpc/ndr/ndr_string.c
>+++ b/source3/librpc/ndr/ndr_string.c
>@@ -714,7 +714,19 @@ _PUBLIC_ enum ndr_err_code ndr_push_charset(struct ndr_push *ndr, int ndr_flags,
> /* Return number of elements in a string in the specified charset */
> _PUBLIC_ uint32_t ndr_charset_length(const void *var, charset_t chset)
> {
>-	/* FIXME: Treat special chars special here, taking chset into account */
>-	/* Also include 0 byte */
>+	switch (chset) {
>+	/* case CH_UTF16: this has the same value as CH_UTF16LE */
>+	case CH_UTF16LE:
>+	case CH_UTF16BE:
>+	case CH_UTF16MUNGED:
>+		return strlen_m_term((const char *)var);
>+	case CH_DISPLAY:
>+	case CH_DOS:
>+	case CH_UNIX:
>+	case CH_UTF8:
>+		return strlen((const char *)var)+1;
>+	}
>+
>+	/* Fallback, this should never happen */
> 	return strlen((const char *)var)+1;
> }
>-- 
>1.6.3.3
>
>
>From 7b9b874dc434763ef83397997840d203a656f97c Mon Sep 17 00:00:00 2001
>From: Michael Adam <obnox@samba.org>
>Date: Sun, 31 Oct 2010 02:04:25 +0200
>Subject: [PATCH 7/7] s3:librpc/ndr: use new strlen_m_ext_term() in ndr_charset_length(): fix bug #7594
>
>This fixes the calculation of needed space for destination unicode charset.
>---
> source3/librpc/ndr/ndr_string.c |    4 ++--
> 1 files changed, 2 insertions(+), 2 deletions(-)
>
>diff --git a/source3/librpc/ndr/ndr_string.c b/source3/librpc/ndr/ndr_string.c
>index 51b0d89..ec745e9 100644
>--- a/source3/librpc/ndr/ndr_string.c
>+++ b/source3/librpc/ndr/ndr_string.c
>@@ -719,11 +719,11 @@ _PUBLIC_ uint32_t ndr_charset_length(const void *var, charset_t chset)
> 	case CH_UTF16LE:
> 	case CH_UTF16BE:
> 	case CH_UTF16MUNGED:
>-		return strlen_m_term((const char *)var);
>+	case CH_UTF8:
>+		return strlen_m_ext_term((const char *)var, CH_UNIX, chset);
> 	case CH_DISPLAY:
> 	case CH_DOS:
> 	case CH_UNIX:
>-	case CH_UTF8:
> 		return strlen((const char *)var)+1;
> 	}
> 
>-- 
>1.6.3.3
>

Actions: View

Attachments on bug 7594: 5876 | 6041 | 6042