From 4af322cad402fb19de7f77cd29f61f9200f14aba Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Fri, 29 Oct 2010 22:06:05 +0200
Subject: [PATCH 1/7] s3:lib/charcnv: add next_codepoint_ext() that accepts input charset.

next_codepoint() takes as string in CH_UNIX encoding and returns the
unicode codepoint of the next (possibly multibyte) character of the
input string.

The new next_codepoint_ext() function adds the encoding of the input
string as a parameter. next_codepoint() now only calls next_codepoint_ext()
with CH_UNIX als src_charset argument.
(cherry picked from commit b887a7b33a855bc3ac6b06f497136b371340d46a)
---
 source3/include/proto.h |    2 +
 source3/lib/charcnv.c   |   74 +++++++++++++++++++++++++++++++---------------
 2 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/source3/include/proto.h b/source3/include/proto.h
index 5064fdb..c9c0b26 100644
--- a/source3/include/proto.h
+++ b/source3/include/proto.h
@@ -410,6 +410,8 @@ size_t pull_string_talloc_fn(const char *function,
 			size_t src_len,
 			int flags);
 size_t align_string(const void *base_ptr, const char *p, int flags);
+codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
+			       size_t *bytes_consumed);
 codepoint_t next_codepoint(const char *str, size_t *size);
 
 /* The following definitions come from lib/clobber.c  */
diff --git a/source3/lib/charcnv.c b/source3/lib/charcnv.c
index 9ac9930..03d1031 100644
--- a/source3/lib/charcnv.c
+++ b/source3/lib/charcnv.c
@@ -1793,17 +1793,23 @@ size_t align_string(const void *base_ptr, const char *p, int flags)
 	return 0;
 }
 
-/*
-  Return the unicode codepoint for the next multi-byte CH_UNIX character
-  in the string. The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
-
-  Also return the number of bytes consumed (which tells the caller
-  how many bytes to skip to get to the next CH_UNIX character).
-
-  Return INVALID_CODEPOINT if the next character cannot be converted.
-*/
+/**
+ * Return the unicode codepoint for the next character in the input
+ * string in the given src_charset.
+ * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
+ *
+ * Also return the number of bytes consumed (which tells the caller
+ * how many bytes to skip to get to the next src_charset-character).
+ *
+ * This is implemented (in the non-ascii-case) by first converting the
+ * next character in the input string to UTF16_LE and then calculating
+ * the unicode codepoint from that.
+ *
+ * Return INVALID_CODEPOINT if the next character cannot be converted.
+ */
 
-codepoint_t next_codepoint(const char *str, size_t *size)
+codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
+			       size_t *bytes_consumed)
 {
 	/* It cannot occupy more than 4 bytes in UTF16 format */
 	uint8_t buf[4];
@@ -1813,41 +1819,46 @@ codepoint_t next_codepoint(const char *str, size_t *size)
 	size_t olen;
 	char *outbuf;
 
+	/* fastpath if the character is ASCII */
 	if ((str[0] & 0x80) == 0) {
-		*size = 1;
+		*bytes_consumed = 1;
 		return (codepoint_t)str[0];
 	}
 
-	/* We assume that no multi-byte character can take
-	   more than 5 bytes. This is OK as we only
-	   support codepoints up to 1M */
+	/*
+	 * We assume that no multi-byte character can take more than
+	 * 5 bytes. This is OK as we only support codepoints up to 1M
+	 */
 
 	ilen_orig = strnlen(str, 5);
 	ilen = ilen_orig;
 
-        lazy_initialize_conv();
+	lazy_initialize_conv();
 
-        descriptor = conv_handles[CH_UNIX][CH_UTF16LE];
+	descriptor = conv_handles[src_charset][CH_UTF16LE];
 	if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
-		*size = 1;
+		*bytes_consumed = 1;
 		return INVALID_CODEPOINT;
 	}
 
-	/* This looks a little strange, but it is needed to cope
-	   with codepoints above 64k which are encoded as per RFC2781. */
+	/*
+	 * This looks a little strange, but it is needed to cope
+	 * with codepoints above 64k which are encoded as per RFC2781.
+	 */
 	olen = 2;
 	outbuf = (char *)buf;
 	smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
 	if (olen == 2) {
-		/* We failed to convert to a 2 byte character.
-		   See if we can convert to a 4 UTF16-LE byte char encoding.
-		*/
+		/*
+		 * We failed to convert to a 2 byte character.
+		 * See if we can convert to a 4 UTF16-LE byte char encoding.
+		 */
 		olen = 4;
 		outbuf = (char *)buf;
 		smb_iconv(descriptor,  &str, &ilen, &outbuf, &olen);
 		if (olen == 4) {
 			/* We didn't convert any bytes */
-			*size = 1;
+			*bytes_consumed = 1;
 			return INVALID_CODEPOINT;
 		}
 		olen = 4 - olen;
@@ -1855,7 +1866,7 @@ codepoint_t next_codepoint(const char *str, size_t *size)
 		olen = 2 - olen;
 	}
 
-	*size = ilen_orig - ilen;
+	*bytes_consumed = ilen_orig - ilen;
 
 	if (olen == 2) {
 		/* 2 byte, UTF16-LE encoded value. */
@@ -1877,6 +1888,21 @@ codepoint_t next_codepoint(const char *str, size_t *size)
 }
 
 /*
+  Return the unicode codepoint for the next multi-byte CH_UNIX character
+  in the string. The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
+
+  Also return the number of bytes consumed (which tells the caller
+  how many bytes to skip to get to the next CH_UNIX character).
+
+  Return INVALID_CODEPOINT if the next character cannot be converted.
+*/
+
+codepoint_t next_codepoint(const char *str, size_t *size)
+{
+	return next_codepoint_ext(str, CH_UNIX, size);
+}
+
+/*
   push a single codepoint into a CH_UNIX string the target string must
   be able to hold the full character, which is guaranteed if it is at
   least 5 bytes in size. The caller may pass less than 5 bytes if they
-- 
1.6.3.3


From 111e2a3bd4663e22cae3b013a09a7426f8cc01ed Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Fri, 29 Oct 2010 22:11:30 +0200
Subject: [PATCH 2/7] s3:lib/charcnv: clarify comments in next_codepoint_ext()

(giving the unicod U+<hexnumber> notation of the codepoints
 referred to in the comments)
(cherry picked from commit bd874fec1ca70cdb1d1551ffcc8be51bb95c8d26)
---
 source3/lib/charcnv.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/source3/lib/charcnv.c b/source3/lib/charcnv.c
index 03d1031..9c76e1b 100644
--- a/source3/lib/charcnv.c
+++ b/source3/lib/charcnv.c
@@ -1827,7 +1827,7 @@ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
 
 	/*
 	 * We assume that no multi-byte character can take more than
-	 * 5 bytes. This is OK as we only support codepoints up to 1M
+	 * 5 bytes. This is OK as we only support codepoints up to 1M (U+100000)
 	 */
 
 	ilen_orig = strnlen(str, 5);
@@ -1843,7 +1843,7 @@ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
 
 	/*
 	 * This looks a little strange, but it is needed to cope
-	 * with codepoints above 64k which are encoded as per RFC2781.
+	 * with codepoints above 64k (U+10000) which are encoded as per RFC2781.
 	 */
 	olen = 2;
 	outbuf = (char *)buf;
-- 
1.6.3.3


From ab4474abd92b0b1134e04166e250edc03bbc8af4 Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Fri, 29 Oct 2010 22:21:47 +0200
Subject: [PATCH 3/7] s3:util_str: clarify the comment header for strlen_m().
 (cherry picked from commit cd79c661994530e6bd26aae1a7977a3dc04d42c0)

---
 source3/lib/util_str.c |   10 ++++++----
 1 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c
index 9a0b12a..7b2ee05 100644
--- a/source3/lib/util_str.c
+++ b/source3/lib/util_str.c
@@ -1454,10 +1454,12 @@ void strupper_m(char *s)
 }
 
 /**
- Count the number of UCS2 characters in a string. Normally this will
- be the same as the number of bytes in a string for single byte strings,
- but will be different for multibyte.
-**/
+ * Calculate the number of 16-bit units that would be needed to convert
+ * the input string which is expected to be in CH_UNIX encoding to UTF16.
+ *
+ * This will be the same as the number of bytes in a string for single
+ * byte strings, but will be different for multibyte.
+ */
 
 size_t strlen_m(const char *s)
 {
-- 
1.6.3.3


From 2eb186e21fc5a0a73d4fbcc331eb79b50ad9d34e Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Sat, 30 Oct 2010 02:03:02 +0200
Subject: [PATCH 4/7] s3:util_str: add strlen_m_ext that takes input and output charset

The function calculates the number of units (8 or 16-bit, depending
on the destination charset), that would be needed to convert the
input string which is expected to be in in src_charset encoding
to the dst_charset (which should be a unicode charset).
(cherry picked from commit 805f7331063db4a5e1156091bff4da0856daa2c2)
---
 source3/include/proto.h |    2 +
 source3/lib/util_str.c  |   70 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/source3/include/proto.h b/source3/include/proto.h
index c9c0b26..3c36488 100644
--- a/source3/include/proto.h
+++ b/source3/include/proto.h
@@ -1541,6 +1541,8 @@ char *strnrchr_m(const char *s, char c, unsigned int n);
 char *strstr_m(const char *src, const char *findstr);
 void strlower_m(char *s);
 void strupper_m(char *s);
+size_t strlen_m_ext(const char *s, const charset_t src_charset,
+		    const charset_t dst_charset);
 size_t strlen_m(const char *s);
 size_t strlen_m_term(const char *s);
 size_t strlen_m_term_null(const char *s);
diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c
index 7b2ee05..5937c9d 100644
--- a/source3/lib/util_str.c
+++ b/source3/lib/util_str.c
@@ -1454,14 +1454,14 @@ void strupper_m(char *s)
 }
 
 /**
- * Calculate the number of 16-bit units that would be needed to convert
- * the input string which is expected to be in CH_UNIX encoding to UTF16.
- *
- * This will be the same as the number of bytes in a string for single
- * byte strings, but will be different for multibyte.
+ * Calculate the number of units (8 or 16-bit, depending on the
+ * destination charset), that would be needed to convert the input
+ * string which is expected to be in in src_charset encoding to the
+ * destination charset (which should be a unicode charset).
  */
 
-size_t strlen_m(const char *s)
+size_t strlen_m_ext(const char *s, const charset_t src_charset,
+		    const charset_t dst_charset)
 {
 	size_t count = 0;
 
@@ -1480,21 +1480,65 @@ size_t strlen_m(const char *s)
 
 	while (*s) {
 		size_t c_size;
-		codepoint_t c = next_codepoint(s, &c_size);
-		if (c < 0x10000) {
-			/* Unicode char fits into 16 bits. */
+		codepoint_t c = next_codepoint_ext(s, src_charset, &c_size);
+		s += c_size;
+
+		switch (dst_charset) {
+		case CH_UTF16LE:
+		case CH_UTF16BE:
+		case CH_UTF16MUNGED:
+			if (c < 0x10000) {
+				/* Unicode char fits into 16 bits. */
+				count += 1;
+			} else {
+				/* Double-width unicode char - 32 bits. */
+				count += 2;
+			}
+			break;
+		case CH_UTF8:
+			/*
+			 * this only checks ranges, and does not
+			 * check for invalid codepoints
+			 */
+			if (c < 0x80) {
+				count += 1;
+			} else if (c < 0x800) {
+				count += 2;
+			} else if (c < 0x1000) {
+				count += 3;
+			} else {
+				count += 4;
+			}
+			break;
+		default:
+			/* non-unicode encoding - does not apply */
+			 return 0;
+			/*
+			 * non-unicode encoding:
+			 * assume that each codepoint fits into
+			 * one unit in the destination encoding.
+			 */
 			count += 1;
-		} else {
-			/* Double-width unicode char - 32 bits. */
-			count += 2;
 		}
-		s += c_size;
 	}
 
 	return count;
 }
 
 /**
+ * Calculate the number of 16-bit units that would bee needed to convert
+ * the input string which is expected to be in CH_UNIX encoding to UTF16.
+ *
+ * This will be the same as the number of bytes in a string for single
+ * byte strings, but will be different for multibyte.
+ */
+
+size_t strlen_m(const char *s)
+{
+	return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
+}
+
+/**
  Count the number of UCS2 characters in a string including the null
  terminator.
 **/
-- 
1.6.3.3


From 767cac2ba518d5bc1526dda519d7bbcec0853e7d Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Sun, 31 Oct 2010 02:02:16 +0200
Subject: [PATCH 5/7] s3:util_str: add strlen_m_ext_term() - variant of strlen_m_ext() counting terminator
 (cherry picked from commit f9cc1fa45dad2702ffdd155cec98ad4f51aac39a)

---
 source3/include/proto.h |    2 ++
 source3/lib/util_str.c  |    9 +++++++++
 2 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/source3/include/proto.h b/source3/include/proto.h
index 3c36488..f817c2d 100644
--- a/source3/include/proto.h
+++ b/source3/include/proto.h
@@ -1543,6 +1543,8 @@ void strlower_m(char *s);
 void strupper_m(char *s);
 size_t strlen_m_ext(const char *s, const charset_t src_charset,
 		    const charset_t dst_charset);
+size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
+			 const charset_t dst_charset);
 size_t strlen_m(const char *s);
 size_t strlen_m_term(const char *s);
 size_t strlen_m_term_null(const char *s);
diff --git a/source3/lib/util_str.c b/source3/lib/util_str.c
index 5937c9d..33c2bce 100644
--- a/source3/lib/util_str.c
+++ b/source3/lib/util_str.c
@@ -1525,6 +1525,15 @@ size_t strlen_m_ext(const char *s, const charset_t src_charset,
 	return count;
 }
 
+size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
+			 const charset_t dst_charset)
+{
+	if (!s) {
+		return 0;
+	}
+	return strlen_m_ext(s, src_charset, dst_charset) + 1;
+}
+
 /**
  * Calculate the number of 16-bit units that would bee needed to convert
  * the input string which is expected to be in CH_UNIX encoding to UTF16.
-- 
1.6.3.3


From 4bbd327651794becf5b0a67c6cab8589869faf28 Mon Sep 17 00:00:00 2001
From: Stefan Metzmacher <metze@samba.org>
Date: Wed, 25 Aug 2010 10:05:15 +0200
Subject: [PATCH 6/7] librpc/ndr: correctly implement ndr_charset_length()

Before we ignored the charset type.

metze

Signed-off-by: Michael Adam <obnox@samba.org>
---
 source3/librpc/ndr/ndr_string.c |   16 ++++++++++++++--
 1 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/source3/librpc/ndr/ndr_string.c b/source3/librpc/ndr/ndr_string.c
index 519be7b..51b0d89 100644
--- a/source3/librpc/ndr/ndr_string.c
+++ b/source3/librpc/ndr/ndr_string.c
@@ -714,7 +714,19 @@ _PUBLIC_ enum ndr_err_code ndr_push_charset(struct ndr_push *ndr, int ndr_flags,
 /* Return number of elements in a string in the specified charset */
 _PUBLIC_ uint32_t ndr_charset_length(const void *var, charset_t chset)
 {
-	/* FIXME: Treat special chars special here, taking chset into account */
-	/* Also include 0 byte */
+	switch (chset) {
+	/* case CH_UTF16: this has the same value as CH_UTF16LE */
+	case CH_UTF16LE:
+	case CH_UTF16BE:
+	case CH_UTF16MUNGED:
+		return strlen_m_term((const char *)var);
+	case CH_DISPLAY:
+	case CH_DOS:
+	case CH_UNIX:
+	case CH_UTF8:
+		return strlen((const char *)var)+1;
+	}
+
+	/* Fallback, this should never happen */
 	return strlen((const char *)var)+1;
 }
-- 
1.6.3.3


From 7b9b874dc434763ef83397997840d203a656f97c Mon Sep 17 00:00:00 2001
From: Michael Adam <obnox@samba.org>
Date: Sun, 31 Oct 2010 02:04:25 +0200
Subject: [PATCH 7/7] s3:librpc/ndr: use new strlen_m_ext_term() in ndr_charset_length(): fix bug #7594

This fixes the calculation of needed space for destination unicode charset.
---
 source3/librpc/ndr/ndr_string.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/source3/librpc/ndr/ndr_string.c b/source3/librpc/ndr/ndr_string.c
index 51b0d89..ec745e9 100644
--- a/source3/librpc/ndr/ndr_string.c
+++ b/source3/librpc/ndr/ndr_string.c
@@ -719,11 +719,11 @@ _PUBLIC_ uint32_t ndr_charset_length(const void *var, charset_t chset)
 	case CH_UTF16LE:
 	case CH_UTF16BE:
 	case CH_UTF16MUNGED:
-		return strlen_m_term((const char *)var);
+	case CH_UTF8:
+		return strlen_m_ext_term((const char *)var, CH_UNIX, chset);
 	case CH_DISPLAY:
 	case CH_DOS:
 	case CH_UNIX:
-	case CH_UTF8:
 		return strlen((const char *)var)+1;
 	}
 
-- 
1.6.3.3