[EGIT PATCH 03/26] Add Constants.encode as a utility for quick encoding in UTF-8

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



We often need to convert a string into a UTF-8 encoding, so that
we can use this string as a path filter in a TreeWalk or in some
other suitable place where we assume a standard UTF-8 encoding is
being used.  As we have already done the lookup for the CHARSET
we can reuse that same CHARSET reference during future encoding
calls, while allowing the CharSet implementation to cache and
reuse the actual encoder instance.

Whenever possible we try to avoid copying the result as most of
the time the returned ByteBuffer's internal array matches the
result array we need to return to our caller.

Signed-off-by: Shawn O. Pearce <spearce@xxxxxxxxxxx>
---
 .../spearce/jgit/lib/ConstantsEncodingTest.java    |   89 ++++++++++++++++++++
 .../src/org/spearce/jgit/lib/Constants.java        |   25 ++++++
 2 files changed, 114 insertions(+), 0 deletions(-)
 create mode 100644 org.spearce.jgit.test/tst/org/spearce/jgit/lib/ConstantsEncodingTest.java

diff --git a/org.spearce.jgit.test/tst/org/spearce/jgit/lib/ConstantsEncodingTest.java b/org.spearce.jgit.test/tst/org/spearce/jgit/lib/ConstantsEncodingTest.java
new file mode 100644
index 0000000..7b3e5a0
--- /dev/null
+++ b/org.spearce.jgit.test/tst/org/spearce/jgit/lib/ConstantsEncodingTest.java
@@ -0,0 +1,89 @@
+/*
+ * Copyright (C) 2008, Google Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ *   copyright notice, this list of conditions and the following
+ *   disclaimer in the documentation and/or other materials provided
+ *   with the distribution.
+ *
+ * - Neither the name of the Git Development Community nor the
+ *   names of its contributors may be used to endorse or promote
+ *   products derived from this software without specific prior
+ *   written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.spearce.jgit.lib;
+
+import java.io.UnsupportedEncodingException;
+import java.util.Arrays;
+
+import junit.framework.TestCase;
+
+public class ConstantsEncodingTest extends TestCase {
+	public void testEncodeASCII_SimpleASCII()
+			throws UnsupportedEncodingException {
+		final String src = "abc";
+		final byte[] exp = { 'a', 'b', 'c' };
+		final byte[] res = Constants.encodeASCII(src);
+		assertTrue(Arrays.equals(exp, res));
+		assertEquals(src, new String(res, 0, res.length, "UTF-8"));
+	}
+
+	public void testEncodeASCII_FailOnNonASCII() {
+		final String src = "ŪnÄ­cÅ?de̽";
+		try {
+			Constants.encodeASCII(src);
+			fail("Incorrectly accepted a Unicode character");
+		} catch (IllegalArgumentException err) {
+			assertEquals("Not ASCII string: " + src, err.getMessage());
+		}
+	}
+
+	public void testEncodeASCII_Number13() {
+		final long src = 13;
+		final byte[] exp = { '1', '3' };
+		final byte[] res = Constants.encodeASCII(src);
+		assertTrue(Arrays.equals(exp, res));
+	}
+
+	public void testEncode_SimpleASCII() throws UnsupportedEncodingException {
+		final String src = "abc";
+		final byte[] exp = { 'a', 'b', 'c' };
+		final byte[] res = Constants.encode(src);
+		assertTrue(Arrays.equals(exp, res));
+		assertEquals(src, new String(res, 0, res.length, "UTF-8"));
+	}
+
+	public void testEncode_Unicode() throws UnsupportedEncodingException {
+		final String src = "ŪnÄ­cÅ?de̽";
+		final byte[] exp = { (byte) 0xC5, (byte) 0xAA, 0x6E, (byte) 0xC4,
+				(byte) 0xAD, 0x63, (byte) 0xC5, (byte) 0x8D, 0x64, 0x65,
+				(byte) 0xCC, (byte) 0xBD };
+		final byte[] res = Constants.encode(src);
+		assertTrue(Arrays.equals(exp, res));
+		assertEquals(src, new String(res, 0, res.length, "UTF-8"));
+	}
+}
diff --git a/org.spearce.jgit/src/org/spearce/jgit/lib/Constants.java b/org.spearce.jgit/src/org/spearce/jgit/lib/Constants.java
index 7c2cef9..23ac3ac 100644
--- a/org.spearce.jgit/src/org/spearce/jgit/lib/Constants.java
+++ b/org.spearce.jgit/src/org/spearce/jgit/lib/Constants.java
@@ -1,6 +1,7 @@
 /*
  * Copyright (C) 2008, Robin Rosenberg <robin.rosenberg@xxxxxxxxxx>
  * Copyright (C) 2008, Shawn O. Pearce <spearce@xxxxxxxxxxx>
+ * Copyright (C) 2008, Google Inc.
  *
  * All rights reserved.
  *
@@ -38,6 +39,7 @@
 
 package org.spearce.jgit.lib;
 
+import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
@@ -387,6 +389,29 @@ public final class Constants {
 		return r;
 	}
 
+	/**
+	 * Convert a string to a byte array in the standard character encoding.
+	 * 
+	 * @param str
+	 *            the string to convert. May contain any Unicode characters.
+	 * @return a byte array representing the requested string, encoded using the
+	 *         default character encoding (UTF-8).
+	 * @see #CHARACTER_ENCODING
+	 */
+	public static byte[] encode(final String str) {
+		final ByteBuffer bb = Constants.CHARSET.encode(str);
+		final int len = bb.limit();
+		if (bb.hasArray() && bb.arrayOffset() == 0) {
+			final byte[] arr = bb.array();
+			if (arr.length == len)
+				return arr;
+		}
+
+		final byte[] arr = new byte[len];
+		bb.get(arr);
+		return arr;
+	}
+
 	static {
 		if (OBJECT_ID_LENGTH != newMessageDigest().getDigestLength())
 			throw new LinkageError("Incorrect OBJECT_ID_LENGTH.");
-- 
1.6.0.rc2.22.g71b99

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux