[JGIT PATCH 1/2] Add getEncoding() to RevCommit to discover the encoding

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



If an application needs to parse the raw buffer by hand it might
benefit from knowing the encoding of the commit.  We can make it
available to them through a getEncoding() method, using the same
logic we already use for getFullMessage() and getShortMessage(),
but this is still only an estimate based on the "encoding" header
and may not be reality if the commit is horribly malformed.

Signed-off-by: Shawn O. Pearce <spearce@xxxxxxxxxxx>
---
 .../spearce/jgit/revwalk/RevCommitParseTest.java   |    6 ++++++
 .../src/org/spearce/jgit/revwalk/RevCommit.java    |   16 ++++++++++++++++
 2 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/org.spearce.jgit.test/tst/org/spearce/jgit/revwalk/RevCommitParseTest.java b/org.spearce.jgit.test/tst/org/spearce/jgit/revwalk/RevCommitParseTest.java
index 9b95924..62a4ab5 100644
--- a/org.spearce.jgit.test/tst/org/spearce/jgit/revwalk/RevCommitParseTest.java
+++ b/org.spearce.jgit.test/tst/org/spearce/jgit/revwalk/RevCommitParseTest.java
@@ -39,6 +39,7 @@
 
 import java.io.ByteArrayOutputStream;
 
+import org.spearce.jgit.lib.Constants;
 import org.spearce.jgit.lib.ObjectId;
 import org.spearce.jgit.lib.PersonIdent;
 import org.spearce.jgit.lib.RepositoryTestCase;
@@ -145,6 +146,7 @@ public void testParse_implicit_UTF8_encoded() throws Exception {
 		c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id
 		c.parseCanonical(new RevWalk(db), b.toByteArray());
 
+		assertSame(Constants.CHARSET, c.getEncoding());
 		assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName());
 		assertEquals("Sm\u00f6rg\u00e5sbord", c.getShortMessage());
 		assertEquals("Sm\u00f6rg\u00e5sbord\n\n\u304d\u308c\u3044\n", c.getFullMessage());
@@ -163,6 +165,7 @@ public void testParse_implicit_mixed_encoded() throws Exception {
 		c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id
 		c.parseCanonical(new RevWalk(db), b.toByteArray());
 
+		assertSame(Constants.CHARSET, c.getEncoding());
 		assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName());
 		assertEquals("Sm\u00f6rg\u00e5sbord", c.getShortMessage());
 		assertEquals("Sm\u00f6rg\u00e5sbord\n\n\u304d\u308c\u3044\n", c.getFullMessage());
@@ -187,6 +190,7 @@ public void testParse_explicit_encoded() throws Exception {
 		c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id
 		c.parseCanonical(new RevWalk(db), b.toByteArray());
 
+		assertEquals("EUC-JP", c.getEncoding().name());
 		assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName());
 		assertEquals("\u304d\u308c\u3044", c.getShortMessage());
 		assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage());
@@ -215,6 +219,7 @@ public void testParse_explicit_bad_encoded() throws Exception {
 		c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id
 		c.parseCanonical(new RevWalk(db), b.toByteArray());
 
+		assertEquals("EUC-JP", c.getEncoding().name());
 		assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName());
 		assertEquals("\u304d\u308c\u3044", c.getShortMessage());
 		assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage());
@@ -244,6 +249,7 @@ public void testParse_explicit_bad_encoded2() throws Exception {
 		c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id
 		c.parseCanonical(new RevWalk(db), b.toByteArray());
 
+		assertEquals("ISO-8859-1", c.getEncoding().name());
 		assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName());
 		assertEquals("\u304d\u308c\u3044", c.getShortMessage());
 		assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage());
diff --git a/org.spearce.jgit/src/org/spearce/jgit/revwalk/RevCommit.java b/org.spearce.jgit/src/org/spearce/jgit/revwalk/RevCommit.java
index f211dfd..284a183 100644
--- a/org.spearce.jgit/src/org/spearce/jgit/revwalk/RevCommit.java
+++ b/org.spearce.jgit/src/org/spearce/jgit/revwalk/RevCommit.java
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2009, Google Inc.
  * Copyright (C) 2008, Shawn O. Pearce <spearce@xxxxxxxxxxx>
  *
  * All rights reserved.
@@ -377,6 +378,21 @@ static boolean hasLF(final byte[] r, int b, final int e) {
 	}
 
 	/**
+	 * Determine the encoding of the commit message buffer.
+	 * <p>
+	 * Locates the "encoding" header (if present) and then returns the proper
+	 * character set to apply to this buffer to evaluate its contents as
+	 * character data.
+	 * <p>
+	 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
+	 *
+	 * @return the preferred encoding of {@link #getRawBuffer()}.
+	 */
+	public final Charset getEncoding() {
+		return RawParseUtils.parseEncoding(buffer);
+	}
+
+	/**
 	 * Reset this commit to allow another RevWalk with the same instances.
 	 * <p>
 	 * Subclasses <b>must</b> call <code>super.reset()</code> to ensure the
-- 
1.6.3.3.420.gd4b46

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]