If an application needs to parse the raw buffer by hand it might benefit from knowing the encoding of the commit. We can make it available to them through a getEncoding() method, using the same logic we already use for getFullMessage() and getShortMessage(), but this is still only an estimate based on the "encoding" header and may not be reality if the commit is horribly malformed. Signed-off-by: Shawn O. Pearce <spearce@xxxxxxxxxxx> --- .../spearce/jgit/revwalk/RevCommitParseTest.java | 6 ++++++ .../src/org/spearce/jgit/revwalk/RevCommit.java | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 0 deletions(-) diff --git a/org.spearce.jgit.test/tst/org/spearce/jgit/revwalk/RevCommitParseTest.java b/org.spearce.jgit.test/tst/org/spearce/jgit/revwalk/RevCommitParseTest.java index 9b95924..62a4ab5 100644 --- a/org.spearce.jgit.test/tst/org/spearce/jgit/revwalk/RevCommitParseTest.java +++ b/org.spearce.jgit.test/tst/org/spearce/jgit/revwalk/RevCommitParseTest.java @@ -39,6 +39,7 @@ import java.io.ByteArrayOutputStream; +import org.spearce.jgit.lib.Constants; import org.spearce.jgit.lib.ObjectId; import org.spearce.jgit.lib.PersonIdent; import org.spearce.jgit.lib.RepositoryTestCase; @@ -145,6 +146,7 @@ public void testParse_implicit_UTF8_encoded() throws Exception { c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id c.parseCanonical(new RevWalk(db), b.toByteArray()); + assertSame(Constants.CHARSET, c.getEncoding()); assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName()); assertEquals("Sm\u00f6rg\u00e5sbord", c.getShortMessage()); assertEquals("Sm\u00f6rg\u00e5sbord\n\n\u304d\u308c\u3044\n", c.getFullMessage()); @@ -163,6 +165,7 @@ public void testParse_implicit_mixed_encoded() throws Exception { c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id c.parseCanonical(new RevWalk(db), b.toByteArray()); + assertSame(Constants.CHARSET, c.getEncoding()); assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName()); assertEquals("Sm\u00f6rg\u00e5sbord", c.getShortMessage()); assertEquals("Sm\u00f6rg\u00e5sbord\n\n\u304d\u308c\u3044\n", c.getFullMessage()); @@ -187,6 +190,7 @@ public void testParse_explicit_encoded() throws Exception { c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id c.parseCanonical(new RevWalk(db), b.toByteArray()); + assertEquals("EUC-JP", c.getEncoding().name()); assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName()); assertEquals("\u304d\u308c\u3044", c.getShortMessage()); assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage()); @@ -215,6 +219,7 @@ public void testParse_explicit_bad_encoded() throws Exception { c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id c.parseCanonical(new RevWalk(db), b.toByteArray()); + assertEquals("EUC-JP", c.getEncoding().name()); assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName()); assertEquals("\u304d\u308c\u3044", c.getShortMessage()); assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage()); @@ -244,6 +249,7 @@ public void testParse_explicit_bad_encoded2() throws Exception { c = new RevCommit(id("9473095c4cb2f12aefe1db8a355fe3fafba42f67")); // bogus id c.parseCanonical(new RevWalk(db), b.toByteArray()); + assertEquals("ISO-8859-1", c.getEncoding().name()); assertEquals("F\u00f6r fattare", c.getAuthorIdent().getName()); assertEquals("\u304d\u308c\u3044", c.getShortMessage()); assertEquals("\u304d\u308c\u3044\n\nHi\n", c.getFullMessage()); diff --git a/org.spearce.jgit/src/org/spearce/jgit/revwalk/RevCommit.java b/org.spearce.jgit/src/org/spearce/jgit/revwalk/RevCommit.java index f211dfd..284a183 100644 --- a/org.spearce.jgit/src/org/spearce/jgit/revwalk/RevCommit.java +++ b/org.spearce.jgit/src/org/spearce/jgit/revwalk/RevCommit.java @@ -1,4 +1,5 @@ /* + * Copyright (C) 2009, Google Inc. * Copyright (C) 2008, Shawn O. Pearce <spearce@xxxxxxxxxxx> * * All rights reserved. @@ -377,6 +378,21 @@ static boolean hasLF(final byte[] r, int b, final int e) { } /** + * Determine the encoding of the commit message buffer. + * <p> + * Locates the "encoding" header (if present) and then returns the proper + * character set to apply to this buffer to evaluate its contents as + * character data. + * <p> + * If no encoding header is present, {@link Constants#CHARSET} is assumed. + * + * @return the preferred encoding of {@link #getRawBuffer()}. + */ + public final Charset getEncoding() { + return RawParseUtils.parseEncoding(buffer); + } + + /** * Reset this commit to allow another RevWalk with the same instances. * <p> * Subclasses <b>must</b> call <code>super.reset()</code> to ensure the -- 1.6.3.3.420.gd4b46 -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html