Re: Git, Mac OS X and German special characters

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hej,
I have the same problem here.
Below there is a patch, which may solve the problem.
(Yes, whitespaces are broken. I'm still fighting with
git format-patch -s --cover-letter -M --stdout origin/master | git imap-send)
But this patch may be a start point for improvements.
Comments welcome
BR
/Torsten



Improved interwork between Mac OS X and linux when umlauts are used
When a git repository containing utf-8 coded umlaut characters
is cloned onto an Mac OS X machine, the Mac OS system will convert
all filenames returned by readdir() into denormalized utf-8.
As a result of this conversion, git will not find them on disk.
This helps by treating the NFD and NFD version of filenames as
identical on Mac OS.






Signed-off-by: Torsten Bögershausen <tboegi@xxxxxx>
---
name-hash.c |   40 ++++++++++++++++++++++++++++++++++++++++
utf8.c      |   55 ++++++++++++++++++++++++++++++++++++++++++++++++-------
utf8.h      |   11 +++++++++++
3 files changed, 99 insertions(+), 7 deletions(-)

diff --git a/name-hash.c b/name-hash.c
index 0031d78..e6494e8 100644
--- a/name-hash.c
+++ b/name-hash.c
@@ -7,6 +7,7 @@
 */
#define NO_THE_INDEX_COMPATIBILITY_MACROS
#include "cache.h"
+#include "utf8.h"

/*
 * This removes bit 5 if bit 6 is set.
@@ -100,6 +101,25 @@ static int same_name(const struct cache_entry *ce, const char *name, int namelen
    return icase && slow_same_name(name, namelen, ce->name, len);
}

+#ifdef __APPLE__
+struct cache_entry *index_name_exists2(struct index_state *istate, const char *name, int icase)
+{
+    int namelen = (int)strlen(name);
+    unsigned int hash = hash_name(name, namelen);
+    struct cache_entry *ce;
+
+    ce = lookup_hash(hash, &istate->name_hash);
+    while (ce) {
+        if (!(ce->ce_flags & CE_UNHASHED)) {
+            if (same_name(ce, name, namelen, icase))
+                return ce;
+        }
+        ce = ce->next;
+    }
+    return NULL;
+}
+#endif
+
struct cache_entry *index_name_exists(struct index_state *istate, const char *name, int namelen, int icase)
{
    unsigned int hash = hash_name(name, namelen);
@@ -115,5 +135,25 @@ struct cache_entry *index_name_exists(struct index_state *istate, const char *na
        }
        ce = ce->next;
    }
+#ifdef __APPLE__
+    {
+        char *name_nfc_nfd;
+        name_nfc_nfd = str_nfc2nfd(name);
+        if (name_nfc_nfd) {
+            ce = index_name_exists2(istate, name_nfc_nfd, icase);
+            free(name_nfc_nfd);
+            if (ce)
+                return ce;
+        }
+        name_nfc_nfd = str_nfd2nfc(name);
+        if (name_nfc_nfd) {
+            ce = index_name_exists2(istate, name_nfc_nfd, icase);
+            free(name_nfc_nfd);
+            if (ce)
+                return ce;
+        }
+    }
+#endif
+
    return NULL;
}
diff --git a/utf8.c b/utf8.c
index 84cfc72..8e794dc 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2,6 +2,11 @@
#include "strbuf.h"
#include "utf8.h"

+#ifdef __APPLE__
+static iconv_t my_iconv_nfd2nfc = (iconv_t) -1;
+static iconv_t my_iconv_nfc2nfd = (iconv_t) -1;
+#endif
+
/* This code is originally from http://www.cl.cam.ac.uk/~mgk25/ucs/ */

struct interval {
@@ -424,18 +429,13 @@ int is_encoding_utf8(const char *name)
#else
    typedef char * iconv_ibp;
#endif
-char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding)
+
+char *reencode_string_iconv(const char *in, iconv_t conv)
{
-    iconv_t conv;
    size_t insz, outsz, outalloc;
    char *out, *outpos;
    iconv_ibp cp;

-    if (!in_encoding)
-        return NULL;
-    conv = iconv_open(out_encoding, in_encoding);
-    if (conv == (iconv_t) -1)
-        return NULL;
    insz = strlen(in);
    outsz = insz;
    outalloc = outsz + 1; /* for terminating NUL */
@@ -469,7 +469,48 @@ char *reencode_string(const char *in, const char *out_encoding, const char *in_e
            break;
        }
    }
+    return out;
+}
+
+char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding)
+{
+    iconv_t conv;
+    char *out;
+
+    if (!in_encoding)
+        return NULL;
+    conv = iconv_open(out_encoding, in_encoding);
+    if (conv == (iconv_t) -1)
+        return NULL;
+    out = reencode_string_iconv(in, conv);
    iconv_close(conv);
    return out;
}
+
+#ifdef __APPLE__
+char*
+str_nfc2nfd(const char *in)
+{
+    if (my_iconv_nfc2nfd == (iconv_t) -1) {
+        my_iconv_nfc2nfd = iconv_open("utf-8-mac", "utf-8");
+        if (my_iconv_nfc2nfd == (iconv_t) -1) {
+            return NULL;
+        }
+    }
+    return reencode_string_iconv(in, my_iconv_nfc2nfd);
+}
+
+char*
+str_nfd2nfc(const char *in)
+{
+    if (my_iconv_nfd2nfc == (iconv_t) -1){
+        my_iconv_nfd2nfc = iconv_open("utf-8", "utf-8-mac");
+        if (my_iconv_nfd2nfc == (iconv_t) -1) {
+            return NULL;
+        }
+    }
+    return reencode_string_iconv(in, my_iconv_nfd2nfc);
+}
+#endif /* APPLE */
+
#endif
diff --git a/utf8.h b/utf8.h
index ebc4d2f..db29c8a 100644
--- a/utf8.h
+++ b/utf8.h
@@ -13,8 +13,19 @@ int strbuf_add_wrapped_text(struct strbuf *buf,

#ifndef NO_ICONV
char *reencode_string(const char *in, const char *out_encoding, const char *in_encoding);
+char *reencode_string_iconv(const char *in, iconv_t conv);
+#ifdef __APPLE__
+char *str_nfc2nfd(const char *in);
+char *str_nfd2nfc(const char *in);
+#else
+#define str_nfc2nfd(in) (NULL)
+#define str_nfd2nfc(in) (NULL)
+#endif
#else
#define reencode_string(a,b,c) NULL
+#define reencode_string2(a,b) NULL
+#define str_nfc2nfd(in) (NULL)
+#define str_nfd2nfc(in) (NULL)
#endif

#endif
--
1.7.1.dirty










On 20.05.10 10:50, Michael J Gruber wrote:
Ævar Arnfjörð Bjarmason venit, vidit, dixit 20.05.2010 10:34:
On Thu, May 20, 2010 at 07:26, Matthias Moeller
<matthias.moeller@xxxxxxxxxxxxxxxxxxx>  wrote:
I have been searching the web for help and found lengthy discussions
which state that this is a common problem of the HFS+ filesystem.
What I did not find was a solution to this problem. Is there a solution
to this problem?
Is this problem particular to Git, or do you also get it if you
e.g. rsync from the Linux box to the Mac OS X box?

#       "U\314\210bersicht.xls"
You probably have to configure your shell on OSX to render UTF-8
correctly. It's just showing the raw escaped byte sequence instead of
a character there.

There isn't anything wrong with OSX in this case, filename encoding on
any POSIX system is only done by convention. You'll find that you have
similar problems on Linux if you encode filename in Big5 or
UTF-32.

Linux will happily accept it, but your shell / other applications will
render it as unknown goo because they expect UTF-8.
No, the problem with git status is not the display. Matthias' problem is
that git status reports a tracked file as untracked. The reason is that
on HFS+, you create a file with name A and get a file with name B, where
A and B are different representations of the same name. There seems to
be no way to reliably detect which one HFS+ uses.

Michael
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]