--- Makefile | 8 ++- git-compat-util.h | 1 + git.c | 9 +++ t/test-lib.sh | 4 +- test-utf.c | 61 ++++++++++++++++ utf.c | 207 +++++++++++++++++++++++++++++++++++++++++++++++++++++ utf.h | 27 +++++++ 7 files changed, 313 insertions(+), 4 deletions(-) create mode 100644 test-utf.c create mode 100644 utf.c create mode 100644 utf.h diff --git a/Makefile b/Makefile index 2d62efb..2d71f01 100644 --- a/Makefile +++ b/Makefile @@ -259,7 +259,7 @@ LIB_OBJS = \ object.o pack-check.o patch-delta.o path.o pkt-line.o sideband.o \ quote.o read-cache.o refs.o run-command.o dir.o object-refs.o \ server-info.o setup.o sha1_file.o sha1_name.o strbuf.o \ - tag.o tree.o usage.o config.o environment.o ctype.o copy.o \ + tag.o tree.o utf.o usage.o config.o environment.o ctype.o copy.o \ fetch-clone.o revision.o pager.o tree-walk.o xdiff-interface.o \ write_or_die.o trace.o list-objects.o grep.o \ alloc.o merge-file.o path-list.o help.o unpack-trees.o $(DIFF_OBJS) \ @@ -564,6 +564,9 @@ ifdef NO_ACCURATE_DIFF endif # Shell quote (do not use $(call) to accommodate ancient setups); +ALL_CFLAGS += -DUTF8INTERNAL=1 +ALL_CFLAGS += -DDEBUG=1 +#ALL_CFLAGS += -DTEST=1 SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER)) @@ -811,6 +814,9 @@ export NO_SVN_TESTS test: all $(MAKE) -C t/ all +test-utf$X: test-utf.c ctype.o utf.o usage.o + $(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) test-utf.c utf.c ctype.o usage.o + test-date$X: test-date.c date.o ctype.o $(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) test-date.c date.o ctype.o diff --git a/git-compat-util.h b/git-compat-util.h index 0272d04..f83352b 100644 --- a/git-compat-util.h +++ b/git-compat-util.h @@ -25,6 +25,7 @@ #include <netinet/in.h> #include <sys/types.h> #include <dirent.h> +#include "utf.h" /* On most systems <limits.h> would have given us this, but * not on some systems (e.g. GNU/Hurd). diff --git a/git.c b/git.c index 6475847..bd4e726 100644 --- a/git.c +++ b/git.c @@ -272,6 +272,15 @@ static void handle_internal_command(int argc, const char **argv, char **envp) }; int i; +#ifdef DEBUG + if (debug()) { + fprintf(stderr,"GIT-"); + for (i = 1; i<argc; ++i) + fprintf(stderr,"%s",argv[i]); + fprintf(stderr,"\n"); + } +#endif + /* Turn "git cmd --help" into "git help cmd" */ if (argc > 1 && !strcmp(argv[1], "--help")) { argv[1] = argv[0]; diff --git a/t/test-lib.sh b/t/test-lib.sh index 07cb706..e8aefd8 100755 --- a/t/test-lib.sh +++ b/t/test-lib.sh @@ -4,11 +4,9 @@ # # For repeatability, reset the environment to known value. -LANG=C -LC_ALL=C PAGER=cat TZ=UTC -export LANG LC_ALL PAGER TZ +export PAGER TZ EDITOR=: VISUAL=: unset AUTHOR_DATE diff --git a/test-utf.c b/test-utf.c new file mode 100644 index 0000000..133eea0 --- /dev/null +++ b/test-utf.c @@ -0,0 +1,61 @@ +#include <stdio.h> +#include <time.h> +#include <assert.h> + +#include "cache.h" +#include "utf.h" + +int main(int argc, char **argv) +{ + int i; + +#if 0 + for (i = 1; i < argc; i++) { + char result1[100]; + char result2[100]; + + utfcpy(result1, argv[i], strlen(argv[i])+1); + localcpy(result2, result1, strlen(result1)+1); + + printf("%s -> %s -> %s\n", argv[i], result1, result2); + } + return 0; +#endif + +#define test(name) case __LINE__: current_name=name; n++; printf("Testing case #%d: %s\n", n, current_name); +#define end_test break; +#define begin_suite() char *current_name=0; int n=1; for (i=0; i<1000; ++i) { switch(i) { +#define concats(a,b) #a #b + +#undef strcmp +#define assertStringEquals(a,b) assert(#a #b && strcmp(a,b)==0) +#define assertIntEquals(a,b) assert(#a #b && (a)==(b)) + +#define end_suite() }} + + begin_suite(); + + test("utfcpy") { + char result[100]; + utfcpy(result,"Ändrad",7); + assertStringEquals(result,"\303\204ndrad"); + } end_test; + + test("utflen") { + int result=utflen("Ändrad",7); + assertIntEquals(result,8); + } end_test; + + test("localcpy") { + char result[100]; + localcpy(result,"\303\204ndrad",8); + assertStringEquals(result,"Ändrad"); + } end_test; + + test("locallen") { + int result=locallen("\303\204ndrad",8); + assertIntEquals(result,7); + } end_test; + + end_suite(); +} diff --git a/utf.c b/utf.c new file mode 100644 index 0000000..eb430b2 --- /dev/null +++ b/utf.c @@ -0,0 +1,207 @@ +#undef UTF8INTERNAL + +#include <langinfo.h> +#include <iconv.h> +#include "cache.h" +#include <locale.h> +#include <stdarg.h> + +static iconv_t local_to_utf8 = (iconv_t)-1; +static iconv_t utf8_to_local = (iconv_t)-1; +static iconv_t utf8_to_utf8 = (iconv_t)-1; +static int same = 0; + +#if TEST +#define die printf +#endif + +static void initlocale() +{ +#ifndef NO_ICONV + if (!same && local_to_utf8 == (iconv_t)-1) { + setlocale(LC_CTYPE, ""); + char *local_encoding = nl_langinfo(CODESET); +#ifdef DEBUG + if (debug()) fprintf(stderr,"encoding=%s\n", local_encoding); +#endif + if (strcmp(local_encoding,"UTF-8") == 0) { + same = 1; + return; + } + local_to_utf8 = iconv_open("UTF-8", local_encoding); + if (local_to_utf8 == (iconv_t)-1) { + die("cannot setup locale conversion from %s: %s", local_encoding, strerror(errno)); + } +#ifdef DEBUG + if (debug()) fprintf(stderr,"utf8_to_local = iconv_open(%s,UTF-8)\n",local_encoding); +#endif + utf8_to_local = iconv_open(local_encoding, "UTF-8"); + if (utf8_to_local == (iconv_t)-1) { + die("cannot setup locale conversion from %s: %s", local_encoding, strerror(errno)); + } + + utf8_to_utf8 = iconv_open("UTF-8","UTF-8"); + if (utf8_to_utf8 == (iconv_t)-1) { + die("cannot setup locale conversion from UTF-8 to UTF-8: %s",strerror(errno)); + } + } +#endif +} + +int maybe_utf8(const char *local, size_t len) +{ + char *self = xcalloc(1,len+1); + char *selfp = self; + size_t outlen = len+1; + int ret = iconv(utf8_to_utf8, (char**)&local, &len, &selfp, &outlen); + free(self); + P(("maybelocal: %0.*s %s\n", len, local, ret!=-1 ? "yes" : "no")); + return ret != -1; +} + +size_t utflen(const char *local, size_t locallen) +{ +#ifndef NO_ICONV + initlocale(); + if (same) { + return locallen; + } + if (maybe_utf8(local, locallen)) + return locallen; + + size_t outlen=locallen*6; + char *outbuf=xcalloc(outlen,1); + char *out=outbuf; + iconv(local_to_utf8, NULL, NULL, NULL, NULL); + const char *vlocal = local; + size_t vlocallen = locallen; + if (iconv(local_to_utf8, (char**)&vlocal, &vlocallen, &out, &outlen) == -1) { +#if TEST + perror("failed"); +#endif + free(outbuf); + return locallen; + } + *out = 0; + free(outbuf); + return locallen*6 - outlen; +#else + return locallen; +#endif +} + +/* Copy and transform */ +void utfcpy(char *to_utf, char *from_local, size_t localsize) +{ +#ifdef DEBUG + char *a=to_utf,*b=from_local; +#endif +#ifndef NO_ICONV + initlocale(); + if (same) { + memcpy(to_utf, from_local, localsize); + return; + } + if (maybe_utf8(from_local, localsize)) { + memcpy(to_utf, from_local, localsize); + return; + } + + size_t outlen=localsize*6; + iconv(local_to_utf8, NULL, NULL, NULL, NULL); + char *vfrom_local = from_local; + char *vto_utf = to_utf; + size_t vlocalsize = localsize; + if (iconv(local_to_utf8, &vfrom_local, &vlocalsize, &vto_utf, &outlen) == -1) { + fprintf(stderr,"Failed to convert %0.*s to UTF\n", localsize, from_local); + memcpy(to_utf, from_local, localsize); + } +#else + memcpy(to_utf, from_local, localsize); +#endif +#ifdef DEBUG + if (debug()) fprintf(stderr,"%0.*s ->UTF %0.*s\n", localsize, b, localsize*6 - outlen, a); +#endif +} + +size_t locallen(const char *utf, size_t utflen) +{ +#ifndef NO_ICONV + initlocale(); + if (same) { + return utflen; + } + char *outbuf=xcalloc(utflen*4,1); /* ??, can we be more specific? */ + char *out=outbuf; + size_t outlen=utflen*4; + iconv(utf8_to_local, NULL, NULL, NULL, NULL); + char *vutf = utf; + size_t vutflen = utflen; + if (iconv(utf8_to_local, (char**)&vutf, &vutflen, &out, &outlen) == -1) { +#ifdef DEBUG + perror("failed"); +#endif + free(outbuf); + return utflen; + } + *out = 0; + free(outbuf); + return utflen*4 - outlen; +#else + return utflen; +#endif +} + +void localcpy(char *tolocal, char *fromutf, size_t utflen) +{ +#ifdef DEBUG + char *a=tolocal,*b=fromutf; +#endif + initlocale(); + if (same) { + memcpy(tolocal, fromutf, utflen); + return; + } +#ifndef NO_ICONV + iconv(utf8_to_local, NULL, NULL, NULL, NULL); + size_t outlen=utflen*4; + char *vfromutf = fromutf; + char *vtolocal = tolocal; + size_t vutflen = utflen; + if (iconv(utf8_to_local, &vfromutf, &vutflen, &vtolocal, &outlen) == -1) { + fprintf(stderr,"Failed to convert %0.*s to LOCAL\n", utflen, fromutf); + memcpy(tolocal, fromutf, utflen); + } +#else + memcpy(tolocale, fromutf, utflen); +#endif +#ifdef DEBUG + if (debug()) fprintf(stderr,"%0.*s ->LOCAL %0.*s\n", utflen, b, utflen*4-outlen, a); +#endif +} + +int PP(const char *fmt,...) +{ + va_list va; + va_start(va,fmt); + int ret=vfprintf(stderr,fmt,va); + va_end(va); + return ret; +} + +int debugf=-1; + +int debug() +{ + if (debugf == -1) { + char *f = getenv("DEBUG"); + if (!f) { + debugf = 0; + } else if (f[0] != 0) { + debugf = 1; + } else + debugf = 0; + } + return debugf == 1; +} + diff --git a/utf.h b/utf.h new file mode 100644 index 0000000..c6c6224 --- /dev/null +++ b/utf.h @@ -0,0 +1,27 @@ +#ifndef UTF_H +#define UTF_H 1 + +/** The number of octets 'local' would occupy encoded as utf8. + * The input format is assumed to be local + */ +extern size_t utflen(const char *local,size_t locallen); +extern size_t locallen(const char *utf,size_t utflen); + +/* Copy and transform */ +extern void utfcpy(char *toutf,char *fromlocal,size_t localen); + +/* Copy and transform */ +extern void localcpy(char *tolocal,char *fromutf,size_t utflen); + +#ifdef DEBUG +#define D(x) do { if (debug()) fprintf(stderr,"%s:%d:%s\n",__FILE__,__LINE__,x); } while(0) +#define P(x) do { if (debug()) { fprintf(stderr,"%s:%d:",__FILE__,__LINE__); PP x; } } while(0) +int PP(const char *fmt,...); +int debug(); + +#else +#define D(x) +#define P(x) +#endif + +#endif -- 1.6.3.dirty -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html