[RFC 1/8] UTF helpers

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



---
 Makefile          |    8 ++-
 git-compat-util.h |    1 +
 git.c             |    9 +++
 t/test-lib.sh     |    4 +-
 test-utf.c        |   61 ++++++++++++++++
 utf.c             |  207 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 utf.h             |   27 +++++++
 7 files changed, 313 insertions(+), 4 deletions(-)
 create mode 100644 test-utf.c
 create mode 100644 utf.c
 create mode 100644 utf.h

diff --git a/Makefile b/Makefile
index 2d62efb..2d71f01 100644
--- a/Makefile
+++ b/Makefile
@@ -259,7 +259,7 @@ LIB_OBJS = \
 	object.o pack-check.o patch-delta.o path.o pkt-line.o sideband.o \
 	quote.o read-cache.o refs.o run-command.o dir.o object-refs.o \
 	server-info.o setup.o sha1_file.o sha1_name.o strbuf.o \
-	tag.o tree.o usage.o config.o environment.o ctype.o copy.o \
+	tag.o tree.o utf.o usage.o config.o environment.o ctype.o copy.o \
 	fetch-clone.o revision.o pager.o tree-walk.o xdiff-interface.o \
 	write_or_die.o trace.o list-objects.o grep.o \
 	alloc.o merge-file.o path-list.o help.o unpack-trees.o $(DIFF_OBJS) \
@@ -564,6 +564,9 @@ ifdef NO_ACCURATE_DIFF
 endif
 
 # Shell quote (do not use $(call) to accommodate ancient setups);
+ALL_CFLAGS += -DUTF8INTERNAL=1
+ALL_CFLAGS += -DDEBUG=1
+#ALL_CFLAGS += -DTEST=1
 
 SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
 
@@ -811,6 +814,9 @@ export NO_SVN_TESTS
 test: all
 	$(MAKE) -C t/ all
 
+test-utf$X: test-utf.c ctype.o utf.o usage.o
+	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) test-utf.c utf.c ctype.o usage.o
+
 test-date$X: test-date.c date.o ctype.o
 	$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) test-date.c date.o ctype.o
 
diff --git a/git-compat-util.h b/git-compat-util.h
index 0272d04..f83352b 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -25,6 +25,7 @@
 #include <netinet/in.h>
 #include <sys/types.h>
 #include <dirent.h>
+#include "utf.h"
 
 /* On most systems <limits.h> would have given us this, but
  * not on some systems (e.g. GNU/Hurd).
diff --git a/git.c b/git.c
index 6475847..bd4e726 100644
--- a/git.c
+++ b/git.c
@@ -272,6 +272,15 @@ static void handle_internal_command(int argc, const char **argv, char **envp)
 	};
 	int i;
 
+#ifdef DEBUG
+	if (debug()) {
+		fprintf(stderr,"GIT-");
+		for (i = 1; i<argc; ++i)
+			fprintf(stderr,"%s",argv[i]);
+		fprintf(stderr,"\n");
+	}
+#endif
+
 	/* Turn "git cmd --help" into "git help cmd" */
 	if (argc > 1 && !strcmp(argv[1], "--help")) {
 		argv[1] = argv[0];
diff --git a/t/test-lib.sh b/t/test-lib.sh
index 07cb706..e8aefd8 100755
--- a/t/test-lib.sh
+++ b/t/test-lib.sh
@@ -4,11 +4,9 @@
 #
 
 # For repeatability, reset the environment to known value.
-LANG=C
-LC_ALL=C
 PAGER=cat
 TZ=UTC
-export LANG LC_ALL PAGER TZ
+export PAGER TZ
 EDITOR=:
 VISUAL=:
 unset AUTHOR_DATE
diff --git a/test-utf.c b/test-utf.c
new file mode 100644
index 0000000..133eea0
--- /dev/null
+++ b/test-utf.c
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
+#include "cache.h"
+#include "utf.h"
+
+int main(int argc, char **argv)
+{
+	int i;
+
+#if 0
+	for (i = 1; i < argc; i++) {
+		char result1[100];
+		char result2[100];
+
+		utfcpy(result1, argv[i], strlen(argv[i])+1);
+		localcpy(result2, result1, strlen(result1)+1);
+
+		printf("%s -> %s -> %s\n", argv[i], result1, result2);
+	}
+	return 0;
+#endif
+
+#define test(name) case __LINE__: current_name=name; n++; printf("Testing case #%d: %s\n", n, current_name);
+#define end_test break;
+#define begin_suite() char *current_name=0; int n=1; for (i=0; i<1000; ++i) { switch(i) { 
+#define concats(a,b) #a #b
+
+#undef strcmp
+#define assertStringEquals(a,b) assert(#a #b && strcmp(a,b)==0)
+#define assertIntEquals(a,b) assert(#a #b && (a)==(b))
+
+#define end_suite() }}
+
+	begin_suite();
+
+	test("utfcpy") {
+	  char result[100];
+	  utfcpy(result,"Ändrad",7);
+	  assertStringEquals(result,"\303\204ndrad");
+	} end_test;
+
+	test("utflen") {
+	  int result=utflen("Ändrad",7);
+	  assertIntEquals(result,8);
+	} end_test;
+
+	test("localcpy") {
+	  char result[100];
+	  localcpy(result,"\303\204ndrad",8);
+	  assertStringEquals(result,"Ändrad");
+	} end_test;
+
+	test("locallen") {
+	  int result=locallen("\303\204ndrad",8);
+	  assertIntEquals(result,7);
+	} end_test;
+
+	end_suite();
+}
diff --git a/utf.c b/utf.c
new file mode 100644
index 0000000..eb430b2
--- /dev/null
+++ b/utf.c
@@ -0,0 +1,207 @@
+#undef UTF8INTERNAL
+
+#include <langinfo.h>
+#include <iconv.h>
+#include "cache.h"
+#include <locale.h>
+#include <stdarg.h>
+
+static iconv_t local_to_utf8 = (iconv_t)-1;
+static iconv_t utf8_to_local = (iconv_t)-1;
+static iconv_t utf8_to_utf8 = (iconv_t)-1;
+static int same = 0;
+
+#if TEST
+#define die printf
+#endif
+
+static void	initlocale()
+{
+#ifndef NO_ICONV
+	if (!same && local_to_utf8 == (iconv_t)-1) {
+		setlocale(LC_CTYPE, "");
+		char *local_encoding = nl_langinfo(CODESET);
+#ifdef DEBUG
+		if (debug()) fprintf(stderr,"encoding=%s\n", local_encoding);
+#endif
+		if (strcmp(local_encoding,"UTF-8") == 0) {
+			same = 1;
+			return;
+		}
+		local_to_utf8 = iconv_open("UTF-8",  local_encoding);
+		if (local_to_utf8 == (iconv_t)-1) {
+			die("cannot setup locale conversion from %s: %s", local_encoding, strerror(errno));
+		}
+#ifdef DEBUG
+		if (debug()) fprintf(stderr,"utf8_to_local = iconv_open(%s,UTF-8)\n",local_encoding);
+#endif
+		utf8_to_local = iconv_open(local_encoding,  "UTF-8");
+		if (utf8_to_local == (iconv_t)-1) {
+			die("cannot setup locale conversion from %s: %s", local_encoding, strerror(errno));
+		}
+
+		utf8_to_utf8 = iconv_open("UTF-8","UTF-8");
+		if (utf8_to_utf8 == (iconv_t)-1) {
+			die("cannot setup locale conversion from UTF-8 to UTF-8: %s",strerror(errno));
+		}
+	}
+#endif
+}
+
+int maybe_utf8(const char *local, size_t len)
+{
+  char *self = xcalloc(1,len+1);
+  char *selfp = self;
+  size_t outlen = len+1;
+  int ret = iconv(utf8_to_utf8, (char**)&local, &len, &selfp, &outlen);
+  free(self);
+  P(("maybelocal: %0.*s %s\n", len, local, ret!=-1 ? "yes" : "no"));
+  return ret != -1;
+}
+
+size_t utflen(const char *local, size_t locallen)
+{
+#ifndef NO_ICONV
+	initlocale();
+	if (same) {
+		return locallen;
+	}
+	if (maybe_utf8(local, locallen))
+		return locallen;
+
+	size_t outlen=locallen*6;
+	char *outbuf=xcalloc(outlen,1);
+	char *out=outbuf;
+	iconv(local_to_utf8, NULL, NULL, NULL, NULL);
+	const char *vlocal = local;
+	size_t vlocallen = locallen;
+	if (iconv(local_to_utf8,  (char**)&vlocal,  &vlocallen,  &out,  &outlen) == -1) {
+#if TEST
+		perror("failed");
+#endif
+		free(outbuf);
+		return locallen;
+	}
+	*out = 0;
+	free(outbuf);
+	return locallen*6 - outlen;
+#else
+	return locallen;
+#endif
+}
+
+/* Copy and transform */
+void utfcpy(char *to_utf, char *from_local, size_t localsize)
+{
+#ifdef DEBUG
+	char *a=to_utf,*b=from_local;
+#endif
+#ifndef NO_ICONV
+	initlocale();
+	if (same) {
+		memcpy(to_utf, from_local, localsize);
+		return;
+	}
+	if (maybe_utf8(from_local, localsize)) {
+		memcpy(to_utf, from_local, localsize);
+		return;
+	}
+
+	size_t outlen=localsize*6;
+	iconv(local_to_utf8, NULL, NULL, NULL, NULL);
+	char *vfrom_local = from_local;
+	char *vto_utf = to_utf;
+	size_t vlocalsize = localsize;
+	if (iconv(local_to_utf8,  &vfrom_local,  &vlocalsize,  &vto_utf,  &outlen) == -1) {
+		fprintf(stderr,"Failed to convert %0.*s to UTF\n", localsize, from_local);
+		memcpy(to_utf,  from_local,  localsize);
+	}
+#else
+	memcpy(to_utf, from_local, localsize);
+#endif
+#ifdef DEBUG
+	if (debug()) fprintf(stderr,"%0.*s ->UTF %0.*s\n", localsize, b, localsize*6 - outlen, a);
+#endif
+}
+
+size_t locallen(const char *utf, size_t utflen)
+{
+#ifndef NO_ICONV
+	initlocale();
+	if (same) {
+		return utflen;
+	}
+	char *outbuf=xcalloc(utflen*4,1); /* ??, can we be more specific? */
+	char *out=outbuf;
+	size_t outlen=utflen*4;
+	iconv(utf8_to_local, NULL, NULL, NULL, NULL);
+	char *vutf = utf;
+	size_t vutflen = utflen;
+	if (iconv(utf8_to_local,  (char**)&vutf,  &vutflen,  &out,  &outlen) == -1) {
+#ifdef DEBUG
+		perror("failed");
+#endif
+		free(outbuf);
+		return utflen;
+	}
+	*out = 0;
+	free(outbuf);
+	return utflen*4 - outlen; 	
+#else
+	return utflen;
+#endif
+}
+
+void localcpy(char *tolocal, char *fromutf, size_t utflen)
+{
+#ifdef DEBUG
+	char *a=tolocal,*b=fromutf;
+#endif
+	initlocale();
+	if (same) {
+		memcpy(tolocal, fromutf, utflen);
+		return;
+	}
+#ifndef NO_ICONV
+	iconv(utf8_to_local,  NULL,  NULL,  NULL,  NULL);
+	size_t outlen=utflen*4;
+	char *vfromutf = fromutf;
+	char *vtolocal = tolocal;
+	size_t vutflen = utflen;
+	if (iconv(utf8_to_local,  &vfromutf,  &vutflen,  &vtolocal,  &outlen) == -1) {
+		fprintf(stderr,"Failed to convert %0.*s to LOCAL\n", utflen, fromutf);
+		memcpy(tolocal, fromutf, utflen);
+	}
+#else
+	memcpy(tolocale, fromutf, utflen);
+#endif	
+#ifdef DEBUG
+	if (debug()) fprintf(stderr,"%0.*s ->LOCAL %0.*s\n", utflen, b, utflen*4-outlen, a);
+#endif
+}
+
+int PP(const char *fmt,...)
+{
+  va_list va;
+  va_start(va,fmt);
+  int ret=vfprintf(stderr,fmt,va);
+  va_end(va);
+  return ret;
+}
+
+int debugf=-1;
+
+int debug()
+{
+	if (debugf == -1) {
+		char *f = getenv("DEBUG");
+		if (!f) {
+			debugf = 0;
+		} else if (f[0] != 0) {
+			debugf = 1;
+		} else
+			debugf = 0;
+	}
+	return debugf == 1;
+}
+
diff --git a/utf.h b/utf.h
new file mode 100644
index 0000000..c6c6224
--- /dev/null
+++ b/utf.h
@@ -0,0 +1,27 @@
+#ifndef UTF_H
+#define UTF_H 1
+
+/** The number of octets 'local' would occupy encoded as utf8.
+ *  The input format is assumed to be local
+ */
+extern size_t utflen(const char *local,size_t locallen);
+extern size_t locallen(const char *utf,size_t utflen);
+
+/* Copy and transform */
+extern void utfcpy(char *toutf,char *fromlocal,size_t localen);
+
+/* Copy and transform */
+extern void localcpy(char *tolocal,char *fromutf,size_t utflen);
+
+#ifdef DEBUG
+#define D(x) do { if (debug()) fprintf(stderr,"%s:%d:%s\n",__FILE__,__LINE__,x); } while(0)
+#define P(x) do { if (debug()) { fprintf(stderr,"%s:%d:",__FILE__,__LINE__); PP x; } } while(0)
+int PP(const char *fmt,...);
+int debug();
+
+#else
+#define D(x)
+#define P(x)
+#endif
+
+#endif
-- 
1.6.3.dirty

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]