Re: [RFC] i18n.pathencoding

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Torsten Bögershausen skrev 2012-09-01 08.11:> Allow path names to be encoded in UTF-8 in the repository
> and checkout out as e.g. ISO-8859-1 in the working tree.

Ack for attempting this.

Did it myself if 2007, but times weren't ripe then, I guess.

> +i18n.pathEncoding::
> +	This option is only used by some implementations of git.
> +	When "git init" sets core.supportspathencoding to true,
> +	i18n.pathEncoding can be set to re-encode path names when
> +	a working tree is checked out.
> +	Path names may be e.g. encoded in ISO-8859-1 and are stored as
> +	UTF-8 encoded in the repository.
> +	When not set, the encoding of path names is the same in working tree
> +	and the repository.

"If set, then core.precomposeunicode is ignored on Mac OS X."

> diff --git a/compat/reencode_pathname.c b/compat/reencode_pathname.c
> new file mode 100644
> index 0000000..3bdc776
> --- /dev/null
> +++ b/compat/reencode_pathname.c
> @@ -0,0 +1,441 @@
> +/*
> + * Converts pathnames from one encoding into another.
> + * The pathnames are stored as UTF-8 in the repository,
> + * and might be checkout out as e.g. ISO-8859-1 in the working tree
> + *
> + * On MacOS X decomposed unicode is converted into precomposed unicode.
, ignoring the setting of core.precomposeunicode.

[...]
> + */
> +
> +#define REENCODE_PATHNAME_C
> +#include "cache.h"
> +#include "utf8.h"
> +#include "reencode_pathname.h"
> +
> +#if defined(OLD_ICONV) || (defined(__sun__) && !defined(_XPG6))
> +	typedef const char *iconv_ibp;
> +#else
> +	typedef char *iconv_ibp;
> +#endif
> +
> +const static char *repo_path_encoding = "UTF-8";
> +
> +static iconv_t iconv_open_or_die(const char *tocode, const char *fromcode)
> +{
> +	iconv_t my_iconv;
> +	my_iconv = iconv_open(tocode, fromcode);
join these two lines

> +	if (my_iconv == (iconv_t) -1)
> +		die_errno(_("iconv_open(%s,%s) failed"), tocode, fromcode);
> +	return my_iconv;
> +}
> +
> +static size_t has_non_ascii(const char *s, size_t maxlen, size_t *strlen_c)
> +{
> +	const uint8_t *ptr = (const uint8_t *)s;
> +	size_t strlen_chars = 0;
> +	size_t ret = 0;
> +
> +	if (!ptr || !*ptr)
> +		return 0;
> +
> +	while (*ptr && maxlen) {
> +		if (*ptr & 0x80)
> +			ret++;
> +		strlen_chars++;
> +		ptr++;
> +		maxlen--;
> +	}
> +	if (strlen_c)
> +		*strlen_c = strlen_chars;
> +
> +	return ret;
> +}
> +
> +#ifdef PRECOMPOSE_UNICODE
> +void probe_utf8_pathname_composition(char *path, int len)
> +{
> +	static const char *auml_nfc = "\xc3\xa4";
> +	static const char *auml_nfd = "\x61\xcc\x88";
> +	int output_fd;
> +	if (precomposed_unicode != -1)
> +		return; /* We found it defined in the global config, respect it */
a bland line here would be nice

> +	strcpy(path + len, auml_nfc);
> +	output_fd = open(path, O_CREAT|O_EXCL|O_RDWR, 0600);
> +	if (output_fd >= 0) {
> +		close(output_fd);
> +		strcpy(path + len, auml_nfd);
> +		/* Indicate to the user, that we can configure it to true */
> +		if (!access(path, R_OK))
> +			git_config_set("core.precomposeunicode", "false");
> +		/* To be backward compatible, set precomposed_unicode to 0 */
> +		precomposed_unicode = 0;
> +		strcpy(path + len, auml_nfc);
> +		if (unlink(path))
> +			die_errno(_("failed to unlink '%s'"), path);
> +	}
> +}
> +#endif

[...]

> +struct dirent_psx *renc_pn_readdir(RENC_FN_DIR *renc_pn_dir)
> +{
> +	struct dirent *res;
> +	res = readdir(renc_pn_dir->dirp);
> +	if (res) {
> +		size_t namelenz = strlen(res->d_name) + 1; /* \0 */
> +		size_t new_len_needed = 0;
> +		int ret_errno = errno;
> +
> +		renc_pn_dir->dirent_utf8->d_ino	 = res->d_ino;
> +		renc_pn_dir->dirent_utf8->d_type = res->d_type;
> +	do {
> +		 if (new_len_needed > renc_pn_dir->dirent_utf8->max_name_len) {
indent

[...]

> diff --git a/environment.c b/environment.c
> index 85edd7f..ba81575 100644
> --- a/environment.c
> +++ b/environment.c
> @@ -59,6 +59,7 @@ int grafts_replace_parents = 1;
>   int core_apply_sparse_checkout;
>   int merge_log_config = -1;
>   int precomposed_unicode = -1; /* see probe_utf8_pathname_composition() */
> +const char *wt_path_encoding = NULL;
indent

>   struct startup_info *startup_info;
>   unsigned long pack_size_limit_cfg;
>
> diff --git a/git-compat-util.h b/git-compat-util.h
> index 35b095e..877b060 100644
> --- a/git-compat-util.h
> +++ b/git-compat-util.h
> @@ -153,13 +153,21 @@
>   #endif
>   #endif
>
> -/* used on Mac OS X */
> -#ifdef PRECOMPOSE_UNICODE
> -#include "compat/precompose_utf8.h"
> +#if defined(PATH_ENCODING) || defined(PRECOMPOSE_UNICODE)
> +#include "compat/reencode_pathname.h"
>   #else
> -#define precompose_str(in,i_nfd2nfc)
> -#define precompose_argv(c,v)
> -#define probe_utf8_pathname_composition(a,b)
> +#define reencode_argv(c,v)
> +#endif
> +
> +/* needed for Mac OS X */
> +#ifndef PRECOMPOSE_UNICODE
> +#define probe_utf8_pathname_composition(a,b);
> +#endif
> +
> +#ifndef PATH_ENCODING
> +#define str_worktree2repolen(in, insz) (NULL)
> +#define str_repo2worktree(in) (NULL)
> +#define str_worktree2repo(in) (NULL)
>   #endif
>
>   #ifndef NO_LIBGEN_H
> diff --git a/parse-options.c b/parse-options.c
> index c1c66bd..5840c18 100644
> --- a/parse-options.c
> +++ b/parse-options.c
> @@ -476,7 +476,7 @@ int parse_options(int argc, const char **argv, const char *prefix,
>   		usage_with_options(usagestr, options);
>   	}
>
> -	precompose_argv(argc, argv);
> +	reencode_argv(argc, argv);
>   	return parse_options_end(&ctx);
>   }
>
> diff --git a/t/t3911-i18n-filename-8859.sh b/t/t3911-i18n-filename-8859.sh
> new file mode 100755
> index 0000000..aa2be57
> --- /dev/null
> +++ b/t/t3911-i18n-filename-8859.sh
> @@ -0,0 +1,251 @@
> +#!/bin/sh
> +#
> +# Copyright (c) 2010 Torsten Bögershausen
> +#
> +
> +test_description='file system encodings UTF-8 ISO8859-1'
> +
> +. ./test-lib.sh
> +
> +fname_UTF_8=`printf '\303\206\302\242'`
> +fname_ISO8859_1=`printf '\306\242'`
> +Euro_utf8=`printf '\342\202\254'`
> +supportspathencoding=`git config core.supportspathencoding` || :
> +
> +
> +add_file_dir_link() {
> +	local bname=$1
> +	local fname=$2
> +	test_expect_success "add file $fname.f $bname" '
> +		git checkout master &&
> +		git checkout -b add_f_$bname &&
> +		>$fname.f &&
> +		git add $fname.f &&
> +		git commit -m "add fname"
> +	'
> +
> +	test_expect_success "add dir $fname.d $bname" '
> +		git checkout master &&
> +		git checkout -b add_d_$bname &&
> +		mkdir $fname.d &&
> +		touch $fname.d/$fname.f &&
> +		git add $fname.d/$fname.f &&
> +		git commit -m "add fname.d/fname"
> +	'
> +
> +	i=0
> +	for src in x $fname; do
> +		for dst in x $fname; do
> +			test_expect_success "add link $dst.l->$src.f on branch add_l_${i}_$bname" '
> +				git checkout master &&
> +				git checkout -b add_l_${i}_$bname &&
> +				ln -s $src.f $dst.l &&
> +				git add $dst.l &&
> +				git commit -m "add fname.l $i"
> +			'
> +			i=$(($i+1))
> +		done
> +	done
> +}
> +
> +test_expect_success "setup add rm x" '
> +	>x &&
> +	git add x &&
> +	git commit -m "1st commit" &&
> +	git rm x &&
> +	git commit -m "rm x"
> +'
> +
> +#combinations to be tested:
> +# UTF-8     -> ISO8859-1
> +# ISO8859-1 -> UTF-8
> +
> +if test "$supportspathencoding"
> +then
> +	srcencodings="ISO8859-1 UTF-8"
> +	for srcenc in $srcencodings
> +	do
> +		case $srcenc in
> +		ISO8859-1)
> +			dstenc=UTF-8
> +		;;
> +		UTF-8)
> +			dstenc=ISO8859-1
> +		;;
> +		UTF-8-MAC)
> +			dstenc=UTF-8
> +		;;
> +		*)
> +			echo >&2 "Wrong encoding $srcenc"
> +			exit 1
> +		;;
> +		esac
> +		eval fname_src=\$fname_$(echo $srcenc | sed -e 's/-/_/g' -e 's/_MAC//')
> +		eval fname_dst=\$fname_$(echo $dstenc | sed -e 's/-/_/g')
> +		test_expect_success "setup $srcenc" '
> +			git checkout master &&
> +			git config i18n.pathencoding $srcenc
> +		'
> +		add_file_dir_link $srcenc $fname_src
> +
> +		test_expect_success "setup $dstenc" '
> +			git checkout master &&
> +			echo "git checkout Master" >&2
> +			ls -l >&2
> +			git config i18n.pathencoding $dstenc
> +		'
> +
> +		test_expect_success "checkout file $dstenc (was $srcenc)" '
> +			git checkout add_f_$srcenc
> +		'
> +
> +		test_expect_success "exists file $dstenc (was $srcenc)" '
> +			test -f $fname_dst.f
> +		'
> +
> +		test_expect_success "log file $dstenc (was $srcenc)" '
> +			git log $fname_dst.f
> +		'
> +
> +		test_expect_success "git mv" '
> +			git checkout -b mv_file_$srcenc &&
> +			git mv $fname_dst.f XX.f &&
> +			git commit -m "git mv fname_dst.f XX.f"
> +		'
> +
> +		test_expect_success "checkout dir $dstenc (was $srcenc)" '
> +			git checkout add_d_$srcenc
> +		'
> +
> +		test_expect_success "exist dir $dstenc (was $srcenc)" '
> +			test -d $fname_dst.d
> +		'
> +
> +		test_expect_success "log dir $dstenc (was $srcenc)" '
> +			git log $fname_dst.d
> +		'
> +
> +		i=0
> +		for src in x $fname_dst; do
> +			for dst in x $fname_dst; do
> +				test_expect_success "checkout link $dst.l->$src.f branch add_l_${i}_$srcenc" '
> +					git checkout add_l_${i}_$srcenc
> +				'
> +				test_expect_success "exist link $dst.l->$src.f branch add_l_${i}_$srcenc" '
> +					test -L $dst.l
> +				'
> +				test_expect_success "log link $dst.l->$src.f branch add_l_${i}_$srcenc" '
> +					git log $dst.l
> +				'
> +				test_expect_success "readlink $dst.l->$src.f branch add_l_${i}_$srcenc" '
> +					echo "$src.f" >expect &&
> +					readlink "$dst.l" > actual &&
> +					test_cmp expect actual &&
> +					rm expect actual
> +				'
> +				i=$(($i+1))
> +			done
> +		done
> +	done
> +	# Make sure that Euro sign can NOT be checked out in 8859
"8859-1", The euro sign exists in 8859-15.

> +	#fname_src=Euro
> +	test_expect_success "setup UTF-8" '
> +		git checkout master &&
> +		git config i18n.pathencoding UTF-8
> +	'
> +	add_file_dir_link Euro $Euro_utf8
> +
> +	test_expect_success "setup ISO8859-1" '
> +		git checkout master &&
> +		rm -rf * &&
> +		git config i18n.pathencoding ISO8859-1
> +	'
> +	test_expect_success "checkout file Euro branch add_f_Euro" '
> +		git checkout add_f_Euro
Missing && ?

> +		echo *  >actual &&
> +		echo "*" >expect &&
> +		test_cmp expect actual &&
> +		rm expect actual
> +	'
> +
> +	test_expect_success "checkout dir Euro branch add_d_Euro" '
> +		rm -rf * &&
> +		test_must_fail git checkout add_d_Euro
> +	'
> +
> +	test_expect_success "Cleanup" '
> +		git config i18n.pathencoding UTF-8 &&
> +		git checkout master &&
> +		rm -rf * &&
> +		git reset --hard &&
> +		git config i18n.pathencoding ISO8859-1
> +	'
> +
> +	test_expect_success "checkout link Euro.l->x.f branch add_l_1_Euro" '
> +		! git checkout add_l_1_Euro
> +	'
> +
> +	test_expect_success "No link Euro.l->x.f" '
> +		echo *  >actual &&
> +		echo "*" >expect &&
> +		test_cmp expect actual &&
> +		rm expect actual
> +	'
> +
> +	test_expect_success "Cleanup after Euro.l->x.f" '
> +		git config i18n.pathencoding UTF-8 &&
> +		git checkout master &&
> +		rm -rf * &&
> +		git reset --hard &&
> +		git config i18n.pathencoding ISO8859-1
> +	'
> +
> +	# Checkoing out a soft link pointing to a filename outside
"checking"

> +	# 8859-1 should fail
> +	test_expect_failure "checkout link x.l->Euro.f branch add_l_2_Euro" '
> +		! git checkout add_l_2_Euro
> +	'
> +
> +	test_expect_success "No link x.f->Euro.l" '
> +		echo *  >actual &&
> +		echo "*" >expect &&
> +		test_cmp expect actual &&
> +		rm expect actual
> +	'
> +
> +	test_expect_success "Cleanup after link x.l->Euro.f branch" '
> +		git config i18n.pathencoding UTF-8 &&
> +		git checkout master &&
> +		rm -rf * &&
> +		git reset --hard &&
> +		git config i18n.pathencoding ISO8859-1
> +	'
> +
> +	test_expect_success "checkout link Euro.l->Euro.f branch add_l_3_Euro" '
> +		! git checkout add_l_3_Euro
> +	'
> +
> +	test_expect_success "No link Euro.l->Euro.f" '
> +		echo *  >actual &&
> +		echo "*" >expect &&
> +		test_cmp expect actual &&
> +		rm expect actual
> +	'
> +
> +else
> +	test_expect_success "setup 8859" '
"8859-1"

> +		git config i18n.pathencoding ISO8859-1 &&
> +		git checkout -b add_file_8859 &&
> +		> $fname_src.f &&
> +		git add $fname_src.f &&
> +		git commit -m "add fname_src" &&
> +		git config i18n.pathencoding UTF-8 &&
> +		rm -rf * &&
> +		git reset --hard
> +	'
> +	test_expect_success "Silent support of pathencoding" '
> +		test_must_fail test -f $fname_UTF_8.f
> +	'
> +fi
> +
> +test_done

-- robin

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]