Re: [PATCH] hfsplus: fix decomposition of Hangul characters

Viacheslav Dubeyko <slava@xxxxxxxxxxx> · Mon, 27 Nov 2017 14:40:53 -0800

On Mon, 2017-11-27 at 16:36 -0300, Ernesto A. Fernández wrote:
> It would be good to have some feedback on the use of code under the
> Unicode License, I don't know if that can be a problem.
> 
> To Ting-Chang Hou: This patch is slightly different from the one you
> tested with your mac. You are still tagged as Tested-by, but you may
> want to check again that I have not broken anything.
> 
> Thanks,
> Ernest
> 
> -- >8 --
> Subject: [PATCH] hfsplus: fix decomposition of Hangul characters
> 
> Files created under macOS become essentially unusable under linux if
> their names contain any Hangul, and vice versa.
> 
> This happens because the normalization of Hangul characters is a special
> case: it can be done algorithmically, without need of a table. The
> hfsplus module deals with Hangul correctly when composing, but
> completely forgets about it when performing a decomposition.
> 
> Solve this by using the Hangul decomposition function provided in the
> Unicode Standard. It's under the Unicode License, compatible with the
> GPL.
> 

I believe it makes sense to share more details about algorithm details.
I think that the Unicode standard contains some explanation how to
decompose correctly. It will be great to have some brief citation in the
patch description.

> This patch will cause trouble for Hangul filenames already created by
> the module in the past. This shouldn't be a special concern because the
> main purpose of the module was always sharing with macOS. If a user
> actually needs to access such a file the nodecompose mount option should
> be enough.
> 
> Reported-by: Ting-Chang Hou <tchou@xxxxxxxxxxxx>
> Tested-by: Ting-Chang Hou <tchou@xxxxxxxxxxxx>
> Signed-off-by: Ernesto A. Fernández <ernesto.mnd.fernandez@xxxxxxxxx>
> ---
>  fs/hfsplus/unicode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 56 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
> index dfa90c2..7ceae91 100644
> --- a/fs/hfsplus/unicode.c
> +++ b/fs/hfsplus/unicode.c
> @@ -272,8 +272,8 @@ static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
>  	return size;
>  }
>  
> -/* Decomposes a single unicode character. */
> -static inline u16 *decompose_unichar(wchar_t uc, int *size)
> +/* Decomposes a non-Hangul unicode character. */
> +static u16 *decompose_nonhangul(wchar_t uc, int *size)
>  {
>  	int off;
>  
> @@ -296,6 +296,51 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size)
>  	return hfsplus_decompose_table + (off / 4);
>  }
>  
> +/*
> + * Decomposes a Hangul unicode character.
> + *
> + * This function was adapted from sample code in section 3.12 of the
> + * Unicode Standard, version 10.0.
> + *
> + * Copyright (C) 1991-2017 Unicode, Inc.  All rights reserved.  Distributed
> + * under the Terms of Use in http://www.unicode.org/copyright.html.
> + */
> +static int decompose_hangul(wchar_t uc, u16 *result)
> +{
> +	int index;
> +	int l, v, t;
> +
> +	index = uc - Hangul_SBase;
> +	if (index < 0 || index >= Hangul_SCount)
> +		return 0;
> +
> +	l = Hangul_LBase + index / Hangul_NCount;
> +	v = Hangul_VBase + (index % Hangul_NCount) / Hangul_TCount;
> +	t = Hangul_TBase + index % Hangul_TCount;
> +
> +	result[0] = l;
> +	result[1] = v;
> +	if (t != Hangul_TBase) {
> +		result[2] = t;
> +		return 3;
> +	}
> +	return 2;
> +}
> +
> +/* Decomposes a single unicode character. */
> +static u16 *decompose_unichar(wchar_t uc, int *size, u16 *hangul_buffer)
> +{
> +	u16 *result;
> +
> +	/* Hangul is handled separately */
> +	result = hangul_buffer;
> +	*size = decompose_hangul(uc, result);

Is it possible to distinguish the different cases beforehand and to
select one proper function for execution without the execution both ones
in the worst case?

> +	if (*size == 0)
> +		/* not Hangul */
> +		result = decompose_nonhangul(uc, size);

I think it makes sense to use the brackets here. If I remember
correctly, this is the code style's requirement.

Thanks,
Vyacheslav Dubeyko.

> +	return result;
> +}
> +
>  int hfsplus_asc2uni(struct super_block *sb,
>  		    struct hfsplus_unistr *ustr, int max_unistr_len,
>  		    const char *astr, int len)
> @@ -303,13 +348,14 @@ int hfsplus_asc2uni(struct super_block *sb,
>  	int size, dsize, decompose;
>  	u16 *dstr, outlen = 0;
>  	wchar_t c;
> +	u16 dhangul[3];
>  
>  	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
>  	while (outlen < max_unistr_len && len > 0) {
>  		size = asc2unichar(sb, astr, len, &c);
>  
>  		if (decompose)
> -			dstr = decompose_unichar(c, &dsize);
> +			dstr = decompose_unichar(c, &dsize, &dhangul[0]);
>  		else
>  			dstr = NULL;
>  		if (dstr) {
> @@ -344,6 +390,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
>  	unsigned long hash;
>  	wchar_t c;
>  	u16 c2;
> +	u16 dhangul[3];
>  
>  	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
>  	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
> @@ -357,7 +404,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
>  		len -= size;
>  
>  		if (decompose)
> -			dstr = decompose_unichar(c, &dsize);
> +			dstr = decompose_unichar(c, &dsize, &dhangul[0]);
>  		else
>  			dstr = NULL;
>  		if (dstr) {
> @@ -396,6 +443,7 @@ int hfsplus_compare_dentry(const struct dentry *dentry,
>  	const char *astr1, *astr2;
>  	u16 c1, c2;
>  	wchar_t c;
> +	u16 dhangul_1[3], dhangul_2[3];
>  
>  	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
>  	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
> @@ -413,7 +461,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry,
>  			len1 -= size;
>  
>  			if (decompose)
> -				dstr1 = decompose_unichar(c, &dsize1);
> +				dstr1 = decompose_unichar(c, &dsize1,
> +							  &dhangul_1[0]);
>  			if (!decompose || !dstr1) {
>  				c1 = c;
>  				dstr1 = &c1;
> @@ -427,7 +476,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry,
>  			len2 -= size;
>  
>  			if (decompose)
> -				dstr2 = decompose_unichar(c, &dsize2);
> +				dstr2 = decompose_unichar(c, &dsize2,
> +							  &dhangul_2[0]);
>  			if (!decompose || !dstr2) {
>  				c2 = c;
>  				dstr2 = &c2;