From: Ben Myers <bpm@xxxxxxx> Here's a preliminary test for utf8 support in xfs. It is based on code that also does some testing in the trie generator. Here too we are using the NormalizationTest.txt file from the unicode distribution. We check that the normalization in libxfs is working and then run checks on a filesystem. Note that there are some 'blacklisted' unichars which normalize to reserved characters. FIXME: For convenience of build this patch is against xfsprogs access to libxfs. Handling of ignorables and case fold is also not implemented here. --- Makefile | 2 +- chkutf8data/Makefile | 21 +++ chkutf8data/chkutf8data.c | 430 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 452 insertions(+), 1 deletion(-) create mode 100644 chkutf8data/Makefile create mode 100644 chkutf8data/chkutf8data.c diff --git a/Makefile b/Makefile index c442da6..d4c0a23 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ endif LIB_SUBDIRS = support libxfs libxlog libxcmd libhandle libdisk TOOL_SUBDIRS = copy db estimate fsck fsr growfs io logprint mkfs quota \ - mdrestore repair rtcp m4 man doc po debian + mdrestore repair rtcp m4 man doc po debian chkutf8data SUBDIRS = include $(LIB_SUBDIRS) $(TOOL_SUBDIRS) diff --git a/chkutf8data/Makefile b/chkutf8data/Makefile new file mode 100644 index 0000000..6ce5706 --- /dev/null +++ b/chkutf8data/Makefile @@ -0,0 +1,21 @@ +# +# Copyright (c) 2014 SGI. All Rights Reserved. +# + +TOPDIR = .. +include $(TOPDIR)/include/builddefs + +LTCOMMAND = chkutf8data +CFILES = chkutf8data.c + +LLDLIBS = $(LIBXFS) +LTDEPENDENCIES = $(LIBXFS) +LLDFLAGS = -static + +default: depend $(LTCOMMAND) + +include $(BUILDRULES) + +install: default + +-include .ltdep diff --git a/chkutf8data/chkutf8data.c b/chkutf8data/chkutf8data.c new file mode 100644 index 0000000..487cf1e --- /dev/null +++ b/chkutf8data/chkutf8data.c @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2014 SGI. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <sys/types.h> +#include <stddef.h> +#include <stdlib.h> +#include <stdio.h> +#include <assert.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include "utf8norm.h" + +#define FOLD_NAME "CaseFolding.txt" +#define TEST_NAME "NormalizationTest.txt" + +const char *fold_name = FOLD_NAME; +const char *test_name = TEST_NAME; + +/* An arbitrary line size limit on input lines. */ + +#define LINESIZE 1024 +char line[LINESIZE]; +char buf0[LINESIZE]; +char buf1[LINESIZE]; +char buf2[LINESIZE]; +char buf3[LINESIZE]; +char buf4[LINESIZE]; +char buf5[LINESIZE]; + +const char *mtpt; +int verbose = 0; + +/* ------------------------------------------------------------------ */ + +static void +help(void) +{ + printf("The input files:\n"); + printf("\t-f %s\n", FOLD_NAME); + printf("\t-t %s\n", TEST_NAME); + printf("\n\n"); + printf("\t-m mtpt\n"); + printf("\t-v (verbose)\n"); + printf("\t-h (help)\n"); + printf("\n"); +} + +static void +usage(void) +{ + help(); + exit(1); +} + +static void +open_fail(const char *name, int error) +{ + printf("Error %d opening %s: %s\n", error, name, strerror(error)); + exit(1); +} + +static void +file_fail(const char *filename) +{ + printf("Error parsing %s\n", filename); + exit(1); +} + +/* ------------------------------------------------------------------ */ + +/* + * UTF8 valid ranges. + * + * The UTF-8 encoding spreads the bits of a 32bit word over several + * bytes. This table gives the ranges that can be held and how they'd + * be represented. + * + * 0x00000000 0x0000007F: 0xxxxxxx + * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx + * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * There is an additional requirement on UTF-8, in that only the + * shortest representation of a 32bit value is to be used. A decoder + * must not decode sequences that do not satisfy this requirement. + * Thus the allowed ranges have a lower bound. + * + * 0x00000000 0x0000007F: 0xxxxxxx + * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx + * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx + * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, + * 17 planes of 65536 values. This limits the sequences actually seen + * even more, to just the following. + * + * 0 - 0x7f: 0 0x7f + * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf + * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf + * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf + * + * Even within those ranges not all values are allowed: the surrogates + * 0xd800 - 0xdfff should never be seen. + * + * Note that the longest sequence seen with valid usage is 4 bytes, + * the same a single UTF-32 character. This makes the UTF-8 + * representation of Unicode strictly smaller than UTF-32. + * + * The shortest sequence requirement was introduced by: + * Corrigendum #1: UTF-8 Shortest Form + * It can be found here: + * http://www.unicode.org/versions/corrigendum1.html + * + */ + +#define UTF8_2_BITS 0xC0 +#define UTF8_3_BITS 0xE0 +#define UTF8_4_BITS 0xF0 +#define UTF8_N_BITS 0x80 +#define UTF8_2_MASK 0xE0 +#define UTF8_3_MASK 0xF0 +#define UTF8_4_MASK 0xF8 +#define UTF8_N_MASK 0xC0 +#define UTF8_V_MASK 0x3F +#define UTF8_V_SHIFT 6 + +static int +utf8key(unsigned int key, char keyval[]) +{ + int keylen; + + if (key < 0x80) { + keyval[0] = key; + keylen = 1; + } else if (key < 0x800) { + keyval[1] = key & UTF8_V_MASK; + keyval[1] |= UTF8_N_BITS; + key >>= UTF8_V_SHIFT; + keyval[0] = key; + keyval[0] |= UTF8_2_BITS; + keylen = 2; + } else if (key < 0x10000) { + keyval[2] = key & UTF8_V_MASK; + keyval[2] |= UTF8_N_BITS; + key >>= UTF8_V_SHIFT; + keyval[1] = key & UTF8_V_MASK; + keyval[1] |= UTF8_N_BITS; + key >>= UTF8_V_SHIFT; + keyval[0] = key; + keyval[0] |= UTF8_3_BITS; + keylen = 3; + } else if (key < 0x110000) { + keyval[3] = key & UTF8_V_MASK; + keyval[3] |= UTF8_N_BITS; + key >>= UTF8_V_SHIFT; + keyval[2] = key & UTF8_V_MASK; + keyval[2] |= UTF8_N_BITS; + key >>= UTF8_V_SHIFT; + keyval[1] = key & UTF8_V_MASK; + keyval[1] |= UTF8_N_BITS; + key >>= UTF8_V_SHIFT; + keyval[0] = key; + keyval[0] |= UTF8_4_BITS; + keylen = 4; + } else { + printf("%#x: illegal key\n", key); + keylen = 0; + } + return keylen; +} + +static int +normalize_line(utf8data_t tree, char *s, char *t) +{ + struct utf8cursor u8c; + + if (utf8cursor(&u8c, tree, s)) { + printf("%s return utf8cursor failed\n", __func__); + return -1; + } + + while ((*t = utf8byte(&u8c)) > 0) + t++; + + if (*t < 0) { + printf("%s return error %d\r", __func__, *t); + return -1; + } + if (*t != 0) { + printf("%s return t not 0\n", __func__); + return -1; + } + + return 0; +} + +static void +test_key(char *source, + char *NFC, + char *NFD, + char *NFKC, + char *NFKD) +{ + int fd; + int error; + + if (verbose) + printf("Testing %s -> %s\n", source, NFKD); + + error = chdir(mtpt); /* XXX hardcoded mount point */ + if (error) { + perror(mtpt); + exit(-1); + } + + /* the initial create should succeed */ + if (verbose) + printf("Initial create %s... ", source); + fd = open(source, O_CREAT|O_EXCL, 0); + if (fd < 0) { + printf("Failed to create %s XXX\n", source); + perror(source); + close(fd); + exit(-1); + } + close(fd); + if (verbose) + printf("Success\n"); + + /* a second create should fail */ + if (verbose) + printf("Second create %s (should return EEXIST)... ", NFKD); + fd = open(NFKD, O_CREAT|O_EXCL, 0); + if (fd >= 1) { + printf("Test Failed. Was able to create %s XXX\n", NFKD); + perror(NFKD); + close(fd); + exit(-1); + } + close(fd); + if (verbose) + printf("EEXIST\n"); + + error = unlink(NFKD); + if (error) { + printf("Unlink failed\n"); + perror(NFKD); + exit(-1); + } +} + +int +blacklisted(unsigned int unichar) +{ + /* these unichars normalize to characters we don't allow */ + unsigned int list[] = { 0x2024 /* . */, + 0x2025 /* .. */, + 0x2100 /* a/c */, + 0x2101 /* a/s */, + 0x2105 /* c/o */, + 0x2106 /* c/u */, + 0xFE30 /* .. */, + 0xFE52 /* . */, + 0xFF0E /* . */, + 0xFF0F /* / */}; + int i; + + for (i=0; i < (sizeof(list) / sizeof(unichar)); i++) { + if (list[i] == unichar) + return 1; + } + return 0; +} + +static void +normalization_test(void) +{ + FILE *file; + unsigned int unichar; + char *s; + char *t; + int ret; + int tests = 0; + int failures = 0; + char source[LINESIZE]; + char NFKD[LINESIZE]; + int skip; + utf8data_t nfkdi = utf8nfkdi(utf8version); + + printf("Parsing %s\n", test_name); + /* Step one, read data from file. */ + file = fopen(test_name, "r"); + if (!file) + open_fail(test_name, errno); + + while (fgets(line, LINESIZE, file)) { + ret = sscanf(line, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];", + source, NFKD); + //NFC, NFD, NFKC, NFKD); + if (ret != 2 || *line == '#') + continue; + + s = source; + t = buf2; + skip = 0; + while (*s) { + unichar = strtoul(s, &s, 16); + if (blacklisted(unichar)) + skip++; + t += utf8key(unichar, t); + } + *t = '\0'; + + if (skip) + continue; + + s = NFKD; + t = buf3; + while (*s) { + unichar = strtoul(s, &s, 16); + t += utf8key(unichar, t); + } + *t = '\0'; + + /* normalize source */ + if (normalize_line(nfkdi, buf2, buf4) < 0) { + printf("normalize_line for unichar %s Failed\n", buf0); + exit(1); + } + if (verbose) + printf("(%s) %s normalized to %s... ", + source, buf2, buf4); + + /* does it match NFKD? */ + tests++; + if (memcmp(buf4, buf3, strlen(buf3))) { + if (verbose) + printf("Fail!\n"); + failures++; + } else { + if (verbose) + printf("Correct!\n"); + } + + /* normalize NFKD */ + if (normalize_line(nfkdi, buf3, buf5) < 0) { + printf("normalize_line for unichar %s Failed\n", + buf3); + exit(1); + } + if (verbose) + printf("(%s) %s normalized to %s... ", + NFKD, buf3, buf5); + + /* does it normalize to itself? */ + tests++; + if (memcmp(buf5, buf3, strlen(buf3))) { + if (verbose) + printf("Fail!\n"); + failures++; + } else { + if (verbose) + printf("Correct!\n"); + } + + /* XXX ignorables need to be taken into account? */ + test_key(buf2, NULL, NULL, NULL, buf3); + } + fclose(file); + printf("Ran %d tests with %d failures\n", tests, failures); + if (failures) + file_fail(test_name); +} + +int +main(int argc, char *argv[]) +{ + int opt; + + while ((opt = getopt(argc, argv, "f:t:m:vh")) != -1) { + switch (opt) { + case 'f': + fold_name = optarg; + break; + case 't': + test_name = optarg; + break; + case 'm': + mtpt = optarg; + break; + case 'v': + verbose++; + break; + case 'h': + help(); + exit(0); + default: + usage(); + } + } + + if (!test_name || !mtpt) { + usage(); + exit(-1); + } + + normalization_test(); + + return 0; +} -- 1.7.12.4 _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs