>From 20fb95dcc1b1f85f0bc1afff39824729fea8297b Mon Sep 17 00:00:00 2001 From: Peter Radisson <--show-origin> Date: Sun, 20 Jun 2021 22:14:59 +0200 Subject: [PATCH] convert between multibyte sequence and 16-bit wide character documentation including example Signed-off-by: Peter Radisson <--show-origin> --- man3/mbrtoc16.3 | 156 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 man3/mbrtoc16.3 diff --git a/man3/mbrtoc16.3 b/man3/mbrtoc16.3 new file mode 100644 index 000000000..18cb48adc --- /dev/null +++ b/man3/mbrtoc16.3 @@ -0,0 +1,156 @@ +.TH MBRTOC16 3 "2021-06-02" Linux "Linux Programmer's Manual" +.SH NAME +mbrtoc16, c16rtomb \- convert between multibyte sequence and 16-bit wide character +.SH SYNOPSIS +.nf +.B #include <uchar.h> +.PP +.BI "size_t t mbrtoc16 (char16_t * restrict "c16 " ," +.BI" const char *" restrict s " , size_t " n " ," +.BI " mbstate_t * restrict " p ");" +.PP +.BI "size_t c16rtomb (char * restrict " s ", char16_t " c16 " ," +.BI " mbstate_t * restrict " p " );" +.fi +.SH DESCRIPTION +The +.BR mbrtoc16 () +function inspects at most +.I n +bytes of the UTF-8 multibyte string starting at +.IR s . +If a multibyte character is identified as valid the corresponding UCS-16 +16-bit wide character is stored in +.IR c16 . +If the multibyte character is the null wide character, it +resets the shift state +.I *p +to the initial state and returns 0. +If +.I p +is NULL, a static anonymous state known only to the +function is used instead. +.PP +The +.BR c16rtomb () +function converts the 16-bit wide character stored in +.I c16 +into a mutability sequence into the memory +.IR s . +.SH "RETURN VALUES" +The +.BR mbrtoc16 () +function returns +0 for the nul character. +\-1 for invalid input, +\-2 for a truncated input, +\-3 for multibyte 16-bit wide character sequence (U+D800—U+DFFF) that is +written to +.IR *c16 . +No bytes are processed from the input +.PP +Otherwise the number of bytes in the multibyte sequence is returned. +.PP +The +.BR c16tombr () +function returns \-1 on error otherwise the number of bytes used +for the multibytes sequence. +.SH EXAMPLE +The input sequence is written as byte sequence to allow a proper +display. Note that the input is UTF-8 and UTF-16 , it may not possible +to convert every code. +.EX +.\" +.\" // https://en.cppreference.com/w/c/string/multibyte/mbrtoc16 +.\" +.nf + +#include <stdio.h> +#include <stdlib.h> +#include <locale.h> +#include <uchar.h> +#include <wchar.h> + +void toc16( char *in, int in_len, char16_t **outbuf, int *len) +{ + char *p_in , *end ; + char16_t *p_out,*out; + size_t rc; + + out=malloc(in_len*sizeof(*out)); + p_out = out; + p_in = in; + end = in + in_len; + while((rc = mbrtoc16(p_out, p_in, end - p_in, NULL))) + { + if(rc == -1) // invalid input + break; + else if(rc == (size_t)-2) // truncated input + break; + else if(rc == (size_t)-3) // UTF-16 high surrogate + p_out += 1; + else { + p_in += rc; + p_out += 1; + }; + } + *len=p_out - out + 1; + *outbuf=out; +} + +void fromc16(char16_t *in, int in_len, char **outbuf, int *len) +{ + char *out,*p; + int i; + size_t rc; + p=out=malloc(MB_CUR_MAX * in_len); + for(i=0;i<in_len;i++) { + rc=c16rtomb(p, in[i], NULL); + if(rc == (size_t)-1) break; + p += rc; + } + *outbuf=out; + *len=p-out+1; +} + +void dump_u8(char *in, int len) +{ + int i; + printf("Processing %d UTF-8 code units: [ ", len); + for(i = 0; i <len ; ++i) printf("%#x ", (unsigned char)in[i]); + puts("]"); +} + +void dump_u16(char16_t *in, int len) +{ + int i; + printf("Processing %d UTF-16 code units: [ ", len); + for(i = 0; i < len; ++i) printf("0x%04x ", in[i]); + puts("]"); + +} + +int main(void){ + char in[] = "z\u00df\u6c34\U0001F34C"; + char16_t *out; + int out_len,len; + char *p; + // make sure we have utf8 + setlocale(LC_ALL, "de_DE.utf8"); + dump_u8(in,sizeof in / sizeof *in); + toc16(in,sizeof in / sizeof *in,&out,&out_len); + dump_u16(out,out_len); + fromc16(out,out_len,&p,&len); + dump_u8(p,len); + return 0; +} +.fi +.EE +This is a simple example and not production ready. +.SH NOTES +UCS-16 is superseded by UCS-32. +.SH "CONFORMING TO" +C11 +.SH "SEE ALSO" +.BR mbrtoc32 (), +.BR c32tocmbr () -- 2.26.2