{
char *p_in , *end ;
char32_t *p_out,*out;
size_t rc;
out=malloc(in_len*sizeof(*out));
p_out = out;
p_in = in;
end = in + in_len;
while((rc = mbrtoc32(p_out, p_in, end ‐ p_in, NULL)))
{
if(rc == ‐1) // invalid input
break;
else if(rc == (size_t)‐2) // truncated input
break;
else if(rc == (size_t)‐3) // UTF‐32 high surrogate
p_out += 1;
else {
p_in += rc;
p_out += 1;
};
}
// out_sz = p_out ‐ out + 1;
*len=p_out ‐ out + 1;
*outbuf=out;
}
void fromc32(char32_t *in, int in_len, char **outbuf, int *len)
{
char *out,*p;
int i;
size_t rc;
p=out=malloc(MB_CUR_MAX * in_len);
for(i=0;i<in_len;i++) {
rc=c32rtomb(p, in[i], NULL);
if(rc == (size_t)‐1) break;
p += rc;
}
*outbuf=out;
*len=p‐out+1;
}
void dump_u8(char *in, int len)
{
int i;
printf("Processing %d UTF‐8 code units: [ ", len);
for(i = 0; i <len ; ++i) printf("%#x ", (unsigned char)in[i]);
puts("]");
}
void dump_u32(char32_t *in, int len)
{
int i;
printf("Processing %d UTF‐32 code units: [ ", len);
for(i = 0; i < len; ++i) printf("0x%04x ", in[i]);
puts("]");
}
int main(void){
char in[] = "z00df6c34U0001F34C";
char32_t *out;
int out_len,len;
char *p;
// make sure we have utf8
setlocale(LC_ALL, "de_DE.utf8");
dump_u8(in,sizeof in / sizeof *in);
toc32(in,sizeof in / sizeof *in,&out,&out_len);
dump_u32(out,out_len);
fromc32(out,out_len,&p,&len);
dump_u8(p,len);
return 0;
}
This is a simple example and not production ready.
CONFORMING TO
C11
SEE ALSO
mbrtoc16(), c16tocmbr(), mbsrtowcs()
Linux 2021‐06‐02 MBRTOC32(3)
]]
---
man3/mbrtoc32.3 | 154 ++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 154 insertions(+)
create mode 100644 man3/mbrtoc32.3
diff --git a/man3/mbrtoc32.3 b/man3/mbrtoc32.3
new file mode 100644
index 000000000..8d0c33de1
--- /dev/null
+++ b/man3/mbrtoc32.3
@@ -0,0 +1,154 @@
+.TH MBRTOC32 3 "2021-06-02" Linux "Linux Programmer's Manual"
+.SH NAME
+mbrtoc32, c32rtomb \- convert between multibyte sequence and 32-bit wide character
+.SH SYNOPSIS
+.nf
+.B #include <uchar.h>
+.PP
+.BI "size_t t mbrtoc32 (char32_t * restrict "c32 " ,"
+.BI" const char *" restrict s " , size_t " n " ,"
+.BI " mbstate_t * restrict " p ");"
+.PP
+.BI "size_t c32rtomb (char * restrict " s ", char32_t " c32 " ,"
+.BI " mbstate_t * restrict " p " );"
+.fi
+.SH DESCRIPTION
+The
+.BR mbrtoc32 ()
+function inspects at most
+.I n
+bytes of the UTF-8 multibyte string starting at
+.IR s .
+If a multibyte is identified as valid the corresponding UCS-32
+32-bit wide character is stored in
+.IR c32 .
+If the multibyte character is the null wide character, it
+resets the shift state
+.I *p
+to the initial state and returns 0.
+If
+.I p
+is NULL, a static anonymous state known only to the
+function is used instead.
+.PP
+The
+.BR c32rtomb ()
+function converts the 32-bit wide character stored in
+.I c32
+into a mutability sequence into the memory
+.IR s .
+.SH "RETURN VALUES"
+The
+.BR mbrtoc32 ()
+function returns
+0 for the nul character.
+\-1 for invalid input,
+\-2 for a truncated input,
+\-3 for multibyte 32-bit wide character sequence that is
+written to
+.IR *c32 .
+No bytes are processed from the input
+.PP
+Otherwise the number of bytes in the multibyte sequence is returned.
+.PP
+The
+.BR c32tombr ()
+function returns \-1 on error otherwise the number of bytes used
+for the multibytes sequence.
+.SH EXAMPLE
+The input sequence is written as byte sequence to allow a proper
+display. Note that the input is UTF-8 and UTF-32 , it may not possible
+to convert every code.
+.EX
+.nf.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <locale.h>
+#include <uchar.h>
+#include <wchar.h>
+
+void toc32( char *in, int in_len, char32_t **outbuf, int *len)
+{
+ char *p_in , *end ;
+ char32_t *p_out,*out;
+ size_t rc;
+
+ out=malloc(in_len*sizeof(*out));
+ p_out = out;
+ p_in = in;
+ end = in + in_len;
+ while((rc = mbrtoc32(p_out, p_in, end - p_in, NULL)))
+ {
+ if(rc == -1) // invalid input
+ break;
+ else if(rc == (size_t)-2) // truncated input
+ break;
+ else if(rc == (size_t)-3) // UTF-32 high surrogate
+ p_out += 1;
+ else {
+ p_in += rc;
+ p_out += 1;
+ };
+ }
+ // out_sz = p_out - out + 1;
+ *len=p_out - out + 1;
+ *outbuf=out;
+}
+
+void fromc32(char32_t *in, int in_len, char **outbuf, int *len)
+{
+ char *out,*p;
+ int i;
+ size_t rc;
+ p=out=malloc(MB_CUR_MAX * in_len);
+ for(i=0;i<in_len;i++) {
+ rc=c32rtomb(p, in[i], NULL);
+ if(rc == (size_t)-1) break;
+ p += rc;
+ }
+ *outbuf=out;
+ *len=p-out+1;
+}
+
+void dump_u8(char *in, int len)
+{
+ int i;
+ printf("Processing %d UTF-8 code units: [ ", len);
+ for(i = 0; i <len ; ++i) printf("%#x ", (unsigned char)in[i]);
+ puts("]");
+}
+
+void dump_u32(char32_t *in, int len)
+{
+ int i;
+ printf("Processing %d UTF-32 code units: [ ", len);
+ for(i = 0; i < len; ++i) printf("0x%04x ", in[i]);
+ puts("]");
+
+}
+
+int main(void){
+ char in[] = "z\u00df\u6c34\U0001F34C";
+ char32_t *out;
+ int out_len,len;
+ char *p;
+ // make sure we have utf8
+ setlocale(LC_ALL, "de_DE.utf8");
+ dump_u8(in,sizeof in / sizeof *in);
+ toc32(in,sizeof in / sizeof *in,&out,&out_len);
+ dump_u32(out,out_len);
+ fromc32(out,out_len,&p,&len);
+ dump_u8(p,len);
+ return 0;
+}
+
+.fi
+.EE
+This is a simple example and not production ready.
+.SH "CONFORMING TO"
+C11
+.SH "SEE ALSO"
+.BR mbrtoc16 (),
+.BR c16tocmbr (),
+.BR mbsrtowcs ()
--
2.26.2