On Sat, Aug 24, 2013 at 12:20 PM, Hans de Goede <hdegoede@xxxxxxxxxx> wrote: > Hi, > > > On 08/23/2013 10:25 PM, Marc-André Lureau wrote: >> >> Convert line endings from/to LF/CRLF, in utf8. >> --- >> gtk/spice-util-priv.h | 2 + >> gtk/spice-util.c | 122 >> ++++++++++++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 124 insertions(+) >> >> diff --git a/gtk/spice-util-priv.h b/gtk/spice-util-priv.h >> index ee5a42d..cc559dc 100644 >> --- a/gtk/spice-util-priv.h >> +++ b/gtk/spice-util-priv.h >> @@ -29,6 +29,8 @@ gboolean spice_strv_contains(const GStrv strv, const >> gchar *str); >> gchar* spice_uuid_to_string(const guint8 uuid[16]); >> const gchar* spice_yes_no(gboolean value); >> guint16 spice_make_scancode(guint scancode, gboolean release); >> +gchar* spice_unix2dos(const gchar *str, gssize len, GError **error); >> +gchar* spice_dos2unix(const gchar *str, gssize len, GError **error); >> >> #if GLIB_CHECK_VERSION(2,32,0) >> #define STATIC_MUTEX GMutex >> diff --git a/gtk/spice-util.c b/gtk/spice-util.c >> index 774a145..be10edc 100644 >> --- a/gtk/spice-util.c >> +++ b/gtk/spice-util.c >> @@ -19,6 +19,7 @@ >> #ifdef HAVE_CONFIG_H >> # include "config.h" >> #endif >> + >> #include <stdlib.h> >> #include <string.h> >> #include <glib-object.h> >> @@ -245,3 +246,124 @@ guint16 spice_make_scancode(guint scancode, gboolean >> release) >> >> g_return_val_if_reached(0); >> } >> + >> +typedef enum { >> + NEWLINE_TYPE_LF, >> + NEWLINE_TYPE_CR_LF >> +} NewlineType; >> + >> +static gssize get_line(const gchar *str, gsize len, >> + NewlineType type, gsize *nl_len, >> + GError **error) >> +{ >> + const gchar *p = str; >> + gsize nl = 0; >> + >> + if (type == NEWLINE_TYPE_CR_LF) { >> + while ((p - str) < len) { >> + p = g_utf8_strchr(p, len, '\r'); >> + if (!p) >> + break; >> + p = g_utf8_next_char(p); >> + if (g_utf8_get_char(p) == '\n') { >> + len = (p - str) - 1; >> + nl = 2; >> + break; >> + } >> + } >> + } else { >> + p = g_utf8_strchr(str, len, '\n'); >> + if (p) { >> + len = p - str; >> + nl = 1; >> + } >> + } > > > This looks way more complicated then it needs to be, in UTF-8 > 0x00 - 0x7f only are valid as a single-byte sequence. multi-byte > encoded characters will never contain 0x00 - 0x7f. UTF-8 was designed > this way, is so that existing string parsing code for non multi-byte > encodings, which make look for example for ' " = or LF characters does > not break when parsing strings with multi-byte characters in there. > > TL;DR: LF and CR will never be part of a multi byte character, so > you can simple do: strstr(str, "\r\n") to find the CRLF. g_utf8_strchr is implemented using a regular strstr. Speed shouldn't be different here. I prefer to use utf8 functions on utf8 strings. > > >> + >> + if (!g_utf8_validate(str, len, NULL)) { >> + g_set_error_literal(error, G_CONVERT_ERROR, >> + G_CONVERT_ERROR_ILLEGAL_SEQUENCE, >> + "Invalid byte sequence in conversion input"); >> + return -1; >> + } > > > And once you simply treat this as a regular C-string without worrying > about multi-byte encodings you can also drop this. Actually, during implementation, I have encountered/produced invalid utf8 that will break later on in gtk+, so I prefer to validate the production. >> + >> + *nl_len = nl; >> + return len; >> +} >> + >> + >> +static gchar* spice_convert_newlines(const gchar *str, gssize len, >> + NewlineType from, >> + NewlineType to, >> + GError **error) >> +{ >> + GError *err = NULL; >> + gssize length; >> + gsize nl; >> + GString *output; >> + gboolean free_segment = FALSE; >> + gint i; >> + >> + g_return_val_if_fail(str != NULL, NULL); >> + g_return_val_if_fail(len >= -1, NULL); >> + g_return_val_if_fail(error == NULL || *error == NULL, NULL); >> + /* only 2 supported combinations */ >> + g_return_val_if_fail((from == NEWLINE_TYPE_LF && >> + to == NEWLINE_TYPE_CR_LF) || >> + (from == NEWLINE_TYPE_CR_LF && >> + to == NEWLINE_TYPE_LF), NULL); >> + >> + if (len == -1) >> + len = strlen(str); >> + /* sometime we get \0 terminated strings, skip that, or it fails >> + to utf8 validate line with \0 end */ >> + else if (str[len] == 0) >> + len -= 1; >> + >> + /* allocate worst case, if it's small enough, we don't care much, >> + * if it's big, malloc will put us in mmap'd region, and we can >> + * over allocate. >> + */ >> + output = g_string_sized_new(len * 2 + 1); >> + >> + for (i = 0; i < len; i += length + nl) { >> + length = get_line(str + i, len - i, from, &nl, error); >> + if (length < 0) >> + break; >> + >> + g_string_append_len(output, str + i, length); >> + >> + if (nl) { >> + /* let's not double \r if it's already in the line */ >> + if (to == NEWLINE_TYPE_CR_LF && >> + output->str[output->len - 1] != '\r') >> + g_string_append_c(output, '\r'); >> + >> + g_string_append_c(output, '\n'); >> + } >> + } >> + >> + if (err) { >> + g_propagate_error(error, err); >> + free_segment = TRUE; >> + } >> + >> + return g_string_free(output, free_segment); >> +} >> + >> +G_GNUC_INTERNAL >> +gchar* spice_dos2unix(const gchar *str, gssize len, GError **error) >> +{ >> + return spice_convert_newlines(str, len, >> + NEWLINE_TYPE_CR_LF, >> + NEWLINE_TYPE_LF, >> + error); >> +} >> + >> +G_GNUC_INTERNAL >> +gchar* spice_unix2dos(const gchar *str, gssize len, GError **error) >> +{ >> + return spice_convert_newlines(str, len, >> + NEWLINE_TYPE_LF, >> + NEWLINE_TYPE_CR_LF, >> + error); >> +} >> > > Regards, > > Hans -- Marc-André Lureau _______________________________________________ Spice-devel mailing list Spice-devel@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/spice-devel