Re: [libosinfo 7/8] rfc: Infer ISO language from label

Christophe Fergeau <cfergeau@xxxxxxxxxx> · Mon, 10 Dec 2012 10:08:33 +0100

On Wed, Dec 05, 2012 at 07:00:41PM +0200, Zeeshan Ali (Khattak) wrote:
> On Mon, Dec 3, 2012 at 1:23 PM, Christophe Fergeau <cfergeau@xxxxxxxxxx> wrote:
> > Now that libosinfo has an osinfo_db_identify_media method which
> > modifies the media it was passed, we can generate properties which
> > needs information from the media stored in the OsinfoDB, and
> > information from the actual media (ISO volume ID).
> > This is useful to guess what languages are supported by a given
> > Windows ISO: the end of the ISO volume ID has a language code, which
> > we can translate to a locale identifier.
> >
> > This commit adds a lang-regex property to the OsinfoDB database to
> > extract the language code from Windows ISO volume IDs, and
> > then add mapping tables to turn it into a locale identifier.
> > ---
> >  data/oses/windows.xml.in   |   2 +
> >  data/schemas/libosinfo.rng |   5 ++
> >  osinfo/libosinfo.syms      |   4 +-
> >  osinfo/osinfo_db.c         | 177 +++++++++++++++++++++++++++++++++++++++++++++
> >  osinfo/osinfo_loader.c     |   4 +-
> >  osinfo/osinfo_media.c      |  67 ++++++++++++++++-
> >  osinfo/osinfo_media.h      |   3 +
> >  7 files changed, 258 insertions(+), 4 deletions(-)
> >
> > diff --git a/data/oses/windows.xml.in b/data/oses/windows.xml.in
> > index d09e873..e8c29f9 100644
> > --- a/data/oses/windows.xml.in
> > +++ b/data/oses/windows.xml.in
> > @@ -739,12 +739,14 @@
> >        <iso>
> >          <volume-id>(HB1_CCPA_X86FRE|HRM_CCSA_X86FRE|HRM_CCSA_X86CHK|HRM_CCSNA_X86CHK|HRM_CCSNA_X86FRE|HRM_CENA_X86FREV|HRM_CENA_X86CHKV|HRM_CENNA_X86FREV|HRM_CENNA_X86CHKV|HRM_CPRA_X86FREV|HRM_CPRNA_X86FREV)_</volume-id>
> >          <publisher-id>MICROSOFT CORPORATION</publisher-id>
> > +        <lang-regex>[[:upper:][:digit:]_]*_([[:upper:]]*-[[:upper:]]*)</lang-regex>
> >        </iso>
> >      </media>
> >      <media arch="x86_64">
> >        <iso>
> >          <volume-id>(HB1_CCPA_X64FRE|HRM_CCSA_X64FRE|HRM_CCSA_X64CHK|HRM_CCSNA_X64FRE|HRM_CCSNA_X64CHK|HRM_CENNA_X64FREV|HRM_CENNA_X64CHKV|HRM_CENA_X64FREV|HRM_CENA_X64CHKV|HRM_CPRA_X64FREV|HRM_CPRNA_X64FREV)_</volume-id>
> >          <publisher-id>MICROSOFT CORPORATION</publisher-id>
> > +        <lang-regex>[[:upper:][:digit:]_]*_([[:upper:]]*-[[:upper:]]*)</lang-regex>
> >        </iso>
> >      </media>
> >
> > diff --git a/data/schemas/libosinfo.rng b/data/schemas/libosinfo.rng
> > index 87635dd..36fa1a1 100644
> > --- a/data/schemas/libosinfo.rng
> > +++ b/data/schemas/libosinfo.rng
> > @@ -281,6 +281,11 @@
> >              <text/>
> >            </element>
> >          </optional>
> > +        <optional>
> > +          <element name='lang-regex'>
> > +            <text/>
> > +          </element>
> > +        </optional>
> >        </interleave>
> >      </element>
> >    </define>
> > diff --git a/osinfo/libosinfo.syms b/osinfo/libosinfo.syms
> > index d45e58e..7c3efe1 100644
> > --- a/osinfo/libosinfo.syms
> > +++ b/osinfo/libosinfo.syms
> > @@ -341,11 +341,11 @@ LIBOSINFO_0.2.2 {
> >         osinfo_install_config_set_target_disk;
> >         osinfo_install_config_get_script_disk;
> >         osinfo_install_config_set_script_disk;
> > -
> >         osinfo_install_script_get_avatar_format;
> >         osinfo_install_script_get_path_format;
> > -
> >         osinfo_install_script_get_product_key_format;
> > +
> > +       osinfo_media_get_languages;
> >  } LIBOSINFO_0.2.1;
> >
> >  /* Symbols in next release...
> > diff --git a/osinfo/osinfo_db.c b/osinfo/osinfo_db.c
> > index 46101d6..2c2eb5a 100644
> > --- a/osinfo/osinfo_db.c
> > +++ b/osinfo/osinfo_db.c
> > @@ -38,6 +38,177 @@ G_DEFINE_TYPE (OsinfoDb, osinfo_db, G_TYPE_OBJECT);
> >       (((str) != NULL) &&                                                \
> >        g_regex_match_simple((pattern), (str), 0, 0)))
> >
> > +static gchar *get_raw_lang(const char *volume_id, const gchar *regex_str)
> > +{
> > +    GRegex *regex;
> > +    GMatchInfo *match;
> > +    gboolean matched;
> > +    gchar *raw_lang = NULL;
> > +
> > +    regex = g_regex_new(regex_str, G_REGEX_ANCHORED,
> > +                        G_REGEX_MATCH_ANCHORED, NULL);
> > +    if (regex == NULL)
> > +        return NULL;
> > +
> > +    matched = g_regex_match(regex, volume_id, G_REGEX_MATCH_ANCHORED, &match);
> > +    if (!matched || !g_match_info_matches(match))
> > +        goto end;
> > +    raw_lang = g_match_info_fetch(match, 1);
> > +    if (raw_lang == NULL)
> > +        goto end;
> > +
> > +end:
> > +    g_match_info_unref(match);
> > +    g_regex_unref(regex);
> > +
> > +    return raw_lang;
> > +}
> > +
> > +struct LanguageMapping {
> > +    const char *iso_label_lang;
> > +    const char *gettext_lang;
> > +};
> > +
> > +static GHashTable *init_win_lang_map(void)
> > +{
> > +    GHashTable *lang_map;
> > +    const struct LanguageMapping lang_table[] = {
> > +        /* ISO label strings up to Windows 7 */
> > +        { "EN", "en_US" },
> > +        { "AR", "ar_SA" },
> > +        { "BG", "bg_BG" },
> > +        { "HK", "zh_HK" },
> > +        { "CN", "zh_CN" },
> > +        { "TW", "zh_TW" },
> > +        { "HR", "hr_HR" },
> > +        { "CS", "cs_CZ" },
> > +        { "DA", "da_DK" },
> > +        { "NL", "nl_NL" },
> > +        { "ET", "et_EE" },
> > +        { "FI", "fi_FI" },
> > +        { "FR", "fr_FR" },
> > +        { "DE", "de_DE" },
> > +        { "EL", "el_GR" },
> > +        { "HE", "he_IL" },
> > +        { "HU", "hu_HU" },
> > +        { "IT", "it_IT" },
> > +        { "JA", "ja_JP" },
> > +        { "KO", "ko_KR" },
> > +        { "LV", "lv_LV" },
> > +        { "LT", "lt_LT" },
> > +        { "NO", "nb_NO" },
> > +        { "PL", "pl_PL" },
> > +        { "BR", "pt_BR" },
> > +        { "PT", "pt_PT" },
> > +        { "RO", "ro_RO" },
> > +        { "RU", "ru_RU" },
> > +        { "SRL", "sr_RS@latin" },
> > +        { "SK", "sk_SK" },
> > +        { "SL", "sl_SI" },
> > +        { "ES", "es_ES" },
> > +        { "SV", "sv_SE" },
> > +        { "TH", "th_TH" },
> > +        { "TR", "tr_TR" },
> > +        { "UK", "uk_UA" },
> > +
> > +        /* starting from Windows 8, the ISO label contains both
> > +         * language and country code */
> > +        { "EN-US", "en_US" },
> > +        { "EN-GB", "en_GB" },
> > +        { "AR-SA", "ar_SA" },
> > +        { "BG-BG", "bg_BG" },
> > +        { "ZH-HK", "zh_HK" },
> > +        { "ZH-CN", "zh_CN" },
> > +        { "ZH-TW", "zh_TW" },
> > +        { "HR-HR", "hr_HR" },
> > +        { "CS-CZ", "cs_CZ" },
> > +        { "DA-DK", "da_DK" },
> > +        { "NL-NL", "nl_NL" },
> > +        { "ET-EE", "et_EE" },
> > +        { "FI-FI", "fi_FI" },
> > +        { "FR-FR", "fr_FR" },
> > +        { "DE-DE", "de_DE" },
> > +        { "EL-GR", "el_GR" },
> > +        { "HE-IL", "he_IL" },
> > +        { "HU-HU", "hu_HU" },
> > +        { "IT-IT", "it_IT" },
> > +        { "JA-JP", "ja_JP" },
> > +        { "KO-KR", "ko_KR" },
> > +        { "LV-LV", "lv_LV" },
> > +        { "LT-LT", "lt_LT" },
> > +        { "NB-NO", "nb_NO" },
> > +        { "PL-PL", "pl_PL" },
> > +        { "PT-BR", "pt_BR" },
> > +        { "PT-PT", "pt_PT" },
> > +        { "RO-RO", "ro_RO" },
> > +        { "RU-RU", "ru_RU" },
> > +        { "SR-LATN-CS", "sr_RS@latin" },
> > +        { "SK-SK", "sk_SK" },
> > +        { "SL-SI", "sl_SI" },
> > +        { "ES-ES", "es_ES" },
> > +        { "SV-SE", "sv_SE" },
> > +        { "TH-TH", "th_TH" },
> > +        { "TR-TR", "tr_TR" },
> > +        { "UK-UA", "uk_UA" },
> > +
> > +        { "EU-ES", "eu_ES" }, //language pack
> > +        { "CA-ES", "ca_ES" }, //language pack
> > +        { "GL-ES", "gl_ES" }, //language pack
> > +        { "KY-KG", "ky_KG" }, //language pack
> > +
> > +        { NULL, NULL }
> > +    };
> 
> Seems all of these except for 1 can be covered by a simple 's/-/_/'
> conversion and thus do not need all this hard coding.

I prefer an explicit list of the languages we expect to find, especially as
there are already 2 different formats.

Christophe
Attachment:
pgp53IAzms3yC.pgp

Description: PGP signature
_______________________________________________
virt-tools-list mailing list
virt-tools-list@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/virt-tools-list