Re: [libvirt PATCH 00/51] Use permutable format strings in translations

Jiri Denemark <jdenemar@xxxxxxxxxx> · Mon, 27 Mar 2023 13:08:09 +0200

On Fri, Mar 10, 2023 at 17:14:32 +0000, Daniel P. Berrangé wrote:
> Even if fixed, it might be worth switching the .pot file anyway, but
> this can't be done without us bulk updating the translations, and
> bulk re-importing them, which will be challenging. We'll almost
> certainly want to try this on a throw-away repo in weblate first,
> not our main repo.

I was able to come up with steps leading to the desired state:

 0. lock weblate repository
 1. update libvirt.pot from the most recent potfile job
 2. push to libvirt.git
 2. wait for translations update from Fedora Weblate and merge it
 3. pull from libvirt.git
 4. apply the first 50 patches from this seires (with required changes
    to make sure all translation strings are updated)
 5. update all po files with the attached script
 6. update libvirt.pot by running meson compile libvirt-pot
 7. apply patch 51 of this series
 8. push to libvirt.git
 9. wait for translations update from Fedora Weblate and merge it
10. unlock weblate repository

The process takes about an hour if we're lucky as weblate is quite slow
when processing such large amount of changes.

The result can be seen at

    https://gitlab.com/jirkade/libvirt/-/commits/format-strings

and the corresponding weblate repository at

    https://translate.fedoraproject.org/projects/libvirt/test/

I used d05ad0f15e737fa2327dd68870a485821505b58f commit as a base.

If we agree this is a reasonable approach, I think we should apply it
just after a release to give translators the whole release cycle to
check or update the translations if they wish so.

The attached script analyzes a single po file and updates all msgid
strings to use permutable format strings. It also tries to update all
translations, but only if the format strings in them exactly match
(including their order) the corresponding msgid format string. That is,
a msgstr will not be updated if format strings in it were incorrect or
reordered or they already used the permutable form. That is, the
processing should be a NO-OP except for strings that already used
permutable format in msgstr, such translations were failing c-format
check in weblate before but would be marked as correct now.

Jirka
#!/usr/bin/env python3

import sys
import re

# see man 3 printf
reIndex = r"([1-9][0-9]*\$)?"
reFlags = r"([-#0+I']|' ')*"
reWidth = rf"([1-9][0-9]*|\*{reIndex})?"
rePrecision = rf"(\.{reWidth})?"
reLenghtMod = r"(hh|h|l|ll|q|L|j|z|Z|t)?"
reConversion = r"[diouxXeEfFgGaAcspnm%]"
reCFormat = "".join([
    r"%",
    rf"(?P<index>{reIndex})",
    rf"(?P<flags>{reFlags})",
    rf"(?P<width>{reWidth})",
    rf"(?P<precision>{rePrecision})",
    rf"(?P<length>{reLenghtMod})",
    rf"(?P<conversion>{reConversion})"])

def translateFormat(fmt, idx, m):
    groups = m.groupdict()

    if groups["index"] or groups["conversion"] == "%":
        print(f"Ignoring c-format '{fmt}'")
        return idx, fmt

    for field in "width", "precision":
        if "*" in groups[field]:
            groups[field] = f"{groups[field]}{idx}$"
            idx += 1

    newFmt = f"%{idx}${''.join(groups.values())}"
    idx += 1

    return idx, newFmt

def process(ids, strs, fuzzy):
    regex = rf"(.*?)({reCFormat})(.*)"
    fmts = []
    idx = 1

    newIds = []
    for s in ids:
        new = []
        m = re.search(regex, s)
        while m is not None:
            new.append(m.group(1))

            oldFmt = m.group(2)
            idx, newFmt = translateFormat(oldFmt, idx, m)
            fmts.append((oldFmt, newFmt))
            new.append(newFmt)

            s = m.group(m.lastindex)
            m = re.search(regex, s)

        new.append(s)
        newIds.append("".join(new))

    if fuzzy:
        return newIds, strs

    n = 0
    newStrs = []
    for s in strs:
        new = []
        m = re.search(regex, s)
        while m is not None:
            new.append(m.group(1))

            if n < len(fmts) and fmts[n][0] == m.group(2):
                new.append(fmts[n][1])
                n += 1
            else:
                print("Ignoring translation", strs)
                print("              for id", newIds)
                return newIds, strs

            s = m.group(m.lastindex)
            m = re.search(regex, s)

        new.append(s)
        newStrs.append("".join(new))

    return newIds, newStrs

def writeMsg(po, header, strs):
    if len(strs) == 0:
        return

    po.write(header)
    po.write(" ")
    for s in strs:
        po.write('"')
        po.write(s)
        po.write('"\n')

if len(sys.argv) != 2:
    print(f"usage: {sys.argv[0]} PO-FILE", file=sys.stderr)
    sys.exit(1)

pofile = sys.argv[1]

with open(pofile, "r") as po:
    polines = po.readlines()

with open(pofile, "w") as po:
    current = None
    cfmt = False
    fuzzy = False
    ids = []
    strs = []

    for line in polines:
        m = re.search(r'^(([a-z]+) )?"(.*)"', line)
        if m is None:
            if cfmt:
                ids, strs = process(ids, strs, fuzzy)

            writeMsg(po, "msgid", ids)
            writeMsg(po, "msgstr", strs)
            po.write(line)

            cfmt = line.startswith("#,") and " c-format" in line
            fuzzy = line.startswith("#,") and " fuzzy" in line

            current = None
            ids = []
            strs = []
            continue

        if m.group(2):
            current = m.group(2)

        if current == "msgid":
            ids.append(m.group(3))
        elif current == "msgstr":
            strs.append(m.group(3))

    if cfmt:
        ids, strs = process(ids, strs, fuzzy)

    writeMsg(po, "msgid", ids)
    writeMsg(po, "msgstr", strs)