Re: Possible regression with cgroups in 3.11

Markus Blank-Burian <burian@xxxxxxxxxxx> · Mon, 11 Nov 2013 17:11:04 +0100

>
> Out of curiosity, do you have memcg swap accounting enabled? Or do you
> use kmem accounting? How does your cgroup tree look like?

I have compiled my kernel with CONFIG_MEMCG_SWAP,
CONFIG_MEMCG_SWAP_ENABLED, CONFIG_MEMCG_KMEM and CONFIG_CGROUP_HUGETLB
options, although our machines have no active swap space at the
moment.
The cgroup tree is maintained by the SLURM cluster queueing system and
looks like this:

/sys/fs/cgroup/memory
`-- slurm
    `-- uid_181994
        |-- job_56870
        |   |-- step_0
        |   `-- step_4294967294
        |-- job_56871
        |   |-- step_0
        |   `-- step_4294967294
        |-- job_56872
        |   |-- step_0
        |   `-- step_4294967294
        |-- job_56873
        |   |-- step_0
        |   `-- step_4294967294
        |-- job_56874
        |   |-- step_0
        |   `-- step_4294967294
        |-- job_56875
        |   |-- step_0
        |   `-- step_4294967294
        |-- job_56876
        |   |-- step_0
        |   `-- step_4294967294
        |-- job_56877
        |   |-- step_0
        |   `-- step_4294967294
        |-- job_56878
        |   |-- step_0
        |   `-- step_4294967294
        |-- job_56879
        |   |-- step_0
        |   `-- step_4294967294
        `-- job_56885
            |-- step_0
            `-- step_4294967294

memory.use_hierarchy support is enabled and memory is limited from the
job directory. I have mounted the cgroup to /var/slurm/cgroup/memory
in addition to the normal directory at /sys/fs/cgroup/memory. The
slurm release_agent script is the following: (the script is called
release_memory, so the subsystem variable is "memory")

#!/bin/bash
#
# Generic release agent for SLURM cgroup usage
#
# Manage cgroup hierarchy like :
#
# /sys/fs/cgroup/subsystem/uid_%/job_%/step_%/task_%
#
# Automatically sync uid_% cgroups to be coherent
# with remaining job childs when one of them is removed
# by a call to this release agent.
# The synchronisation is made in a flock on the root cgroup
# to ensure coherency of the cgroups contents.
#

progname=$(basename $0)
subsystem=${progname##*_}

get_mount_dir()
{
    local lssubsys=$(type -p lssubsys)
    if [[ $lssubsys ]]; then
        $lssubsys -m $subsystem | awk '{print $2}'
    else
        echo "/sys/fs/cgroup/$subsystem"
    fi
}

mountdir=$(get_mount_dir)

if [[ $# -eq 0 ]]
then
    echo "Usage: $(basename $0) [sync] cgroup"
    exit 1
fi

# build orphan cg path
if [[ $# -eq 1 ]]
then
    rmcg=${mountdir}$1
else
    rmcg=${mountdir}$2
fi
slurmcg=${rmcg%/uid_*}
if [[ ${slurmcg} == ${rmcg} ]]
then
    # not a slurm job pattern, perhaps the slurmcg, just remove
    # the dir with a lock and exit
    flock -x ${mountdir} -c "rmdir ${rmcg}"
    exit $?
fi
orphancg=${slurmcg}/orphan

# make sure orphan cgroup is existing
if [[ ! -d ${orphancg} ]]
then
    mkdir ${orphancg}
    case ${subsystem} in
        cpuset)
            cat ${mountdir}/cpuset.cpus > ${orphancg}/cpuset.cpus
            cat ${mountdir}/cpuset.mems > ${orphancg}/cpuset.mems
            ;;
        *)
            ;;
    esac
fi

# kernel call
if [[ $# -eq 1 ]]
then

    rmcg=${mountdir}$@

    # try to extract the uid cgroup from the input one
    # ( extract /uid_% from /uid%/job_*...)
    uidcg=${rmcg%/job_*}
    if [[ ${uidcg} == ${rmcg} ]]
    then
        # not a slurm job pattern, perhaps the uidcg, just remove
        # the dir with a lock and exit
        flock -x ${mountdir} -c "rmdir ${rmcg}"
        exit $?
    fi

    if [[ -d ${mountdir} ]]
    then
        flock -x ${mountdir} -c "$0 sync $@"
    fi

    exit $?

# sync subcall (called using flock by the kernel hook to be sure
# that no one is manipulating the hierarchy, i.e. PAM, SLURM, ...)
elif [[ $# -eq 2 ]] && [[ $1 == "sync" ]]
then

    shift
    rmcg=${mountdir}$@
    uidcg=${rmcg%/job_*}

    # remove this cgroup
    if [[ -d ${rmcg} ]]
    then
        case ${subsystem} in
            memory)
                # help to correctly remove lazy cleaning memcg
                # but still not perfect
                sleep 1
                ;;
            *)
                ;;
        esac
        rmdir ${rmcg}
    fi
    if [[ ${uidcg} == ${rmcg} ]]
    then
        ## not a slurm job pattern exit now do not sync
        exit 0
    fi

    # sync the user cgroup based on targeted subsystem
    # and the remaining job
    if [[ -d ${uidcg} ]]
    then
        case ${subsystem} in
            cpuset)
                cpus=$(cat ${uidcg}/job_*/cpuset.cpus 2>/dev/null)
                if [[ -n ${cpus} ]]
                then
                    cpus=$(scontrol show hostnames $(echo ${cpus} | tr ' ' ','))
                    cpus=$(echo ${cpus} | tr ' ' ',')
                    echo ${cpus} > ${uidcg}/cpuset.cpus
                else
                    # first move the remaining processes to
                    # a cgroup reserved for orphaned processes
                    for t in $(cat ${uidcg}/tasks)
                    do
                        echo $t > ${orphancg}/tasks
                    done
                    # then remove the remaining cpus from the cgroup
                    echo "" > ${uidcg}/cpuset.cpus
                fi
                ;;
            *)
                ;;
        esac
    fi

# error
else
    echo "Usage: $(basename $0) [sync] cgroup"
    exit 1
fi

exit 0
--
To unsubscribe from this list: send the line "unsubscribe cgroups" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html