> > Out of curiosity, do you have memcg swap accounting enabled? Or do you > use kmem accounting? How does your cgroup tree look like? I have compiled my kernel with CONFIG_MEMCG_SWAP, CONFIG_MEMCG_SWAP_ENABLED, CONFIG_MEMCG_KMEM and CONFIG_CGROUP_HUGETLB options, although our machines have no active swap space at the moment. The cgroup tree is maintained by the SLURM cluster queueing system and looks like this: /sys/fs/cgroup/memory `-- slurm `-- uid_181994 |-- job_56870 | |-- step_0 | `-- step_4294967294 |-- job_56871 | |-- step_0 | `-- step_4294967294 |-- job_56872 | |-- step_0 | `-- step_4294967294 |-- job_56873 | |-- step_0 | `-- step_4294967294 |-- job_56874 | |-- step_0 | `-- step_4294967294 |-- job_56875 | |-- step_0 | `-- step_4294967294 |-- job_56876 | |-- step_0 | `-- step_4294967294 |-- job_56877 | |-- step_0 | `-- step_4294967294 |-- job_56878 | |-- step_0 | `-- step_4294967294 |-- job_56879 | |-- step_0 | `-- step_4294967294 `-- job_56885 |-- step_0 `-- step_4294967294 memory.use_hierarchy support is enabled and memory is limited from the job directory. I have mounted the cgroup to /var/slurm/cgroup/memory in addition to the normal directory at /sys/fs/cgroup/memory. The slurm release_agent script is the following: (the script is called release_memory, so the subsystem variable is "memory") #!/bin/bash # # Generic release agent for SLURM cgroup usage # # Manage cgroup hierarchy like : # # /sys/fs/cgroup/subsystem/uid_%/job_%/step_%/task_% # # Automatically sync uid_% cgroups to be coherent # with remaining job childs when one of them is removed # by a call to this release agent. # The synchronisation is made in a flock on the root cgroup # to ensure coherency of the cgroups contents. # progname=$(basename $0) subsystem=${progname##*_} get_mount_dir() { local lssubsys=$(type -p lssubsys) if [[ $lssubsys ]]; then $lssubsys -m $subsystem | awk '{print $2}' else echo "/sys/fs/cgroup/$subsystem" fi } mountdir=$(get_mount_dir) if [[ $# -eq 0 ]] then echo "Usage: $(basename $0) [sync] cgroup" exit 1 fi # build orphan cg path if [[ $# -eq 1 ]] then rmcg=${mountdir}$1 else rmcg=${mountdir}$2 fi slurmcg=${rmcg%/uid_*} if [[ ${slurmcg} == ${rmcg} ]] then # not a slurm job pattern, perhaps the slurmcg, just remove # the dir with a lock and exit flock -x ${mountdir} -c "rmdir ${rmcg}" exit $? fi orphancg=${slurmcg}/orphan # make sure orphan cgroup is existing if [[ ! -d ${orphancg} ]] then mkdir ${orphancg} case ${subsystem} in cpuset) cat ${mountdir}/cpuset.cpus > ${orphancg}/cpuset.cpus cat ${mountdir}/cpuset.mems > ${orphancg}/cpuset.mems ;; *) ;; esac fi # kernel call if [[ $# -eq 1 ]] then rmcg=${mountdir}$@ # try to extract the uid cgroup from the input one # ( extract /uid_% from /uid%/job_*...) uidcg=${rmcg%/job_*} if [[ ${uidcg} == ${rmcg} ]] then # not a slurm job pattern, perhaps the uidcg, just remove # the dir with a lock and exit flock -x ${mountdir} -c "rmdir ${rmcg}" exit $? fi if [[ -d ${mountdir} ]] then flock -x ${mountdir} -c "$0 sync $@" fi exit $? # sync subcall (called using flock by the kernel hook to be sure # that no one is manipulating the hierarchy, i.e. PAM, SLURM, ...) elif [[ $# -eq 2 ]] && [[ $1 == "sync" ]] then shift rmcg=${mountdir}$@ uidcg=${rmcg%/job_*} # remove this cgroup if [[ -d ${rmcg} ]] then case ${subsystem} in memory) # help to correctly remove lazy cleaning memcg # but still not perfect sleep 1 ;; *) ;; esac rmdir ${rmcg} fi if [[ ${uidcg} == ${rmcg} ]] then ## not a slurm job pattern exit now do not sync exit 0 fi # sync the user cgroup based on targeted subsystem # and the remaining job if [[ -d ${uidcg} ]] then case ${subsystem} in cpuset) cpus=$(cat ${uidcg}/job_*/cpuset.cpus 2>/dev/null) if [[ -n ${cpus} ]] then cpus=$(scontrol show hostnames $(echo ${cpus} | tr ' ' ',')) cpus=$(echo ${cpus} | tr ' ' ',') echo ${cpus} > ${uidcg}/cpuset.cpus else # first move the remaining processes to # a cgroup reserved for orphaned processes for t in $(cat ${uidcg}/tasks) do echo $t > ${orphancg}/tasks done # then remove the remaining cpus from the cgroup echo "" > ${uidcg}/cpuset.cpus fi ;; *) ;; esac fi # error else echo "Usage: $(basename $0) [sync] cgroup" exit 1 fi exit 0 -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html