Re: Annoying MDS_CLIENT_RECALL Warning

胡玮文 <huww98@xxxxxxxxxxx> · Fri, 19 Nov 2021 07:13:11 +0000

Thanks Dan,

I choose one of the stuck client to investigate, as shown below, it currently holds ~269700 caps, which is pretty high with no obvious reason. I cannot understand most of the output, and failed to find any documents about it.

# ceph tell mds.cephfs.gpu018.ovxvoz client ls id=7915658
[
    {
        "id": 7915658,
        "entity": {
            "name": {
                "type": "client",
                "num": 7915658
            },
            "addr": {
                "type": "v1",
                "addr": "202.38.247.227:0",
                "nonce": 3019311016
            }
        },
        "state": "open",
        "num_leases": 0,
        "num_caps": 269695,
        "request_load_avg": 184,
        "uptime": 1340483.111458218,
        "requests_in_flight": 0,
        "num_completed_requests": 0,
        "num_completed_flushes": 1,
        "reconnecting": false,
        "recall_caps": {
            "value": 1625220.0378812221,
            "halflife": 60
        },
        "release_caps": {
            "value": 69.432671270941171,
            "halflife": 60
        },
        "recall_caps_throttle": {
            "value": 63255.667075845187,
            "halflife": 1.5
        },
        "recall_caps_throttle2o": {
            "value": 26064.679002183591,
            "halflife": 0.5
        },
        "session_cache_liveness": {
            "value": 259.9718480278375,
            "halflife": 300
        },
        "cap_acquisition": {
            "value": 0,
            "halflife": 10
        },
        "delegated_inos": [... 7 items removed ],
        "inst": "client.7915658 v1:202.38.247.227:0/3019311016",
        "completed_requests": [],
        "prealloc_inos": [ ... 9 items removed ],
        "client_metadata": {
            "client_features": {
                "feature_bits": "0x0000000000007bff"
            },
            "metric_spec": {
                "metric_flags": {
                    "feature_bits": "0x000000000000001f"
                }
            },
            "entity_id": "smil",
            "hostname": "gpu027",
            "kernel_version": "5.11.0-37-generic",
            "root": "/"
        }
    }
]

I suspect that some files are in use so that their caps cannot be released. However, "sudo lsof +f -- /mnt/cephfs | wc -l" just shows about 9k open files, well below "num_caps".

I also looked at /sys/kernel/debug/ceph/e88d509a-f6fc-11ea-b25d-a0423f3ac864.client7915658/caps on the client. The number of lines in it matches the "num_caps" reported by MDS. This file also tells me which caps are not released. I investigated some of them, but cannot see anything special. One example is attached here.

# ceph tell mds.cephfs.gpu018.ovxvoz dump inode 0x100068b9d24
{
    "path": "/dataset/coco2017/train2017/000000342643.jpg",
    "ino": 1099621440804,
    "rdev": 0,
    "ctime": "2021-04-23T09:49:54.433652+0000",
    "btime": "2021-04-23T09:49:54.425652+0000",
    "mode": 33204,
    "uid": 859600009,
    "gid": 859600009,
    "nlink": 1,
    "dir_layout": {
        "dir_hash": 0,
        "unused1": 0,
        "unused2": 0,
        "unused3": 0
    },
    "layout": {
        "stripe_unit": 4194304,
        "stripe_count": 1,
        "object_size": 4194304,
        "pool_id": 5,
        "pool_ns": ""
    },
    "old_pools": [],
    "size": 147974,
    "truncate_seq": 1,
    "truncate_size": 18446744073709551615,
    "truncate_from": 0,
    "truncate_pending": 0,
    "mtime": "2021-04-23T09:49:54.433652+0000",
    "atime": "2021-04-23T09:49:54.425652+0000",
    "time_warp_seq": 0,
    "change_attr": 1,
    "export_pin": -1,
    "export_ephemeral_random_pin": 0,
    "export_ephemeral_distributed_pin": false,
    "client_ranges": [],
    "dirstat": {
        "version": 0,
        "mtime": "0.000000",
        "num_files": 0,
        "num_subdirs": 0,
        "change_attr": 0
    },
    "rstat": {
        "version": 0,
        "rbytes": 147974,
        "rfiles": 1,
        "rsubdirs": 0,
        "rsnaps": 0,
        "rctime": "2021-04-23T09:49:54.433652+0000"
    },
    "accounted_rstat": {
        "version": 0,
        "rbytes": 147974,
        "rfiles": 1,
        "rsubdirs": 0,
        "rsnaps": 0,
        "rctime": "2021-04-23T09:49:54.433652+0000"
    },
    "version": 182894,
    "file_data_version": 0,
    "xattr_version": 1,
    "backtrace_version": 177717,
    "stray_prior_path": "",
    "max_size_ever": 0,
    "quota": {
        "max_bytes": 0,
        "max_files": 0
    },
    "last_scrub_stamp": "0.000000",
    "last_scrub_version": 0,
    "symlink": "",
    "xattrs": [],
    "dirfragtree": {
        "splits": []
    },
    "old_inodes": [],
    "oldest_snap": 18446744073709551614,
    "damage_flags": 0,
    "is_auth": true,
    "auth_state": {
        "replicas": {}
    },
    "replica_state": {
        "authority": [
            0,
            -2
        ],
        "replica_nonce": 0
    },
    "auth_pins": 0,
    "is_frozen": false,
    "is_freezing": false,
    "pins": {
        "caps": 1
    },
    "nref": 1,
    "versionlock": {
        "gather_set": [],
        "state": "lock",
        "is_leased": false,
        "num_rdlocks": 0,
        "num_wrlocks": 0,
        "num_xlocks": 0,
        "xlock_by": {}
    },
    "authlock": {},
    "linklock": {},
    "dirfragtreelock": {},
    "filelock": {},
    "xattrlock": {},
    "snaplock": {},
    "nestlock": {},
    "flocklock": {},
    "policylock": {},
    "states": [
        "auth"
    ],
    "client_caps": [
        {
            "client_id": 7915658,
            "pending": "pAsLsXsFscr",
            "issued": "pAsLsXsFscr",
            "wanted": "-",
            "last_sent": 1
        }
    ],
    "loner": -1,
    "want_loner": -1,
    "mds_caps_wanted": []
}

I also did an experiment, I executed "find > /dev/null" in one directory to acquire some caps. As expected "num_caps" quickly increased to over 1M. But after around an hour, it dropped back to about 269700. So new caps are released as expected, old caps are still not released.

It seems I need to find out why the client don't want to release some specific caps.

> -----邮件原件-----
> 发件人: Dan van der Ster <dan@xxxxxxxxxxxxxx>
> 发送时间: 2021年11月18日 20:45
> 收件人: 胡 玮文 <huww98@xxxxxxxxxxx>
> 抄送: ceph-users@xxxxxxx
> 主题: Re:  Annoying MDS_CLIENT_RECALL Warning
> 
> Hi,
> 
> We sometimes have similar stuck client recall warnings.
> To debug you can try:
> 
> (1) ceph health detail
>      that will show you the client ids which are generating the warning. (e.g.
> 1234)
> (2) ceph tell mds.* client ls id=1234
>      this will show lots of client statistics for the session. Notably you need to
> look at the num_caps and the various recall metrics, and see how they are
> changing over time.
> 
> From experience this can be one of two or three things:
> 1. the client is very quickly iterating through files and the mds isn't recalling
> them fast enough -- that shouldn't be the case if you're running the default caps
> recall tuning on 16.2.6.
> 2. the client has some files open and cannot release the caps it holds for some
> reason. We've seen some apps behave like this, and also I've noticed that if users
> mount cephfs on another cephfs, or use bind mounts, in some cases the caps of
> the "lower" cephfs will simply never be released.

We don't mount cephfs on another cephfs, nor bind mount.

> I've found that issuing an `ls
> -lR` in the working dir of the other client sometimes provokes the caps to be
> released. Or, if they really will never release, you can try `echo 2 >
> /proc/sys/vm/drop_caches` or umount / mount on the client.

I tried "echo 2 > /proc/sys/vm/drop_caches" with no luck. This cephfs mount is used as home directory for dozens of users. So umount is nearly impossible as I need to kill almost every process on this system before umount.

> If the cause isn't obvious to you, you can share some client session stats and we
> can try to help here.
> 
> Best Regards,
> 
> dan
> 
> 
> 
> 
> 
> On Thu, Nov 18, 2021 at 6:36 AM 胡 玮文 <huww98@xxxxxxxxxxx> wrote:
> >
> > Hi all,
> >
> > We are consistently seeing the MDS_CLIENT_RECALL warning in our cluster, it
> seems harmless, but we cannot get HEALTH_OK, which is annoying.
> >
> > The clients that are reported failing to respond to cache pressure are
> constantly changing, and most of the time we got 1-5 such clients out of ~20. All
> of the clients are kernel clients, running HWE kernel 5.11 of Ubuntu 20.04. The
> load is pretty low.
> >
> > We are reading datasets that consist of millions of small files from cephfs, so
> we have tuned some config for performance. Some configs from "ceph config
> dump" that might be relevant:
> >
> > WHO       LEVEL     OPTION                   VALUE
> >   mds     basic     mds_cache_memory_limit   51539607552
> >   mds     advanced  mds_max_caps_per_client  8388608
> >   client  basic     client_cache_size        32768
> >
> > We also manually pinned almost every directory to either rank 0 or rank 1.
> >
> > Any thoughts about what causes the warning, or how can we get rid of it?
> >
> > Thanks,
> > Weiwen Hu
> >
> >
> > # ceph -s
> >   cluster:
> >     id:     e88d509a-f6fc-11ea-b25d-a0423f3ac864
> >     health: HEALTH_WARN
> >             4 clients failing to respond to cache pressure
> >
> >   services:
> >     mon: 5 daemons, quorum gpu024,gpu006,gpu023,gpu013,gpu018 (age
> 7d)
> >     mgr: gpu014.kwbqcf(active, since 2w), standbys: gpu024.bapbcz
> >     mds: 2/2 daemons up, 2 hot standby
> >     osd: 45 osds: 45 up (since 2h), 45 in (since 5d)
> >     rgw: 2 daemons active (2 hosts, 1 zones)
> >
> >   data:
> >     volumes: 1/1 healthy
> >     pools:   16 pools, 1713 pgs
> >     objects: 265.84M objects, 55 TiB
> >     usage:   115 TiB used, 93 TiB / 208 TiB avail
> >     pgs:     1711 active+clean
> >              2    active+clean+scrubbing+deep
> >
> >   io:
> >     client:   55 MiB/s rd, 5.2 MiB/s wr, 513 op/s rd, 14 op/s wr
> >
> >
> > # ceph fs status
> > cephfs - 23 clients
> > ======
> > RANK      STATE               MDS              ACTIVITY
> DNS    INOS   DIRS   CAPS
> >  0        active      cephfs.gpu018.ovxvoz  Reqs:  241 /s  17.3M
> 17.3M  41.3k  5054k
> >  1        active      cephfs.gpu023.aetiph  Reqs:    1 /s  13.1M
> 12.1M   864k   586k
> > 1-s   standby-replay  cephfs.gpu006.ddpekw  Evts:    2 /s  2517k
> 2393k   216k     0
> > 0-s   standby-replay  cephfs.gpu024.rpfbnh  Evts:   17 /s  9587k
> 9587k   214k     0
> >           POOL              TYPE     USED  AVAIL
> >    cephfs.cephfs.meta     metadata   126G   350G
> >    cephfs.cephfs.data       data     102T  25.9T
> >  cephfs.cephfs.data_ssd     data       0    525G
> > cephfs.cephfs.data_mixed    data    9.81T   350G
> >  cephfs.cephfs.data_ec      data     166G  41.4T
> > MDS version: ceph version 16.2.6
> > (ee28fb57e47e9f88813e24bbf4c14496ca299d31) pacific (stable)
> > _______________________________________________
> > ceph-users mailing list -- ceph-users@xxxxxxx To unsubscribe send an
> > email to ceph-users-leave@xxxxxxx
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx