Re: [malware-list] A few concerns about fanotify implementation.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Eric,

We are moving to fanotify at the moment. It meets our needs except couple minor issues we would like to be fixed in future versions:

1. The file is thrown out of the cache only when it is modified. But in case there are different scan options for different dirs that's not enough. So we also need it to be evicted from cache on rename or number of hard links change.
It is the most important issue for us.
The patch is in attachment: add_clear_cache_events.patch

2. We can't use unlimited cache because it can potentially grab too much memory and slow down a system. In case we use limited cache it can be easily filled with not very frequently used files. So the only option we have at the moment is to clear cache every time it is full. The solution we consider the most appropriate is to introduce replaceable marks and LRU cache for them in fanotify. So we suggest to introduce a new flag FAN_MARK_REPLACEABLE for marks. That will not break the current API.
The patch is in attachment: marks_lru_cache.patch

3. The fanotify file descriptor is always ready to be written to it. But it's poll method says the opposite. In case you handle it by yourself that's not a problem. But in case you use some async io library as we do that polls fds internally it doesn't work.
The patch is in attachment: fanotify_write_poll.patch

--
Best regards,

Vasily Novikov | Software developer | Kaspersky Lab

Direct: +7 495 123 45 67 x2344 | Mobile: +7 964 786 44 82 | vasily.novikov@xxxxxxxxxxxxx 10/1, 1st Volokolamsky Proezd, Moscow, 123060, Russia | www.kaspersky.com, www.securelist.com
#
# Patch managed by http://www.holgerschurig.de/patcher.html
#

--- //media/storage/src/linux-2.6.38/fs/notify/fsnotify.c~add_clear_cache_events
+++ //media/storage/src/linux-2.6.38/fs/notify/fsnotify.c
@@ -140,7 +140,7 @@
 	}
 
 	/* clear ignored on inode modification */
-	if (mask & FS_MODIFY) {
+	if (mask & FS_CLEAR_IGNORED_MASK_EVENTS) {
 		if (inode_mark &&
 		    !(inode_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
 			inode_mark->ignored_mask = 0;
@@ -220,19 +220,19 @@
 	 * otherwise return if neither the inode nor the vfsmount care about
 	 * this type of event.
 	 */
-	if (!(mask & FS_MODIFY) &&
+	if (!(mask & FS_CLEAR_IGNORED_MASK_EVENTS) &&
 	    !(test_mask & to_tell->i_fsnotify_mask) &&
 	    !(mnt && test_mask & mnt->mnt_fsnotify_mask))
 		return 0;
 
 	idx = srcu_read_lock(&fsnotify_mark_srcu);
 
-	if ((mask & FS_MODIFY) ||
+	if ((mask & FS_CLEAR_IGNORED_MASK_EVENTS) ||
 	    (test_mask & to_tell->i_fsnotify_mask))
 		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
 					      &fsnotify_mark_srcu);
 
-	if (mnt && ((mask & FS_MODIFY) ||
+	if (mnt && ((mask & FS_CLEAR_IGNORED_MASK_EVENTS) ||
 		    (test_mask & mnt->mnt_fsnotify_mask))) {
 		vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
 						 &fsnotify_mark_srcu);
--- //media/storage/src/linux-2.6.38/include/linux/fsnotify_backend.h~add_clear_cache_events
+++ //media/storage/src/linux-2.6.38/include/linux/fsnotify_backend.h
@@ -75,6 +75,8 @@
 			     FS_ISDIR | FS_IN_ONESHOT | FS_DN_RENAME | \
 			     FS_DN_MULTISHOT | FS_EVENT_ON_CHILD)
 
+#define FS_CLEAR_IGNORED_MASK_EVENTS (FS_MODIFY | FS_ATTRIB | FS_MOVE_SELF)
+
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark;
#
# Patch managed by http://www.holgerschurig.de/patcher.html
#

--- //media/storage/src/linux-2.6.38/fs/notify/fanotify/fanotify_user.c~fanotify_write_poll
+++ //media/storage/src/linux-2.6.38/fs/notify/fanotify/fanotify_user.c
@@ -301,12 +301,16 @@
 static unsigned int fanotify_poll(struct file *file, poll_table *wait)
 {
 	struct fsnotify_group *group = file->private_data;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	int ret = POLLOUT | POLLWRNORM;
+#else
 	int ret = 0;
+#endif
 
 	poll_wait(file, &group->notification_waitq, wait);
 	mutex_lock(&group->notification_mutex);
 	if (!fsnotify_notify_queue_is_empty(group))
-		ret = POLLIN | POLLRDNORM;
+		ret |= POLLIN | POLLRDNORM;
 	mutex_unlock(&group->notification_mutex);
 
 	return ret;
#
# Patch managed by http://www.holgerschurig.de/patcher.html
#

--- //media/storage/src/linux-2.6.38/include/linux/fanotify.h~marks_lru_cache
+++ //media/storage/src/linux-2.6.38/include/linux/fanotify.h
@@ -49,6 +49,7 @@
 #define FAN_MARK_IGNORED_MASK	0x00000020
 #define FAN_MARK_IGNORED_SURV_MODIFY	0x00000040
 #define FAN_MARK_FLUSH		0x00000080
+#define FAN_MARK_REPLACEABLE	0x00000200
 #ifdef __KERNEL__
 /* not valid from userspace, only kernel internal */
 #define FAN_MARK_ONDIR		0x00000100
@@ -61,7 +62,8 @@
 				 FAN_MARK_MOUNT |\
 				 FAN_MARK_IGNORED_MASK |\
 				 FAN_MARK_IGNORED_SURV_MODIFY |\
-				 FAN_MARK_FLUSH)
+				 FAN_MARK_FLUSH |\
+				 FAN_MARK_REPLACEABLE)
 
 /*
  * All of the events - we build the list by hand so that we can add flags in
--- //media/storage/src/linux-2.6.38/include/linux/fsnotify_backend.h~marks_lru_cache
+++ //media/storage/src/linux-2.6.38/include/linux/fsnotify_backend.h
@@ -93,6 +93,9 @@
  * freeing-mark - this means that a mark has been flagged to die when everything
  *		finishes using it.  The function is supplied with what must be a
  *		valid group and inode to use to clean up.
+ * on_ignored_event - called when an event is masked by ignored_mask and
+ *		therefore should_send_event & handle_event are not called.
+ *		This callback is required to implement LRU cache in fanotify.
  */
 struct fsnotify_ops {
 	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode,
@@ -106,6 +109,10 @@
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
 	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
+	void (*on_ignored_event)(struct fsnotify_group *group, struct inode *inode,
+				  struct fsnotify_mark *inode_mark,
+				  struct fsnotify_mark *vfsmount_mark,
+				  __u32 mask, __u32 ignored_mask, void *data, int data_type);
 };
 
 /*
@@ -173,6 +180,8 @@
 			int f_flags;
 			unsigned int max_marks;
 			struct user_struct *user;
+			struct list_head lru_cache_list;
+			spinlock_t lru_cache_lock;		/* protect g_lru_cache_list */
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
--- //media/storage/src/linux-2.6.38/fs/notify/dnotify/dnotify.c~marks_lru_cache
+++ //media/storage/src/linux-2.6.38/fs/notify/dnotify/dnotify.c
@@ -157,6 +157,7 @@
 	.free_group_priv = NULL,
 	.freeing_mark = NULL,
 	.free_event_priv = NULL,
+	.on_ignored_event = NULL,
 };
 
 /*
--- //media/storage/src/linux-2.6.38/fs/notify/fanotify/fanotify.c~marks_lru_cache
+++ //media/storage/src/linux-2.6.38/fs/notify/fanotify/fanotify.c
@@ -9,6 +9,8 @@
 #include <linux/types.h>
 #include <linux/wait.h>
 
+#include "fanotify.h"
+
 static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
 {
 	pr_debug("%s: old=%p new=%p\n", __func__, old, new);
@@ -219,10 +221,49 @@
 	free_uid(user);
 }
 
+static void fanotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
+{
+	struct fanotify_mark *fan_mark = container_of(fsn_mark, struct fanotify_mark, fsn_mark);
+
+	spin_lock(&group->fanotify_data.lru_cache_lock);
+	if (!list_empty(&fan_mark->lru_cache_list))
+		list_del_init(&fan_mark->lru_cache_list);
+	spin_unlock(&group->fanotify_data.lru_cache_lock);
+}
+
+static void fanotify_lru_cache_hit(struct fsnotify_group *group, struct fsnotify_mark *fsn_mark)
+{
+	if (!fsn_mark) {
+		return;
+	} else {
+		struct fanotify_mark *fan_mark = container_of(fsn_mark, struct fanotify_mark, fsn_mark);
+
+		/* move fsn_mark to the head of lru_cache_list */
+		spin_lock(&group->fanotify_data.lru_cache_lock);
+		if (!list_empty(&fan_mark->lru_cache_list)) {
+			list_del_init(&fan_mark->lru_cache_list);
+			list_add(&fan_mark->lru_cache_list, &group->fanotify_data.lru_cache_list);
+		}
+		spin_unlock(&group->fanotify_data.lru_cache_lock);
+	}
+}
+
+static void fanotify_ignored_event(struct fsnotify_group *group,
+				       struct inode *to_tell,
+				       struct fsnotify_mark *inode_mark,
+				       struct fsnotify_mark *vfsmnt_mark,
+				       __u32 event_mask, __u32 ignored_mask,
+				       void *data, int data_type)
+{
+	fanotify_lru_cache_hit(group, inode_mark);
+	fanotify_lru_cache_hit(group, vfsmnt_mark);
+}
+
 const struct fsnotify_ops fanotify_fsnotify_ops = {
 	.handle_event = fanotify_handle_event,
 	.should_send_event = fanotify_should_send_event,
 	.free_group_priv = fanotify_free_group_priv,
 	.free_event_priv = NULL,
-	.freeing_mark = NULL,
+	.freeing_mark = fanotify_freeing_mark,
+	.on_ignored_event = fanotify_ignored_event,
 };
--- //media/storage/src/linux-2.6.38/fs/notify/inotify/inotify_fsnotify.c~marks_lru_cache
+++ //media/storage/src/linux-2.6.38/fs/notify/inotify/inotify_fsnotify.c
@@ -218,4 +218,5 @@
 	.free_group_priv = inotify_free_group_priv,
 	.free_event_priv = inotify_free_event_priv,
 	.freeing_mark = inotify_freeing_mark,
+	.on_ignored_event = NULL,
 };
--- //media/storage/src/linux-2.6.38/fs/notify/fsnotify.c~marks_lru_cache
+++ //media/storage/src/linux-2.6.38/fs/notify/fsnotify.c
@@ -133,6 +133,7 @@
 	struct fsnotify_group *group = NULL;
 	__u32 inode_test_mask = 0;
 	__u32 vfsmount_test_mask = 0;
+	__u32 ignored_mask = 0;
 
 	if (unlikely(!inode_mark && !vfsmount_mark)) {
 		BUG();
@@ -154,6 +155,7 @@
 		group = inode_mark->group;
 		inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
 		inode_test_mask &= inode_mark->mask;
+		ignored_mask |= inode_test_mask & inode_mark->ignored_mask;
 		inode_test_mask &= ~inode_mark->ignored_mask;
 	}
 
@@ -162,9 +164,12 @@
 		vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
 		group = vfsmount_mark->group;
 		vfsmount_test_mask &= vfsmount_mark->mask;
+		ignored_mask |= vfsmount_test_mask & vfsmount_mark->ignored_mask;
 		vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
-		if (inode_mark)
+		if (inode_mark) {
+			ignored_mask |= vfsmount_test_mask & inode_mark->ignored_mask;
 			vfsmount_test_mask &= ~inode_mark->ignored_mask;
+		}
 	}
 
 	pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
@@ -174,8 +179,14 @@
 		 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
 		 data_is, cookie, *event);
 
-	if (!inode_test_mask && !vfsmount_test_mask)
+	if (!inode_test_mask && !vfsmount_test_mask) {
+		if (group->ops->on_ignored_event && ignored_mask) {
+			group->ops->on_ignored_event(group, to_tell, inode_mark,
+						     vfsmount_mark, mask, ignored_mask,
+						     data, data_is);
+		}
 		return 0;
+	}
 
 	if (group->ops->should_send_event(group, to_tell, inode_mark,
 					  vfsmount_mark, mask, data,
--- //media/storage/src/linux-2.6.38/fs/notify/fanotify/fanotify_user.c~marks_lru_cache
+++ //media/storage/src/linux-2.6.38/fs/notify/fanotify/fanotify_user.c
@@ -16,6 +16,8 @@
 
 #include <asm/ioctls.h>
 
+#include "fanotify.h"
+
 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
 #define FANOTIFY_DEFAULT_MAX_MARKS	8192
 #define FANOTIFY_DEFAULT_MAX_LISTENERS	128
@@ -464,7 +466,51 @@
 
 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
 {
-	kmem_cache_free(fanotify_mark_cache, fsn_mark);
+	struct fanotify_mark *fan_mark = container_of(fsn_mark, struct fanotify_mark, fsn_mark);
+	kmem_cache_free(fanotify_mark_cache, fan_mark);
+}
+
+static struct fsnotify_mark* fanotify_alloc_mark(void)
+{
+	struct fanotify_mark *fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+
+	if (!fan_mark)
+		return NULL;
+
+	fsnotify_init_mark(&fan_mark->fsn_mark, fanotify_free_mark);
+	INIT_LIST_HEAD(&fan_mark->lru_cache_list);
+
+	return &fan_mark->fsn_mark;
+}
+
+static void fanotify_lru_cache_add(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
+{
+	struct fanotify_mark *fan_mark = container_of(fsn_mark, struct fanotify_mark, fsn_mark);
+	
+	spin_lock(&group->fanotify_data.lru_cache_lock);
+	list_add(&fan_mark->lru_cache_list, &group->fanotify_data.lru_cache_list);
+	spin_unlock(&group->fanotify_data.lru_cache_lock);
+}
+
+static bool fanotify_lru_cache_evict(struct fsnotify_group *group)
+{
+	struct fanotify_mark *fan_mark = NULL;
+
+	/* evict lru_cache_list tail */
+	spin_lock(&group->fanotify_data.lru_cache_lock);
+	if (!list_empty(&group->fanotify_data.lru_cache_list)) {
+		fan_mark = list_entry(group->fanotify_data.lru_cache_list.prev, struct fanotify_mark, lru_cache_list);
+		fsnotify_get_mark(&fan_mark->fsn_mark);
+		list_del_init(&fan_mark->lru_cache_list);
+	}
+	spin_unlock(&group->fanotify_data.lru_cache_lock);
+
+	if (fan_mark) {
+		fsnotify_destroy_mark(&fan_mark->fsn_mark);
+		fsnotify_put_mark(&fan_mark->fsn_mark);
+	}
+
+	return !!fan_mark;
 }
 
 static int fanotify_find_path(int dfd, const char __user *filename,
@@ -614,16 +660,19 @@
 	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
 	if (!fsn_mark) {
 		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
-			return -ENOSPC;
+			if (!fanotify_lru_cache_evict(group))
+				return -ENOSPC;
 
-		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+		fsn_mark = fanotify_alloc_mark();
 		if (!fsn_mark)
 			return -ENOMEM;
 
-		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
 		ret = fsnotify_add_mark(fsn_mark, group, NULL, mnt, 0);
 		if (ret)
 			goto err;
+
+		if (flags & FAN_MARK_REPLACEABLE)
+			fanotify_lru_cache_add(fsn_mark, group);
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 
@@ -657,16 +706,19 @@
 	fsn_mark = fsnotify_find_inode_mark(group, inode);
 	if (!fsn_mark) {
 		if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
-			return -ENOSPC;
+			if (!fanotify_lru_cache_evict(group))
+				return -ENOSPC;
 
-		fsn_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+		fsn_mark = fanotify_alloc_mark();
 		if (!fsn_mark)
 			return -ENOMEM;
 
-		fsnotify_init_mark(fsn_mark, fanotify_free_mark);
 		ret = fsnotify_add_mark(fsn_mark, group, inode, NULL, 0);
 		if (ret)
 			goto err;
+
+		if (flags & FAN_MARK_REPLACEABLE)
+			fanotify_lru_cache_add(fsn_mark, group);
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
 
@@ -722,6 +774,9 @@
 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
 	atomic_set(&group->fanotify_data.bypass_perm, 0);
 #endif
+	INIT_LIST_HEAD(&group->fanotify_data.lru_cache_list);
+	spin_lock_init(&group->fanotify_data.lru_cache_lock);
+
 	switch (flags & FAN_ALL_CLASS_BITS) {
 	case FAN_CLASS_NOTIF:
 		group->priority = FS_PRIO_0;
@@ -886,7 +941,7 @@
  */
 static int __init fanotify_user_setup(void)
 {
-	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
+	fanotify_mark_cache = KMEM_CACHE(fanotify_mark, SLAB_PANIC);
 	fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
 						   SLAB_PANIC);
 
--- /dev/null
+++ //media/storage/src/linux-2.6.38/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,12 @@
+#ifndef __FS_NOTIFY_FANOTIFY_FANOTIFY_H_
+#define __FS_NOTIFY_FANOTIFY_FANOTIFY_H_
+
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+
+struct fanotify_mark {
+	struct fsnotify_mark fsn_mark;
+	struct list_head lru_cache_list;	/* list of marks that can be replaced when cache in full */
+};
+
+#endif /* __FS_NOTIFY_FANOTIFY_FANOTIFY_H_ */

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux