torvalds
diff --git a/‎Documentation/filesystems/inotify.txt
Lines changed: 138 additions & 0 deletions b/‎Documentation/filesystems/inotify.txt
Lines changed: 138 additions & 0 deletions
diff --git a/‎arch/i386/kernel/syscall_table.S
Lines changed: 3 additions & 0 deletions b/‎arch/i386/kernel/syscall_table.S
Lines changed: 3 additions & 0 deletions
diff --git a/‎fs/Kconfig
Lines changed: 13 additions & 0 deletions b/‎fs/Kconfig
Lines changed: 13 additions & 0 deletions
diff --git a/‎fs/Makefile
Lines changed: 1 addition & 0 deletions b/‎fs/Makefile
Lines changed: 1 addition & 0 deletions
diff --git a/‎fs/attr.c
Lines changed: 4 additions & 29 deletions b/‎fs/attr.c
Lines changed: 4 additions & 29 deletions
diff --git a/‎fs/compat.c
Lines changed: 8 additions & 4 deletions b/‎fs/compat.c
Lines changed: 8 additions & 4 deletions
diff --git a/‎fs/file_table.c
Lines changed: 3 additions & 0 deletions b/‎fs/file_table.c
Lines changed: 3 additions & 0 deletions
diff --git a/‎fs/inode.c
Lines changed: 6 additions & 0 deletions b/‎fs/inode.c
Lines changed: 6 additions & 0 deletions
@@ -0,0 +1,138 @@
+				    inotify
+	     a powerful yet simple file change notification system
+
+
+
+Document started 15 Mar 2005 by Robert Love <[email protected]>
+
+(i) User Interface
+
+Inotify is controlled by a set of three sys calls 
+
+First step in using inotify is to initialise an inotify instance
+
+	int fd = inotify_init ();
+
+Change events are managed by "watches".  A watch is an (object,mask) pair where
+the object is a file or directory and the mask is a bit mask of one or more
+inotify events that the application wishes to receive.  See <linux/inotify.h>
+for valid events.  A watch is referenced by a watch descriptor, or wd.
+
+Watches are added via a path to the file.
+
+Watches on a directory will return events on any files inside of the directory.
+
+Adding a watch is simple,
+
+	int wd = inotify_add_watch (fd, path, mask);
+
+You can add a large number of files via something like
+
+	for each file to watch {
+		int wd = inotify_add_watch (fd, file, mask);
+	}
+
+You can update an existing watch in the same manner, by passing in a new mask.
+
+An existing watch is removed via the INOTIFY_IGNORE ioctl, for example
+
+	inotify_rm_watch (fd, wd);
+
+Events are provided in the form of an inotify_event structure that is read(2)
+from a inotify instance fd.  The filename is of dynamic length and follows the 
+struct. It is of size len.  The filename is padded with null bytes to ensure 
+proper alignment.  This padding is reflected in len.
+
+You can slurp multiple events by passing a large buffer, for example
+
+	size_t len = read (fd, buf, BUF_LEN);
+
+Will return as many events as are available and fit in BUF_LEN.
+
+each inotify instance fd is also select()- and poll()-able.
+
+You can find the size of the current event queue via the FIONREAD ioctl.
+
+All watches are destroyed and cleaned up on close.
+
+
+(ii) Internal Kernel Implementation
+
+Each open inotify instance is associated with an inotify_device structure.
+
+Each watch is associated with an inotify_watch structure.  Watches are chained
+off of each associated device and each associated inode.
+
+See fs/inotify.c for the locking and lifetime rules.
+
+
+(iii) Rationale
+
+Q: What is the design decision behind not tying the watch to the open fd of
+   the watched object?
+
+A: Watches are associated with an open inotify device, not an open file.
+   This solves the primary problem with dnotify: keeping the file open pins
+   the file and thus, worse, pins the mount.  Dnotify is therefore infeasible
+   for use on a desktop system with removable media as the media cannot be
+   unmounted.
+
+Q: What is the design decision behind using an-fd-per-device as opposed to
+   an fd-per-watch?
+
+A: An fd-per-watch quickly consumes more file descriptors than are allowed,
+   more fd's than are feasible to manage, and more fd's than are optimally
+   select()-able.  Yes, root can bump the per-process fd limit and yes, users
+   can use epoll, but requiring both is a silly and extraneous requirement.
+   A watch consumes less memory than an open file, separating the number
+   spaces is thus sensible.  The current design is what user-space developers
+   want: Users initialize inotify, once, and add n watches, requiring but one fd
+   and no twiddling with fd limits.  Initializing an inotify instance two
+   thousand times is silly.  If we can implement user-space's preferences 
+   cleanly--and we can, the idr layer makes stuff like this trivial--then we 
+   should.
+
+   There are other good arguments.  With a single fd, there is a single
+   item to block on, which is mapped to a single queue of events.  The single
+   fd returns all watch events and also any potential out-of-band data.  If
+   every fd was a separate watch,
+
+   - There would be no way to get event ordering.  Events on file foo and
+     file bar would pop poll() on both fd's, but there would be no way to tell
+     which happened first.  A single queue trivially gives you ordering.  Such
+     ordering is crucial to existing applications such as Beagle.  Imagine
+     "mv a b ; mv b a" events without ordering.
+
+   - We'd have to maintain n fd's and n internal queues with state,
+     versus just one.  It is a lot messier in the kernel.  A single, linear
+     queue is the data structure that makes sense.
+
+   - User-space developers prefer the current API.  The Beagle guys, for
+     example, love it.  Trust me, I asked.  It is not a surprise: Who'd want
+     to manage and block on 1000 fd's via select?
+
+   - You'd have to manage the fd's, as an example: Call close() when you
+     received a delete event.
+
+   - No way to get out of band data.
+
+   - 1024 is still too low.  ;-)
+
+   When you talk about designing a file change notification system that
+   scales to 1000s of directories, juggling 1000s of fd's just does not seem
+   the right interface.  It is too heavy.
+
+Q: Why the system call approach?
+
+A: The poor user-space interface is the second biggest problem with dnotify.
+   Signals are a terrible, terrible interface for file notification.  Or for
+   anything, for that matter.  The ideal solution, from all perspectives, is a
+   file descriptor-based one that allows basic file I/O and poll/select.
+   Obtaining the fd and managing the watches could have been done either via a
+   device file or a family of new system calls.  We decided to implement a
+   family of system calls because that is the preffered approach for new kernel
+   features and it means our user interface requirements.
+
+   Additionally, it _is_ possible to  more than one instance  and
+   juggle more than one queue and thus more than one associated fd.
+
@@ -291,3 +291,6 @@ ENTRY(sys_call_table)
 	.long sys_keyctl
 	.long sys_ioprio_set
 	.long sys_ioprio_get		/* 290 */
+	.long sys_inotify_init
+	.long sys_inotify_add_watch
+	.long sys_inotify_rm_watch
@@ -359,6 +359,19 @@ config ROMFS_FS
 	  If you don't know whether you need it, then you don't need it:
 	  answer N.
 
+config INOTIFY
+	bool "Inotify file change notification support"
+	default y
+	---help---
+	  Say Y here to enable inotify support and the /dev/inotify character
+	  device.  Inotify is a file change notification system and a
+	  replacement for dnotify.  Inotify fixes numerous shortcomings in
+	  dnotify and introduces several new features.  It allows monitoring
+	  of both files and directories via a single open fd.  Multiple file
+	  events are supported.
+
+	  If unsure, say Y.
+
 config QUOTA
 	bool "Quota support"
 	help
 
@@ -12,6 +12,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
 		ioprio.o
 
+obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_COMPAT)		+= compat.o
 
 
@@ -10,7 +10,7 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/fcntl.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
@@ -107,31 +107,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
 out:
 	return error;
 }
-
 EXPORT_SYMBOL(inode_setattr);
 
-int setattr_mask(unsigned int ia_valid)
-{
-	unsigned long dn_mask = 0;
-
-	if (ia_valid & ATTR_UID)
-		dn_mask |= DN_ATTRIB;
-	if (ia_valid & ATTR_GID)
-		dn_mask |= DN_ATTRIB;
-	if (ia_valid & ATTR_SIZE)
-		dn_mask |= DN_MODIFY;
-	/* both times implies a utime(s) call */
-	if ((ia_valid & (ATTR_ATIME|ATTR_MTIME)) == (ATTR_ATIME|ATTR_MTIME))
-		dn_mask |= DN_ATTRIB;
-	else if (ia_valid & ATTR_ATIME)
-		dn_mask |= DN_ACCESS;
-	else if (ia_valid & ATTR_MTIME)
-		dn_mask |= DN_MODIFY;
-	if (ia_valid & ATTR_MODE)
-		dn_mask |= DN_ATTRIB;
-	return dn_mask;
-}
-
 int notify_change(struct dentry * dentry, struct iattr * attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -197,11 +174,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 	if (ia_valid & ATTR_SIZE)
 		up_write(&dentry->d_inode->i_alloc_sem);
 
-	if (!error) {
-		unsigned long dn_mask = setattr_mask(ia_valid);
-		if (dn_mask)
-			dnotify_parent(dentry, dn_mask);
-	}
+	if (!error)
+		fsnotify_change(dentry, ia_valid);
+
 	return error;
 }
 
 
@@ -37,7 +37,7 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/dirent.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/highuid.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -1307,9 +1307,13 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 out:
 	if (iov != iovstack)
 		kfree(iov);
-	if ((ret + (type == READ)) > 0)
-		dnotify_parent(file->f_dentry,
-				(type == READ) ? DN_ACCESS : DN_MODIFY);
+	if ((ret + (type == READ)) > 0) {
+		struct dentry *dentry = file->f_dentry;
+		if (type == READ)
+			fsnotify_access(dentry);
+		else
+			fsnotify_modify(dentry);
+	}
 	return ret;
 }
 
 
@@ -16,6 +16,7 @@
 #include <linux/eventpoll.h>
 #include <linux/mount.h>
 #include <linux/cdev.h>
+#include <linux/fsnotify.h>
 
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {
@@ -126,6 +127,8 @@ void fastcall __fput(struct file *file)
 	struct inode *inode = dentry->d_inode;
 
 	might_sleep();
+
+	fsnotify_close(file);
 	/*
 	 * The function eventpoll_release() should be the first called
 	 * in the file cleanup chain.
 
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
+#include <linux/inotify.h>
 
 /*
  * This is needed for the following functions:
@@ -202,6 +203,10 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
 	spin_lock_init(&inode->i_lock);
 	i_size_ordered_init(inode);
+#ifdef CONFIG_INOTIFY
+	INIT_LIST_HEAD(&inode->inotify_watches);
+	sema_init(&inode->inotify_sem, 1);
+#endif
 }
 
 EXPORT_SYMBOL(inode_init_once);
@@ -351,6 +356,7 @@ int invalidate_inodes(struct super_block * sb)
 
 	down(&iprune_sem);
 	spin_lock(&inode_lock);
+	inotify_unmount_inodes(&sb->s_inodes);
 	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);