Fix a nasty bug related to use of dup() with epoll on Linux

Current versions of the Linux kernel don't seem to remove the struct epitem for a given (file,fd) combo when the fd is closed unless the file itself is also completely closed. This means that if you do: fd = dup(fd_orig); add(fd); close(fd); dup2(fd_orig, fd); add(fd); you will get an EEXIST when you should have gotten a success. This could cause warnings and dropped events when using dup and epoll. The solution is pretty simple: when we get an EEXIST from EPOLL_CTL_ADD, we retry with EPOLL_CTL_MOD. Unit test included to demonstrate the bug. Found due to the patient efforts of Gilad Benjamini; diagnosed with help from Nicholas Marriott.
2025-09-09 12:28:19 -04:00 · 2010-10-24 11:38:29 -04:00 · 2010-10-24 11:38:29 -04:00 · c281aba30e
commit c281aba30e
parent bf11e7ddf7
2 changed files with 92 additions and 13 deletions
--- a/epoll.c
+++ b/epoll.c
@ -155,7 +155,6 @@ epoll_apply_changes(struct event_base *base)
 	int op, events;

 	for (i = 0; i < changelist->n_changes; ++i) {
-		int precautionary_add = 0;
 		ch = &changelist->changes[i];
 		events = 0;

@ -203,13 +202,12 @@ epoll_apply_changes(struct event_base *base)
 				  on the fd, we need to try the ADD anyway, in
 				  case the fd was closed at some in the
 				  middle.  If it wasn't, the ADD operation
-				  will fail with; that's okay.
-				*/
-				precautionary_add = 1;
+				  will fail with EEXIST; and we retry it as a
+				  MOD. */
+				op = EPOLL_CTL_ADD;
 			} else if (ch->old_events) {
 				op = EPOLL_CTL_MOD;
 			}
-
 		} else if ((ch->read_change & EV_CHANGE_DEL) ||
 		    (ch->write_change & EV_CHANGE_DEL)) {
 			/* If we're deleting anything, we'll want to do a MOD
@ -255,14 +253,22 @@ epoll_apply_changes(struct event_base *base)
 						(int)epev.events,
 						ch->fd));
 				}
-			} else if (op == EPOLL_CTL_ADD && errno == EEXIST &&
-			    precautionary_add) {
-				/* If a precautionary ADD operation fails with
-				   EEXIST, that's fine too.
-				 */
-				event_debug(("Epoll ADD(%d) on fd %d gave %s: ADD was redundant",
-					(int)epev.events,
-					ch->fd, strerror(errno)));
+			} else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
+				/* If an ADD operation fails with EEXIST,
+				 * either the operation was redundant (as with a
+				 * precautionary add), or we ran into a fun
+				 * kernel bug where using dup*() to duplicate the
+				 * same file into the same fd gives you the same epitem
+				 * rather than a fresh one.  For the second case,
+				 * we must retry with MOD. */
+				if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
+					event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
+					    (int)epev.events, ch->fd);
+				} else {
+					event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
+						(int)epev.events,
+						ch->fd));
+				}
 			} else if (op == EPOLL_CTL_DEL &&
 			    (errno == ENOENT || errno == EBADF ||
 				errno == EPERM)) {
--- a/test/regress.c
+++ b/test/regress.c
@ -2061,6 +2061,76 @@ end:
 	}
 }

+#ifndef WIN32
+/* You can't do this test on windows, since dup2 doesn't work on sockets */
+
+static void
+dfd_cb(evutil_socket_t fd, short e, void *data)
+{
+	*(int*)data = (int)e;
+}
+
+/* Regression test for our workaround for a fun epoll/linux related bug
+ * where fd2 = dup(fd1); add(fd2); close(fd2); dup2(fd1,fd2); add(fd2)
+ * will get you an EEXIST */
+static void
+test_dup_fd(void *arg)
+{
+	struct basic_test_data *data = arg;
+	struct event_base *base = data->base;
+	struct event *ev1=NULL, *ev2=NULL;
+	int fd, dfd=-1;
+	int ev1_got, ev2_got;
+
+	tt_int_op(write(data->pair[0], "Hello world",
+		strlen("Hello world")), >, 0);
+	fd = data->pair[1];
+
+	dfd = dup(fd);
+	tt_int_op(dfd, >=, 0);
+
+	ev1 = event_new(base, fd, EV_READ|EV_PERSIST, dfd_cb, &ev1_got);
+	ev2 = event_new(base, dfd, EV_READ|EV_PERSIST, dfd_cb, &ev2_got);
+	ev1_got = ev2_got = 0;
+	event_add(ev1, NULL);
+	event_add(ev2, NULL);
+	event_base_loop(base, EVLOOP_ONCE);
+	tt_int_op(ev1_got, ==, EV_READ);
+	tt_int_op(ev2_got, ==, EV_READ);
+
+	/* Now close and delete dfd then dispatch.  We need to do the
+	 * dispatch here so that when we add it later, we think there
+	 * was an intermediate delete. */
+	close(dfd);
+	event_del(ev2);
+	ev1_got = ev2_got = 0;
+	event_base_loop(base, EVLOOP_ONCE);
+	tt_want_int_op(ev1_got, ==, EV_READ);
+	tt_int_op(ev2_got, ==, 0);
+
+	/* Re-duplicate the fd.  We need to get the same duplicated
+	 * value that we closed to provoke the epoll quirk.  Also, we
+	 * need to change the events to write, or else the old lingering
+	 * read event will make the test pass whether the change was
+	 * successful or not. */
+	tt_int_op(dup2(fd, dfd), ==, dfd);
+	event_free(ev2);
+	ev2 = event_new(base, dfd, EV_WRITE|EV_PERSIST, dfd_cb, &ev2_got);
+	event_add(ev2, NULL);
+	ev1_got = ev2_got = 0;
+	event_base_loop(base, EVLOOP_ONCE);
+	tt_want_int_op(ev1_got, ==, EV_READ);
+	tt_int_op(ev2_got, ==, EV_WRITE);
+
+end:
+	if (ev1)
+		event_free(ev1);
+	if (ev2)
+		event_free(ev2);
+	close(dfd);
+}
+#endif
+
 #ifdef _EVENT_DISABLE_MM_REPLACEMENT
 static void
 test_mm_functions(void *arg)
@ -2229,6 +2299,9 @@ struct testcase_t main_testcases[] = {
 	{ "event_once", test_event_once, TT_ISOLATED, &basic_setup, NULL },
 	{ "event_pending", test_event_pending, TT_ISOLATED, &basic_setup,
 	  NULL },
+#ifndef WIN32
+	{ "dup_fd", test_dup_fd, TT_ISOLATED, &basic_setup, NULL },
+#endif
 	{ "mm_functions", test_mm_functions, TT_FORK, NULL, NULL },
 	BASIC(many_events, TT_ISOLATED),