Fix a nasty bug related to use of dup() with epoll on Linux

Current versions of the Linux kernel don't seem to remove the struct
epitem for a given (file,fd) combo when the fd is closed unless the
file itself is also completely closed.  This means that if you do:
   fd = dup(fd_orig);
   add(fd);
   close(fd);
   dup2(fd_orig, fd);
   add(fd);
you will get an EEXIST when you should have gotten a success.  This
could cause warnings and dropped events when using dup and epoll.

The solution is pretty simple: when we get an EEXIST from
EPOLL_CTL_ADD, we retry with EPOLL_CTL_MOD.

Unit test included to demonstrate the bug.

Found due to the patient efforts of Gilad Benjamini; diagnosed with
help from Nicholas Marriott.
This commit is contained in:
Nick Mathewson 2010-10-24 11:38:29 -04:00
parent bf11e7ddf7
commit c281aba30e
2 changed files with 92 additions and 13 deletions

30
epoll.c
View File

@ -155,7 +155,6 @@ epoll_apply_changes(struct event_base *base)
int op, events; int op, events;
for (i = 0; i < changelist->n_changes; ++i) { for (i = 0; i < changelist->n_changes; ++i) {
int precautionary_add = 0;
ch = &changelist->changes[i]; ch = &changelist->changes[i];
events = 0; events = 0;
@ -203,13 +202,12 @@ epoll_apply_changes(struct event_base *base)
on the fd, we need to try the ADD anyway, in on the fd, we need to try the ADD anyway, in
case the fd was closed at some in the case the fd was closed at some in the
middle. If it wasn't, the ADD operation middle. If it wasn't, the ADD operation
will fail with; that's okay. will fail with EEXIST; and we retry it as a
*/ MOD. */
precautionary_add = 1; op = EPOLL_CTL_ADD;
} else if (ch->old_events) { } else if (ch->old_events) {
op = EPOLL_CTL_MOD; op = EPOLL_CTL_MOD;
} }
} else if ((ch->read_change & EV_CHANGE_DEL) || } else if ((ch->read_change & EV_CHANGE_DEL) ||
(ch->write_change & EV_CHANGE_DEL)) { (ch->write_change & EV_CHANGE_DEL)) {
/* If we're deleting anything, we'll want to do a MOD /* If we're deleting anything, we'll want to do a MOD
@ -255,14 +253,22 @@ epoll_apply_changes(struct event_base *base)
(int)epev.events, (int)epev.events,
ch->fd)); ch->fd));
} }
} else if (op == EPOLL_CTL_ADD && errno == EEXIST && } else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
precautionary_add) { /* If an ADD operation fails with EEXIST,
/* If a precautionary ADD operation fails with * either the operation was redundant (as with a
EEXIST, that's fine too. * precautionary add), or we ran into a fun
*/ * kernel bug where using dup*() to duplicate the
event_debug(("Epoll ADD(%d) on fd %d gave %s: ADD was redundant", * same file into the same fd gives you the same epitem
* rather than a fresh one. For the second case,
* we must retry with MOD. */
if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
(int)epev.events, ch->fd);
} else {
event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
(int)epev.events, (int)epev.events,
ch->fd, strerror(errno))); ch->fd));
}
} else if (op == EPOLL_CTL_DEL && } else if (op == EPOLL_CTL_DEL &&
(errno == ENOENT || errno == EBADF || (errno == ENOENT || errno == EBADF ||
errno == EPERM)) { errno == EPERM)) {

View File

@ -2061,6 +2061,76 @@ end:
} }
} }
#ifndef WIN32
/* You can't do this test on windows, since dup2 doesn't work on sockets */
static void
dfd_cb(evutil_socket_t fd, short e, void *data)
{
*(int*)data = (int)e;
}
/* Regression test for our workaround for a fun epoll/linux related bug
* where fd2 = dup(fd1); add(fd2); close(fd2); dup2(fd1,fd2); add(fd2)
* will get you an EEXIST */
static void
test_dup_fd(void *arg)
{
struct basic_test_data *data = arg;
struct event_base *base = data->base;
struct event *ev1=NULL, *ev2=NULL;
int fd, dfd=-1;
int ev1_got, ev2_got;
tt_int_op(write(data->pair[0], "Hello world",
strlen("Hello world")), >, 0);
fd = data->pair[1];
dfd = dup(fd);
tt_int_op(dfd, >=, 0);
ev1 = event_new(base, fd, EV_READ|EV_PERSIST, dfd_cb, &ev1_got);
ev2 = event_new(base, dfd, EV_READ|EV_PERSIST, dfd_cb, &ev2_got);
ev1_got = ev2_got = 0;
event_add(ev1, NULL);
event_add(ev2, NULL);
event_base_loop(base, EVLOOP_ONCE);
tt_int_op(ev1_got, ==, EV_READ);
tt_int_op(ev2_got, ==, EV_READ);
/* Now close and delete dfd then dispatch. We need to do the
* dispatch here so that when we add it later, we think there
* was an intermediate delete. */
close(dfd);
event_del(ev2);
ev1_got = ev2_got = 0;
event_base_loop(base, EVLOOP_ONCE);
tt_want_int_op(ev1_got, ==, EV_READ);
tt_int_op(ev2_got, ==, 0);
/* Re-duplicate the fd. We need to get the same duplicated
* value that we closed to provoke the epoll quirk. Also, we
* need to change the events to write, or else the old lingering
* read event will make the test pass whether the change was
* successful or not. */
tt_int_op(dup2(fd, dfd), ==, dfd);
event_free(ev2);
ev2 = event_new(base, dfd, EV_WRITE|EV_PERSIST, dfd_cb, &ev2_got);
event_add(ev2, NULL);
ev1_got = ev2_got = 0;
event_base_loop(base, EVLOOP_ONCE);
tt_want_int_op(ev1_got, ==, EV_READ);
tt_int_op(ev2_got, ==, EV_WRITE);
end:
if (ev1)
event_free(ev1);
if (ev2)
event_free(ev2);
close(dfd);
}
#endif
#ifdef _EVENT_DISABLE_MM_REPLACEMENT #ifdef _EVENT_DISABLE_MM_REPLACEMENT
static void static void
test_mm_functions(void *arg) test_mm_functions(void *arg)
@ -2229,6 +2299,9 @@ struct testcase_t main_testcases[] = {
{ "event_once", test_event_once, TT_ISOLATED, &basic_setup, NULL }, { "event_once", test_event_once, TT_ISOLATED, &basic_setup, NULL },
{ "event_pending", test_event_pending, TT_ISOLATED, &basic_setup, { "event_pending", test_event_pending, TT_ISOLATED, &basic_setup,
NULL }, NULL },
#ifndef WIN32
{ "dup_fd", test_dup_fd, TT_ISOLATED, &basic_setup, NULL },
#endif
{ "mm_functions", test_mm_functions, TT_FORK, NULL, NULL }, { "mm_functions", test_mm_functions, TT_FORK, NULL, NULL },
BASIC(many_events, TT_ISOLATED), BASIC(many_events, TT_ISOLATED),