Fix a nasty bug related to use of dup() with epoll on Linux

Current versions of the Linux kernel don't seem to remove the struct
epitem for a given (file,fd) combo when the fd is closed unless the
file itself is also completely closed.  This means that if you do:
   fd = dup(fd_orig);
   add(fd);
   close(fd);
   dup2(fd_orig, fd);
   add(fd);
you will get an EEXIST when you should have gotten a success.  This
could cause warnings and dropped events when using dup and epoll.

The solution is pretty simple: when we get an EEXIST from
EPOLL_CTL_ADD, we retry with EPOLL_CTL_MOD.

Unit test included to demonstrate the bug.

Found due to the patient efforts of Gilad Benjamini; diagnosed with
help from Nicholas Marriott.
This commit is contained in:
Nick Mathewson 2010-10-24 11:38:29 -04:00
parent bf11e7ddf7
commit c281aba30e
2 changed files with 92 additions and 13 deletions

32
epoll.c
View File

@ -155,7 +155,6 @@ epoll_apply_changes(struct event_base *base)
int op, events;
for (i = 0; i < changelist->n_changes; ++i) {
int precautionary_add = 0;
ch = &changelist->changes[i];
events = 0;
@ -203,13 +202,12 @@ epoll_apply_changes(struct event_base *base)
on the fd, we need to try the ADD anyway, in
case the fd was closed at some in the
middle. If it wasn't, the ADD operation
will fail with; that's okay.
*/
precautionary_add = 1;
will fail with EEXIST; and we retry it as a
MOD. */
op = EPOLL_CTL_ADD;
} else if (ch->old_events) {
op = EPOLL_CTL_MOD;
}
} else if ((ch->read_change & EV_CHANGE_DEL) ||
(ch->write_change & EV_CHANGE_DEL)) {
/* If we're deleting anything, we'll want to do a MOD
@ -255,14 +253,22 @@ epoll_apply_changes(struct event_base *base)
(int)epev.events,
ch->fd));
}
} else if (op == EPOLL_CTL_ADD && errno == EEXIST &&
precautionary_add) {
/* If a precautionary ADD operation fails with
EEXIST, that's fine too.
*/
event_debug(("Epoll ADD(%d) on fd %d gave %s: ADD was redundant",
(int)epev.events,
ch->fd, strerror(errno)));
} else if (op == EPOLL_CTL_ADD && errno == EEXIST) {
/* If an ADD operation fails with EEXIST,
* either the operation was redundant (as with a
* precautionary add), or we ran into a fun
* kernel bug where using dup*() to duplicate the
* same file into the same fd gives you the same epitem
* rather than a fresh one. For the second case,
* we must retry with MOD. */
if (epoll_ctl(epollop->epfd, EPOLL_CTL_MOD, ch->fd, &epev) == -1) {
event_warn("Epoll ADD(%d) on %d retried as MOD; that failed too",
(int)epev.events, ch->fd);
} else {
event_debug(("Epoll ADD(%d) on %d retried as MOD; succeeded.",
(int)epev.events,
ch->fd));
}
} else if (op == EPOLL_CTL_DEL &&
(errno == ENOENT || errno == EBADF ||
errno == EPERM)) {

View File

@ -2061,6 +2061,76 @@ end:
}
}
#ifndef WIN32
/* You can't do this test on windows, since dup2 doesn't work on sockets */
static void
dfd_cb(evutil_socket_t fd, short e, void *data)
{
*(int*)data = (int)e;
}
/* Regression test for our workaround for a fun epoll/linux related bug
* where fd2 = dup(fd1); add(fd2); close(fd2); dup2(fd1,fd2); add(fd2)
* will get you an EEXIST */
static void
test_dup_fd(void *arg)
{
struct basic_test_data *data = arg;
struct event_base *base = data->base;
struct event *ev1=NULL, *ev2=NULL;
int fd, dfd=-1;
int ev1_got, ev2_got;
tt_int_op(write(data->pair[0], "Hello world",
strlen("Hello world")), >, 0);
fd = data->pair[1];
dfd = dup(fd);
tt_int_op(dfd, >=, 0);
ev1 = event_new(base, fd, EV_READ|EV_PERSIST, dfd_cb, &ev1_got);
ev2 = event_new(base, dfd, EV_READ|EV_PERSIST, dfd_cb, &ev2_got);
ev1_got = ev2_got = 0;
event_add(ev1, NULL);
event_add(ev2, NULL);
event_base_loop(base, EVLOOP_ONCE);
tt_int_op(ev1_got, ==, EV_READ);
tt_int_op(ev2_got, ==, EV_READ);
/* Now close and delete dfd then dispatch. We need to do the
* dispatch here so that when we add it later, we think there
* was an intermediate delete. */
close(dfd);
event_del(ev2);
ev1_got = ev2_got = 0;
event_base_loop(base, EVLOOP_ONCE);
tt_want_int_op(ev1_got, ==, EV_READ);
tt_int_op(ev2_got, ==, 0);
/* Re-duplicate the fd. We need to get the same duplicated
* value that we closed to provoke the epoll quirk. Also, we
* need to change the events to write, or else the old lingering
* read event will make the test pass whether the change was
* successful or not. */
tt_int_op(dup2(fd, dfd), ==, dfd);
event_free(ev2);
ev2 = event_new(base, dfd, EV_WRITE|EV_PERSIST, dfd_cb, &ev2_got);
event_add(ev2, NULL);
ev1_got = ev2_got = 0;
event_base_loop(base, EVLOOP_ONCE);
tt_want_int_op(ev1_got, ==, EV_READ);
tt_int_op(ev2_got, ==, EV_WRITE);
end:
if (ev1)
event_free(ev1);
if (ev2)
event_free(ev2);
close(dfd);
}
#endif
#ifdef _EVENT_DISABLE_MM_REPLACEMENT
static void
test_mm_functions(void *arg)
@ -2229,6 +2299,9 @@ struct testcase_t main_testcases[] = {
{ "event_once", test_event_once, TT_ISOLATED, &basic_setup, NULL },
{ "event_pending", test_event_pending, TT_ISOLATED, &basic_setup,
NULL },
#ifndef WIN32
{ "dup_fd", test_dup_fd, TT_ISOLATED, &basic_setup, NULL },
#endif
{ "mm_functions", test_mm_functions, TT_FORK, NULL, NULL },
BASIC(many_events, TT_ISOLATED),