diff --git a/minix/include/minix/safecopies.h b/minix/include/minix/safecopies.h index 53580ff3d..c1214a285 100644 --- a/minix/include/minix/safecopies.h +++ b/minix/include/minix/safecopies.h @@ -7,21 +7,20 @@ #include typedef struct { - int cp_flags; /* CPF_* below */ - union ixfer_cp_u{ + int cp_flags; /* CPF_* below */ + int cp_seq; /* sequence number */ + union ixfer_cp_u { struct { /* CPF_DIRECT */ endpoint_t cp_who_to; /* grantee */ vir_bytes cp_start; /* memory */ size_t cp_len; /* size in bytes */ - char cp_reserved[8]; /* future use */ } cp_direct; struct { /* CPF_INDIRECT */ endpoint_t cp_who_to; /* grantee */ endpoint_t cp_who_from; /* previous granter */ cp_grant_id_t cp_grant; /* previous grant */ - char cp_reserved[8];/* future use */ } cp_indirect; struct { /* CPF_MAGIC */ @@ -29,15 +28,13 @@ typedef struct { endpoint_t cp_who_to; /* grantee */ vir_bytes cp_start; /* memory */ size_t cp_len; /* size in bytes */ - char cp_reserved[8]; /* future use */ } cp_magic; struct { /* (free slot) */ int cp_next; /* next free or -1 */ } cp_free; } cp_u; - int cp_seq; /* sequence number */ - char cp_reserved[4]; /* future use */ + cp_grant_id_t cp_faulted; /* soft fault marker (CPF_TRY only) */ } cp_grant_t; /* Vectored safecopy. */ @@ -45,7 +42,6 @@ struct vscp_vec { /* Exactly one of the following must be SELF. */ endpoint_t v_from; /* source */ endpoint_t v_to; /* destination */ - cp_grant_id_t v_gid; /* grant id of other process */ size_t v_offset; /* offset in other grant */ vir_bytes v_addr; /* address in copier's space */ @@ -78,6 +74,9 @@ struct vscp_vec { #define CPF_MAGIC 0x000800 /* Grant from any to any. */ #define CPF_VALID 0x001000 /* Grant slot contains valid grant. */ +/* Special cpf_revoke() return values. */ +#define GRANT_FAULTED 1 /* CPF_TRY: a soft fault occurred */ + /* Prototypes for functions in libsys. */ cp_grant_id_t cpf_grant_direct(endpoint_t, vir_bytes, size_t, int); cp_grant_id_t cpf_grant_indirect(endpoint_t, endpoint_t, cp_grant_id_t); diff --git a/minix/kernel/proto.h b/minix/kernel/proto.h index 209fae7f9..44c99e8b4 100644 --- a/minix/kernel/proto.h +++ b/minix/kernel/proto.h @@ -156,8 +156,9 @@ void hook_ipc_clear(struct proc *proc); #endif /* system/do_safecopy.c */ +struct cp_sfinfo; /* external callers may only provide NULL */ int verify_grant(endpoint_t, endpoint_t, cp_grant_id_t, vir_bytes, int, - vir_bytes, vir_bytes *, endpoint_t *, u32_t *); + vir_bytes, vir_bytes *, endpoint_t *, struct cp_sfinfo *); /* system/do_diagctl.c */ int do_diagctl(struct proc * caller, message *m); diff --git a/minix/kernel/system/do_safecopy.c b/minix/kernel/system/do_safecopy.c index 5284741a5..6002a9011 100644 --- a/minix/kernel/system/do_safecopy.c +++ b/minix/kernel/system/do_safecopy.c @@ -29,6 +29,13 @@ static int safecopy(struct proc *, endpoint_t, endpoint_t, #define HASGRANTTABLE(gr) \ (priv(gr) && priv(gr)->s_grant_table) +struct cp_sfinfo { /* information for handling soft faults */ + int try; /* if nonzero, try copy only, stop on fault */ + endpoint_t endpt; /* endpoint owning grant with CPF_TRY flag */ + vir_bytes addr; /* address to write mark upon soft fault */ + cp_grant_id_t value; /* grant ID to use as mark value to write */ +}; + /*===========================================================================* * verify_grant * *===========================================================================*/ @@ -41,7 +48,7 @@ int verify_grant( vir_bytes offset_in, /* copy offset within grant */ vir_bytes *offset_result, /* copy offset within virtual address space */ endpoint_t *e_granter, /* new granter (magic grants) */ - u32_t *flags /* CPF_* */ + struct cp_sfinfo *sfinfo /* storage for soft fault information */ ) { cp_grant_t g; @@ -120,8 +127,6 @@ int verify_grant( return EPERM; } - if(flags) *flags = g.cp_flags; - /* Check validity: flags and sequence number. */ if((g.cp_flags & (CPF_USED | CPF_VALID)) != (CPF_USED | CPF_VALID)) { @@ -250,6 +255,14 @@ int verify_grant( return EPERM; } + /* If requested, store information regarding soft faults. */ + if (sfinfo != NULL && (sfinfo->try = !!(g.cp_flags & CPF_TRY))) { + sfinfo->endpt = granter; + sfinfo->addr = priv(granter_proc)->s_grant_table + + sizeof(g) * grant_idx + offsetof(cp_grant_t, cp_faulted); + sfinfo->value = grant; + } + return OK; } @@ -274,7 +287,7 @@ static int safecopy( endpoint_t new_granter, *src, *dst; struct proc *granter_p; int r; - u32_t flags; + struct cp_sfinfo sfinfo; #if PERF_USE_COW_SAFECOPY vir_bytes size; #endif @@ -295,7 +308,7 @@ static int safecopy( /* Verify permission exists. */ if((r=verify_grant(granter, grantee, grantid, bytes, access, - g_offset, &v_offset, &new_granter, &flags)) != OK) { + g_offset, &v_offset, &new_granter, &sfinfo)) != OK) { if(r == ENOTREADY) return r; printf( "grant %d verify to copy %d->%d by %d failed: err %d\n", @@ -325,11 +338,39 @@ static int safecopy( } /* Do the regular copy. */ - if(flags & CPF_TRY) { - int r; - /* Try copy without transparently faulting in pages. */ + if (sfinfo.try) { + /* + * Try copying without transparently faulting in pages. + * TODO: while CPF_TRY is meant to protect against deadlocks on + * memory-mapped files in file systems, it seems that this case + * triggers faults a whole lot more often, resulting in extra + * overhead due to retried file system operations. It might be + * a good idea to go through VM even in this case, and have VM + * fail (only) if the affected page belongs to a file mapping. + */ r = virtual_copy(&v_src, &v_dst, bytes); - if(r == EFAULT_SRC || r == EFAULT_DST) return EFAULT; + if (r == EFAULT_SRC || r == EFAULT_DST) { + /* + * Mark the magic grant as having experienced a soft + * fault during its lifetime. The exact value does not + * matter, but we use the grant ID (including its + * sequence number) as a form of protection in the + * light of CPU concurrency. + */ + r = data_copy(KERNEL, (vir_bytes)&sfinfo.value, + sfinfo.endpt, sfinfo.addr, sizeof(sfinfo.value)); + /* + * Failure means the creator of the magic grant messed + * up, which can only be unintentional, so report.. + */ + if (r != OK) + printf("Kernel: writing soft fault marker %d " + "into %d at 0x%lx failed (%d)\n", + sfinfo.value, sfinfo.endpt, sfinfo.addr, + r); + + return EFAULT; + } return r; } return virtual_copy_vmcheck(caller, &v_src, &v_dst, bytes); diff --git a/minix/lib/libsys/safecopies.c b/minix/lib/libsys/safecopies.c index cec339f6a..e22620ff1 100644 --- a/minix/lib/libsys/safecopies.c +++ b/minix/lib/libsys/safecopies.c @@ -152,6 +152,7 @@ cpf_grant_direct(endpoint_t who_to, vir_bytes addr, size_t bytes, int access) grants[g].cp_u.cp_direct.cp_who_to = who_to; grants[g].cp_u.cp_direct.cp_start = addr; grants[g].cp_u.cp_direct.cp_len = bytes; + grants[g].cp_faulted = GRANT_INVALID; __insn_barrier(); grants[g].cp_flags = access | CPF_DIRECT | CPF_USED | CPF_VALID; @@ -174,6 +175,7 @@ cpf_grant_indirect(endpoint_t who_to, endpoint_t who_from, cp_grant_id_t gr) grants[g].cp_u.cp_indirect.cp_who_to = who_to; grants[g].cp_u.cp_indirect.cp_who_from = who_from; grants[g].cp_u.cp_indirect.cp_grant = gr; + grants[g].cp_faulted = GRANT_INVALID; __insn_barrier(); grants[g].cp_flags = CPF_USED | CPF_INDIRECT | CPF_VALID; @@ -198,23 +200,39 @@ cpf_grant_magic(endpoint_t who_to, endpoint_t who_from, grants[g].cp_u.cp_magic.cp_who_from = who_from; grants[g].cp_u.cp_magic.cp_start = addr; grants[g].cp_u.cp_magic.cp_len = bytes; + grants[g].cp_faulted = GRANT_INVALID; __insn_barrier(); grants[g].cp_flags = CPF_USED | CPF_MAGIC | CPF_VALID | access; return GRANT_ID(g, grants[g].cp_seq); } +/* + * Revoke previously granted access, identified by grant ID. Return -1 on + * error, with errno set as appropriate. Return 0 on success, with one + * exception: return GRANT_FAULTED (1) if a grant was created with CPF_TRY and + * during its lifetime, a copy from or to the grant experienced a soft fault. + */ int cpf_revoke(cp_grant_id_t grant) { -/* Revoke previously granted access, identified by grant id. */ - int g; + int r, g; GID_CHECK_USED(grant); g = GRANT_IDX(grant); - /* Make grant invalid by setting flags to 0, clearing CPF_USED. + /* + * If a safecopy action on a (direct or magic) grant with the CPF_TRY + * flag failed on a soft fault, the kernel will have set the cp_faulted + * field to the grant identifier. Here, we test this and return + * GRANT_FAULTED (1) on a match. + */ + r = ((grants[g].cp_flags & CPF_TRY) && + grants[g].cp_faulted == grant) ? GRANT_FAULTED : 0; + + /* + * Make grant invalid by setting flags to 0, clearing CPF_USED. * This invalidates the grant. */ grants[g].cp_flags = 0; @@ -239,7 +257,7 @@ cpf_revoke(cp_grant_id_t grant) grants[g].cp_u.cp_free.cp_next = freelist; freelist = g; - return 0; + return r; } /* diff --git a/minix/servers/vfs/comm.c b/minix/servers/vfs/comm.c index 9381cc323..e21e62159 100644 --- a/minix/servers/vfs/comm.c +++ b/minix/servers/vfs/comm.c @@ -161,7 +161,10 @@ int fs_sendrec(endpoint_t fs_e, message *reqmp) assert(self->w_sendrec == NULL); - return(reqmp->m_type); + r = reqmp->m_type; + if (r == ERESTART) /* ERESTART is used internally, so make sure it is.. */ + r = EIO; /* ..not delivered as a result from a file system. */ + return(r); } /*===========================================================================* diff --git a/minix/servers/vfs/pipe.c b/minix/servers/vfs/pipe.c index 3cba587e2..2aee4ac5c 100644 --- a/minix/servers/vfs/pipe.c +++ b/minix/servers/vfs/pipe.c @@ -483,7 +483,7 @@ void revive(endpoint_t proc_e, int returned) * it again now that I/O is done. */ if (GRANT_VALID(rfp->fp_grant)) { - if(cpf_revoke(rfp->fp_grant)) { + if(cpf_revoke(rfp->fp_grant) == -1) { panic("VFS: revoke failed for grant: %d", rfp->fp_grant); } diff --git a/minix/servers/vfs/request.c b/minix/servers/vfs/request.c index ec019fbcb..e34874d35 100644 --- a/minix/servers/vfs/request.c +++ b/minix/servers/vfs/request.c @@ -49,7 +49,9 @@ static int req_breadwrite_actual(endpoint_t fs_e, endpoint_t user_e, dev_t dev, /* Send/rec request */ r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); + if (r != OK) return(r); /* Fill in response structure */ @@ -68,7 +70,7 @@ int req_breadwrite(endpoint_t fs_e, endpoint_t user_e, dev_t dev, off_t pos, r = req_breadwrite_actual(fs_e, user_e, dev, pos, num_of_bytes, user_addr, rw_flag, new_pos, cum_iop, CPF_TRY); - if(r == EFAULT) { + if (r == ERESTART) { if((r=vm_vfs_procctl_handlemem(user_e, user_addr, num_of_bytes, rw_flag == READING)) != OK) { return r; @@ -324,7 +326,8 @@ static int req_getdents_actual( } r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); if (r == OK) { *new_pos = m.m_fs_vfs_getdents.seek_pos; @@ -351,7 +354,9 @@ int req_getdents( r = req_getdents_actual(fs_e, inode_nr, pos, buf, size, new_pos, direct, CPF_TRY); - if(r == EFAULT && !direct) { + if (r == ERESTART) { + assert(!direct); + if((r=vm_vfs_procctl_handlemem(who_e, buf, size, 1)) != OK) { return r; } @@ -736,7 +741,8 @@ static int req_rdlink_actual(endpoint_t fs_e, ino_t inode_nr, /* Send/rec request */ r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); if (r == OK) r = m.m_fs_vfs_rdlink.nbytes; @@ -756,7 +762,9 @@ int req_rdlink(endpoint_t fs_e, ino_t inode_nr, endpoint_t proc_e, r = req_rdlink_actual(fs_e, inode_nr, proc_e, buf, len, direct, CPF_TRY); - if(r == EFAULT && !direct) { + if (r == ERESTART) { + assert(!direct); + if((r=vm_vfs_procctl_handlemem(proc_e, buf, len, 1)) != OK) { return r; } @@ -854,7 +862,8 @@ static int req_readwrite_actual(endpoint_t fs_e, ino_t inode_nr, off_t pos, /* Send/rec request */ r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); if (r == OK) { /* Fill in response structure */ @@ -877,9 +886,9 @@ int req_readwrite(endpoint_t fs_e, ino_t inode_nr, off_t pos, r = req_readwrite_actual(fs_e, inode_nr, pos, rw_flag, user_e, user_addr, num_of_bytes, new_posp, cum_iop, CPF_TRY); - if(r == EFAULT) { - if((r=vm_vfs_procctl_handlemem(user_e, (vir_bytes) user_addr, num_of_bytes, - rw_flag == READING)) != OK) { + if (r == ERESTART) { + if ((r=vm_vfs_procctl_handlemem(user_e, (vir_bytes) user_addr, + num_of_bytes, rw_flag == READING)) != OK) { return r; } @@ -1034,8 +1043,9 @@ static int req_slink_actual( /* Send/rec request */ r = fs_sendrec(fs_e, &m); + cpf_revoke(gid_name); - cpf_revoke(gid_buf); + if (cpf_revoke(gid_buf) == GRANT_FAULTED) return(ERESTART); return(r); } @@ -1059,7 +1069,7 @@ int req_slink( r = req_slink_actual(fs_e, inode_nr, lastc, proc_e, path_addr, path_length, uid, gid, CPF_TRY); - if(r == EFAULT) { + if (r == ERESTART) { if((r=vm_vfs_procctl_handlemem(proc_e, (vir_bytes) path_addr, path_length, 0)) != OK) { return r; @@ -1096,7 +1106,8 @@ int req_stat_actual(endpoint_t fs_e, ino_t inode_nr, endpoint_t proc_e, /* Send/rec request */ r = fs_sendrec(fs_e, &m); - cpf_revoke(grant_id); + + if (cpf_revoke(grant_id) == GRANT_FAULTED) return(ERESTART); return(r); } @@ -1112,7 +1123,7 @@ int req_stat(endpoint_t fs_e, ino_t inode_nr, endpoint_t proc_e, r = req_stat_actual(fs_e, inode_nr, proc_e, buf, CPF_TRY); - if(r == EFAULT) { + if (r == ERESTART) { if((r=vm_vfs_procctl_handlemem(proc_e, (vir_bytes) buf, sizeof(struct stat), 1)) != OK) { return r; diff --git a/minix/tests/blocktest/blocktest.c b/minix/tests/blocktest/blocktest.c index 136bbefa6..e7e235ade 100644 --- a/minix/tests/blocktest/blocktest.c +++ b/minix/tests/blocktest/blocktest.c @@ -318,7 +318,7 @@ static void raw_xfer(dev_t minor, u64_t pos, iovec_s_t *iovec, int nr_req, r = sendrec_driver(&m, exp, res); - if (cpf_revoke(grant) != OK) + if (cpf_revoke(grant) == -1) panic("unable to revoke grant"); if (r != RESULT_OK) @@ -359,7 +359,7 @@ static void vir_xfer(dev_t minor, u64_t pos, iovec_t *iovec, int nr_req, for (i = 0; i < nr_req; i++) { iovec[i].iov_size = iov_s[i].iov_size; - if (cpf_revoke(iov_s[i].iov_grant) != OK) + if (cpf_revoke(iov_s[i].iov_grant) == -1) panic("unable to revoke grant"); } } @@ -1181,7 +1181,7 @@ static int vir_ioctl(dev_t minor, unsigned long req, void *ptr, ssize_t exp, r = sendrec_driver(&m, exp, res); - if (cpf_revoke(grant) != OK) + if (cpf_revoke(grant) == -1) panic("unable to revoke grant"); return r; diff --git a/minix/tests/test74.c b/minix/tests/test74.c index 5c5119e81..8478e037b 100644 --- a/minix/tests/test74.c +++ b/minix/tests/test74.c @@ -791,6 +791,90 @@ hole_regression(void) close(fd); } +/* + * Test that soft faults during file system I/O do not cause functions to + * return partial I/O results. + * + * We refer to the faults that are caused internally within the operating + * system as a result of the deadlock mitigation described at the top of this + * file, as a particular class of "soft faults". Such soft faults may occur in + * the middle of an I/O operation, and general I/O semantics dictate that upon + * partial success, the partial success is returned (and *not* an error). As a + * result, these soft faults, if not handled as special cases, may cause even + * common file system operations such as read(2) on a regular file to return + * fewer bytes than requested. Such unexpected short reads are typically not + * handled well by userland, and the OS must prevent them from occurring if it + * can. Note that read(2) is the most problematic, but certainly not the only, + * case where this problem can occur. + * + * Unfortunately, several file system services are not following the proper + * general I/O semantics - and this includes MFS. Therefore, for now, we have + * to test this case using block device I/O, which does do the right thing. + * In this test we hope that the root file system is mounted on a block device + * usable for (read-only!) testing purposes. + */ +static void +softfault_partial(void) +{ + struct statvfs stf; + struct stat st; + char *buf, *buf2; + ssize_t size; + int fd; + + if (statvfs("/", &stf) != 0) e(0); + + /* + * If the root file system is not mounted off a block device, or if we + * cannot open that device ourselves, simply skip this subtest. + */ + if (stat(stf.f_mntfromname, &st) != 0 || !S_ISBLK(st.st_mode)) + return; /* skip subtest */ + + if ((fd = open(stf.f_mntfromname, O_RDONLY)) == -1) + return; /* skip subtest */ + + /* + * See if we can read in the first two full blocks, or two pages worth + * of data, whichever is larger. If that fails, there is no point in + * continuing the test. + */ + size = MAX(stf.f_bsize, PAGE_SIZE) * 2; + + if ((buf = mmap(NULL, size, PROT_READ | PROT_READ, + MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) e(0); + + if (read(fd, buf, size) != size) { + munmap(buf, size); + close(fd); + return; /* skip subtest */ + } + + lseek(fd, 0, SEEK_SET); + + /* + * Now attempt a read to a partially faulted-in buffer. The first time + * around, the I/O transfer will generate a fault and return partial + * success. In that case, the entire I/O transfer should be retried + * after faulting in the missing page(s), thus resulting in the read + * succeeding in full. + */ + if ((buf2 = mmap(NULL, size, PROT_READ | PROT_READ, + MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) e(0); + buf2[0] = '\0'; /* fault in the first page */ + + if (read(fd, buf2, size) != size) e(0); + + /* The result should be correct, too. */ + if (memcmp(buf, buf2, size)) e(0); + + /* Clean up. */ + munmap(buf2, size); + munmap(buf, size); + + close(fd); +} + int main(int argc, char *argv[]) { @@ -814,6 +898,8 @@ main(int argc, char *argv[]) test_memory_types_vs_operations(); + softfault_partial(); + makefiles(MAXFILES); cachequiet(!bigflag);