diff --git a/minix/fs/ext2/read.c b/minix/fs/ext2/read.c index 544db1ce2..782f9c064 100644 --- a/minix/fs/ext2/read.c +++ b/minix/fs/ext2/read.c @@ -148,8 +148,12 @@ int *completed; /* number of bytes copied */ printf("ext2fs: fsdriver_zero failed\n"); } return r; + } else if (call == FSC_PEEK) { + /* Peeking a nonexistent block. Report to VM. */ + lmfs_zero_block_ino(dev, ino, ino_off); + return OK; } else { - /* Writing to or peeking a nonexistent block. + /* Writing to a nonexistent block. * Create and enter in inode. */ if ((bp = new_block(rip, (off_t) ex64lo(position))) == NULL) diff --git a/minix/fs/mfs/read.c b/minix/fs/mfs/read.c index 784fa251e..4f92fd954 100644 --- a/minix/fs/mfs/read.c +++ b/minix/fs/mfs/read.c @@ -159,8 +159,12 @@ int *completed; /* number of bytes copied */ printf("MFS: fsdriver_zero failed\n"); } return r; + } else if (call == FSC_PEEK) { + /* Peeking a nonexistent block. Report to VM. */ + lmfs_zero_block_ino(dev, ino, ino_off); + return OK; } else { - /* Writing to or peeking a nonexistent block. + /* Writing to a nonexistent block. * Create and enter in inode. */ if ((bp = new_block(rip, (off_t) ex64lo(position))) == NULL) diff --git a/minix/include/minix/libminixfs.h b/minix/include/minix/libminixfs.h index 207ca83d3..15fa1630c 100644 --- a/minix/include/minix/libminixfs.h +++ b/minix/include/minix/libminixfs.h @@ -47,6 +47,7 @@ struct buf *lmfs_get_block_ino(dev_t dev, block64_t block,int only_search, ino_t ino, u64_t off); void lmfs_put_block(struct buf *bp, int block_type); void lmfs_free_block(dev_t dev, block64_t block); +void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t off); void lmfs_invalidate(dev_t device); void lmfs_rw_scattered(dev_t, struct buf **, int, int); void lmfs_setquiet(int q); diff --git a/minix/lib/libminixfs/cache.c b/minix/lib/libminixfs/cache.c index 0b2ad2b1b..b6cb4564d 100644 --- a/minix/lib/libminixfs/cache.c +++ b/minix/lib/libminixfs/cache.c @@ -445,7 +445,7 @@ void lmfs_put_block( */ dev_t dev; uint64_t dev_off; - int r; + int r, setflags; if (bp == NULL) return; /* it is easier to check here than in caller */ @@ -487,9 +487,10 @@ void lmfs_put_block( /* block has sensible content - if necesary, identify it to VM */ if(vmcache && bp->lmfs_needsetcache && dev != NO_DEV) { - if((r=vm_set_cacheblock(bp->data, dev, dev_off, - bp->lmfs_inode, bp->lmfs_inode_offset, - &bp->lmfs_flags, fs_block_size, 0)) != OK) { + setflags = (block_type & ONE_SHOT) ? VMSF_ONCE : 0; + if ((r = vm_set_cacheblock(bp->data, dev, dev_off, bp->lmfs_inode, + bp->lmfs_inode_offset, &bp->lmfs_flags, fs_block_size, + setflags)) != OK) { if(r == ENOSYS) { printf("libminixfs: ENOSYS, disabling VM calls\n"); vmcache = 0; @@ -500,6 +501,14 @@ void lmfs_put_block( } } bp->lmfs_needsetcache = 0; + + /* Now that we (may) have given the block to VM, invalidate the block if it + * is a one-shot block. Otherwise, it may still be reobtained immediately + * after, which could be a problem if VM already forgot the block and we are + * expected to pass it to VM again, which then wouldn't happen. + */ + if (block_type & ONE_SHOT) + bp->lmfs_dev = NO_DEV; } /*===========================================================================* @@ -544,6 +553,62 @@ void lmfs_free_block(dev_t dev, block64_t block) */ } +/*===========================================================================* + * lmfs_zero_block_ino * + *===========================================================================*/ +void lmfs_zero_block_ino(dev_t dev, ino_t ino, u64_t ino_off) +{ +/* Files may have holes. From an application perspective, these are just file + * regions filled with zeroes. From a file system perspective however, holes + * may represent unallocated regions on disk. Thus, these holes do not have + * corresponding blocks on the disk, and therefore also no block number. + * Therefore, we cannot simply use lmfs_get_block_ino() for them. For reads, + * this is not a problem, since the file system can just zero out the target + * application buffer instead. For mapped pages however, this *is* a problem, + * since the VM cache needs to be told about the corresponding block, and VM + * does not accept blocks without a device offset. The role of this function is + * therefore to tell VM about the hole using a fake device offset. The device + * offsets are picked so that the VM cache will see a block memory-mapped for + * the hole in the file, while the same block is not visible when + * memory-mapping the block device. + */ + struct buf *bp; + static block64_t fake_block = 0; + + if (!vmcache) + return; + + assert(fs_block_size > 0); + + /* Pick a block number which is above the threshold of what can possibly be + * mapped in by mmap'ing the device, since off_t is signed, and it is safe to + * say that it will take a while before we have 8-exabyte devices. Pick a + * different block number each time to avoid possible concurrency issues. + * FIXME: it does not seem like VM actually verifies mmap offsets though.. + */ + if (fake_block == 0 || ++fake_block >= UINT64_MAX / fs_block_size) + fake_block = ((uint64_t)INT64_MAX + 1) / fs_block_size; + + /* Obtain a block. */ + bp = lmfs_get_block_ino(dev, fake_block, NO_READ, ino, ino_off); + assert(bp != NULL); + assert(bp->lmfs_dev != NO_DEV); + + /* The block is already zeroed, as it has just been allocated with mmap. File + * systems do not rely on this assumption yet, so if VM ever gets changed to + * not clear the blocks we allocate (e.g., by recycling pages in the VM cache + * for the same process, which would be safe), we need to add a memset here. + */ + + /* Release the block. We don't expect it to be accessed ever again. Moreover, + * if we keep the block around in the VM cache, it may erroneously be mapped + * in beyond the file end later. Hence, use VMSF_ONCE when passing it to VM. + * TODO: tell VM that it is an all-zeroes block, so that VM can deduplicate + * all such pages in its cache. + */ + lmfs_put_block(bp, ONE_SHOT); +} + void lmfs_cache_reevaluate(dev_t dev) { if(bufs_in_use == 0 && dev != NO_DEV) { diff --git a/minix/tests/test74.c b/minix/tests/test74.c index b8cd36283..d29e969b1 100644 --- a/minix/tests/test74.c +++ b/minix/tests/test74.c @@ -690,6 +690,107 @@ corruption_regression(void) free(buf); } +/* + * Test mmap on file holes. Holes are a tricky case with the current VM + * implementation. There are two main issues. First, whenever a file data + * block is freed, VM has to know about this, or it will later blindly map in + * the old data. This, file systems explicitly tell VM (through libminixfs) + * whenever a block is freed, upon which VM cache forgets the block. Second, + * blocks are accessed primarily by a pair and only additionally + * by a pair. Holes have no meaningful value for the first pair, + * but do need to be registered in VM with the second pair, or accessing them + * will generate a segmentation fault. Thus, file systems explicitly tell VM + * (through libminixfs) when a hole is being peeked; libminixfs currently fakes + * a device offset to make this work. + */ +static void +hole_regression(void) +{ + struct statvfs st; + size_t block_size; + char *buf; + int fd; + + if (statvfs(".", &st) < 0) e(1); + + block_size = st.f_bsize; + + if ((buf = malloc(block_size)) == NULL) e(2); + + if ((fd = open("testfile", O_CREAT | O_TRUNC | O_RDWR)) < 0) e(3); + + if (unlink("testfile") != 0) e(4); + + /* + * We perform the test twice, in a not-so-perfect attempt to test the + * two aspects independently. The first part immediately creates a + * hole, and is supposed to fail only if reporting holes to VM does not + * work. However, it may also fail if a page for a previous file with + * the same inode number as "testfile" is still in the VM cache. + */ + memset(buf, 12, block_size); + + if (write(fd, buf, block_size) != block_size) e(5); + + if (lseek(fd, block_size * 2, SEEK_CUR) != block_size * 3) e(6); + + memset(buf, 78, block_size); + + if (write(fd, buf, block_size) != block_size) e(7); + + free(buf); + + if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE, + fd, 0)) == MAP_FAILED) e(8); + + if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(9); + if (buf[1 * block_size] != 0 || buf[2 * block_size - 1] != 0) e(10); + if (buf[2 * block_size] != 0 || buf[3 * block_size - 1] != 0) e(11); + if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(12); + + if (munmap(buf, 4 * block_size) != 0) e(13); + + /* + * The second part first creates file content and only turns part of it + * into a file hole, thus ensuring that VM has previously cached pages + * for the blocks that are freed. The test will fail if VM keeps the + * pages around in its cache. + */ + if ((buf = malloc(block_size)) == NULL) e(14); + + if (lseek(fd, block_size, SEEK_SET) != block_size) e(15); + + memset(buf, 34, block_size); + + if (write(fd, buf, block_size) != block_size) e(16); + + memset(buf, 56, block_size); + + if (write(fd, buf, block_size) != block_size) e(17); + + if (ftruncate(fd, block_size) != 0) e(18); + + if (lseek(fd, block_size * 3, SEEK_SET) != block_size * 3) e(19); + + memset(buf, 78, block_size); + + if (write(fd, buf, block_size) != block_size) e(20); + + free(buf); + + if ((buf = mmap(NULL, 4 * block_size, PROT_READ, MAP_SHARED | MAP_FILE, + fd, 0)) == MAP_FAILED) e(21); + + if (buf[0 * block_size] != 12 || buf[1 * block_size - 1] != 12) e(22); + if (buf[1 * block_size] != 0 || buf[2 * block_size - 1] != 0) e(23); + if (buf[2 * block_size] != 0 || buf[3 * block_size - 1] != 0) e(24); + if (buf[3 * block_size] != 78 || buf[4 * block_size - 1] != 78) e(25); + + if (munmap(buf, 4 * block_size) != 0) e(26); + + close(fd); +} + int main(int argc, char *argv[]) { @@ -709,6 +810,8 @@ main(int argc, char *argv[]) for (i = 0; i < 10; i++) corruption_regression(); + hole_regression(); + test_memory_types_vs_operations(); makefiles(MAXFILES);