 4472b590c7
			
		
	
	
		4472b590c7
		
	
	
	
	
		
			
			This patch changes the prefetch API so that file systems must now provide a set of block numbers, rather than a set of buffers. The result is a leaner and more well-defined API; linear computation of the range of blocks to prefetch; duplicates no longer interfering with the prefetch process; guaranteed inclusion of the block needed next into the prefetch range; and, limits and policy decisions better established by libminixfs now actually being moved into libminixfs. Change-Id: I7e44daf2d2d164bc5e2f1473ad717f3ff0f0a77f
		
			
				
	
	
		
			264 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			264 lines
		
	
	
		
			7.9 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * This file provides an implementation for block I/O functions as expected by
 | |
|  * libfsdriver for root file systems.  In particular, the lmfs_driver function
 | |
|  * can be used to implement fdr_driver, the lmfs_bio function can be used to
 | |
|  * implement the fdr_bread, fdr_bwrite, and fdr_bpeek hooks, and the the
 | |
|  * lmfs_bflush function can be used to implement the fdr_bflush hook.  At the
 | |
|  * very least, a file system that makes use of the provided functionality
 | |
|  * must adhere to the following rules:
 | |
|  *
 | |
|  *   o  it must initialize this library in order to set up a buffer pool for
 | |
|  *      use by these functions, using the lmfs_buf_pool function; the
 | |
|  *      recommended number of blocks for *non*-disk-backed file systems is
 | |
|  *      LMFS_MAX_PREFETCH buffers (disk-backed file systems typically use many
 | |
|  *      more);
 | |
|  *   o  it must enable VM caching in order to support memory mapping of block
 | |
|  *      devices, using the lmfs_may_use_vmcache function;
 | |
|  *   o  it must either use lmfs_flushall as implementation for the fdr_sync
 | |
|  *      hook, or call lmfs_flushall as part of its own fdr_sync implementation.
 | |
|  *
 | |
|  * In addition, a disk-backed file system (as opposed to e.g. a networked file
 | |
|  * system that intends to be able to serve as a root file system) should
 | |
|  * consider the following points:
 | |
|  *
 | |
|  *   o  it may restrict calls to fdr_bwrite on the mounted partition, for
 | |
|  *      example to the partition's first 1024 bytes; it should generally not
 | |
|  *      prevent that area from being written even if the file system is mounted
 | |
|  *      read-only;
 | |
|  *   o  it is free to set its own block size, although the default block size
 | |
|  *      works fine for raw block I/O as well.
 | |
|  */
 | |
| 
 | |
| #include <minix/drivers.h>
 | |
| #include <minix/libminixfs.h>
 | |
| #include <minix/fsdriver.h>
 | |
| #include <minix/bdev.h>
 | |
| #include <minix/partition.h>
 | |
| #include <sys/ioctl.h>
 | |
| #include <assert.h>
 | |
| 
 | |
| #include "inc.h"
 | |
| 
 | |
| /*
 | |
|  * Set the driver label of the device identified by 'dev' to 'label'.  While
 | |
|  * 'dev' is a full device number, only its major device number is to be used.
 | |
|  * This is a very thin wrapper right now, but eventually we will want to hide
 | |
|  * all of libbdev from file systems that use this library, so it is a start.
 | |
|  */
 | |
| void
 | |
| lmfs_driver(dev_t dev, char *label)
 | |
| {
 | |
| 
 | |
| 	bdev_driver(dev, label);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Prefetch up to "nblocks" blocks on "dev" starting from block number "block".
 | |
|  * The size to be used for the last block in the range is given as "last_size".
 | |
|  * Stop early when either the I/O request fills up or when a block is already
 | |
|  * found to be in the cache.  The latter is likely to happen often, since this
 | |
|  * function is called before getting each block for reading.  Prefetching is a
 | |
|  * strictly best-effort operation, and may fail silently.
 | |
|  * TODO: limit according to the number of available buffers.
 | |
|  */
 | |
| static void
 | |
| block_prefetch(dev_t dev, block64_t block, unsigned int nblocks,
 | |
| 	size_t block_size, size_t last_size)
 | |
| {
 | |
| 	struct buf *bp;
 | |
| 	unsigned int count, limit;
 | |
| 	int r;
 | |
| 
 | |
| 	limit = lmfs_readahead_limit();
 | |
| 	assert(limit >= 1 && limit <= LMFS_MAX_PREFETCH);
 | |
| 
 | |
| 	if (nblocks > limit) {
 | |
| 		nblocks = limit;
 | |
| 
 | |
| 		last_size = block_size;
 | |
| 	}
 | |
| 
 | |
| 	for (count = 0; count < nblocks; count++) {
 | |
| 		if (count == nblocks - 1 && last_size < block_size)
 | |
| 			r = lmfs_get_partial_block(&bp, dev, block + count,
 | |
| 			    PEEK, last_size);
 | |
| 		else
 | |
| 			r = lmfs_get_block(&bp, dev, block + count, PEEK);
 | |
| 
 | |
| 		if (r == OK) {
 | |
| 			lmfs_put_block(bp);
 | |
| 
 | |
| 			last_size = block_size;
 | |
| 
 | |
| 			break;
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if (count > 0)
 | |
| 		lmfs_readahead(dev, block, count, last_size);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Perform block I/O, on "dev", starting from offset "pos", for a total of
 | |
|  * "bytes" bytes.  Reading, writing, and peeking are highly similar, and thus,
 | |
|  * this function implements all of them.  The "call" parameter indicates the
 | |
|  * call type (one of FSC_READ, FSC_WRITE, FSC_PEEK).  For read and write calls,
 | |
|  * "data" will identify the user buffer to use; for peek calls, "data" is set
 | |
|  * to NULL.  In all cases, this function returns the number of bytes
 | |
|  * successfully transferred, 0 on end-of-file conditions, and a negative error
 | |
|  * code if no bytes could be transferred due to an error.  Dirty data is not
 | |
|  * flushed immediately, and thus, a successful write only indicates that the
 | |
|  * data have been taken in by the cache (for immediate I/O, a character device
 | |
|  * would have to be used, but MINIX3 no longer supports this), which may be
 | |
|  * follwed later by silent failures.  End-of-file conditions are always
 | |
|  * reported immediately, though.
 | |
|  */
 | |
| ssize_t
 | |
| lmfs_bio(dev_t dev, struct fsdriver_data * data, size_t bytes, off_t pos,
 | |
| 	int call)
 | |
| {
 | |
| 	block64_t block;
 | |
| 	struct part_geom part;
 | |
| 	size_t block_size, off, block_off, last_size, size, chunk;
 | |
| 	unsigned int blocks_left;
 | |
| 	struct buf *bp;
 | |
| 	int r, write, how;
 | |
| 
 | |
| 	if (dev == NO_DEV)
 | |
| 		return EINVAL;
 | |
| 
 | |
| 	block_size = lmfs_fs_block_size();
 | |
| 	write = (call == FSC_WRITE);
 | |
| 
 | |
| 	assert(block_size > 0);
 | |
| 
 | |
| 	if (bytes == 0)
 | |
| 		return 0; /* just in case */
 | |
| 
 | |
| 	if (pos < 0 || bytes > SSIZE_MAX || pos > INT64_MAX - bytes + 1)
 | |
| 		return EINVAL;
 | |
| 
 | |
| 	/*
 | |
| 	 * Get the partition size, so that we can handle EOF ourselves.
 | |
| 	 * Unfortunately, we cannot cache the results between calls, since we
 | |
| 	 * do not get to see DIOCSETP ioctls--see also repartition(8).
 | |
| 	 */
 | |
| 	if ((r = bdev_ioctl(dev, DIOCGETP, &part, NONE /*user_endpt*/)) != OK)
 | |
| 		return r;
 | |
| 
 | |
| 	if ((uint64_t)pos >= part.size)
 | |
| 		return 0; /* EOF */
 | |
| 
 | |
| 	if ((uint64_t)pos > part.size - bytes)
 | |
| 		bytes = part.size - pos;
 | |
| 
 | |
| 	off = 0;
 | |
| 	block = pos / block_size;
 | |
| 	block_off = (size_t)(pos % block_size);
 | |
| 	blocks_left = howmany(block_off + bytes, block_size);
 | |
| 
 | |
| 	assert(blocks_left > 0);
 | |
| 
 | |
| 	/*
 | |
| 	 * If the last block we need is also the last block of the device,
 | |
| 	 * see how many bytes we should actually transfer for that block.
 | |
| 	 */
 | |
| 	if (block + blocks_left - 1 == part.size / block_size)
 | |
| 		last_size = part.size % block_size;
 | |
| 	else
 | |
| 		last_size = block_size;
 | |
| 
 | |
| 	r = OK;
 | |
| 
 | |
| 	for (off = 0; off < bytes && blocks_left > 0; off += chunk) {
 | |
| 		size = (blocks_left == 1) ? last_size : block_size;
 | |
| 
 | |
| 		chunk = size - block_off;
 | |
| 		if (chunk > bytes - off)
 | |
| 			chunk = bytes - off;
 | |
| 
 | |
| 		assert(chunk > 0 && chunk <= size);
 | |
| 
 | |
| 		/*
 | |
| 		 * For read requests, help the block driver form larger I/O
 | |
| 		 * requests.
 | |
| 		 */
 | |
| 		if (!write)
 | |
| 			block_prefetch(dev, block, blocks_left, block_size,
 | |
| 			    last_size);
 | |
| 
 | |
| 		/*
 | |
| 		 * Do not read the block from disk if we will end up
 | |
| 		 * overwriting all of its contents.
 | |
| 		 */
 | |
| 		how = (write && chunk == size) ? NO_READ : NORMAL;
 | |
| 
 | |
| 		if (size < block_size)
 | |
| 			r = lmfs_get_partial_block(&bp, dev, block, how, size);
 | |
| 		else
 | |
| 			r = lmfs_get_block(&bp, dev, block, how);
 | |
| 
 | |
| 		if (r != OK) {
 | |
| 			printf("libminixfs: error getting block <%"PRIx64","
 | |
| 			    "%"PRIu64"> for device I/O (%d)\n", dev, block, r);
 | |
| 
 | |
| 			break;
 | |
| 		}
 | |
| 
 | |
| 		/* Perform the actual copy. */
 | |
| 		if (r == OK && data != NULL) {
 | |
| 			if (write) {
 | |
| 				r = fsdriver_copyin(data, off,
 | |
| 				    (char *)bp->data + block_off, chunk);
 | |
| 
 | |
| 				/*
 | |
| 				 * Mark the block as dirty even if the copy
 | |
| 				 * failed, since the copy may in fact have
 | |
| 				 * succeeded partially.  This is an interface
 | |
| 				 * issue that should be resolved at some point,
 | |
| 				 * but for now we do not want the cache to be
 | |
| 				 * desynchronized from the disk contents.
 | |
| 				 */
 | |
| 				lmfs_markdirty(bp);
 | |
| 			} else
 | |
| 				r = fsdriver_copyout(data, off,
 | |
| 				    (char *)bp->data + block_off, chunk);
 | |
| 		}
 | |
| 
 | |
| 		lmfs_put_block(bp);
 | |
| 
 | |
| 		if (r != OK)
 | |
| 			break;
 | |
| 
 | |
| 		block++;
 | |
| 		block_off = 0;
 | |
| 		blocks_left--;
 | |
| 	}
 | |
| 
 | |
| 	/*
 | |
| 	 * If we were not able to do any I/O, return the error.  Otherwise,
 | |
| 	 * return how many bytes we did manage to transfer.
 | |
| 	 */
 | |
| 	if (r != OK && off == 0)
 | |
| 		return r;
 | |
| 
 | |
| 	return off;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Perform a flush request on a block device, flushing and invalidating all
 | |
|  * blocks associated with this device, both in the local cache and in VM.
 | |
|  * This operation is called after a block device is closed and must prevent
 | |
|  * that stale copies of blocks remain in any cache.
 | |
|  */
 | |
| void
 | |
| lmfs_bflush(dev_t dev)
 | |
| {
 | |
| 
 | |
| 	/* First flush any dirty blocks on this device to disk. */
 | |
| 	lmfs_flushdev(dev);
 | |
| 
 | |
| 	/* Then purge any blocks associated with the device. */
 | |
| 	lmfs_invalidate(dev);
 | |
| }
 |