+ * We could optimize this by keeping track of where the last read-ahead
+ * was, but it would involve adding fields to the vnode. For now, let's
+ * just get it working.
+ *
+ * This replaces bread. If this is a bread at the beginning of a file and
+ * lastr is 0, we assume this is the first read and we'll read up to two
+ * blocks if they are sequential. After that, we'll do regular read ahead
+ * in clustered chunks.
+ *
+ * There are 4 or 5 cases depending on how you count:
+ * Desired block is in the cache:
+ * 1 Not sequential access (0 I/Os).
+ * 2 Access is sequential, do read-ahead (1 ASYNC).
+ * Desired block is not in cache:
+ * 3 Not sequential access (1 SYNC).
+ * 4 Sequential access, next block is contiguous (1 SYNC).
+ * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
+ *
+ * There are potentially two buffers that require I/O.
+ * bp is the block requested.
+ * rbp is the read-ahead block.
+ * If either is NULL, then you don't have to do the I/O.
+ */
+cluster_read(vp, filesize, lblkno, size, cred, bpp)
+ struct vnode *vp;
+ u_quad_t filesize;
+ daddr_t lblkno;
+ long size;
+ struct ucred *cred;
+ struct buf **bpp;
+{
+ struct buf *bp, *rbp;
+ daddr_t blkno, ioblkno;
+ long flags;
+ int error, num_ra, alreadyincore;
+
+#ifdef DIAGNOSTIC
+ if (size == 0)
+ panic("cluster_read: size = 0");
+#endif
+
+ error = 0;
+ flags = B_READ;
+ *bpp = bp = getblk(vp, lblkno, size, 0, 0);
+ if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
+ /*
+ * Desired block is in cache; do any readahead ASYNC.
+ * Case 1, 2.
+ */
+ trace(TR_BREADHIT, pack(vp, size), lblkno);
+ flags |= B_ASYNC;
+ ioblkno = lblkno +
+ (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
+ alreadyincore = (int)incore(vp, ioblkno);
+ bp = NULL;
+ } else {
+ /* Block wasn't in cache, case 3, 4, 5. */
+ trace(TR_BREADMISS, pack(vp, size), lblkno);
+ ioblkno = lblkno;
+ bp->b_flags |= flags;
+ alreadyincore = 0;
+ curproc->p_stats->p_ru.ru_inblock++; /* XXX */
+ }
+ /*
+ * XXX
+ * Replace 1 with a window size based on some permutation of
+ * maxcontig and rot_delay. This will let you figure out how
+ * many blocks you should read-ahead (case 2, 4, 5).
+ *
+ * If the access isn't sequential, cut the window size in half.
+ */
+ rbp = NULL;
+ if (lblkno != vp->v_lastr + 1 && lblkno != 0)
+ vp->v_ralen = max(vp->v_ralen >> 1, 1);
+ else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
+ !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
+ /*
+ * Reading sequentially, and the next block is not in the
+ * cache. We are going to try reading ahead. If this is
+ * the first read of a file, then limit read-ahead to a
+ * single block, else read as much as we're allowed.
+ */
+ if (num_ra > vp->v_ralen) {
+ num_ra = vp->v_ralen;
+ vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
+ } else
+ vp->v_ralen = num_ra + 1;
+
+
+ if (num_ra) /* case 2, 4 */
+ rbp = cluster_rbuild(vp, filesize,
+ bp, ioblkno, blkno, size, num_ra, flags);
+ else if (lblkno != 0 && ioblkno == lblkno) {
+ /* Case 5: check how many blocks to read ahead */
+ ++ioblkno;
+ if ((ioblkno + 1) * size > filesize ||
+ (error = VOP_BMAP(vp,
+ ioblkno, NULL, &blkno, &num_ra)))
+ goto skip_readahead;
+ flags |= B_ASYNC;
+ if (num_ra)
+ rbp = cluster_rbuild(vp, filesize,
+ NULL, ioblkno, blkno, size, num_ra, flags);
+ else {
+ rbp = getblk(vp, ioblkno, size, 0, 0);
+ rbp->b_flags |= flags;
+ rbp->b_blkno = blkno;
+ }
+ } else if (lblkno != 0) {
+ /* case 2; read ahead single block */
+ rbp = getblk(vp, ioblkno, size, 0, 0);
+ rbp->b_flags |= flags;
+ rbp->b_blkno = blkno;
+ } else if (bp) /* case 1, 3, block 0 */
+ bp->b_blkno = blkno;
+ /* Case 1 on block 0; not really doing sequential I/O */
+
+ if (rbp == bp) /* case 4 */
+ rbp = NULL;
+ else if (rbp) { /* case 2, 5 */
+ trace(TR_BREADMISSRA,
+ pack(vp, (num_ra + 1) * size), ioblkno);
+ curproc->p_stats->p_ru.ru_inblock++; /* XXX */
+ }
+ }
+
+ /* XXX Kirk, do we need to make sure the bp has creds? */
+skip_readahead:
+ if (bp)
+ if (bp->b_flags & (B_DONE | B_DELWRI))
+ panic("cluster_read: DONE bp");
+ else
+ error = VOP_STRATEGY(bp);
+
+ if (rbp)
+ if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
+ rbp->b_flags &= ~(B_ASYNC | B_READ);
+ brelse(rbp);
+ } else
+ (void) VOP_STRATEGY(rbp);
+
+ if (bp)
+ return(biowait(bp));
+ return(error);
+}
+
+/*
+ * If blocks are contiguous on disk, use this to provide clustered
+ * read ahead. We will read as many blocks as possible sequentially
+ * and then parcel them up into logical blocks in the buffer hash table.
+ */
+struct buf *
+cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
+ struct vnode *vp;
+ u_quad_t filesize;
+ struct buf *bp;
+ daddr_t lbn;
+ daddr_t blkno;
+ long size;
+ int run;
+ long flags;
+{
+ struct cluster_save *b_save;
+ struct buf *tbp;
+ daddr_t bn;
+ int i, inc;
+
+ if (size * (lbn + run + 1) > filesize)
+ --run;
+ if (run == 0) {
+ if (!bp) {
+ bp = getblk(vp, lbn, size, 0, 0);
+ bp->b_blkno = blkno;
+ bp->b_flags |= flags;
+ }
+ return(bp);
+ }
+
+ bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
+ if (bp->b_flags & (B_DONE | B_DELWRI))
+ return (bp);
+
+ b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
+ M_SEGMENT, M_WAITOK);
+ b_save->bs_bufsize = b_save->bs_bcount = size;
+ b_save->bs_nchildren = 0;
+ b_save->bs_children = (struct buf **)(b_save + 1);
+ b_save->bs_saveaddr = bp->b_saveaddr;
+ bp->b_saveaddr = (caddr_t) b_save;
+
+ inc = size / DEV_BSIZE;
+ for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
+ if (incore(vp, lbn + i)) {
+ if (i == 1) {
+ bp->b_saveaddr = b_save->bs_saveaddr;
+ bp->b_flags &= ~B_CALL;
+ bp->b_iodone = NULL;
+ allocbuf(bp, size);
+ free(b_save, M_SEGMENT);
+ } else
+ allocbuf(bp, size * i);
+ break;
+ }
+ tbp = getblk(vp, lbn + i, 0, 0, 0);
+ tbp->b_bcount = tbp->b_bufsize = size;
+ tbp->b_blkno = bn;
+ tbp->b_flags |= flags | B_READ | B_ASYNC;
+ ++b_save->bs_nchildren;
+ b_save->bs_children[i - 1] = tbp;
+ }
+ if (!(bp->b_flags & B_ASYNC))
+ vp->v_ralen = max(vp->v_ralen - 1, 1);
+ return(bp);
+}
+
+/*
+ * Either get a new buffer or grow the existing one.
+ */
+struct buf *
+cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
+ struct vnode *vp;
+ struct buf *bp;
+ long flags;
+ daddr_t blkno;
+ daddr_t lblkno;
+ long size;
+ int run;
+{
+ if (!bp) {
+ bp = getblk(vp, lblkno, size, 0, 0);
+ if (bp->b_flags & (B_DONE | B_DELWRI)) {
+ bp->b_blkno = blkno;
+ return(bp);
+ }
+ }
+ allocbuf(bp, run * size);
+ bp->b_blkno = blkno;
+ bp->b_iodone = cluster_callback;
+ bp->b_flags |= flags | B_CALL;
+ return(bp);
+}
+
+/*
+ * Cleanup after a clustered read or write.
+ */
+void
+cluster_callback(bp)
+ struct buf *bp;
+{
+ struct cluster_save *b_save;
+ struct buf **tbp;
+ long bsize;
+ caddr_t cp;
+ b_save = (struct cluster_save *)(bp->b_saveaddr);
+ bp->b_saveaddr = b_save->bs_saveaddr;
+
+ cp = bp->b_un.b_addr + b_save->bs_bufsize;
+ for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
+ pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
+ cp += (*tbp)->b_bufsize;
+ bp->b_bufsize -= (*tbp)->b_bufsize;
+ biodone(*tbp);
+ }
+#ifdef DIAGNOSTIC
+ if (bp->b_bufsize != b_save->bs_bufsize)
+ panic ("cluster_callback: more space to reclaim");
+#endif
+ bp->b_bcount = bp->b_bufsize;
+ bp->b_iodone = NULL;
+ free(b_save, M_SEGMENT);
+ if (bp->b_flags & B_ASYNC)
+ brelse(bp);
+ else
+ wakeup((caddr_t)bp);
+}
+
+/*
+ * Synchronous write.
+ * Release buffer on completion.