fs/xfs/xfs_buf.c

   1 /*
   2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include <linux/stddef.h>
  20 #include <linux/errno.h>
  21 #include <linux/gfp.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/init.h>
  24 #include <linux/vmalloc.h>
  25 #include <linux/bio.h>
  26 #include <linux/sysctl.h>
  27 #include <linux/proc_fs.h>
  28 #include <linux/workqueue.h>
  29 #include <linux/percpu.h>
  30 #include <linux/blkdev.h>
  31 #include <linux/hash.h>
  32 #include <linux/kthread.h>
  33 #include <linux/migrate.h>
  34 #include <linux/backing-dev.h>
  35 #include <linux/freezer.h>
  36
  37 #include "xfs_sb.h"
  38 #include "xfs_inum.h"
  39 #include "xfs_log.h"
  40 #include "xfs_ag.h"
  41 #include "xfs_mount.h"
  42 #include "xfs_trace.h"
  43
  44 static kmem_zone_t *xfs_buf_zone;
  45
  46 static struct workqueue_struct *xfslogd_workqueue;
  47
  48 #ifdef XFS_BUF_LOCK_TRACKING
  49 # define XB_SET_OWNER(bp)       ((bp)->b_last_holder = current->pid)
  50 # define XB_CLEAR_OWNER(bp)     ((bp)->b_last_holder = -1)
  51 # define XB_GET_OWNER(bp)       ((bp)->b_last_holder)
  52 #else
  53 # define XB_SET_OWNER(bp)       do { } while (0)
  54 # define XB_CLEAR_OWNER(bp)     do { } while (0)
  55 # define XB_GET_OWNER(bp)       do { } while (0)
  56 #endif
  57
  58 #define xb_to_gfp(flags) \
  59         ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
  60           ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
  61
  62 #define xb_to_km(flags) \
  63          (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
  64
  65
  66 static inline int
  67 xfs_buf_is_vmapped(
  68         struct xfs_buf  *bp)
  69 {
  70         /*
  71          * Return true if the buffer is vmapped.
  72          *
  73          * The XBF_MAPPED flag is set if the buffer should be mapped, but the
  74          * code is clever enough to know it doesn't have to map a single page,
  75          * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
  76          */
  77         return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
  78 }
  79
  80 static inline int
  81 xfs_buf_vmap_len(
  82         struct xfs_buf  *bp)
  83 {
  84         return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
  85 }
  86
  87 /*
  88  * xfs_buf_lru_add - add a buffer to the LRU.
  89  *
  90  * The LRU takes a new reference to the buffer so that it will only be freed
  91  * once the shrinker takes the buffer off the LRU.
  92  */
  93 STATIC void
  94 xfs_buf_lru_add(
  95         struct xfs_buf  *bp)
  96 {
  97         struct xfs_buftarg *btp = bp->b_target;
  98
  99         spin_lock(&btp->bt_lru_lock);
 100         if (list_empty(&bp->b_lru)) {
 101                 atomic_inc(&bp->b_hold);
 102                 list_add_tail(&bp->b_lru, &btp->bt_lru);
 103                 btp->bt_lru_nr++;
 104         }
 105         spin_unlock(&btp->bt_lru_lock);
 106 }
 107
 108 /*
 109  * xfs_buf_lru_del - remove a buffer from the LRU
 110  *
 111  * The unlocked check is safe here because it only occurs when there are not
 112  * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
 113  * to optimise the shrinker removing the buffer from the LRU and calling
 114  * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
 115  * bt_lru_lock.
 116  */
 117 STATIC void
 118 xfs_buf_lru_del(
 119         struct xfs_buf  *bp)
 120 {
 121         struct xfs_buftarg *btp = bp->b_target;
 122
 123         if (list_empty(&bp->b_lru))
 124                 return;
 125
 126         spin_lock(&btp->bt_lru_lock);
 127         if (!list_empty(&bp->b_lru)) {
 128                 list_del_init(&bp->b_lru);
 129                 btp->bt_lru_nr--;
 130         }
 131         spin_unlock(&btp->bt_lru_lock);
 132 }
 133
 134 /*
 135  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 136  * b_lru_ref count so that the buffer is freed immediately when the buffer
 137  * reference count falls to zero. If the buffer is already on the LRU, we need
 138  * to remove the reference that LRU holds on the buffer.
 139  *
 140  * This prevents build-up of stale buffers on the LRU.
 141  */
 142 void
 143 xfs_buf_stale(
 144         struct xfs_buf  *bp)
 145 {
 146         ASSERT(xfs_buf_islocked(bp));
 147
 148         bp->b_flags |= XBF_STALE;
 149
 150         /*
 151          * Clear the delwri status so that a delwri queue walker will not
 152          * flush this buffer to disk now that it is stale. The delwri queue has
 153          * a reference to the buffer, so this is safe to do.
 154          */
 155         bp->b_flags &= ~_XBF_DELWRI_Q;
 156
 157         atomic_set(&(bp)->b_lru_ref, 0);
 158         if (!list_empty(&bp->b_lru)) {
 159                 struct xfs_buftarg *btp = bp->b_target;
 160
 161                 spin_lock(&btp->bt_lru_lock);
 162                 if (!list_empty(&bp->b_lru)) {
 163                         list_del_init(&bp->b_lru);
 164                         btp->bt_lru_nr--;
 165                         atomic_dec(&bp->b_hold);
 166                 }
 167                 spin_unlock(&btp->bt_lru_lock);
 168         }
 169         ASSERT(atomic_read(&bp->b_hold) >= 1);
 170 }
 171
 172 struct xfs_buf *
 173 xfs_buf_alloc(
 174         struct xfs_buftarg      *target,
 175         xfs_off_t               range_base,
 176         size_t                  range_length,
 177         xfs_buf_flags_t         flags)
 178 {
 179         struct xfs_buf          *bp;
 180
 181         bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
 182         if (unlikely(!bp))
 183                 return NULL;
 184
 185         /*
 186          * We don't want certain flags to appear in b_flags.
 187          */
 188         flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
 189
 190         memset(bp, 0, sizeof(xfs_buf_t));
 191         atomic_set(&bp->b_hold, 1);
 192         atomic_set(&bp->b_lru_ref, 1);
 193         init_completion(&bp->b_iowait);
 194         INIT_LIST_HEAD(&bp->b_lru);
 195         INIT_LIST_HEAD(&bp->b_list);
 196         RB_CLEAR_NODE(&bp->b_rbnode);
 197         sema_init(&bp->b_sema, 0); /* held, no waiters */
 198         XB_SET_OWNER(bp);
 199         bp->b_target = target;
 200         bp->b_file_offset = range_base;
 201         /*
 202          * Set buffer_length and count_desired to the same value initially.
 203          * I/O routines should use count_desired, which will be the same in
 204          * most cases but may be reset (e.g. XFS recovery).
 205          */
 206         bp->b_buffer_length = bp->b_count_desired = range_length;
 207         bp->b_flags = flags;
 208         bp->b_bn = XFS_BUF_DADDR_NULL;
 209         atomic_set(&bp->b_pin_count, 0);
 210         init_waitqueue_head(&bp->b_waiters);
 211
 212         XFS_STATS_INC(xb_create);
 213         trace_xfs_buf_init(bp, _RET_IP_);
 214
 215         return bp;
 216 }
 217
 218 /*
 219  *      Allocate a page array capable of holding a specified number
 220  *      of pages, and point the page buf at it.
 221  */
 222 STATIC int
 223 _xfs_buf_get_pages(
 224         xfs_buf_t               *bp,
 225         int                     page_count,
 226         xfs_buf_flags_t         flags)
 227 {
 228         /* Make sure that we have a page list */
 229         if (bp->b_pages == NULL) {
 230                 bp->b_offset = xfs_buf_poff(bp->b_file_offset);
 231                 bp->b_page_count = page_count;
 232                 if (page_count <= XB_PAGES) {
 233                         bp->b_pages = bp->b_page_array;
 234                 } else {
 235                         bp->b_pages = kmem_alloc(sizeof(struct page *) *
 236                                         page_count, xb_to_km(flags));
 237                         if (bp->b_pages == NULL)
 238                                 return -ENOMEM;
 239                 }
 240                 memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
 241         }
 242         return 0;
 243 }
 244
 245 /*
 246  *      Frees b_pages if it was allocated.
 247  */
 248 STATIC void
 249 _xfs_buf_free_pages(
 250         xfs_buf_t       *bp)
 251 {
 252         if (bp->b_pages != bp->b_page_array) {
 253                 kmem_free(bp->b_pages);
 254                 bp->b_pages = NULL;
 255         }
 256 }
 257
 258 /*
 259  *      Releases the specified buffer.
 260  *
 261  *      The modification state of any associated pages is left unchanged.
 262  *      The buffer most not be on any hash - use xfs_buf_rele instead for
 263  *      hashed and refcounted buffers
 264  */
 265 void
 266 xfs_buf_free(
 267         xfs_buf_t               *bp)
 268 {
 269         trace_xfs_buf_free(bp, _RET_IP_);
 270
 271         ASSERT(list_empty(&bp->b_lru));
 272
 273         if (bp->b_flags & _XBF_PAGES) {
 274                 uint            i;
 275
 276                 if (xfs_buf_is_vmapped(bp))
 277                         vm_unmap_ram(bp->b_addr - bp->b_offset,
 278                                         bp->b_page_count);
 279
 280                 for (i = 0; i < bp->b_page_count; i++) {
 281                         struct page     *page = bp->b_pages[i];
 282
 283                         __free_page(page);
 284                 }
 285         } else if (bp->b_flags & _XBF_KMEM)
 286                 kmem_free(bp->b_addr);
 287         _xfs_buf_free_pages(bp);
 288         kmem_zone_free(xfs_buf_zone, bp);
 289 }
 290
 291 /*
 292  * Allocates all the pages for buffer in question and builds it's page list.
 293  */
 294 STATIC int
 295 xfs_buf_allocate_memory(
 296         xfs_buf_t               *bp,
 297         uint                    flags)
 298 {
 299         size_t                  size = bp->b_count_desired;
 300         size_t                  nbytes, offset;
 301         gfp_t                   gfp_mask = xb_to_gfp(flags);
 302         unsigned short          page_count, i;
 303         xfs_off_t               end;
 304         int                     error;
 305
 306         /*
 307          * for buffers that are contained within a single page, just allocate
 308          * the memory from the heap - there's no need for the complexity of
 309          * page arrays to keep allocation down to order 0.
 310          */
 311         if (bp->b_buffer_length < PAGE_SIZE) {
 312                 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
 313                 if (!bp->b_addr) {
 314                         /* low memory - use alloc_page loop instead */
 315                         goto use_alloc_page;
 316                 }
 317
 318                 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
 319                                                                 PAGE_MASK) !=
 320                     ((unsigned long)bp->b_addr & PAGE_MASK)) {
 321                         /* b_addr spans two pages - use alloc_page instead */
 322                         kmem_free(bp->b_addr);
 323                         bp->b_addr = NULL;
 324                         goto use_alloc_page;
 325                 }
 326                 bp->b_offset = offset_in_page(bp->b_addr);
 327                 bp->b_pages = bp->b_page_array;
 328                 bp->b_pages[0] = virt_to_page(bp->b_addr);
 329                 bp->b_page_count = 1;
 330                 bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
 331                 return 0;
 332         }
 333
 334 use_alloc_page:
 335         end = bp->b_file_offset + bp->b_buffer_length;
 336         page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
 337         error = _xfs_buf_get_pages(bp, page_count, flags);
 338         if (unlikely(error))
 339                 return error;
 340
 341         offset = bp->b_offset;
 342         bp->b_flags |= _XBF_PAGES;
 343
 344         for (i = 0; i < bp->b_page_count; i++) {
 345                 struct page     *page;
 346                 uint            retries = 0;
 347 retry:
 348                 page = alloc_page(gfp_mask);
 349                 if (unlikely(page == NULL)) {
 350                         if (flags & XBF_READ_AHEAD) {
 351                                 bp->b_page_count = i;
 352                                 error = ENOMEM;
 353                                 goto out_free_pages;
 354                         }
 355
 356                         /*
 357                          * This could deadlock.
 358                          *
 359                          * But until all the XFS lowlevel code is revamped to
 360                          * handle buffer allocation failures we can't do much.
 361                          */
 362                         if (!(++retries % 100))
 363                                 xfs_err(NULL,
 364                 "possible memory allocation deadlock in %s (mode:0x%x)",
 365                                         __func__, gfp_mask);
 366
 367                         XFS_STATS_INC(xb_page_retries);
 368                         congestion_wait(BLK_RW_ASYNC, HZ/50);
 369                         goto retry;
 370                 }
 371
 372                 XFS_STATS_INC(xb_page_found);
 373
 374                 nbytes = min_t(size_t, size, PAGE_SIZE - offset);
 375                 size -= nbytes;
 376                 bp->b_pages[i] = page;
 377                 offset = 0;
 378         }
 379         return 0;
 380
 381 out_free_pages:
 382         for (i = 0; i < bp->b_page_count; i++)
 383                 __free_page(bp->b_pages[i]);
 384         return error;
 385 }
 386
 387 /*
 388  *      Map buffer into kernel address-space if necessary.
 389  */
 390 STATIC int
 391 _xfs_buf_map_pages(
 392         xfs_buf_t               *bp,
 393         uint                    flags)
 394 {
 395         ASSERT(bp->b_flags & _XBF_PAGES);
 396         if (bp->b_page_count == 1) {
 397                 /* A single page buffer is always mappable */
 398                 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 399                 bp->b_flags |= XBF_MAPPED;
 400         } else if (flags & XBF_MAPPED) {
 401                 int retried = 0;
 402
 403                 do {
 404                         bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
 405                                                 -1, PAGE_KERNEL);
 406                         if (bp->b_addr)
 407                                 break;
 408                         vm_unmap_aliases();
 409                 } while (retried++ <= 1);
 410
 411                 if (!bp->b_addr)
 412                         return -ENOMEM;
 413                 bp->b_addr += bp->b_offset;
 414                 bp->b_flags |= XBF_MAPPED;
 415         }
 416
 417         return 0;
 418 }
 419
 420 /*
 421  *      Finding and Reading Buffers
 422  */
 423
 424 /*
 425  *      Look up, and creates if absent, a lockable buffer for
 426  *      a given range of an inode.  The buffer is returned
 427  *      locked. No I/O is implied by this call.
 428  */
 429 xfs_buf_t *
 430 _xfs_buf_find(
 431         xfs_buftarg_t           *btp,   /* block device target          */
 432         xfs_off_t               ioff,   /* starting offset of range     */
 433         size_t                  isize,  /* length of range              */
 434         xfs_buf_flags_t         flags,
 435         xfs_buf_t               *new_bp)
 436 {
 437         xfs_off_t               range_base;
 438         size_t                  range_length;
 439         struct xfs_perag        *pag;
 440         struct rb_node          **rbp;
 441         struct rb_node          *parent;
 442         xfs_buf_t               *bp;
 443
 444         range_base = (ioff << BBSHIFT);
 445         range_length = (isize << BBSHIFT);
 446
 447         /* Check for IOs smaller than the sector size / not sector aligned */
 448         ASSERT(!(range_length < (1 << btp->bt_sshift)));
 449         ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 450
 451         /* get tree root */
 452         pag = xfs_perag_get(btp->bt_mount,
 453                                 xfs_daddr_to_agno(btp->bt_mount, ioff));
 454
 455         /* walk tree */
 456         spin_lock(&pag->pag_buf_lock);
 457         rbp = &pag->pag_buf_tree.rb_node;
 458         parent = NULL;
 459         bp = NULL;
 460         while (*rbp) {
 461                 parent = *rbp;
 462                 bp = rb_entry(parent, struct xfs_buf, b_rbnode);
 463
 464                 if (range_base < bp->b_file_offset)
 465                         rbp = &(*rbp)->rb_left;
 466                 else if (range_base > bp->b_file_offset)
 467                         rbp = &(*rbp)->rb_right;
 468                 else {
 469                         /*
 470                          * found a block offset match. If the range doesn't
 471                          * match, the only way this is allowed is if the buffer
 472                          * in the cache is stale and the transaction that made
 473                          * it stale has not yet committed. i.e. we are
 474                          * reallocating a busy extent. Skip this buffer and
 475                          * continue searching to the right for an exact match.
 476                          */
 477                         if (bp->b_buffer_length != range_length) {
 478                                 ASSERT(bp->b_flags & XBF_STALE);
 479                                 rbp = &(*rbp)->rb_right;
 480                                 continue;
 481                         }
 482                         atomic_inc(&bp->b_hold);
 483                         goto found;
 484                 }
 485         }
 486
 487         /* No match found */
 488         if (new_bp) {
 489                 rb_link_node(&new_bp->b_rbnode, parent, rbp);
 490                 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
 491                 /* the buffer keeps the perag reference until it is freed */
 492                 new_bp->b_pag = pag;
 493                 spin_unlock(&pag->pag_buf_lock);
 494         } else {
 495                 XFS_STATS_INC(xb_miss_locked);
 496                 spin_unlock(&pag->pag_buf_lock);
 497                 xfs_perag_put(pag);
 498         }
 499         return new_bp;
 500
 501 found:
 502         spin_unlock(&pag->pag_buf_lock);
 503         xfs_perag_put(pag);
 504
 505         if (!xfs_buf_trylock(bp)) {
 506                 if (flags & XBF_TRYLOCK) {
 507                         xfs_buf_rele(bp);
 508                         XFS_STATS_INC(xb_busy_locked);
 509                         return NULL;
 510                 }
 511                 xfs_buf_lock(bp);
 512                 XFS_STATS_INC(xb_get_locked_waited);
 513         }
 514
 515         /*
 516          * if the buffer is stale, clear all the external state associated with
 517          * it. We need to keep flags such as how we allocated the buffer memory
 518          * intact here.
 519          */
 520         if (bp->b_flags & XBF_STALE) {
 521                 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
 522                 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
 523         }
 524
 525         trace_xfs_buf_find(bp, flags, _RET_IP_);
 526         XFS_STATS_INC(xb_get_locked);
 527         return bp;
 528 }
 529
 530 /*
 531  * Assembles a buffer covering the specified range. The code is optimised for
 532  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
 533  * more hits than misses.
 534  */
 535 struct xfs_buf *
 536 xfs_buf_get(
 537         xfs_buftarg_t           *target,/* target for buffer            */
 538         xfs_off_t               ioff,   /* starting offset of range     */
 539         size_t                  isize,  /* length of range              */
 540         xfs_buf_flags_t         flags)
 541 {
 542         struct xfs_buf          *bp;
 543         struct xfs_buf          *new_bp;
 544         int                     error = 0;
 545
 546         bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
 547         if (likely(bp))
 548                 goto found;
 549
 550         new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
 551                                flags);
 552         if (unlikely(!new_bp))
 553                 return NULL;
 554
 555         bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
 556         if (!bp) {
 557                 kmem_zone_free(xfs_buf_zone, new_bp);
 558                 return NULL;
 559         }
 560
 561         if (bp == new_bp) {
 562                 error = xfs_buf_allocate_memory(bp, flags);
 563                 if (error)
 564                         goto no_buffer;
 565         } else
 566                 kmem_zone_free(xfs_buf_zone, new_bp);
 567
 568         /*
 569          * Now we have a workable buffer, fill in the block number so
 570          * that we can do IO on it.
 571          */
 572         bp->b_bn = ioff;
 573         bp->b_count_desired = bp->b_buffer_length;
 574
 575 found:
 576         if (!(bp->b_flags & XBF_MAPPED)) {
 577                 error = _xfs_buf_map_pages(bp, flags);
 578                 if (unlikely(error)) {
 579                         xfs_warn(target->bt_mount,
 580                                 "%s: failed to map pages\n", __func__);
 581                         goto no_buffer;
 582                 }
 583         }
 584
 585         XFS_STATS_INC(xb_get);
 586         trace_xfs_buf_get(bp, flags, _RET_IP_);
 587         return bp;
 588
 589 no_buffer:
 590         if (flags & (XBF_LOCK | XBF_TRYLOCK))
 591                 xfs_buf_unlock(bp);
 592         xfs_buf_rele(bp);
 593         return NULL;
 594 }
 595
 596 STATIC int
 597 _xfs_buf_read(
 598         xfs_buf_t               *bp,
 599         xfs_buf_flags_t         flags)
 600 {
 601         int                     status;
 602
 603         ASSERT(!(flags & XBF_WRITE));
 604         ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
 605
 606         bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
 607         bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
 608
 609         status = xfs_buf_iorequest(bp);
 610         if (status || bp->b_error || (flags & XBF_ASYNC))
 611                 return status;
 612         return xfs_buf_iowait(bp);
 613 }
 614
 615 xfs_buf_t *
 616 xfs_buf_read(
 617         xfs_buftarg_t           *target,
 618         xfs_off_t               ioff,
 619         size_t                  isize,
 620         xfs_buf_flags_t         flags)
 621 {
 622         xfs_buf_t               *bp;
 623
 624         flags |= XBF_READ;
 625
 626         bp = xfs_buf_get(target, ioff, isize, flags);
 627         if (bp) {
 628                 trace_xfs_buf_read(bp, flags, _RET_IP_);
 629
 630                 if (!XFS_BUF_ISDONE(bp)) {
 631                         XFS_STATS_INC(xb_get_read);
 632                         _xfs_buf_read(bp, flags);
 633                 } else if (flags & XBF_ASYNC) {
 634                         /*
 635                          * Read ahead call which is already satisfied,
 636                          * drop the buffer
 637                          */
 638                         goto no_buffer;
 639                 } else {
 640                         /* We do not want read in the flags */
 641                         bp->b_flags &= ~XBF_READ;
 642                 }
 643         }
 644
 645         return bp;
 646
 647  no_buffer:
 648         if (flags & (XBF_LOCK | XBF_TRYLOCK))
 649                 xfs_buf_unlock(bp);
 650         xfs_buf_rele(bp);
 651         return NULL;
 652 }
 653
 654 /*
 655  *      If we are not low on memory then do the readahead in a deadlock
 656  *      safe manner.
 657  */
 658 void
 659 xfs_buf_readahead(
 660         xfs_buftarg_t           *target,
 661         xfs_off_t               ioff,
 662         size_t                  isize)
 663 {
 664         if (bdi_read_congested(target->bt_bdi))
 665                 return;
 666
 667         xfs_buf_read(target, ioff, isize,
 668                      XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
 669 }
 670
 671 /*
 672  * Read an uncached buffer from disk. Allocates and returns a locked
 673  * buffer containing the disk contents or nothing.
 674  */
 675 struct xfs_buf *
 676 xfs_buf_read_uncached(
 677         struct xfs_mount        *mp,
 678         struct xfs_buftarg      *target,
 679         xfs_daddr_t             daddr,
 680         size_t                  length,
 681         int                     flags)
 682 {
 683         xfs_buf_t               *bp;
 684         int                     error;
 685
 686         bp = xfs_buf_get_uncached(target, length, flags);
 687         if (!bp)
 688                 return NULL;
 689
 690         /* set up the buffer for a read IO */
 691         XFS_BUF_SET_ADDR(bp, daddr);
 692         XFS_BUF_READ(bp);
 693
 694         xfsbdstrat(mp, bp);
 695         error = xfs_buf_iowait(bp);
 696         if (error || bp->b_error) {
 697                 xfs_buf_relse(bp);
 698                 return NULL;
 699         }
 700         return bp;
 701 }
 702
 703 /*
 704  * Return a buffer allocated as an empty buffer and associated to external
 705  * memory via xfs_buf_associate_memory() back to it's empty state.
 706  */
 707 void
 708 xfs_buf_set_empty(
 709         struct xfs_buf          *bp,
 710         size_t                  len)
 711 {
 712         if (bp->b_pages)
 713                 _xfs_buf_free_pages(bp);
 714
 715         bp->b_pages = NULL;
 716         bp->b_page_count = 0;
 717         bp->b_addr = NULL;
 718         bp->b_file_offset = 0;
 719         bp->b_buffer_length = bp->b_count_desired = len;
 720         bp->b_bn = XFS_BUF_DADDR_NULL;
 721         bp->b_flags &= ~XBF_MAPPED;
 722 }
 723
 724 static inline struct page *
 725 mem_to_page(
 726         void                    *addr)
 727 {
 728         if ((!is_vmalloc_addr(addr))) {
 729                 return virt_to_page(addr);
 730         } else {
 731                 return vmalloc_to_page(addr);
 732         }
 733 }
 734
 735 int
 736 xfs_buf_associate_memory(
 737         xfs_buf_t               *bp,
 738         void                    *mem,
 739         size_t                  len)
 740 {
 741         int                     rval;
 742         int                     i = 0;
 743         unsigned long           pageaddr;
 744         unsigned long           offset;
 745         size_t                  buflen;
 746         int                     page_count;
 747
 748         pageaddr = (unsigned long)mem & PAGE_MASK;
 749         offset = (unsigned long)mem - pageaddr;
 750         buflen = PAGE_ALIGN(len + offset);
 751         page_count = buflen >> PAGE_SHIFT;
 752
 753         /* Free any previous set of page pointers */
 754         if (bp->b_pages)
 755                 _xfs_buf_free_pages(bp);
 756
 757         bp->b_pages = NULL;
 758         bp->b_addr = mem;
 759
 760         rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
 761         if (rval)
 762                 return rval;
 763
 764         bp->b_offset = offset;
 765
 766         for (i = 0; i < bp->b_page_count; i++) {
 767                 bp->b_pages[i] = mem_to_page((void *)pageaddr);
 768                 pageaddr += PAGE_SIZE;
 769         }
 770
 771         bp->b_count_desired = len;
 772         bp->b_buffer_length = buflen;
 773         bp->b_flags |= XBF_MAPPED;
 774
 775         return 0;
 776 }
 777
 778 xfs_buf_t *
 779 xfs_buf_get_uncached(
 780         struct xfs_buftarg      *target,
 781         size_t                  len,
 782         int                     flags)
 783 {
 784         unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
 785         int                     error, i;
 786         xfs_buf_t               *bp;
 787
 788         bp = xfs_buf_alloc(target, 0, len, 0);
 789         if (unlikely(bp == NULL))
 790                 goto fail;
 791
 792         error = _xfs_buf_get_pages(bp, page_count, 0);
 793         if (error)
 794                 goto fail_free_buf;
 795
 796         for (i = 0; i < page_count; i++) {
 797                 bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
 798                 if (!bp->b_pages[i])
 799                         goto fail_free_mem;
 800         }
 801         bp->b_flags |= _XBF_PAGES;
 802
 803         error = _xfs_buf_map_pages(bp, XBF_MAPPED);
 804         if (unlikely(error)) {
 805                 xfs_warn(target->bt_mount,
 806                         "%s: failed to map pages\n", __func__);
 807                 goto fail_free_mem;
 808         }
 809
 810         trace_xfs_buf_get_uncached(bp, _RET_IP_);
 811         return bp;
 812
 813  fail_free_mem:
 814         while (--i >= 0)
 815                 __free_page(bp->b_pages[i]);
 816         _xfs_buf_free_pages(bp);
 817  fail_free_buf:
 818         kmem_zone_free(xfs_buf_zone, bp);
 819  fail:
 820         return NULL;
 821 }
 822
 823 /*
 824  *      Increment reference count on buffer, to hold the buffer concurrently
 825  *      with another thread which may release (free) the buffer asynchronously.
 826  *      Must hold the buffer already to call this function.
 827  */
 828 void
 829 xfs_buf_hold(
 830         xfs_buf_t               *bp)
 831 {
 832         trace_xfs_buf_hold(bp, _RET_IP_);
 833         atomic_inc(&bp->b_hold);
 834 }
 835
 836 /*
 837  *      Releases a hold on the specified buffer.  If the
 838  *      the hold count is 1, calls xfs_buf_free.
 839  */
 840 void
 841 xfs_buf_rele(
 842         xfs_buf_t               *bp)
 843 {
 844         struct xfs_perag        *pag = bp->b_pag;
 845
 846         trace_xfs_buf_rele(bp, _RET_IP_);
 847
 848         if (!pag) {
 849                 ASSERT(list_empty(&bp->b_lru));
 850                 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 851                 if (atomic_dec_and_test(&bp->b_hold))
 852                         xfs_buf_free(bp);
 853                 return;
 854         }
 855
 856         ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
 857
 858         ASSERT(atomic_read(&bp->b_hold) > 0);
 859         if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
 860                 if (!(bp->b_flags & XBF_STALE) &&
 861                            atomic_read(&bp->b_lru_ref)) {
 862                         xfs_buf_lru_add(bp);
 863                         spin_unlock(&pag->pag_buf_lock);
 864                 } else {
 865                         xfs_buf_lru_del(bp);
 866                         ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 867                         rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
 868                         spin_unlock(&pag->pag_buf_lock);
 869                         xfs_perag_put(pag);
 870                         xfs_buf_free(bp);
 871                 }
 872         }
 873 }
 874
 875
 876 /*
 877  *      Lock a buffer object, if it is not already locked.
 878  *
 879  *      If we come across a stale, pinned, locked buffer, we know that we are
 880  *      being asked to lock a buffer that has been reallocated. Because it is
 881  *      pinned, we know that the log has not been pushed to disk and hence it
 882  *      will still be locked.  Rather than continuing to have trylock attempts
 883  *      fail until someone else pushes the log, push it ourselves before
 884  *      returning.  This means that the xfsaild will not get stuck trying
 885  *      to push on stale inode buffers.
 886  */
 887 int
 888 xfs_buf_trylock(
 889         struct xfs_buf          *bp)
 890 {
 891         int                     locked;
 892
 893         locked = down_trylock(&bp->b_sema) == 0;
 894         if (locked)
 895                 XB_SET_OWNER(bp);
 896         else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
 897                 xfs_log_force(bp->b_target->bt_mount, 0);
 898
 899         trace_xfs_buf_trylock(bp, _RET_IP_);
 900         return locked;
 901 }
 902
 903 /*
 904  *      Lock a buffer object.
 905  *
 906  *      If we come across a stale, pinned, locked buffer, we know that we
 907  *      are being asked to lock a buffer that has been reallocated. Because
 908  *      it is pinned, we know that the log has not been pushed to disk and
 909  *      hence it will still be locked. Rather than sleeping until someone
 910  *      else pushes the log, push it ourselves before trying to get the lock.
 911  */
 912 void
 913 xfs_buf_lock(
 914         struct xfs_buf          *bp)
 915 {
 916         trace_xfs_buf_lock(bp, _RET_IP_);
 917
 918         if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
 919                 xfs_log_force(bp->b_target->bt_mount, 0);
 920         down(&bp->b_sema);
 921         XB_SET_OWNER(bp);
 922
 923         trace_xfs_buf_lock_done(bp, _RET_IP_);
 924 }
 925
 926 void
 927 xfs_buf_unlock(
 928         struct xfs_buf          *bp)
 929 {
 930         XB_CLEAR_OWNER(bp);
 931         up(&bp->b_sema);
 932
 933         trace_xfs_buf_unlock(bp, _RET_IP_);
 934 }
 935
 936 STATIC void
 937 xfs_buf_wait_unpin(
 938         xfs_buf_t               *bp)
 939 {
 940         DECLARE_WAITQUEUE       (wait, current);
 941
 942         if (atomic_read(&bp->b_pin_count) == 0)
 943                 return;
 944
 945         add_wait_queue(&bp->b_waiters, &wait);
 946         for (;;) {
 947                 set_current_state(TASK_UNINTERRUPTIBLE);
 948                 if (atomic_read(&bp->b_pin_count) == 0)
 949                         break;
 950                 io_schedule();
 951         }
 952         remove_wait_queue(&bp->b_waiters, &wait);
 953         set_current_state(TASK_RUNNING);
 954 }
 955
 956 /*
 957  *      Buffer Utility Routines
 958  */
 959
 960 STATIC void
 961 xfs_buf_iodone_work(
 962         struct work_struct      *work)
 963 {
 964         xfs_buf_t               *bp =
 965                 container_of(work, xfs_buf_t, b_iodone_work);
 966
 967         if (bp->b_iodone)
 968                 (*(bp->b_iodone))(bp);
 969         else if (bp->b_flags & XBF_ASYNC)
 970                 xfs_buf_relse(bp);
 971 }
 972
 973 void
 974 xfs_buf_ioend(
 975         xfs_buf_t               *bp,
 976         int                     schedule)
 977 {
 978         trace_xfs_buf_iodone(bp, _RET_IP_);
 979
 980         bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
 981         if (bp->b_error == 0)
 982                 bp->b_flags |= XBF_DONE;
 983
 984         if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
 985                 if (schedule) {
 986                         INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
 987                         queue_work(xfslogd_workqueue, &bp->b_iodone_work);
 988                 } else {
 989                         xfs_buf_iodone_work(&bp->b_iodone_work);
 990                 }
 991         } else {
 992                 complete(&bp->b_iowait);
 993         }
 994 }
 995
 996 void
 997 xfs_buf_ioerror(
 998         xfs_buf_t               *bp,
 999         int                     error)
1000 {
1001         ASSERT(error >= 0 && error <= 0xffff);
1002         bp->b_error = (unsigned short)error;
1003         trace_xfs_buf_ioerror(bp, error, _RET_IP_);
1004 }
1005
1006 void
1007 xfs_buf_ioerror_alert(
1008         struct xfs_buf          *bp,
1009         const char              *func)
1010 {
1011         xfs_alert(bp->b_target->bt_mount,
1012 "metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
1013                 (__uint64_t)XFS_BUF_ADDR(bp), func,
1014                 bp->b_error, XFS_BUF_COUNT(bp));
1015 }
1016
1017 int
1018 xfs_bwrite(
1019         struct xfs_buf          *bp)
1020 {
1021         int                     error;
1022
1023         ASSERT(xfs_buf_islocked(bp));
1024
1025         bp->b_flags |= XBF_WRITE;
1026         bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
1027
1028         xfs_bdstrat_cb(bp);
1029
1030         error = xfs_buf_iowait(bp);
1031         if (error) {
1032                 xfs_force_shutdown(bp->b_target->bt_mount,
1033                                    SHUTDOWN_META_IO_ERROR);
1034         }
1035         return error;
1036 }
1037
1038 /*
1039  * Called when we want to stop a buffer from getting written or read.
1040  * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1041  * so that the proper iodone callbacks get called.
1042  */
1043 STATIC int
1044 xfs_bioerror(
1045         xfs_buf_t *bp)
1046 {
1047 #ifdef XFSERRORDEBUG
1048         ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1049 #endif
1050
1051         /*
1052          * No need to wait until the buffer is unpinned, we aren't flushing it.
1053          */
1054         xfs_buf_ioerror(bp, EIO);
1055
1056         /*
1057          * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1058          */
1059         XFS_BUF_UNREAD(bp);
1060         XFS_BUF_UNDONE(bp);
1061         xfs_buf_stale(bp);
1062
1063         xfs_buf_ioend(bp, 0);
1064
1065         return EIO;
1066 }
1067
1068 /*
1069  * Same as xfs_bioerror, except that we are releasing the buffer
1070  * here ourselves, and avoiding the xfs_buf_ioend call.
1071  * This is meant for userdata errors; metadata bufs come with
1072  * iodone functions attached, so that we can track down errors.
1073  */
1074 STATIC int
1075 xfs_bioerror_relse(
1076         struct xfs_buf  *bp)
1077 {
1078         int64_t         fl = bp->b_flags;
1079         /*
1080          * No need to wait until the buffer is unpinned.
1081          * We aren't flushing it.
1082          *
1083          * chunkhold expects B_DONE to be set, whether
1084          * we actually finish the I/O or not. We don't want to
1085          * change that interface.
1086          */
1087         XFS_BUF_UNREAD(bp);
1088         XFS_BUF_DONE(bp);
1089         xfs_buf_stale(bp);
1090         bp->b_iodone = NULL;
1091         if (!(fl & XBF_ASYNC)) {
1092                 /*
1093                  * Mark b_error and B_ERROR _both_.
1094                  * Lot's of chunkcache code assumes that.
1095                  * There's no reason to mark error for
1096                  * ASYNC buffers.
1097                  */
1098                 xfs_buf_ioerror(bp, EIO);
1099                 complete(&bp->b_iowait);
1100         } else {
1101                 xfs_buf_relse(bp);
1102         }
1103
1104         return EIO;
1105 }
1106
1107
1108 /*
1109  * All xfs metadata buffers except log state machine buffers
1110  * get this attached as their b_bdstrat callback function.
1111  * This is so that we can catch a buffer
1112  * after prematurely unpinning it to forcibly shutdown the filesystem.
1113  */
1114 int
1115 xfs_bdstrat_cb(
1116         struct xfs_buf  *bp)
1117 {
1118         if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1119                 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1120                 /*
1121                  * Metadata write that didn't get logged but
1122                  * written delayed anyway. These aren't associated
1123                  * with a transaction, and can be ignored.
1124                  */
1125                 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1126                         return xfs_bioerror_relse(bp);
1127                 else
1128                         return xfs_bioerror(bp);
1129         }
1130
1131         xfs_buf_iorequest(bp);
1132         return 0;
1133 }
1134
1135 /*
1136  * Wrapper around bdstrat so that we can stop data from going to disk in case
1137  * we are shutting down the filesystem.  Typically user data goes thru this
1138  * path; one of the exceptions is the superblock.
1139  */
1140 void
1141 xfsbdstrat(
1142         struct xfs_mount        *mp,
1143         struct xfs_buf          *bp)
1144 {
1145         if (XFS_FORCED_SHUTDOWN(mp)) {
1146                 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1147                 xfs_bioerror_relse(bp);
1148                 return;
1149         }
1150
1151         xfs_buf_iorequest(bp);
1152 }
1153
1154 STATIC void
1155 _xfs_buf_ioend(
1156         xfs_buf_t               *bp,
1157         int                     schedule)
1158 {
1159         if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1160                 xfs_buf_ioend(bp, schedule);
1161 }
1162
1163 STATIC void
1164 xfs_buf_bio_end_io(
1165         struct bio              *bio,
1166         int                     error)
1167 {
1168         xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
1169
1170         xfs_buf_ioerror(bp, -error);
1171
1172         if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1173                 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1174
1175         _xfs_buf_ioend(bp, 1);
1176         bio_put(bio);
1177 }
1178
1179 STATIC void
1180 _xfs_buf_ioapply(
1181         xfs_buf_t               *bp)
1182 {
1183         int                     rw, map_i, total_nr_pages, nr_pages;
1184         struct bio              *bio;
1185         int                     offset = bp->b_offset;
1186         int                     size = bp->b_count_desired;
1187         sector_t                sector = bp->b_bn;
1188
1189         total_nr_pages = bp->b_page_count;
1190         map_i = 0;
1191
1192         if (bp->b_flags & XBF_WRITE) {
1193                 if (bp->b_flags & XBF_SYNCIO)
1194                         rw = WRITE_SYNC;
1195                 else
1196                         rw = WRITE;
1197                 if (bp->b_flags & XBF_FUA)
1198                         rw |= REQ_FUA;
1199                 if (bp->b_flags & XBF_FLUSH)
1200                         rw |= REQ_FLUSH;
1201         } else if (bp->b_flags & XBF_READ_AHEAD) {
1202                 rw = READA;
1203         } else {
1204                 rw = READ;
1205         }
1206
1207         /* we only use the buffer cache for meta-data */
1208         rw |= REQ_META;
1209
1210 next_chunk:
1211         atomic_inc(&bp->b_io_remaining);
1212         nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1213         if (nr_pages > total_nr_pages)
1214                 nr_pages = total_nr_pages;
1215
1216         bio = bio_alloc(GFP_NOIO, nr_pages);
1217         bio->bi_bdev = bp->b_target->bt_bdev;
1218         bio->bi_sector = sector;
1219         bio->bi_end_io = xfs_buf_bio_end_io;
1220         bio->bi_private = bp;
1221
1222
1223         for (; size && nr_pages; nr_pages--, map_i++) {
1224                 int     rbytes, nbytes = PAGE_SIZE - offset;
1225
1226                 if (nbytes > size)
1227                         nbytes = size;
1228
1229                 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
1230                 if (rbytes < nbytes)
1231                         break;
1232
1233                 offset = 0;
1234                 sector += nbytes >> BBSHIFT;
1235                 size -= nbytes;
1236                 total_nr_pages--;
1237         }
1238
1239         if (likely(bio->bi_size)) {
1240                 if (xfs_buf_is_vmapped(bp)) {
1241                         flush_kernel_vmap_range(bp->b_addr,
1242                                                 xfs_buf_vmap_len(bp));
1243                 }
1244                 submit_bio(rw, bio);
1245                 if (size)
1246                         goto next_chunk;
1247         } else {
1248                 xfs_buf_ioerror(bp, EIO);
1249                 bio_put(bio);
1250         }
1251 }
1252
1253 int
1254 xfs_buf_iorequest(
1255         xfs_buf_t               *bp)
1256 {
1257         trace_xfs_buf_iorequest(bp, _RET_IP_);
1258
1259         ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1260
1261         if (bp->b_flags & XBF_WRITE)
1262                 xfs_buf_wait_unpin(bp);
1263         xfs_buf_hold(bp);
1264
1265         /* Set the count to 1 initially, this will stop an I/O
1266          * completion callout which happens before we have started
1267          * all the I/O from calling xfs_buf_ioend too early.
1268          */
1269         atomic_set(&bp->b_io_remaining, 1);
1270         _xfs_buf_ioapply(bp);
1271         _xfs_buf_ioend(bp, 0);
1272
1273         xfs_buf_rele(bp);
1274         return 0;
1275 }
1276
1277 /*
1278  *      Waits for I/O to complete on the buffer supplied.
1279  *      It returns immediately if no I/O is pending.
1280  *      It returns the I/O error code, if any, or 0 if there was no error.
1281  */
1282 int
1283 xfs_buf_iowait(
1284         xfs_buf_t               *bp)
1285 {
1286         trace_xfs_buf_iowait(bp, _RET_IP_);
1287
1288         wait_for_completion(&bp->b_iowait);
1289
1290         trace_xfs_buf_iowait_done(bp, _RET_IP_);
1291         return bp->b_error;
1292 }
1293
1294 xfs_caddr_t
1295 xfs_buf_offset(
1296         xfs_buf_t               *bp,
1297         size_t                  offset)
1298 {
1299         struct page             *page;
1300
1301         if (bp->b_flags & XBF_MAPPED)
1302                 return bp->b_addr + offset;
1303
1304         offset += bp->b_offset;
1305         page = bp->b_pages[offset >> PAGE_SHIFT];
1306         return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1307 }
1308
1309 /*
1310  *      Move data into or out of a buffer.
1311  */
1312 void
1313 xfs_buf_iomove(
1314         xfs_buf_t               *bp,    /* buffer to process            */
1315         size_t                  boff,   /* starting buffer offset       */
1316         size_t                  bsize,  /* length to copy               */
1317         void                    *data,  /* data address                 */
1318         xfs_buf_rw_t            mode)   /* read/write/zero flag         */
1319 {
1320         size_t                  bend, cpoff, csize;
1321         struct page             *page;
1322
1323         bend = boff + bsize;
1324         while (boff < bend) {
1325                 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1326                 cpoff = xfs_buf_poff(boff + bp->b_offset);
1327                 csize = min_t(size_t,
1328                               PAGE_SIZE-cpoff, bp->b_count_desired-boff);
1329
1330                 ASSERT(((csize + cpoff) <= PAGE_SIZE));
1331
1332                 switch (mode) {
1333                 case XBRW_ZERO:
1334                         memset(page_address(page) + cpoff, 0, csize);
1335                         break;
1336                 case XBRW_READ:
1337                         memcpy(data, page_address(page) + cpoff, csize);
1338                         break;
1339                 case XBRW_WRITE:
1340                         memcpy(page_address(page) + cpoff, data, csize);
1341                 }
1342
1343                 boff += csize;
1344                 data += csize;
1345         }
1346 }
1347
1348 /*
1349  *      Handling of buffer targets (buftargs).
1350  */
1351
1352 /*
1353  * Wait for any bufs with callbacks that have been submitted but have not yet
1354  * returned. These buffers will have an elevated hold count, so wait on those
1355  * while freeing all the buffers only held by the LRU.
1356  */
1357 void
1358 xfs_wait_buftarg(
1359         struct xfs_buftarg      *btp)
1360 {
1361         struct xfs_buf          *bp;
1362
1363 restart:
1364         spin_lock(&btp->bt_lru_lock);
1365         while (!list_empty(&btp->bt_lru)) {
1366                 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1367                 if (atomic_read(&bp->b_hold) > 1) {
1368                         spin_unlock(&btp->bt_lru_lock);
1369                         delay(100);
1370                         goto restart;
1371                 }
1372                 /*
1373                  * clear the LRU reference count so the buffer doesn't get
1374                  * ignored in xfs_buf_rele().
1375                  */
1376                 atomic_set(&bp->b_lru_ref, 0);
1377                 spin_unlock(&btp->bt_lru_lock);
1378                 xfs_buf_rele(bp);
1379                 spin_lock(&btp->bt_lru_lock);
1380         }
1381         spin_unlock(&btp->bt_lru_lock);
1382 }
1383
1384 int
1385 xfs_buftarg_shrink(
1386         struct shrinker         *shrink,
1387         struct shrink_control   *sc)
1388 {
1389         struct xfs_buftarg      *btp = container_of(shrink,
1390                                         struct xfs_buftarg, bt_shrinker);
1391         struct xfs_buf          *bp;
1392         int nr_to_scan = sc->nr_to_scan;
1393         LIST_HEAD(dispose);
1394
1395         if (!nr_to_scan)
1396                 return btp->bt_lru_nr;
1397
1398         spin_lock(&btp->bt_lru_lock);
1399         while (!list_empty(&btp->bt_lru)) {
1400                 if (nr_to_scan-- <= 0)
1401                         break;
1402
1403                 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1404
1405                 /*
1406                  * Decrement the b_lru_ref count unless the value is already
1407                  * zero. If the value is already zero, we need to reclaim the
1408                  * buffer, otherwise it gets another trip through the LRU.
1409                  */
1410                 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1411                         list_move_tail(&bp->b_lru, &btp->bt_lru);
1412                         continue;
1413                 }
1414
1415                 /*
1416                  * remove the buffer from the LRU now to avoid needing another
1417                  * lock round trip inside xfs_buf_rele().
1418                  */
1419                 list_move(&bp->b_lru, &dispose);
1420                 btp->bt_lru_nr--;
1421         }
1422         spin_unlock(&btp->bt_lru_lock);
1423
1424         while (!list_empty(&dispose)) {
1425                 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1426                 list_del_init(&bp->b_lru);
1427                 xfs_buf_rele(bp);
1428         }
1429
1430         return btp->bt_lru_nr;
1431 }
1432
1433 void
1434 xfs_free_buftarg(
1435         struct xfs_mount        *mp,
1436         struct xfs_buftarg      *btp)
1437 {
1438         unregister_shrinker(&btp->bt_shrinker);
1439
1440         if (mp->m_flags & XFS_MOUNT_BARRIER)
1441                 xfs_blkdev_issue_flush(btp);
1442
1443         kmem_free(btp);
1444 }
1445
1446 STATIC int
1447 xfs_setsize_buftarg_flags(
1448         xfs_buftarg_t           *btp,
1449         unsigned int            blocksize,
1450         unsigned int            sectorsize,
1451         int                     verbose)
1452 {
1453         btp->bt_bsize = blocksize;
1454         btp->bt_sshift = ffs(sectorsize) - 1;
1455         btp->bt_smask = sectorsize - 1;
1456
1457         if (set_blocksize(btp->bt_bdev, sectorsize)) {
1458                 char name[BDEVNAME_SIZE];
1459
1460                 bdevname(btp->bt_bdev, name);
1461
1462                 xfs_warn(btp->bt_mount,
1463                         "Cannot set_blocksize to %u on device %s\n",
1464                         sectorsize, name);
1465                 return EINVAL;
1466         }
1467
1468         return 0;
1469 }
1470
1471 /*
1472  *      When allocating the initial buffer target we have not yet
1473  *      read in the superblock, so don't know what sized sectors
1474  *      are being used is at this early stage.  Play safe.
1475  */
1476 STATIC int
1477 xfs_setsize_buftarg_early(
1478         xfs_buftarg_t           *btp,
1479         struct block_device     *bdev)
1480 {
1481         return xfs_setsize_buftarg_flags(btp,
1482                         PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1483 }
1484
1485 int
1486 xfs_setsize_buftarg(
1487         xfs_buftarg_t           *btp,
1488         unsigned int            blocksize,
1489         unsigned int            sectorsize)
1490 {
1491         return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1492 }
1493
1494 xfs_buftarg_t *
1495 xfs_alloc_buftarg(
1496         struct xfs_mount        *mp,
1497         struct block_device     *bdev,
1498         int                     external,
1499         const char              *fsname)
1500 {
1501         xfs_buftarg_t           *btp;
1502
1503         btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1504
1505         btp->bt_mount = mp;
1506         btp->bt_dev =  bdev->bd_dev;
1507         btp->bt_bdev = bdev;
1508         btp->bt_bdi = blk_get_backing_dev_info(bdev);
1509         if (!btp->bt_bdi)
1510                 goto error;
1511
1512         INIT_LIST_HEAD(&btp->bt_lru);
1513         spin_lock_init(&btp->bt_lru_lock);
1514         if (xfs_setsize_buftarg_early(btp, bdev))
1515                 goto error;
1516         btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1517         btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1518         register_shrinker(&btp->bt_shrinker);
1519         return btp;
1520
1521 error:
1522         kmem_free(btp);
1523         return NULL;
1524 }
1525
1526 /*
1527  * Add a buffer to the delayed write list.
1528  *
1529  * This queues a buffer for writeout if it hasn't already been.  Note that
1530  * neither this routine nor the buffer list submission functions perform
1531  * any internal synchronization.  It is expected that the lists are thread-local
1532  * to the callers.
1533  *
1534  * Returns true if we queued up the buffer, or false if it already had
1535  * been on the buffer list.
1536  */
1537 bool
1538 xfs_buf_delwri_queue(
1539         struct xfs_buf          *bp,
1540         struct list_head        *list)
1541 {
1542         ASSERT(xfs_buf_islocked(bp));
1543         ASSERT(!(bp->b_flags & XBF_READ));
1544
1545         /*
1546          * If the buffer is already marked delwri it already is queued up
1547          * by someone else for imediate writeout.  Just ignore it in that
1548          * case.
1549          */
1550         if (bp->b_flags & _XBF_DELWRI_Q) {
1551                 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
1552                 return false;
1553         }
1554
1555         trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1556
1557         /*
1558          * If a buffer gets written out synchronously or marked stale while it
1559          * is on a delwri list we lazily remove it. To do this, the other party
1560          * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
1561          * It remains referenced and on the list.  In a rare corner case it
1562          * might get readded to a delwri list after the synchronous writeout, in
1563          * which case we need just need to re-add the flag here.
1564          */
1565         bp->b_flags |= _XBF_DELWRI_Q;
1566         if (list_empty(&bp->b_list)) {
1567                 atomic_inc(&bp->b_hold);
1568                 list_add_tail(&bp->b_list, list);
1569         }
1570
1571         return true;
1572 }
1573
1574 /*
1575  * Compare function is more complex than it needs to be because
1576  * the return value is only 32 bits and we are doing comparisons
1577  * on 64 bit values
1578  */
1579 static int
1580 xfs_buf_cmp(
1581         void            *priv,
1582         struct list_head *a,
1583         struct list_head *b)
1584 {
1585         struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
1586         struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
1587         xfs_daddr_t             diff;
1588
1589         diff = ap->b_bn - bp->b_bn;
1590         if (diff < 0)
1591                 return -1;
1592         if (diff > 0)
1593                 return 1;
1594         return 0;
1595 }
1596
1597 static int
1598 __xfs_buf_delwri_submit(
1599         struct list_head        *buffer_list,
1600         struct list_head        *io_list,
1601         bool                    wait)
1602 {
1603         struct blk_plug         plug;
1604         struct xfs_buf          *bp, *n;
1605         int                     pinned = 0;
1606
1607         list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1608                 if (!wait) {
1609                         if (xfs_buf_ispinned(bp)) {
1610                                 pinned++;
1611                                 continue;
1612                         }
1613                         if (!xfs_buf_trylock(bp))
1614                                 continue;
1615                 } else {
1616                         xfs_buf_lock(bp);
1617                 }
1618
1619                 /*
1620                  * Someone else might have written the buffer synchronously or
1621                  * marked it stale in the meantime.  In that case only the
1622                  * _XBF_DELWRI_Q flag got cleared, and we have to drop the
1623                  * reference and remove it from the list here.
1624                  */
1625                 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
1626                         list_del_init(&bp->b_list);
1627                         xfs_buf_relse(bp);
1628                         continue;
1629                 }
1630
1631                 list_move_tail(&bp->b_list, io_list);
1632                 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1633         }
1634
1635         list_sort(NULL, io_list, xfs_buf_cmp);
1636
1637         blk_start_plug(&plug);
1638         list_for_each_entry_safe(bp, n, io_list, b_list) {
1639                 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
1640                 bp->b_flags |= XBF_WRITE;
1641
1642                 if (!wait) {
1643                         bp->b_flags |= XBF_ASYNC;
1644                         list_del_init(&bp->b_list);
1645                 }
1646                 xfs_bdstrat_cb(bp);
1647         }
1648         blk_finish_plug(&plug);
1649
1650         return pinned;
1651 }
1652
1653 /*
1654  * Write out a buffer list asynchronously.
1655  *
1656  * This will take the @buffer_list, write all non-locked and non-pinned buffers
1657  * out and not wait for I/O completion on any of the buffers.  This interface
1658  * is only safely useable for callers that can track I/O completion by higher
1659  * level means, e.g. AIL pushing as the @buffer_list is consumed in this
1660  * function.
1661  */
1662 int
1663 xfs_buf_delwri_submit_nowait(
1664         struct list_head        *buffer_list)
1665 {
1666         LIST_HEAD               (io_list);
1667         return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
1668 }
1669
1670 /*
1671  * Write out a buffer list synchronously.
1672  *
1673  * This will take the @buffer_list, write all buffers out and wait for I/O
1674  * completion on all of the buffers. @buffer_list is consumed by the function,
1675  * so callers must have some other way of tracking buffers if they require such
1676  * functionality.
1677  */
1678 int
1679 xfs_buf_delwri_submit(
1680         struct list_head        *buffer_list)
1681 {
1682         LIST_HEAD               (io_list);
1683         int                     error = 0, error2;
1684         struct xfs_buf          *bp;
1685
1686         __xfs_buf_delwri_submit(buffer_list, &io_list, true);
1687
1688         /* Wait for IO to complete. */
1689         while (!list_empty(&io_list)) {
1690                 bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1691
1692                 list_del_init(&bp->b_list);
1693                 error2 = xfs_buf_iowait(bp);
1694                 xfs_buf_relse(bp);
1695                 if (!error)
1696                         error = error2;
1697         }
1698
1699         return error;
1700 }
1701
1702 int __init
1703 xfs_buf_init(void)
1704 {
1705         xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1706                                                 KM_ZONE_HWALIGN, NULL);
1707         if (!xfs_buf_zone)
1708                 goto out;
1709
1710         xfslogd_workqueue = alloc_workqueue("xfslogd",
1711                                         WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1712         if (!xfslogd_workqueue)
1713                 goto out_free_buf_zone;
1714
1715         return 0;
1716
1717  out_free_buf_zone:
1718         kmem_zone_destroy(xfs_buf_zone);
1719  out:
1720         return -ENOMEM;
1721 }
1722
1723 void
1724 xfs_buf_terminate(void)
1725 {
1726         destroy_workqueue(xfslogd_workqueue);
1727         kmem_zone_destroy(xfs_buf_zone);
1728 }