]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - drivers/gpu/drm/i915/i915_gem.c
Merge branch 'drm-intel-next' of git://people.freedesktop.org/~danvet/drm-intel into...
[karo-tx-linux.git] / drivers / gpu / drm / i915 / i915_gem.c
index 4c65c639f7721d315a1b5875fe4f32478471fab3..b851bd34ca18376af5becaf346e1cbd960c0432a 100644 (file)
 static __must_check int i915_gem_object_flush_gpu_write_domain(struct drm_i915_gem_object *obj);
 static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj);
 static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj);
-static __must_check int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj,
-                                                         bool write);
-static __must_check int i915_gem_object_set_cpu_read_domain_range(struct drm_i915_gem_object *obj,
-                                                                 uint64_t offset,
-                                                                 uint64_t size);
-static void i915_gem_object_set_to_full_cpu_read_domain(struct drm_i915_gem_object *obj);
 static __must_check int i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj,
                                                    unsigned alignment,
                                                    bool map_and_fenceable);
@@ -125,25 +119,6 @@ i915_gem_object_is_inactive(struct drm_i915_gem_object *obj)
        return obj->gtt_space && !obj->active && obj->pin_count == 0;
 }
 
-void i915_gem_do_init(struct drm_device *dev,
-                     unsigned long start,
-                     unsigned long mappable_end,
-                     unsigned long end)
-{
-       drm_i915_private_t *dev_priv = dev->dev_private;
-
-       drm_mm_init(&dev_priv->mm.gtt_space, start, end - start);
-
-       dev_priv->mm.gtt_start = start;
-       dev_priv->mm.gtt_mappable_end = mappable_end;
-       dev_priv->mm.gtt_end = end;
-       dev_priv->mm.gtt_total = end - start;
-       dev_priv->mm.mappable_gtt_total = min(end, mappable_end) - start;
-
-       /* Take over this portion of the GTT */
-       intel_gtt_clear_range(start / PAGE_SIZE, (end-start) / PAGE_SIZE);
-}
-
 int
 i915_gem_init_ioctl(struct drm_device *dev, void *data,
                    struct drm_file *file)
@@ -154,8 +129,13 @@ i915_gem_init_ioctl(struct drm_device *dev, void *data,
            (args->gtt_end | args->gtt_start) & (PAGE_SIZE - 1))
                return -EINVAL;
 
+       /* GEM with user mode setting was never supported on ilk and later. */
+       if (INTEL_INFO(dev)->gen >= 5)
+               return -ENODEV;
+
        mutex_lock(&dev->struct_mutex);
-       i915_gem_do_init(dev, args->gtt_start, args->gtt_end, args->gtt_end);
+       i915_gem_init_global_gtt(dev, args->gtt_start,
+                                args->gtt_end, args->gtt_end);
        mutex_unlock(&dev->struct_mutex);
 
        return 0;
@@ -259,66 +239,6 @@ static int i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj)
                obj->tiling_mode != I915_TILING_NONE;
 }
 
-/**
- * This is the fast shmem pread path, which attempts to copy_from_user directly
- * from the backing pages of the object to the user's address space.  On a
- * fault, it fails so we can fall back to i915_gem_shmem_pwrite_slow().
- */
-static int
-i915_gem_shmem_pread_fast(struct drm_device *dev,
-                         struct drm_i915_gem_object *obj,
-                         struct drm_i915_gem_pread *args,
-                         struct drm_file *file)
-{
-       struct address_space *mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
-       ssize_t remain;
-       loff_t offset;
-       char __user *user_data;
-       int page_offset, page_length;
-
-       user_data = (char __user *) (uintptr_t) args->data_ptr;
-       remain = args->size;
-
-       offset = args->offset;
-
-       while (remain > 0) {
-               struct page *page;
-               char *vaddr;
-               int ret;
-
-               /* Operation in this page
-                *
-                * page_offset = offset within page
-                * page_length = bytes to copy for this page
-                */
-               page_offset = offset_in_page(offset);
-               page_length = remain;
-               if ((page_offset + remain) > PAGE_SIZE)
-                       page_length = PAGE_SIZE - page_offset;
-
-               page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
-               if (IS_ERR(page))
-                       return PTR_ERR(page);
-
-               vaddr = kmap_atomic(page);
-               ret = __copy_to_user_inatomic(user_data,
-                                             vaddr + page_offset,
-                                             page_length);
-               kunmap_atomic(vaddr);
-
-               mark_page_accessed(page);
-               page_cache_release(page);
-               if (ret)
-                       return -EFAULT;
-
-               remain -= page_length;
-               user_data += page_length;
-               offset += page_length;
-       }
-
-       return 0;
-}
-
 static inline int
 __copy_to_user_swizzled(char __user *cpu_vaddr,
                        const char *gpu_vaddr, int gpu_offset,
@@ -371,37 +291,121 @@ __copy_from_user_swizzled(char __user *gpu_vaddr, int gpu_offset,
        return 0;
 }
 
-/**
- * This is the fallback shmem pread path, which allocates temporary storage
- * in kernel space to copy_to_user into outside of the struct_mutex, so we
- * can copy out of the object's backing pages while holding the struct mutex
- * and not take page faults.
- */
+/* Per-page copy function for the shmem pread fastpath.
+ * Flushes invalid cachelines before reading the target if
+ * needs_clflush is set. */
 static int
-i915_gem_shmem_pread_slow(struct drm_device *dev,
-                         struct drm_i915_gem_object *obj,
-                         struct drm_i915_gem_pread *args,
-                         struct drm_file *file)
+shmem_pread_fast(struct page *page, int shmem_page_offset, int page_length,
+                char __user *user_data,
+                bool page_do_bit17_swizzling, bool needs_clflush)
+{
+       char *vaddr;
+       int ret;
+
+       if (unlikely(page_do_bit17_swizzling))
+               return -EINVAL;
+
+       vaddr = kmap_atomic(page);
+       if (needs_clflush)
+               drm_clflush_virt_range(vaddr + shmem_page_offset,
+                                      page_length);
+       ret = __copy_to_user_inatomic(user_data,
+                                     vaddr + shmem_page_offset,
+                                     page_length);
+       kunmap_atomic(vaddr);
+
+       return ret;
+}
+
+static void
+shmem_clflush_swizzled_range(char *addr, unsigned long length,
+                            bool swizzled)
+{
+       if (unlikely(swizzled)) {
+               unsigned long start = (unsigned long) addr;
+               unsigned long end = (unsigned long) addr + length;
+
+               /* For swizzling simply ensure that we always flush both
+                * channels. Lame, but simple and it works. Swizzled
+                * pwrite/pread is far from a hotpath - current userspace
+                * doesn't use it at all. */
+               start = round_down(start, 128);
+               end = round_up(end, 128);
+
+               drm_clflush_virt_range((void *)start, end - start);
+       } else {
+               drm_clflush_virt_range(addr, length);
+       }
+
+}
+
+/* Only difference to the fast-path function is that this can handle bit17
+ * and uses non-atomic copy and kmap functions. */
+static int
+shmem_pread_slow(struct page *page, int shmem_page_offset, int page_length,
+                char __user *user_data,
+                bool page_do_bit17_swizzling, bool needs_clflush)
+{
+       char *vaddr;
+       int ret;
+
+       vaddr = kmap(page);
+       if (needs_clflush)
+               shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
+                                            page_length,
+                                            page_do_bit17_swizzling);
+
+       if (page_do_bit17_swizzling)
+               ret = __copy_to_user_swizzled(user_data,
+                                             vaddr, shmem_page_offset,
+                                             page_length);
+       else
+               ret = __copy_to_user(user_data,
+                                    vaddr + shmem_page_offset,
+                                    page_length);
+       kunmap(page);
+
+       return ret;
+}
+
+static int
+i915_gem_shmem_pread(struct drm_device *dev,
+                    struct drm_i915_gem_object *obj,
+                    struct drm_i915_gem_pread *args,
+                    struct drm_file *file)
 {
        struct address_space *mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
        char __user *user_data;
        ssize_t remain;
        loff_t offset;
-       int shmem_page_offset, page_length, ret;
+       int shmem_page_offset, page_length, ret = 0;
        int obj_do_bit17_swizzling, page_do_bit17_swizzling;
+       int hit_slowpath = 0;
+       int prefaulted = 0;
+       int needs_clflush = 0;
+       int release_page;
 
        user_data = (char __user *) (uintptr_t) args->data_ptr;
        remain = args->size;
 
        obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
 
-       offset = args->offset;
+       if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) {
+               /* If we're not in the cpu read domain, set ourself into the gtt
+                * read domain and manually flush cachelines (if required). This
+                * optimizes for the case when the gpu will dirty the data
+                * anyway again before the next pread happens. */
+               if (obj->cache_level == I915_CACHE_NONE)
+                       needs_clflush = 1;
+               ret = i915_gem_object_set_to_gtt_domain(obj, false);
+               if (ret)
+                       return ret;
+       }
 
-       mutex_unlock(&dev->struct_mutex);
+       offset = args->offset;
 
        while (remain > 0) {
                struct page *page;
-               char *vaddr;
 
                /* Operation in this page
                 *
@@ -413,28 +417,51 @@ i915_gem_shmem_pread_slow(struct drm_device *dev,
                if ((shmem_page_offset + page_length) > PAGE_SIZE)
                        page_length = PAGE_SIZE - shmem_page_offset;
 
-               page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
-               if (IS_ERR(page)) {
-                       ret = PTR_ERR(page);
-                       goto out;
+               if (obj->pages) {
+                       page = obj->pages[offset >> PAGE_SHIFT];
+                       release_page = 0;
+               } else {
+                       page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+                       if (IS_ERR(page)) {
+                               ret = PTR_ERR(page);
+                               goto out;
+                       }
+                       release_page = 1;
                }
 
                page_do_bit17_swizzling = obj_do_bit17_swizzling &&
                        (page_to_phys(page) & (1 << 17)) != 0;
 
-               vaddr = kmap(page);
-               if (page_do_bit17_swizzling)
-                       ret = __copy_to_user_swizzled(user_data,
-                                                     vaddr, shmem_page_offset,
-                                                     page_length);
-               else
-                       ret = __copy_to_user(user_data,
-                                            vaddr + shmem_page_offset,
-                                            page_length);
-               kunmap(page);
+               ret = shmem_pread_fast(page, shmem_page_offset, page_length,
+                                      user_data, page_do_bit17_swizzling,
+                                      needs_clflush);
+               if (ret == 0)
+                       goto next_page;
 
-               mark_page_accessed(page);
+               hit_slowpath = 1;
+               page_cache_get(page);
+               mutex_unlock(&dev->struct_mutex);
+
+               if (!prefaulted) {
+                       ret = fault_in_multipages_writeable(user_data, remain);
+                       /* Userspace is tricking us, but we've already clobbered
+                        * its pages with the prefault and promised to write the
+                        * data up to the first fault. Hence ignore any errors
+                        * and just continue. */
+                       (void)ret;
+                       prefaulted = 1;
+               }
+
+               ret = shmem_pread_slow(page, shmem_page_offset, page_length,
+                                      user_data, page_do_bit17_swizzling,
+                                      needs_clflush);
+
+               mutex_lock(&dev->struct_mutex);
                page_cache_release(page);
+next_page:
+               mark_page_accessed(page);
+               if (release_page)
+                       page_cache_release(page);
 
                if (ret) {
                        ret = -EFAULT;
@@ -447,10 +474,11 @@ i915_gem_shmem_pread_slow(struct drm_device *dev,
        }
 
 out:
-       mutex_lock(&dev->struct_mutex);
-       /* Fixup: Kill any reinstated backing storage pages */
-       if (obj->madv == __I915_MADV_PURGED)
-               i915_gem_object_truncate(obj);
+       if (hit_slowpath) {
+               /* Fixup: Kill any reinstated backing storage pages */
+               if (obj->madv == __I915_MADV_PURGED)
+                       i915_gem_object_truncate(obj);
+       }
 
        return ret;
 }
@@ -476,11 +504,6 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
                       args->size))
                return -EFAULT;
 
-       ret = fault_in_pages_writeable((char __user *)(uintptr_t)args->data_ptr,
-                                      args->size);
-       if (ret)
-               return -EFAULT;
-
        ret = i915_mutex_lock_interruptible(dev);
        if (ret)
                return ret;
@@ -500,17 +523,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 
        trace_i915_gem_object_pread(obj, args->offset, args->size);
 
-       ret = i915_gem_object_set_cpu_read_domain_range(obj,
-                                                       args->offset,
-                                                       args->size);
-       if (ret)
-               goto out;
-
-       ret = -EFAULT;
-       if (!i915_gem_object_needs_bit17_swizzle(obj))
-               ret = i915_gem_shmem_pread_fast(dev, obj, args, file);
-       if (ret == -EFAULT)
-               ret = i915_gem_shmem_pread_slow(dev, obj, args, file);
+       ret = i915_gem_shmem_pread(dev, obj, args, file);
 
 out:
        drm_gem_object_unreference(&obj->base);
@@ -539,30 +552,6 @@ fast_user_write(struct io_mapping *mapping,
        return unwritten;
 }
 
-/* Here's the write path which can sleep for
- * page faults
- */
-
-static inline void
-slow_kernel_write(struct io_mapping *mapping,
-                 loff_t gtt_base, int gtt_offset,
-                 struct page *user_page, int user_offset,
-                 int length)
-{
-       char __iomem *dst_vaddr;
-       char *src_vaddr;
-
-       dst_vaddr = io_mapping_map_wc(mapping, gtt_base);
-       src_vaddr = kmap(user_page);
-
-       memcpy_toio(dst_vaddr + gtt_offset,
-                   src_vaddr + user_offset,
-                   length);
-
-       kunmap(user_page);
-       io_mapping_unmap(dst_vaddr);
-}
-
 /**
  * This is the fast pwrite path, where we copy the data directly from the
  * user into the GTT, uncached.
@@ -577,7 +566,19 @@ i915_gem_gtt_pwrite_fast(struct drm_device *dev,
        ssize_t remain;
        loff_t offset, page_base;
        char __user *user_data;
-       int page_offset, page_length;
+       int page_offset, page_length, ret;
+
+       ret = i915_gem_object_pin(obj, 0, true);
+       if (ret)
+               goto out;
+
+       ret = i915_gem_object_set_to_gtt_domain(obj, true);
+       if (ret)
+               goto out_unpin;
+
+       ret = i915_gem_object_put_fence(obj);
+       if (ret)
+               goto out_unpin;
 
        user_data = (char __user *) (uintptr_t) args->data_ptr;
        remain = args->size;
@@ -602,214 +603,133 @@ i915_gem_gtt_pwrite_fast(struct drm_device *dev,
                 * retry in the slow path.
                 */
                if (fast_user_write(dev_priv->mm.gtt_mapping, page_base,
-                                   page_offset, user_data, page_length))
-                       return -EFAULT;
+                                   page_offset, user_data, page_length)) {
+                       ret = -EFAULT;
+                       goto out_unpin;
+               }
 
                remain -= page_length;
                user_data += page_length;
                offset += page_length;
        }
 
-       return 0;
+out_unpin:
+       i915_gem_object_unpin(obj);
+out:
+       return ret;
 }
 
-/**
- * This is the fallback GTT pwrite path, which uses get_user_pages to pin
- * the memory and maps it using kmap_atomic for copying.
- *
- * This code resulted in x11perf -rgb10text consuming about 10% more CPU
- * than using i915_gem_gtt_pwrite_fast on a G45 (32-bit).
- */
+/* Per-page copy function for the shmem pwrite fastpath.
+ * Flushes invalid cachelines before writing to the target if
+ * needs_clflush_before is set and flushes out any written cachelines after
+ * writing if needs_clflush is set. */
 static int
-i915_gem_gtt_pwrite_slow(struct drm_device *dev,
-                        struct drm_i915_gem_object *obj,
-                        struct drm_i915_gem_pwrite *args,
-                        struct drm_file *file)
+shmem_pwrite_fast(struct page *page, int shmem_page_offset, int page_length,
+                 char __user *user_data,
+                 bool page_do_bit17_swizzling,
+                 bool needs_clflush_before,
+                 bool needs_clflush_after)
 {
-       drm_i915_private_t *dev_priv = dev->dev_private;
-       ssize_t remain;
-       loff_t gtt_page_base, offset;
-       loff_t first_data_page, last_data_page, num_pages;
-       loff_t pinned_pages, i;
-       struct page **user_pages;
-       struct mm_struct *mm = current->mm;
-       int gtt_page_offset, data_page_offset, data_page_index, page_length;
+       char *vaddr;
        int ret;
-       uint64_t data_ptr = args->data_ptr;
-
-       remain = args->size;
-
-       /* Pin the user pages containing the data.  We can't fault while
-        * holding the struct mutex, and all of the pwrite implementations
-        * want to hold it while dereferencing the user data.
-        */
-       first_data_page = data_ptr / PAGE_SIZE;
-       last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
-       num_pages = last_data_page - first_data_page + 1;
-
-       user_pages = drm_malloc_ab(num_pages, sizeof(struct page *));
-       if (user_pages == NULL)
-               return -ENOMEM;
-
-       mutex_unlock(&dev->struct_mutex);
-       down_read(&mm->mmap_sem);
-       pinned_pages = get_user_pages(current, mm, (uintptr_t)args->data_ptr,
-                                     num_pages, 0, 0, user_pages, NULL);
-       up_read(&mm->mmap_sem);
-       mutex_lock(&dev->struct_mutex);
-       if (pinned_pages < num_pages) {
-               ret = -EFAULT;
-               goto out_unpin_pages;
-       }
-
-       ret = i915_gem_object_set_to_gtt_domain(obj, true);
-       if (ret)
-               goto out_unpin_pages;
-
-       ret = i915_gem_object_put_fence(obj);
-       if (ret)
-               goto out_unpin_pages;
-
-       offset = obj->gtt_offset + args->offset;
 
-       while (remain > 0) {
-               /* Operation in this page
-                *
-                * gtt_page_base = page offset within aperture
-                * gtt_page_offset = offset within page in aperture
-                * data_page_index = page number in get_user_pages return
-                * data_page_offset = offset with data_page_index page.
-                * page_length = bytes to copy for this page
-                */
-               gtt_page_base = offset & PAGE_MASK;
-               gtt_page_offset = offset_in_page(offset);
-               data_page_index = data_ptr / PAGE_SIZE - first_data_page;
-               data_page_offset = offset_in_page(data_ptr);
-
-               page_length = remain;
-               if ((gtt_page_offset + page_length) > PAGE_SIZE)
-                       page_length = PAGE_SIZE - gtt_page_offset;
-               if ((data_page_offset + page_length) > PAGE_SIZE)
-                       page_length = PAGE_SIZE - data_page_offset;
-
-               slow_kernel_write(dev_priv->mm.gtt_mapping,
-                                 gtt_page_base, gtt_page_offset,
-                                 user_pages[data_page_index],
-                                 data_page_offset,
-                                 page_length);
-
-               remain -= page_length;
-               offset += page_length;
-               data_ptr += page_length;
-       }
+       if (unlikely(page_do_bit17_swizzling))
+               return -EINVAL;
 
-out_unpin_pages:
-       for (i = 0; i < pinned_pages; i++)
-               page_cache_release(user_pages[i]);
-       drm_free_large(user_pages);
+       vaddr = kmap_atomic(page);
+       if (needs_clflush_before)
+               drm_clflush_virt_range(vaddr + shmem_page_offset,
+                                      page_length);
+       ret = __copy_from_user_inatomic_nocache(vaddr + shmem_page_offset,
+                                               user_data,
+                                               page_length);
+       if (needs_clflush_after)
+               drm_clflush_virt_range(vaddr + shmem_page_offset,
+                                      page_length);
+       kunmap_atomic(vaddr);
 
        return ret;
 }
 
-/**
- * This is the fast shmem pwrite path, which attempts to directly
- * copy_from_user into the kmapped pages backing the object.
- */
+/* Only difference to the fast-path function is that this can handle bit17
+ * and uses non-atomic copy and kmap functions. */
 static int
-i915_gem_shmem_pwrite_fast(struct drm_device *dev,
-                          struct drm_i915_gem_object *obj,
-                          struct drm_i915_gem_pwrite *args,
-                          struct drm_file *file)
+shmem_pwrite_slow(struct page *page, int shmem_page_offset, int page_length,
+                 char __user *user_data,
+                 bool page_do_bit17_swizzling,
+                 bool needs_clflush_before,
+                 bool needs_clflush_after)
 {
-       struct address_space *mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
-       ssize_t remain;
-       loff_t offset;
-       char __user *user_data;
-       int page_offset, page_length;
-
-       user_data = (char __user *) (uintptr_t) args->data_ptr;
-       remain = args->size;
-
-       offset = args->offset;
-       obj->dirty = 1;
-
-       while (remain > 0) {
-               struct page *page;
-               char *vaddr;
-               int ret;
-
-               /* Operation in this page
-                *
-                * page_offset = offset within page
-                * page_length = bytes to copy for this page
-                */
-               page_offset = offset_in_page(offset);
-               page_length = remain;
-               if ((page_offset + remain) > PAGE_SIZE)
-                       page_length = PAGE_SIZE - page_offset;
-
-               page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
-               if (IS_ERR(page))
-                       return PTR_ERR(page);
+       char *vaddr;
+       int ret;
 
-               vaddr = kmap_atomic(page);
-               ret = __copy_from_user_inatomic(vaddr + page_offset,
+       vaddr = kmap(page);
+       if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
+               shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
+                                            page_length,
+                                            page_do_bit17_swizzling);
+       if (page_do_bit17_swizzling)
+               ret = __copy_from_user_swizzled(vaddr, shmem_page_offset,
                                                user_data,
                                                page_length);
-               kunmap_atomic(vaddr);
-
-               set_page_dirty(page);
-               mark_page_accessed(page);
-               page_cache_release(page);
-
-               /* If we get a fault while copying data, then (presumably) our
-                * source page isn't available.  Return the error and we'll
-                * retry in the slow path.
-                */
-               if (ret)
-                       return -EFAULT;
-
-               remain -= page_length;
-               user_data += page_length;
-               offset += page_length;
-       }
+       else
+               ret = __copy_from_user(vaddr + shmem_page_offset,
+                                      user_data,
+                                      page_length);
+       if (needs_clflush_after)
+               shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
+                                            page_length,
+                                            page_do_bit17_swizzling);
+       kunmap(page);
 
-       return 0;
+       return ret;
 }
 
-/**
- * This is the fallback shmem pwrite path, which uses get_user_pages to pin
- * the memory and maps it using kmap_atomic for copying.
- *
- * This avoids taking mmap_sem for faulting on the user's address while the
- * struct_mutex is held.
- */
 static int
-i915_gem_shmem_pwrite_slow(struct drm_device *dev,
-                          struct drm_i915_gem_object *obj,
-                          struct drm_i915_gem_pwrite *args,
-                          struct drm_file *file)
+i915_gem_shmem_pwrite(struct drm_device *dev,
+                     struct drm_i915_gem_object *obj,
+                     struct drm_i915_gem_pwrite *args,
+                     struct drm_file *file)
 {
        struct address_space *mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
        ssize_t remain;
        loff_t offset;
        char __user *user_data;
-       int shmem_page_offset, page_length, ret;
+       int shmem_page_offset, page_length, ret = 0;
        int obj_do_bit17_swizzling, page_do_bit17_swizzling;
+       int hit_slowpath = 0;
+       int needs_clflush_after = 0;
+       int needs_clflush_before = 0;
+       int release_page;
 
        user_data = (char __user *) (uintptr_t) args->data_ptr;
        remain = args->size;
 
        obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
 
+       if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
+               /* If we're not in the cpu write domain, set ourself into the gtt
+                * write domain and manually flush cachelines (if required). This
+                * optimizes for the case when the gpu will use the data
+                * right away and we therefore have to clflush anyway. */
+               if (obj->cache_level == I915_CACHE_NONE)
+                       needs_clflush_after = 1;
+               ret = i915_gem_object_set_to_gtt_domain(obj, true);
+               if (ret)
+                       return ret;
+       }
+       /* Same trick applies for invalidate partially written cachelines before
+        * writing.  */
+       if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)
+           && obj->cache_level == I915_CACHE_NONE)
+               needs_clflush_before = 1;
+
        offset = args->offset;
        obj->dirty = 1;
 
-       mutex_unlock(&dev->struct_mutex);
-
        while (remain > 0) {
                struct page *page;
-               char *vaddr;
+               int partial_cacheline_write;
 
                /* Operation in this page
                 *
@@ -822,29 +742,51 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev,
                if ((shmem_page_offset + page_length) > PAGE_SIZE)
                        page_length = PAGE_SIZE - shmem_page_offset;
 
-               page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
-               if (IS_ERR(page)) {
-                       ret = PTR_ERR(page);
-                       goto out;
+               /* If we don't overwrite a cacheline completely we need to be
+                * careful to have up-to-date data by first clflushing. Don't
+                * overcomplicate things and flush the entire patch. */
+               partial_cacheline_write = needs_clflush_before &&
+                       ((shmem_page_offset | page_length)
+                               & (boot_cpu_data.x86_clflush_size - 1));
+
+               if (obj->pages) {
+                       page = obj->pages[offset >> PAGE_SHIFT];
+                       release_page = 0;
+               } else {
+                       page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+                       if (IS_ERR(page)) {
+                               ret = PTR_ERR(page);
+                               goto out;
+                       }
+                       release_page = 1;
                }
 
                page_do_bit17_swizzling = obj_do_bit17_swizzling &&
                        (page_to_phys(page) & (1 << 17)) != 0;
 
-               vaddr = kmap(page);
-               if (page_do_bit17_swizzling)
-                       ret = __copy_from_user_swizzled(vaddr, shmem_page_offset,
-                                                       user_data,
-                                                       page_length);
-               else
-                       ret = __copy_from_user(vaddr + shmem_page_offset,
-                                              user_data,
-                                              page_length);
-               kunmap(page);
+               ret = shmem_pwrite_fast(page, shmem_page_offset, page_length,
+                                       user_data, page_do_bit17_swizzling,
+                                       partial_cacheline_write,
+                                       needs_clflush_after);
+               if (ret == 0)
+                       goto next_page;
 
+               hit_slowpath = 1;
+               page_cache_get(page);
+               mutex_unlock(&dev->struct_mutex);
+
+               ret = shmem_pwrite_slow(page, shmem_page_offset, page_length,
+                                       user_data, page_do_bit17_swizzling,
+                                       partial_cacheline_write,
+                                       needs_clflush_after);
+
+               mutex_lock(&dev->struct_mutex);
+               page_cache_release(page);
+next_page:
                set_page_dirty(page);
                mark_page_accessed(page);
-               page_cache_release(page);
+               if (release_page)
+                       page_cache_release(page);
 
                if (ret) {
                        ret = -EFAULT;
@@ -857,17 +799,21 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev,
        }
 
 out:
-       mutex_lock(&dev->struct_mutex);
-       /* Fixup: Kill any reinstated backing storage pages */
-       if (obj->madv == __I915_MADV_PURGED)
-               i915_gem_object_truncate(obj);
-       /* and flush dirty cachelines in case the object isn't in the cpu write
-        * domain anymore. */
-       if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
-               i915_gem_clflush_object(obj);
-               intel_gtt_chipset_flush();
+       if (hit_slowpath) {
+               /* Fixup: Kill any reinstated backing storage pages */
+               if (obj->madv == __I915_MADV_PURGED)
+                       i915_gem_object_truncate(obj);
+               /* and flush dirty cachelines in case the object isn't in the cpu write
+                * domain anymore. */
+               if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
+                       i915_gem_clflush_object(obj);
+                       intel_gtt_chipset_flush();
+               }
        }
 
+       if (needs_clflush_after)
+               intel_gtt_chipset_flush();
+
        return ret;
 }
 
@@ -892,8 +838,8 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
                       args->size))
                return -EFAULT;
 
-       ret = fault_in_pages_readable((char __user *)(uintptr_t)args->data_ptr,
-                                     args->size);
+       ret = fault_in_multipages_readable((char __user *)(uintptr_t)args->data_ptr,
+                                          args->size);
        if (ret)
                return -EFAULT;
 
@@ -916,6 +862,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 
        trace_i915_gem_object_pwrite(obj, args->offset, args->size);
 
+       ret = -EFAULT;
        /* We can only do the GTT pwrite on untiled buffers, as otherwise
         * it would end up going through the fenced access, and we'll get
         * different detiling behavior between reading and writing.
@@ -928,42 +875,17 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
        }
 
        if (obj->gtt_space &&
+           obj->cache_level == I915_CACHE_NONE &&
+           obj->map_and_fenceable &&
            obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
-               ret = i915_gem_object_pin(obj, 0, true);
-               if (ret)
-                       goto out;
-
-               ret = i915_gem_object_set_to_gtt_domain(obj, true);
-               if (ret)
-                       goto out_unpin;
-
-               ret = i915_gem_object_put_fence(obj);
-               if (ret)
-                       goto out_unpin;
-
                ret = i915_gem_gtt_pwrite_fast(dev, obj, args, file);
-               if (ret == -EFAULT)
-                       ret = i915_gem_gtt_pwrite_slow(dev, obj, args, file);
-
-out_unpin:
-               i915_gem_object_unpin(obj);
-
-               if (ret != -EFAULT)
-                       goto out;
-               /* Fall through to the shmfs paths because the gtt paths might
-                * fail with non-page-backed user pointers (e.g. gtt mappings
-                * when moving data between textures). */
+               /* Note that the gtt paths might fail with non-page-backed user
+                * pointers (e.g. gtt mappings when moving data between
+                * textures). Fallback to the shmem path in that case. */
        }
 
-       ret = i915_gem_object_set_to_cpu_domain(obj, 1);
-       if (ret)
-               goto out;
-
-       ret = -EFAULT;
-       if (!i915_gem_object_needs_bit17_swizzle(obj))
-               ret = i915_gem_shmem_pwrite_fast(dev, obj, args, file);
        if (ret == -EFAULT)
-               ret = i915_gem_shmem_pwrite_slow(dev, obj, args, file);
+               ret = i915_gem_shmem_pwrite(dev, obj, args, file);
 
 out:
        drm_gem_object_unreference(&obj->base);
@@ -1153,6 +1075,9 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                        goto unlock;
        }
 
+       if (!obj->has_global_gtt_mapping)
+               i915_gem_gtt_bind_object(obj, obj->cache_level);
+
        if (obj->tiling_mode == I915_TILING_NONE)
                ret = i915_gem_object_put_fence(obj);
        else
@@ -1546,6 +1471,9 @@ i915_gem_object_truncate(struct drm_i915_gem_object *obj)
        inode = obj->base.filp->f_path.dentry->d_inode;
        shmem_truncate_range(inode, 0, (loff_t)-1);
 
+       if (obj->base.map_list.map)
+               drm_gem_free_mmap_offset(&obj->base);
+
        obj->madv = __I915_MADV_PURGED;
 }
 
@@ -1954,6 +1882,8 @@ i915_wait_request(struct intel_ring_buffer *ring,
        if (!i915_seqno_passed(ring->get_seqno(ring), seqno)) {
                if (HAS_PCH_SPLIT(ring->dev))
                        ier = I915_READ(DEIER) | I915_READ(GTIER);
+               else if (IS_VALLEYVIEW(ring->dev))
+                       ier = I915_READ(GTIER) | I915_READ(VLV_IER);
                else
                        ier = I915_READ(IER);
                if (!ier) {
@@ -2100,11 +2030,13 @@ i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 
        trace_i915_gem_object_unbind(obj);
 
-       i915_gem_gtt_unbind_object(obj);
+       if (obj->has_global_gtt_mapping)
+               i915_gem_gtt_unbind_object(obj);
        if (obj->has_aliasing_ppgtt_mapping) {
                i915_ppgtt_unbind_object(dev_priv->mm.aliasing_ppgtt, obj);
                obj->has_aliasing_ppgtt_mapping = 0;
        }
+       i915_gem_gtt_finish_object(obj);
 
        i915_gem_object_put_pages_gtt(obj);
 
@@ -2749,7 +2681,7 @@ i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj,
                return ret;
        }
 
-       ret = i915_gem_gtt_bind_object(obj);
+       ret = i915_gem_gtt_prepare_object(obj);
        if (ret) {
                i915_gem_object_put_pages_gtt(obj);
                drm_mm_put_block(obj->gtt_space);
@@ -2761,6 +2693,9 @@ i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj,
                goto search_free;
        }
 
+       if (!dev_priv->mm.aliasing_ppgtt)
+               i915_gem_gtt_bind_object(obj, obj->cache_level);
+
        list_add_tail(&obj->gtt_list, &dev_priv->mm.gtt_list);
        list_add_tail(&obj->mm_list, &dev_priv->mm.inactive_list);
 
@@ -2953,7 +2888,8 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
                                return ret;
                }
 
-               i915_gem_gtt_rebind_object(obj, cache_level);
+               if (obj->has_global_gtt_mapping)
+                       i915_gem_gtt_bind_object(obj, cache_level);
                if (obj->has_aliasing_ppgtt_mapping)
                        i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
                                               obj, cache_level);
@@ -3082,7 +3018,7 @@ i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj)
  * This function returns when the move is complete, including waiting on
  * flushes to occur.
  */
-static int
+int
 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 {
        uint32_t old_write_domain, old_read_domains;
@@ -3101,11 +3037,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 
        i915_gem_object_flush_gtt_write_domain(obj);
 
-       /* If we have a partially-valid cache of the object in the CPU,
-        * finish invalidating it and free the per-page flags.
-        */
-       i915_gem_object_set_to_full_cpu_read_domain(obj);
-
        old_write_domain = obj->base.write_domain;
        old_read_domains = obj->base.read_domains;
 
@@ -3136,113 +3067,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
        return 0;
 }
 
-/**
- * Moves the object from a partially CPU read to a full one.
- *
- * Note that this only resolves i915_gem_object_set_cpu_read_domain_range(),
- * and doesn't handle transitioning from !(read_domains & I915_GEM_DOMAIN_CPU).
- */
-static void
-i915_gem_object_set_to_full_cpu_read_domain(struct drm_i915_gem_object *obj)
-{
-       if (!obj->page_cpu_valid)
-               return;
-
-       /* If we're partially in the CPU read domain, finish moving it in.
-        */
-       if (obj->base.read_domains & I915_GEM_DOMAIN_CPU) {
-               int i;
-
-               for (i = 0; i <= (obj->base.size - 1) / PAGE_SIZE; i++) {
-                       if (obj->page_cpu_valid[i])
-                               continue;
-                       drm_clflush_pages(obj->pages + i, 1);
-               }
-       }
-
-       /* Free the page_cpu_valid mappings which are now stale, whether
-        * or not we've got I915_GEM_DOMAIN_CPU.
-        */
-       kfree(obj->page_cpu_valid);
-       obj->page_cpu_valid = NULL;
-}
-
-/**
- * Set the CPU read domain on a range of the object.
- *
- * The object ends up with I915_GEM_DOMAIN_CPU in its read flags although it's
- * not entirely valid.  The page_cpu_valid member of the object flags which
- * pages have been flushed, and will be respected by
- * i915_gem_object_set_to_cpu_domain() if it's called on to get a valid mapping
- * of the whole object.
- *
- * This function returns when the move is complete, including waiting on
- * flushes to occur.
- */
-static int
-i915_gem_object_set_cpu_read_domain_range(struct drm_i915_gem_object *obj,
-                                         uint64_t offset, uint64_t size)
-{
-       uint32_t old_read_domains;
-       int i, ret;
-
-       if (offset == 0 && size == obj->base.size)
-               return i915_gem_object_set_to_cpu_domain(obj, 0);
-
-       ret = i915_gem_object_flush_gpu_write_domain(obj);
-       if (ret)
-               return ret;
-
-       ret = i915_gem_object_wait_rendering(obj);
-       if (ret)
-               return ret;
-
-       i915_gem_object_flush_gtt_write_domain(obj);
-
-       /* If we're already fully in the CPU read domain, we're done. */
-       if (obj->page_cpu_valid == NULL &&
-           (obj->base.read_domains & I915_GEM_DOMAIN_CPU) != 0)
-               return 0;
-
-       /* Otherwise, create/clear the per-page CPU read domain flag if we're
-        * newly adding I915_GEM_DOMAIN_CPU
-        */
-       if (obj->page_cpu_valid == NULL) {
-               obj->page_cpu_valid = kzalloc(obj->base.size / PAGE_SIZE,
-                                             GFP_KERNEL);
-               if (obj->page_cpu_valid == NULL)
-                       return -ENOMEM;
-       } else if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0)
-               memset(obj->page_cpu_valid, 0, obj->base.size / PAGE_SIZE);
-
-       /* Flush the cache on any pages that are still invalid from the CPU's
-        * perspective.
-        */
-       for (i = offset / PAGE_SIZE; i <= (offset + size - 1) / PAGE_SIZE;
-            i++) {
-               if (obj->page_cpu_valid[i])
-                       continue;
-
-               drm_clflush_pages(obj->pages + i, 1);
-
-               obj->page_cpu_valid[i] = 1;
-       }
-
-       /* It should now be out of any other write domains, and we can update
-        * the domain values for our changes.
-        */
-       BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
-
-       old_read_domains = obj->base.read_domains;
-       obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
-
-       trace_i915_gem_object_change_domain(obj,
-                                           old_read_domains,
-                                           obj->base.write_domain);
-
-       return 0;
-}
-
 /* Throttle our rendering by waiting until the ring has completed our requests
  * emitted over 20 msec ago.
  *
@@ -3343,6 +3167,9 @@ i915_gem_object_pin(struct drm_i915_gem_object *obj,
                        return ret;
        }
 
+       if (!obj->has_global_gtt_mapping && map_and_fenceable)
+               i915_gem_gtt_bind_object(obj, obj->cache_level);
+
        if (obj->pin_count++ == 0) {
                if (!obj->active)
                        list_move_tail(&obj->mm_list,
@@ -3664,7 +3491,6 @@ static void i915_gem_free_object_tail(struct drm_i915_gem_object *obj)
        drm_gem_object_release(&obj->base);
        i915_gem_info_remove_obj(dev_priv, obj->base.size);
 
-       kfree(obj->page_cpu_valid);
        kfree(obj->bit_17);
        kfree(obj);
 }