[PATCHv5 3/8] gpu: host1x: Add channel support

Terje Bergstrom <tbergstrom@xxxxxxxxxx> · Tue, 15 Jan 2013 13:26:17 +0200

Add support for host1x client modules, and host1x channels to submit
work to the clients. The work is submitted in GEM CMA buffers, so
this patch adds support for them.

Signed-off-by: Terje Bergstrom <tbergstrom@xxxxxxxxxx>
---
 drivers/gpu/host1x/Kconfig                  |   25 +-
 drivers/gpu/host1x/Makefile                 |    5 +
 drivers/gpu/host1x/cdma.c                   |  439 +++++++++++++++++++
 drivers/gpu/host1x/cdma.h                   |  107 +++++
 drivers/gpu/host1x/channel.c                |  140 ++++++
 drivers/gpu/host1x/channel.h                |   58 +++
 drivers/gpu/host1x/cma.c                    |  116 +++++
 drivers/gpu/host1x/cma.h                    |   43 ++
 drivers/gpu/host1x/dev.c                    |   13 +
 drivers/gpu/host1x/dev.h                    |   59 +++
 drivers/gpu/host1x/host1x.h                 |   29 ++
 drivers/gpu/host1x/hw/cdma_hw.c             |  475 +++++++++++++++++++++
 drivers/gpu/host1x/hw/cdma_hw.h             |   37 ++
 drivers/gpu/host1x/hw/channel_hw.c          |  148 +++++++
 drivers/gpu/host1x/hw/host1x01.c            |    6 +
 drivers/gpu/host1x/hw/host1x01_hardware.h   |  124 ++++++
 drivers/gpu/host1x/hw/hw_host1x01_channel.h |  102 +++++
 drivers/gpu/host1x/hw/hw_host1x01_sync.h    |   12 +
 drivers/gpu/host1x/hw/hw_host1x01_uclass.h  |  168 ++++++++
 drivers/gpu/host1x/hw/syncpt_hw.c           |   10 +
 drivers/gpu/host1x/intr.c                   |   29 +-
 drivers/gpu/host1x/intr.h                   |    6 +
 drivers/gpu/host1x/job.c                    |  612 +++++++++++++++++++++++++++
 drivers/gpu/host1x/job.h                    |  164 +++++++
 drivers/gpu/host1x/memmgr.c                 |  173 ++++++++
 drivers/gpu/host1x/memmgr.h                 |   72 ++++
 drivers/gpu/host1x/syncpt.c                 |   11 +
 drivers/gpu/host1x/syncpt.h                 |    4 +
 include/trace/events/host1x.h               |  211 +++++++++
 29 files changed, 3396 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/host1x/cdma.c
 create mode 100644 drivers/gpu/host1x/cdma.h
 create mode 100644 drivers/gpu/host1x/channel.c
 create mode 100644 drivers/gpu/host1x/channel.h
 create mode 100644 drivers/gpu/host1x/cma.c
 create mode 100644 drivers/gpu/host1x/cma.h
 create mode 100644 drivers/gpu/host1x/host1x.h
 create mode 100644 drivers/gpu/host1x/hw/cdma_hw.c
 create mode 100644 drivers/gpu/host1x/hw/cdma_hw.h
 create mode 100644 drivers/gpu/host1x/hw/channel_hw.c
 create mode 100644 drivers/gpu/host1x/hw/hw_host1x01_channel.h
 create mode 100644 drivers/gpu/host1x/hw/hw_host1x01_uclass.h
 create mode 100644 drivers/gpu/host1x/job.c
 create mode 100644 drivers/gpu/host1x/job.h
 create mode 100644 drivers/gpu/host1x/memmgr.c
 create mode 100644 drivers/gpu/host1x/memmgr.h

diff --git a/drivers/gpu/host1x/Kconfig b/drivers/gpu/host1x/Kconfig
index e89fb2b..57680a6 100644
--- a/drivers/gpu/host1x/Kconfig
+++ b/drivers/gpu/host1x/Kconfig
@@ -3,4 +3,27 @@ config TEGRA_HOST1X
 	help
 	  Driver for the Tegra host1x hardware.
 
-	  Required for enabling tegradrm.
+	  Required for enabling tegradrm and 2D acceleration.
+
+if TEGRA_HOST1X
+
+config TEGRA_HOST1X_CMA
+	bool "Support DRM CMA buffers"
+	depends on DRM
+	default y
+	select DRM_GEM_CMA_HELPER
+	select DRM_KMS_CMA_HELPER
+	help
+	  Say yes if you wish to use DRM CMA buffers.
+
+	  If unsure, choose Y.
+
+config TEGRA_HOST1X_FIREWALL
+	bool "Enable HOST1X security firewall"
+	default y
+	help
+	  Say yes if kernel should protect command streams from tampering.
+
+	  If unsure, choose Y.
+
+endif
diff --git a/drivers/gpu/host1x/Makefile b/drivers/gpu/host1x/Makefile
index 5ef47ff..cdd87c8 100644
--- a/drivers/gpu/host1x/Makefile
+++ b/drivers/gpu/host1x/Makefile
@@ -4,6 +4,11 @@ host1x-y = \
 	syncpt.o \
 	dev.o \
 	intr.o \
+	cdma.o \
+	channel.o \
+	job.o \
+	memmgr.o \
 	hw/host1x01.o
 
+host1x-$(CONFIG_TEGRA_HOST1X_CMA) += cma.o
 obj-$(CONFIG_TEGRA_HOST1X) += host1x.o
diff --git a/drivers/gpu/host1x/cdma.c b/drivers/gpu/host1x/cdma.c
new file mode 100644
index 0000000..d6a38d2
--- /dev/null
+++ b/drivers/gpu/host1x/cdma.c
@@ -0,0 +1,439 @@
+/*
+ * Tegra host1x Command DMA
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "cdma.h"
+#include "channel.h"
+#include "dev.h"
+#include "memmgr.h"
+#include "job.h"
+#include <asm/cacheflush.h>
+
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/interrupt.h>
+#include <trace/events/host1x.h>
+
+#define TRACE_MAX_LENGTH 128U
+
+/*
+ * Add an entry to the sync queue.
+ */
+static void add_to_sync_queue(struct host1x_cdma *cdma,
+			      struct host1x_job *job,
+			      u32 nr_slots,
+			      u32 first_get)
+{
+	if (job->syncpt_id == NVSYNCPT_INVALID) {
+		dev_warn(&job->ch->dev->dev, "%s: Invalid syncpt\n",
+				__func__);
+		return;
+	}
+
+	job->first_get = first_get;
+	job->num_slots = nr_slots;
+	host1x_job_get(job);
+	list_add_tail(&job->list, &cdma->sync_queue);
+}
+
+/*
+ * Return the status of the cdma's sync queue or push buffer for the given event
+ *  - sq empty: returns 1 for empty, 0 for not empty (as in "1 empty queue" :-)
+ *  - pb space: returns the number of free slots in the channel's push buffer
+ * Must be called with the cdma lock held.
+ */
+static unsigned int cdma_status_locked(struct host1x_cdma *cdma,
+		enum cdma_event event)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	switch (event) {
+	case CDMA_EVENT_SYNC_QUEUE_EMPTY:
+		return list_empty(&cdma->sync_queue) ? 1 : 0;
+	case CDMA_EVENT_PUSH_BUFFER_SPACE: {
+		struct push_buffer *pb = &cdma->push_buffer;
+		return host1x->cdma_pb_op.space(pb);
+	}
+	default:
+		return 0;
+	}
+}
+
+/*
+ * Sleep (if necessary) until the requested event happens
+ *   - CDMA_EVENT_SYNC_QUEUE_EMPTY : sync queue is completely empty.
+ *     - Returns 1
+ *   - CDMA_EVENT_PUSH_BUFFER_SPACE : there is space in the push buffer
+ *     - Return the amount of space (> 0)
+ * Must be called with the cdma lock held.
+ */
+unsigned int host1x_cdma_wait_locked(struct host1x_cdma *cdma,
+		enum cdma_event event)
+{
+	for (;;) {
+		unsigned int space = cdma_status_locked(cdma, event);
+		if (space)
+			return space;
+
+		trace_host1x_wait_cdma(cdma_to_channel(cdma)->dev->name,
+				event);
+
+		/* If somebody has managed to already start waiting, yield */
+		if (cdma->event != CDMA_EVENT_NONE) {
+			mutex_unlock(&cdma->lock);
+			schedule();
+			mutex_lock(&cdma->lock);
+			continue;
+		}
+		cdma->event = event;
+
+		mutex_unlock(&cdma->lock);
+		down(&cdma->sem);
+		mutex_lock(&cdma->lock);
+	}
+	return 0;
+}
+
+/*
+ * Start timer for a buffer submition that has completed yet.
+ * Must be called with the cdma lock held.
+ */
+static void cdma_start_timer_locked(struct host1x_cdma *cdma,
+		struct host1x_job *job)
+{
+	struct host1x *host = cdma_to_host1x(cdma);
+
+	if (cdma->timeout.clientid) {
+		/* timer already started */
+		return;
+	}
+
+	cdma->timeout.clientid = job->clientid;
+	cdma->timeout.syncpt = host1x_syncpt_get(host, job->syncpt_id);
+	cdma->timeout.syncpt_val = job->syncpt_end;
+	cdma->timeout.start_ktime = ktime_get();
+
+	schedule_delayed_work(&cdma->timeout.wq,
+			msecs_to_jiffies(job->timeout));
+}
+
+/*
+ * Stop timer when a buffer submition completes.
+ * Must be called with the cdma lock held.
+ */
+static void stop_cdma_timer_locked(struct host1x_cdma *cdma)
+{
+	cancel_delayed_work(&cdma->timeout.wq);
+	cdma->timeout.clientid = 0;
+}
+
+/*
+ * For all sync queue entries that have already finished according to the
+ * current sync point registers:
+ *  - unpin & unref their mems
+ *  - pop their push buffer slots
+ *  - remove them from the sync queue
+ * This is normally called from the host code's worker thread, but can be
+ * called manually if necessary.
+ * Must be called with the cdma lock held.
+ */
+static void update_cdma_locked(struct host1x_cdma *cdma)
+{
+	bool signal = false;
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_job *job, *n;
+
+	/* If CDMA is stopped, queue is cleared and we can return */
+	if (!cdma->running)
+		return;
+
+	/*
+	 * Walk the sync queue, reading the sync point registers as necessary,
+	 * to consume as many sync queue entries as possible without blocking
+	 */
+	list_for_each_entry_safe(job, n, &cdma->sync_queue, list) {
+		struct host1x_syncpt *sp = host1x->syncpt + job->syncpt_id;
+
+		/* Check whether this syncpt has completed, and bail if not */
+		if (!host1x_syncpt_is_expired(sp, job->syncpt_end)) {
+			/* Start timer on next pending syncpt */
+			if (job->timeout)
+				cdma_start_timer_locked(cdma, job);
+			break;
+		}
+
+		/* Cancel timeout, when a buffer completes */
+		if (cdma->timeout.clientid)
+			stop_cdma_timer_locked(cdma);
+
+		/* Unpin the memory */
+		host1x_job_unpin(job);
+
+		/* Pop push buffer slots */
+		if (job->num_slots) {
+			struct push_buffer *pb = &cdma->push_buffer;
+			host1x->cdma_pb_op.pop_from(pb, job->num_slots);
+			if (cdma->event == CDMA_EVENT_PUSH_BUFFER_SPACE)
+				signal = true;
+		}
+
+		list_del(&job->list);
+		host1x_job_put(job);
+	}
+
+	if (list_empty(&cdma->sync_queue) &&
+				cdma->event == CDMA_EVENT_SYNC_QUEUE_EMPTY)
+			signal = true;
+
+	/* Wake up CdmaWait() if the requested event happened */
+	if (signal) {
+		cdma->event = CDMA_EVENT_NONE;
+		up(&cdma->sem);
+	}
+}
+
+void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
+		struct platform_device *dev)
+{
+	u32 get_restart;
+	u32 syncpt_incrs;
+	struct host1x_job *job = NULL;
+	u32 syncpt_val;
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	syncpt_val = host1x_syncpt_load_min(cdma->timeout.syncpt);
+
+	dev_dbg(&dev->dev,
+		"%s: starting cleanup (thresh %d)\n",
+		__func__, syncpt_val);
+
+	/*
+	 * Move the sync_queue read pointer to the first entry that hasn't
+	 * completed based on the current HW syncpt value. It's likely there
+	 * won't be any (i.e. we're still at the head), but covers the case
+	 * where a syncpt incr happens just prior/during the teardown.
+	 */
+
+	dev_dbg(&dev->dev,
+		"%s: skip completed buffers still in sync_queue\n",
+		__func__);
+
+	list_for_each_entry(job, &cdma->sync_queue, list) {
+		if (syncpt_val < job->syncpt_end)
+			break;
+
+		host1x_job_dump(&dev->dev, job);
+	}
+
+	/*
+	 * Walk the sync_queue, first incrementing with the CPU syncpts that
+	 * are partially executed (the first buffer) or fully skipped while
+	 * still in the current context (slots are also NOP-ed).
+	 *
+	 * At the point contexts are interleaved, syncpt increments must be
+	 * done inline with the pushbuffer from a GATHER buffer to maintain
+	 * the order (slots are modified to be a GATHER of syncpt incrs).
+	 *
+	 * Note: save in get_restart the location where the timed out buffer
+	 * started in the PB, so we can start the refetch from there (with the
+	 * modified NOP-ed PB slots). This lets things appear to have completed
+	 * properly for this buffer and resources are freed.
+	 */
+
+	dev_dbg(&dev->dev,
+		"%s: perform CPU incr on pending same ctx buffers\n",
+		__func__);
+
+	get_restart = cdma->last_put;
+	if (!list_empty(&cdma->sync_queue))
+		get_restart = job->first_get;
+
+	/* do CPU increments as long as this context continues */
+	list_for_each_entry_from(job, &cdma->sync_queue, list) {
+		/* different context, gets us out of this loop */
+		if (job->clientid != cdma->timeout.clientid)
+			break;
+
+		/* won't need a timeout when replayed */
+		job->timeout = 0;
+
+		syncpt_incrs = job->syncpt_end - syncpt_val;
+		dev_dbg(&dev->dev,
+			"%s: CPU incr (%d)\n", __func__, syncpt_incrs);
+
+		host1x_job_dump(&dev->dev, job);
+
+		/* safe to use CPU to incr syncpts */
+		host1x->cdma_op.timeout_cpu_incr(cdma,
+				job->first_get,
+				syncpt_incrs,
+				job->syncpt_end,
+				job->num_slots);
+
+		syncpt_val += syncpt_incrs;
+	}
+
+	list_for_each_entry_from(job, &cdma->sync_queue, list)
+		if (job->clientid == cdma->timeout.clientid)
+			job->timeout = 500;
+
+	dev_dbg(&dev->dev,
+		"%s: finished sync_queue modification\n", __func__);
+
+	/* roll back DMAGET and start up channel again */
+	host1x->cdma_op.timeout_teardown_end(cdma, get_restart);
+}
+
+/*
+ * Create a cdma
+ */
+int host1x_cdma_init(struct host1x_cdma *cdma)
+{
+	int err;
+	struct push_buffer *pb = &cdma->push_buffer;
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	mutex_init(&cdma->lock);
+	sema_init(&cdma->sem, 0);
+
+	INIT_LIST_HEAD(&cdma->sync_queue);
+
+	cdma->event = CDMA_EVENT_NONE;
+	cdma->running = false;
+	cdma->torndown = false;
+
+	err = host1x->cdma_pb_op.init(pb);
+	if (err)
+		return err;
+	return 0;
+}
+
+/*
+ * Destroy a cdma
+ */
+void host1x_cdma_deinit(struct host1x_cdma *cdma)
+{
+	struct push_buffer *pb = &cdma->push_buffer;
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	if (cdma->running) {
+		pr_warn("%s: CDMA still running\n",
+				__func__);
+	} else {
+		host1x->cdma_pb_op.destroy(pb);
+		host1x->cdma_op.timeout_destroy(cdma);
+	}
+}
+
+/*
+ * Begin a cdma submit
+ */
+int host1x_cdma_begin(struct host1x_cdma *cdma, struct host1x_job *job)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	mutex_lock(&cdma->lock);
+
+	if (job->timeout) {
+		/* init state on first submit with timeout value */
+		if (!cdma->timeout.initialized) {
+			int err;
+			err = host1x->cdma_op.timeout_init(cdma,
+					job->syncpt_id);
+			if (err) {
+				mutex_unlock(&cdma->lock);
+				return err;
+			}
+		}
+	}
+	if (!cdma->running)
+		host1x->cdma_op.start(cdma);
+
+	cdma->slots_free = 0;
+	cdma->slots_used = 0;
+	cdma->first_get = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
+
+	trace_host1x_cdma_begin(job->ch->dev->name);
+	return 0;
+}
+
+/*
+ * Push two words into a push buffer slot
+ * Blocks as necessary if the push buffer is full.
+ */
+void host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2)
+{
+	host1x_cdma_push_gather(cdma, NULL, 0, op1, op2);
+}
+
+/*
+ * Push two words into a push buffer slot
+ * Blocks as necessary if the push buffer is full.
+ */
+void host1x_cdma_push_gather(struct host1x_cdma *cdma,
+		struct mem_handle *handle,
+		u32 offset, u32 op1, u32 op2)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	u32 slots_free = cdma->slots_free;
+	struct push_buffer *pb = &cdma->push_buffer;
+
+	if (slots_free == 0) {
+		host1x->cdma_op.kick(cdma);
+		slots_free = host1x_cdma_wait_locked(cdma,
+				CDMA_EVENT_PUSH_BUFFER_SPACE);
+	}
+	cdma->slots_free = slots_free - 1;
+	cdma->slots_used++;
+	host1x->cdma_pb_op.push_to(pb, handle, op1, op2);
+}
+
+/*
+ * End a cdma submit
+ * Kick off DMA, add job to the sync queue, and a number of slots to be freed
+ * from the pushbuffer. The handles for a submit must all be pinned at the same
+ * time, but they can be unpinned in smaller chunks.
+ */
+void host1x_cdma_end(struct host1x_cdma *cdma,
+		struct host1x_job *job)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	bool was_idle = list_empty(&cdma->sync_queue);
+
+	host1x->cdma_op.kick(cdma);
+
+	add_to_sync_queue(cdma,
+			job,
+			cdma->slots_used,
+			cdma->first_get);
+
+	/* start timer on idle -> active transitions */
+	if (job->timeout && was_idle)
+		cdma_start_timer_locked(cdma, job);
+
+	trace_host1x_cdma_end(job->ch->dev->name);
+	mutex_unlock(&cdma->lock);
+}
+
+/*
+ * Update cdma state according to current sync point values
+ */
+void host1x_cdma_update(struct host1x_cdma *cdma)
+{
+	mutex_lock(&cdma->lock);
+	update_cdma_locked(cdma);
+	mutex_unlock(&cdma->lock);
+}
diff --git a/drivers/gpu/host1x/cdma.h b/drivers/gpu/host1x/cdma.h
new file mode 100644
index 0000000..d9cabef
--- /dev/null
+++ b/drivers/gpu/host1x/cdma.h
@@ -0,0 +1,107 @@
+/*
+ * Tegra host1x Command DMA
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_CDMA_H
+#define __HOST1X_CDMA_H
+
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+
+#include <linux/list.h>
+
+struct host1x_syncpt;
+struct host1x_userctx_timeout;
+struct host1x_job;
+struct mem_handle;
+struct platform_device;
+
+/*
+ * cdma
+ *
+ * This is in charge of a host command DMA channel.
+ * Sends ops to a push buffer, and takes responsibility for unpinning
+ * (& possibly freeing) of memory after those ops have completed.
+ * Producer:
+ *	begin
+ *		push - send ops to the push buffer
+ *	end - start command DMA and enqueue handles to be unpinned
+ * Consumer:
+ *	update - call to update sync queue and push buffer, unpin memory
+ */
+
+struct push_buffer {
+	u32 *mapped;			/* mapped pushbuffer memory */
+	dma_addr_t phys;		/* physical address of pushbuffer */
+	u32 fence;			/* index we've written */
+	u32 cur;			/* index to write to */
+	struct mem_handle **handle;	/* handle for each opcode pair */
+};
+
+struct buffer_timeout {
+	struct delayed_work wq;		/* work queue */
+	bool initialized;		/* timer one-time setup flag */
+	struct host1x_syncpt *syncpt;	/* buffer completion syncpt */
+	u32 syncpt_val;			/* syncpt value when completed */
+	ktime_t start_ktime;		/* starting time */
+	/* context timeout information */
+	int clientid;
+};
+
+enum cdma_event {
+	CDMA_EVENT_NONE,		/* not waiting for any event */
+	CDMA_EVENT_SYNC_QUEUE_EMPTY,	/* wait for empty sync queue */
+	CDMA_EVENT_PUSH_BUFFER_SPACE	/* wait for space in push buffer */
+};
+
+struct host1x_cdma {
+	struct mutex lock;		/* controls access to shared state */
+	struct semaphore sem;		/* signalled when event occurs */
+	enum cdma_event event;		/* event that sem is waiting for */
+	unsigned int slots_used;	/* pb slots used in current submit */
+	unsigned int slots_free;	/* pb slots free in current submit */
+	unsigned int first_get;		/* DMAGET value, where submit begins */
+	unsigned int last_put;		/* last value written to DMAPUT */
+	struct push_buffer push_buffer;	/* channel's push buffer */
+	struct list_head sync_queue;	/* job queue */
+	struct buffer_timeout timeout;	/* channel's timeout state/wq */
+	bool running;
+	bool torndown;
+};
+
+#define cdma_to_channel(cdma) container_of(cdma, struct host1x_channel, cdma)
+#define cdma_to_host1x(cdma) host1x_get_host(cdma_to_channel(cdma)->dev)
+#define cdma_to_memmgr(cdma) ((cdma_to_host1x(cdma))->memmgr)
+#define pb_to_cdma(pb) container_of(pb, struct host1x_cdma, push_buffer)
+
+int	host1x_cdma_init(struct host1x_cdma *cdma);
+void	host1x_cdma_deinit(struct host1x_cdma *cdma);
+void	host1x_cdma_stop(struct host1x_cdma *cdma);
+int	host1x_cdma_begin(struct host1x_cdma *cdma, struct host1x_job *job);
+void	host1x_cdma_push(struct host1x_cdma *cdma, u32 op1, u32 op2);
+void	host1x_cdma_push_gather(struct host1x_cdma *cdma,
+		struct mem_handle *handle, u32 offset, u32 op1, u32 op2);
+void	host1x_cdma_end(struct host1x_cdma *cdma,
+		struct host1x_job *job);
+void	host1x_cdma_update(struct host1x_cdma *cdma);
+void	host1x_cdma_peek(struct host1x_cdma *cdma,
+		u32 dmaget, int slot, u32 *out);
+unsigned int host1x_cdma_wait_locked(struct host1x_cdma *cdma,
+		enum cdma_event event);
+void host1x_cdma_update_sync_queue(struct host1x_cdma *cdma,
+		struct platform_device *dev);
+#endif
diff --git a/drivers/gpu/host1x/channel.c b/drivers/gpu/host1x/channel.c
new file mode 100644
index 0000000..ff647ac
--- /dev/null
+++ b/drivers/gpu/host1x/channel.c
@@ -0,0 +1,140 @@
+/*
+ * Tegra host1x Channel
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "channel.h"
+#include "dev.h"
+#include "job.h"
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+/* Constructor for the host1x device list */
+void host1x_channel_list_init(struct host1x *host1x)
+{
+	INIT_LIST_HEAD(&host1x->chlist.list);
+	mutex_init(&host1x->chlist_mutex);
+}
+
+/*
+ * Iterator function for host1x device list
+ * It takes a fptr as an argument and calls that function for each
+ * device in the list
+ */
+void host1x_channel_for_all(struct host1x *host1x, void *data,
+	int (*fptr)(struct host1x_channel *ch, void *fdata))
+{
+	struct host1x_channel *ch;
+	int ret;
+
+	list_for_each_entry(ch, &host1x->chlist.list, list) {
+		if (ch && fptr) {
+			ret = fptr(ch, data);
+			if (ret) {
+				pr_info("%s: iterator error\n", __func__);
+				break;
+			}
+		}
+	}
+}
+
+
+int host1x_channel_submit(struct host1x_job *job)
+{
+	return host1x_get_host(job->ch->dev)->channel_op.submit(job);
+}
+
+struct host1x_channel *host1x_channel_get(struct host1x_channel *ch)
+{
+	int err = 0;
+
+	mutex_lock(&ch->reflock);
+	if (ch->refcount == 0)
+		err = host1x_cdma_init(&ch->cdma);
+	if (!err)
+		ch->refcount++;
+
+	mutex_unlock(&ch->reflock);
+
+	return err ? NULL : ch;
+}
+
+void host1x_channel_put(struct host1x_channel *ch)
+{
+	mutex_lock(&ch->reflock);
+	if (ch->refcount == 1) {
+		host1x_get_host(ch->dev)->cdma_op.stop(&ch->cdma);
+		host1x_cdma_deinit(&ch->cdma);
+	}
+	ch->refcount--;
+	mutex_unlock(&ch->reflock);
+}
+
+struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev)
+{
+	struct host1x_channel *ch = NULL;
+	struct host1x *host1x = host1x_get_host(pdev);
+	int chindex;
+	int max_channels = host1x->info.nb_channels;
+	int err;
+
+	mutex_lock(&host1x->chlist_mutex);
+
+	chindex = host1x->allocated_channels;
+	if (chindex > max_channels)
+		goto fail;
+
+	ch = kzalloc(sizeof(*ch), GFP_KERNEL);
+	if (ch == NULL)
+		goto fail;
+
+	/* Link platform_device to host1x_channel */
+	err = host1x->channel_op.init(ch, host1x, chindex);
+	if (err < 0)
+		goto fail;
+
+	ch->dev = pdev;
+
+	/* Add to channel list */
+	list_add_tail(&ch->list, &host1x->chlist.list);
+
+	host1x->allocated_channels++;
+
+	mutex_unlock(&host1x->chlist_mutex);
+	return ch;
+
+fail:
+	dev_err(&pdev->dev, "failed to init channel\n");
+	kfree(ch);
+	mutex_unlock(&host1x->chlist_mutex);
+	return NULL;
+}
+
+void host1x_channel_free(struct host1x_channel *ch)
+{
+	struct host1x *host1x = host1x_get_host(ch->dev);
+	struct host1x_channel *chiter, *tmp;
+	list_for_each_entry_safe(chiter, tmp, &host1x->chlist.list, list) {
+		if (chiter == ch) {
+			list_del(&chiter->list);
+			kfree(ch);
+			host1x->allocated_channels--;
+
+			return;
+		}
+	}
+}
diff --git a/drivers/gpu/host1x/channel.h b/drivers/gpu/host1x/channel.h
new file mode 100644
index 0000000..41eb01e
--- /dev/null
+++ b/drivers/gpu/host1x/channel.h
@@ -0,0 +1,58 @@
+/*
+ * Tegra host1x Channel
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_CHANNEL_H
+#define __HOST1X_CHANNEL_H
+
+#include <linux/cdev.h>
+#include <linux/io.h>
+#include "cdma.h"
+
+struct host1x;
+struct platform_device;
+
+/*
+ * host1x device list in debug-fs dump of host1x and client device
+ * as well as channel state
+ */
+struct host1x_channel {
+	struct list_head list;
+
+	int refcount;
+	int chid;
+	struct mutex reflock;
+	struct mutex submitlock;
+	void __iomem *regs;
+	struct device *node;
+	struct platform_device *dev;
+	struct cdev cdev;
+	struct host1x_cdma cdma;
+};
+
+/* channel list operations */
+void host1x_channel_list_init(struct host1x *);
+void host1x_channel_for_all(struct host1x *, void *data,
+	int (*fptr)(struct host1x_channel *ch, void *fdata));
+
+struct host1x_channel *host1x_channel_alloc(struct platform_device *pdev);
+void host1x_channel_free(struct host1x_channel *ch);
+struct host1x_channel *host1x_channel_get(struct host1x_channel *ch);
+void host1x_channel_put(struct host1x_channel *ch);
+int host1x_channel_submit(struct host1x_job *job);
+
+#endif
diff --git a/drivers/gpu/host1x/cma.c b/drivers/gpu/host1x/cma.c
new file mode 100644
index 0000000..06b7959
--- /dev/null
+++ b/drivers/gpu/host1x/cma.c
@@ -0,0 +1,116 @@
+/*
+ * Tegra host1x CMA support
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <drm/drmP.h>
+#include <drm/drm.h>
+#include <drm/drm_gem_cma_helper.h>
+#include <linux/mutex.h>
+
+#include "cma.h"
+#include "memmgr.h"
+
+static inline struct drm_gem_cma_object *to_cma_obj(struct mem_handle *h)
+{
+	return (struct drm_gem_cma_object *)(((u32)h) & MEMMGR_ID_MASK);
+}
+
+struct mem_handle *host1x_cma_alloc(size_t size, size_t align, int flags)
+{
+	return NULL;
+}
+
+void host1x_cma_put(struct mem_handle *handle)
+{
+	struct drm_gem_cma_object *obj = to_cma_obj(handle);
+	struct mutex *struct_mutex = &obj->base.dev->struct_mutex;
+
+	mutex_lock(struct_mutex);
+	drm_gem_object_unreference(&obj->base);
+	mutex_unlock(struct_mutex);
+}
+
+struct sg_table *host1x_cma_pin(struct mem_handle *handle)
+{
+	return NULL;
+}
+
+void host1x_cma_unpin(struct mem_handle *handle, struct sg_table *sgt)
+{
+
+}
+
+
+void *host1x_cma_mmap(struct mem_handle *handle)
+{
+	return (to_cma_obj(handle))->vaddr;
+}
+
+void host1x_cma_munmap(struct mem_handle *handle, void *addr)
+{
+
+}
+
+void *host1x_cma_kmap(struct mem_handle *handle, unsigned int pagenum)
+{
+	return (to_cma_obj(handle))->vaddr + pagenum * PAGE_SIZE;
+}
+
+void host1x_cma_kunmap(struct mem_handle *handle, unsigned int pagenum,
+		void *addr)
+{
+
+}
+
+struct mem_handle *host1x_cma_get(u32 id, struct platform_device *dev)
+{
+	struct drm_gem_cma_object *obj = to_cma_obj((void *)id);
+	struct mutex *struct_mutex = &obj->base.dev->struct_mutex;
+
+	mutex_lock(struct_mutex);
+	drm_gem_object_reference(&obj->base);
+	mutex_unlock(struct_mutex);
+
+	return (struct mem_handle *) ((u32)id | mem_mgr_type_cma);
+}
+
+int host1x_cma_pin_array_ids(struct platform_device *dev,
+		long unsigned *ids,
+		long unsigned id_type_mask,
+		long unsigned id_type,
+		u32 count,
+		struct host1x_job_unpin_data *unpin_data,
+		dma_addr_t *phys_addr)
+{
+	int i;
+	int pin_count = 0;
+
+	for (i = 0; i < count; i++) {
+		struct mem_handle *handle;
+
+		if ((ids[i] & id_type_mask) != id_type)
+			continue;
+
+		handle = host1x_cma_get(ids[i], dev);
+
+		phys_addr[i] = (to_cma_obj(handle)->paddr);
+		unpin_data[pin_count].h = handle;
+
+		pin_count++;
+	}
+	return pin_count;
+}
diff --git a/drivers/gpu/host1x/cma.h b/drivers/gpu/host1x/cma.h
new file mode 100644
index 0000000..82ad710
--- /dev/null
+++ b/drivers/gpu/host1x/cma.h
@@ -0,0 +1,43 @@
+/*
+ * Tegra host1x cma memory manager
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_CMA_H
+#define __HOST1X_CMA_H
+
+#include "memmgr.h"
+
+struct platform_device;
+
+struct mem_handle *host1x_cma_alloc(size_t size, size_t align, int flags);
+void host1x_cma_put(struct mem_handle *handle);
+struct sg_table *host1x_cma_pin(struct mem_handle *handle);
+void host1x_cma_unpin(struct mem_handle *handle, struct sg_table *sgt);
+void *host1x_cma_mmap(struct mem_handle *handle);
+void host1x_cma_munmap(struct mem_handle *handle, void *addr);
+void *host1x_cma_kmap(struct mem_handle *handle, unsigned int pagenum);
+void host1x_cma_kunmap(struct mem_handle *handle, unsigned int pagenum,
+		void *addr);
+struct mem_handle *host1x_cma_get(u32 id, struct platform_device *dev);
+int host1x_cma_pin_array_ids(struct platform_device *dev,
+		long unsigned *ids,
+		long unsigned id_type_mask,
+		long unsigned id_type,
+		u32 count,
+		struct host1x_job_unpin_data *unpin_data,
+		dma_addr_t *phys_addr);
+#endif
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c
index 7f9f389..80311ca 100644
--- a/drivers/gpu/host1x/dev.c
+++ b/drivers/gpu/host1x/dev.c
@@ -25,6 +25,7 @@
 #include <linux/io.h>
 #include "dev.h"
 #include "intr.h"
+#include "channel.h"
 #include "hw/host1x01.h"
 
 #define CREATE_TRACE_POINTS
@@ -46,6 +47,16 @@ u32 host1x_sync_readl(struct host1x *host1x, u32 r)
 	return readl(sync_regs + r);
 }
 
+void host1x_ch_writel(struct host1x_channel *ch, u32 v, u32 r)
+{
+	writel(v, ch->regs + r);
+}
+
+u32 host1x_ch_readl(struct host1x_channel *ch, u32 r)
+{
+	return readl(ch->regs + r);
+}
+
 static struct host1x_device_info host1x_info = {
 	.nb_channels	= 8,
 	.nb_pts		= 32,
@@ -135,6 +146,8 @@ static int host1x_probe(struct platform_device *dev)
 
 	host1x_syncpt_reset(host);
 
+	host1x_channel_list_init(host);
+
 	host1x_intr_start(&host->intr, clk_get_rate(host->clk));
 
 	dev_info(&dev->dev, "initialized\n");
diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h
index 8376092..2fefa78 100644
--- a/drivers/gpu/host1x/dev.h
+++ b/drivers/gpu/host1x/dev.h
@@ -18,11 +18,58 @@
 #define HOST1X_DEV_H
 
 #include <linux/platform_device.h>
+
+#include "channel.h"
 #include "syncpt.h"
 #include "intr.h"
 
 struct host1x;
+struct host1x_intr;
 struct host1x_syncpt;
+struct host1x_channel;
+struct host1x_cdma;
+struct host1x_job;
+struct push_buffer;
+struct dentry;
+struct mem_handle;
+struct platform_device;
+
+struct host1x_channel_ops {
+	int (*init)(struct host1x_channel *,
+		    struct host1x *,
+		    int chid);
+	int (*submit)(struct host1x_job *job);
+};
+
+struct host1x_cdma_ops {
+	void (*start)(struct host1x_cdma *);
+	void (*stop)(struct host1x_cdma *);
+	void (*kick)(struct  host1x_cdma *);
+	int (*timeout_init)(struct host1x_cdma *,
+			    u32 syncpt_id);
+	void (*timeout_destroy)(struct host1x_cdma *);
+	void (*timeout_teardown_begin)(struct host1x_cdma *);
+	void (*timeout_teardown_end)(struct host1x_cdma *,
+				     u32 getptr);
+	void (*timeout_cpu_incr)(struct host1x_cdma *,
+				 u32 getptr,
+				 u32 syncpt_incrs,
+				 u32 syncval,
+				 u32 nr_slots);
+};
+
+struct host1x_pushbuffer_ops {
+	void (*reset)(struct push_buffer *);
+	int (*init)(struct push_buffer *);
+	void (*destroy)(struct push_buffer *);
+	void (*push_to)(struct push_buffer *,
+			struct mem_handle *,
+			u32 op1, u32 op2);
+	void (*pop_from)(struct push_buffer *,
+			 unsigned int slots);
+	u32 (*space)(struct push_buffer *);
+	u32 (*putptr)(struct push_buffer *);
+};
 
 struct host1x_syncpt_ops {
 	void (*reset)(struct host1x_syncpt *);
@@ -64,9 +111,19 @@ struct host1x {
 	struct host1x_device_info info;
 	struct clk *clk;
 
+	/* Sync point dedicated to replacing waits for expired fences */
+	struct host1x_syncpt *nop_sp;
+
+	struct host1x_channel_ops channel_op;
+	struct host1x_cdma_ops cdma_op;
+	struct host1x_pushbuffer_ops cdma_pb_op;
 	struct host1x_syncpt_ops syncpt_op;
 	struct host1x_intr_ops intr_op;
 
+	struct mutex chlist_mutex;
+	struct host1x_channel chlist;
+	int allocated_channels;
+
 	struct dentry *debugfs;
 };
 
@@ -84,5 +141,7 @@ struct host1x *host1x_get_host(struct platform_device *_dev)
 
 void host1x_sync_writel(struct host1x *host1x, u32 r, u32 v);
 u32 host1x_sync_readl(struct host1x *host1x, u32 r);
+void host1x_ch_writel(struct host1x_channel *ch, u32 r, u32 v);
+u32 host1x_ch_readl(struct host1x_channel *ch, u32 r);
 
 #endif
diff --git a/drivers/gpu/host1x/host1x.h b/drivers/gpu/host1x/host1x.h
new file mode 100644
index 0000000..ded0660
--- /dev/null
+++ b/drivers/gpu/host1x/host1x.h
@@ -0,0 +1,29 @@
+/*
+ * Tegra host1x driver
+ *
+ * Copyright (c) 2009-2013, NVIDIA Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __LINUX_HOST1X_H
+#define __LINUX_HOST1X_H
+
+enum host1x_class {
+	NV_HOST1X_CLASS_ID		= 0x1,
+	NV_GRAPHICS_2D_CLASS_ID		= 0x51,
+};
+
+#endif
diff --git a/drivers/gpu/host1x/hw/cdma_hw.c b/drivers/gpu/host1x/hw/cdma_hw.c
new file mode 100644
index 0000000..7a44418
--- /dev/null
+++ b/drivers/gpu/host1x/hw/cdma_hw.c
@@ -0,0 +1,475 @@
+/*
+ * Tegra host1x Command DMA
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/dma-mapping.h>
+#include "cdma.h"
+#include "channel.h"
+#include "dev.h"
+#include "memmgr.h"
+
+#include "cdma_hw.h"
+
+static inline u32 host1x_channel_dmactrl(int stop, int get_rst, int init_get)
+{
+	return HOST1X_CHANNEL_DMACTRL_DMASTOP_F(stop)
+		| HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(get_rst)
+		| HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(init_get);
+}
+
+static void cdma_timeout_handler(struct work_struct *work);
+
+/*
+ * push_buffer
+ *
+ * The push buffer is a circular array of words to be fetched by command DMA.
+ * Note that it works slightly differently to the sync queue; fence == cur
+ * means that the push buffer is full, not empty.
+ */
+
+
+/**
+ * Reset to empty push buffer
+ */
+static void push_buffer_reset(struct push_buffer *pb)
+{
+	pb->fence = PUSH_BUFFER_SIZE - 8;
+	pb->cur = 0;
+}
+
+/**
+ * Init push buffer resources
+ */
+static void push_buffer_destroy(struct push_buffer *pb);
+static int push_buffer_init(struct push_buffer *pb)
+{
+	struct host1x_cdma *cdma = pb_to_cdma(pb);
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	pb->mapped = NULL;
+	pb->phys = 0;
+	pb->handle = NULL;
+
+	host1x->cdma_pb_op.reset(pb);
+
+	/* allocate and map pushbuffer memory */
+	pb->mapped = dma_alloc_writecombine(&host1x->dev->dev,
+			PUSH_BUFFER_SIZE + 4, &pb->phys, GFP_KERNEL);
+	if (!pb->mapped)
+		goto fail;
+
+	/* memory for storing mem client and handles for each opcode pair */
+	pb->handle = kzalloc(HOST1X_GATHER_QUEUE_SIZE *
+				sizeof(struct mem_handle *),
+			GFP_KERNEL);
+	if (!pb->handle)
+		goto fail;
+
+	/* put the restart at the end of pushbuffer memory */
+	*(pb->mapped + (PUSH_BUFFER_SIZE >> 2)) =
+		host1x_opcode_restart(pb->phys);
+
+	return 0;
+
+fail:
+	push_buffer_destroy(pb);
+	return -ENOMEM;
+}
+
+/*
+ * Clean up push buffer resources
+ */
+static void push_buffer_destroy(struct push_buffer *pb)
+{
+	struct host1x_cdma *cdma = pb_to_cdma(pb);
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	if (pb->phys != 0)
+		dma_free_writecombine(&host1x->dev->dev,
+				PUSH_BUFFER_SIZE + 4,
+				pb->mapped, pb->phys);
+
+	kfree(pb->handle);
+
+	pb->mapped = NULL;
+	pb->phys = 0;
+	pb->handle = NULL;
+}
+
+/*
+ * Push two words to the push buffer
+ * Caller must ensure push buffer is not full
+ */
+static void push_buffer_push_to(struct push_buffer *pb,
+		struct mem_handle *handle,
+		u32 op1, u32 op2)
+{
+	u32 cur = pb->cur;
+	u32 *p = (u32 *)((u32)pb->mapped + cur);
+	u32 cur_mem = (cur/8) & (HOST1X_GATHER_QUEUE_SIZE - 1);
+	WARN_ON(cur == pb->fence);
+	*(p++) = op1;
+	*(p++) = op2;
+	pb->handle[cur_mem] = handle;
+	pb->cur = (cur + 8) & (PUSH_BUFFER_SIZE - 1);
+}
+
+/*
+ * Pop a number of two word slots from the push buffer
+ * Caller must ensure push buffer is not empty
+ */
+static void push_buffer_pop_from(struct push_buffer *pb,
+		unsigned int slots)
+{
+	/* Clear the mem references for old items from pb */
+	unsigned int i;
+	u32 fence_mem = pb->fence/8;
+	for (i = 0; i < slots; i++) {
+		int cur_fence_mem = (fence_mem+i)
+				& (HOST1X_GATHER_QUEUE_SIZE - 1);
+		pb->handle[cur_fence_mem] = NULL;
+	}
+	/* Advance the next write position */
+	pb->fence = (pb->fence + slots * 8) & (PUSH_BUFFER_SIZE - 1);
+}
+
+/*
+ * Return the number of two word slots free in the push buffer
+ */
+static u32 push_buffer_space(struct push_buffer *pb)
+{
+	return ((pb->fence - pb->cur) & (PUSH_BUFFER_SIZE - 1)) / 8;
+}
+
+static u32 push_buffer_putptr(struct push_buffer *pb)
+{
+	return pb->phys + pb->cur;
+}
+
+/*
+ * The syncpt incr buffer is filled with methods to increment syncpts, which
+ * is later GATHER-ed into the mainline PB. It's used when a timed out context
+ * is interleaved with other work, so needs to inline the syncpt increments
+ * to maintain the count (but otherwise does no work).
+ */
+
+/*
+ * Init timeout resources
+ */
+static int cdma_timeout_init(struct host1x_cdma *cdma,
+				 u32 syncpt_id)
+{
+	if (syncpt_id == NVSYNCPT_INVALID)
+		return -EINVAL;
+
+	INIT_DELAYED_WORK(&cdma->timeout.wq, cdma_timeout_handler);
+	cdma->timeout.initialized = true;
+
+	return 0;
+}
+
+/*
+ * Clean up timeout resources
+ */
+static void cdma_timeout_destroy(struct host1x_cdma *cdma)
+{
+	if (cdma->timeout.initialized)
+		cancel_delayed_work(&cdma->timeout.wq);
+	cdma->timeout.initialized = false;
+}
+
+/*
+ * Increment timedout buffer's syncpt via CPU.
+ */
+static void cdma_timeout_cpu_incr(struct host1x_cdma *cdma, u32 getptr,
+				u32 syncpt_incrs, u32 syncval, u32 nr_slots)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct push_buffer *pb = &cdma->push_buffer;
+	u32 i, getidx;
+
+	for (i = 0; i < syncpt_incrs; i++)
+		host1x_syncpt_cpu_incr(cdma->timeout.syncpt);
+
+	/* after CPU incr, ensure shadow is up to date */
+	host1x_syncpt_load_min(cdma->timeout.syncpt);
+
+	/* NOP all the PB slots */
+	getidx = getptr - pb->phys;
+	while (nr_slots--) {
+		u32 *p = (u32 *)((u32)pb->mapped + getidx);
+		*(p++) = HOST1X_OPCODE_NOOP;
+		*(p++) = HOST1X_OPCODE_NOOP;
+		dev_dbg(&host1x->dev->dev, "%s: NOP at 0x%x\n",
+			__func__, pb->phys + getidx);
+		getidx = (getidx + 8) & (PUSH_BUFFER_SIZE - 1);
+	}
+	wmb();
+}
+
+/*
+ * Start channel DMA
+ */
+static void cdma_start(struct host1x_cdma *cdma)
+{
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	struct host1x *host1x = cdma_to_host1x(cdma);
+
+	if (cdma->running)
+		return;
+
+	cdma->last_put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
+
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	/* set base, put, end pointer (all of memory) */
+	host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, cdma->last_put, HOST1X_CHANNEL_DMAPUT);
+	host1x_ch_writel(ch, 0xFFFFFFFF, HOST1X_CHANNEL_DMAEND);
+
+	/* reset GET */
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, true, true),
+		HOST1X_CHANNEL_DMACTRL);
+
+	/* start the command DMA */
+	host1x_ch_writel(ch, host1x_channel_dmactrl(false, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	cdma->running = true;
+}
+
+/*
+ * Similar to cdma_start(), but rather than starting from an idle
+ * state (where DMA GET is set to DMA PUT), on a timeout we restore
+ * DMA GET from an explicit value (so DMA may again be pending).
+ */
+static void cdma_timeout_restart(struct host1x_cdma *cdma, u32 getptr)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+
+	if (cdma->running)
+		return;
+
+	cdma->last_put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
+
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	/* set base, end pointer (all of memory) */
+	host1x_ch_writel(ch, 0, HOST1X_CHANNEL_DMASTART);
+	host1x_ch_writel(ch, 0xFFFFFFFF, HOST1X_CHANNEL_DMAEND);
+
+	/* set GET, by loading the value in PUT (then reset GET) */
+	host1x_ch_writel(ch, getptr, HOST1X_CHANNEL_DMAPUT);
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, true, true),
+		HOST1X_CHANNEL_DMACTRL);
+
+	dev_dbg(&host1x->dev->dev,
+		"%s: DMA GET 0x%x, PUT HW 0x%x / shadow 0x%x\n",
+		__func__,
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAGET),
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAPUT),
+		cdma->last_put);
+
+	/* deassert GET reset and set PUT */
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+	host1x_ch_writel(ch, cdma->last_put, HOST1X_CHANNEL_DMAPUT);
+
+	/* start the command DMA */
+	host1x_ch_writel(ch, host1x_channel_dmactrl(false, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	cdma->running = true;
+}
+
+/*
+ * Kick channel DMA into action by writing its PUT offset (if it has changed)
+ */
+static void cdma_kick(struct host1x_cdma *cdma)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u32 put;
+
+	put = host1x->cdma_pb_op.putptr(&cdma->push_buffer);
+
+	if (put != cdma->last_put) {
+		host1x_ch_writel(ch, put, HOST1X_CHANNEL_DMAPUT);
+		cdma->last_put = put;
+	}
+}
+
+static void cdma_stop(struct host1x_cdma *cdma)
+{
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+
+	mutex_lock(&cdma->lock);
+	if (cdma->running) {
+		host1x_cdma_wait_locked(cdma, CDMA_EVENT_SYNC_QUEUE_EMPTY);
+		host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+			HOST1X_CHANNEL_DMACTRL);
+		cdma->running = false;
+	}
+	mutex_unlock(&cdma->lock);
+}
+
+/*
+ * Stops both channel's command processor and CDMA immediately.
+ * Also, tears down the channel and resets corresponding module.
+ */
+static void cdma_timeout_teardown_begin(struct host1x_cdma *cdma)
+{
+	struct host1x *dev = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u32 cmdproc_stop;
+
+	if (cdma->torndown && !cdma->running) {
+		dev_warn(&dev->dev->dev, "Already torn down\n");
+		return;
+	}
+
+	dev_dbg(&dev->dev->dev,
+		"begin channel teardown (channel id %d)\n", ch->chid);
+
+	cmdproc_stop = host1x_sync_readl(dev, HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop |= BIT(ch->chid);
+	host1x_sync_writel(dev, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
+
+	dev_dbg(&dev->dev->dev,
+		"%s: DMA GET 0x%x, PUT HW 0x%x / shadow 0x%x\n",
+		__func__,
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAGET),
+		host1x_ch_readl(ch, HOST1X_CHANNEL_DMAPUT),
+		cdma->last_put);
+
+	host1x_ch_writel(ch, host1x_channel_dmactrl(true, false, false),
+		HOST1X_CHANNEL_DMACTRL);
+
+	host1x_sync_writel(dev, BIT(ch->chid), HOST1X_SYNC_CH_TEARDOWN);
+
+	cdma->running = false;
+	cdma->torndown = true;
+}
+
+static void cdma_timeout_teardown_end(struct host1x_cdma *cdma, u32 getptr)
+{
+	struct host1x *host1x = cdma_to_host1x(cdma);
+	struct host1x_channel *ch = cdma_to_channel(cdma);
+	u32 cmdproc_stop;
+
+	dev_dbg(&host1x->dev->dev,
+		"end channel teardown (id %d, DMAGET restart = 0x%x)\n",
+		ch->chid, getptr);
+
+	cmdproc_stop = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop &= ~(BIT(ch->chid));
+	host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
+
+	cdma->torndown = false;
+	cdma_timeout_restart(cdma, getptr);
+}
+
+/*
+ * If this timeout fires, it indicates the current sync_queue entry has
+ * exceeded its TTL and the userctx should be timed out and remaining
+ * submits already issued cleaned up (future submits return an error).
+ */
+static void cdma_timeout_handler(struct work_struct *work)
+{
+	struct host1x_cdma *cdma;
+	struct host1x *host1x;
+	struct host1x_channel *ch;
+
+	u32 syncpt_val;
+
+	u32 prev_cmdproc, cmdproc_stop;
+
+	cdma = container_of(to_delayed_work(work), struct host1x_cdma,
+			    timeout.wq);
+	host1x = cdma_to_host1x(cdma);
+	ch = cdma_to_channel(cdma);
+
+	mutex_lock(&cdma->lock);
+
+	if (!cdma->timeout.clientid) {
+		dev_dbg(&host1x->dev->dev,
+			 "cdma_timeout: expired, but has no clientid\n");
+		mutex_unlock(&cdma->lock);
+		return;
+	}
+
+	/* stop processing to get a clean snapshot */
+	prev_cmdproc = host1x_sync_readl(host1x, HOST1X_SYNC_CMDPROC_STOP);
+	cmdproc_stop = prev_cmdproc | BIT(ch->chid);
+	host1x_sync_writel(host1x, cmdproc_stop, HOST1X_SYNC_CMDPROC_STOP);
+
+	dev_dbg(&host1x->dev->dev, "cdma_timeout: cmdproc was 0x%x is 0x%x\n",
+		prev_cmdproc, cmdproc_stop);
+
+	syncpt_val = host1x_syncpt_load_min(host1x->syncpt);
+
+	/* has buffer actually completed? */
+	if ((s32)(syncpt_val - cdma->timeout.syncpt_val) >= 0) {
+		dev_dbg(&host1x->dev->dev,
+			 "cdma_timeout: expired, but buffer had completed\n");
+		/* restore */
+		cmdproc_stop = prev_cmdproc & ~(BIT(ch->chid));
+		host1x_sync_writel(host1x, cmdproc_stop,
+			HOST1X_SYNC_CMDPROC_STOP);
+		mutex_unlock(&cdma->lock);
+		return;
+	}
+
+	dev_warn(&host1x->dev->dev,
+		"%s: timeout: %d (%s), HW thresh %d, done %d\n",
+		__func__,
+		cdma->timeout.syncpt->id, cdma->timeout.syncpt->name,
+		syncpt_val, cdma->timeout.syncpt_val);
+
+	/* stop HW, resetting channel/module */
+	host1x->cdma_op.timeout_teardown_begin(cdma);
+
+	host1x_cdma_update_sync_queue(cdma, ch->dev);
+	mutex_unlock(&cdma->lock);
+}
+
+static const struct host1x_cdma_ops host1x_cdma_ops = {
+	.start = cdma_start,
+	.stop = cdma_stop,
+	.kick = cdma_kick,
+
+	.timeout_init = cdma_timeout_init,
+	.timeout_destroy = cdma_timeout_destroy,
+	.timeout_teardown_begin = cdma_timeout_teardown_begin,
+	.timeout_teardown_end = cdma_timeout_teardown_end,
+	.timeout_cpu_incr = cdma_timeout_cpu_incr,
+};
+
+static const struct host1x_pushbuffer_ops host1x_pushbuffer_ops = {
+	.reset = push_buffer_reset,
+	.init = push_buffer_init,
+	.destroy = push_buffer_destroy,
+	.push_to = push_buffer_push_to,
+	.pop_from = push_buffer_pop_from,
+	.space = push_buffer_space,
+	.putptr = push_buffer_putptr,
+};
+
diff --git a/drivers/gpu/host1x/hw/cdma_hw.h b/drivers/gpu/host1x/hw/cdma_hw.h
new file mode 100644
index 0000000..80a085a
--- /dev/null
+++ b/drivers/gpu/host1x/hw/cdma_hw.h
@@ -0,0 +1,37 @@
+/*
+ * Tegra host1x Command DMA
+ *
+ * Copyright (c) 2011-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_CDMA_HW_H
+#define __HOST1X_CDMA_HW_H
+
+/*
+ * Size of the sync queue. If it is too small, we won't be able to queue up
+ * many command buffers. If it is too large, we waste memory.
+ */
+#define HOST1X_SYNC_QUEUE_SIZE 512
+
+/*
+ * Number of gathers we allow to be queued up per channel. Must be a
+ * power of two. Currently sized such that pushbuffer is 4KB (512*8B).
+ */
+#define HOST1X_GATHER_QUEUE_SIZE 512
+
+/* 8 bytes per slot. (This number does not include the final RESTART.) */
+#define PUSH_BUFFER_SIZE (HOST1X_GATHER_QUEUE_SIZE * 8)
+
+#endif
diff --git a/drivers/gpu/host1x/hw/channel_hw.c b/drivers/gpu/host1x/hw/channel_hw.c
new file mode 100644
index 0000000..905cfd2
--- /dev/null
+++ b/drivers/gpu/host1x/hw/channel_hw.c
@@ -0,0 +1,148 @@
+/*
+ * Tegra host1x Channel
+ *
+ * Copyright (c) 2010-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "host1x.h"
+#include "channel.h"
+#include "dev.h"
+#include <linux/slab.h>
+#include "intr.h"
+#include "job.h"
+#include <trace/events/host1x.h>
+
+static void submit_gathers(struct host1x_job *job)
+{
+	/* push user gathers */
+	int i;
+	for (i = 0 ; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+		u32 op1 = host1x_opcode_gather(g->words);
+		u32 op2 = g->mem_base + g->offset;
+		host1x_cdma_push_gather(&job->ch->cdma,
+				job->gathers[i].ref,
+				job->gathers[i].offset,
+				op1, op2);
+	}
+}
+
+static int channel_submit(struct host1x_job *job)
+{
+	struct host1x_channel *ch = job->ch;
+	struct host1x_syncpt *sp;
+	u32 user_syncpt_incrs = job->syncpt_incrs;
+	u32 prev_max = 0;
+	u32 syncval;
+	int err;
+	void *completed_waiter = NULL;
+
+	sp = host1x_get_host(job->ch->dev)->syncpt + job->syncpt_id;
+	trace_host1x_channel_submit(ch->dev->name,
+			job->num_gathers, job->num_relocs, job->num_waitchk,
+			job->syncpt_id, job->syncpt_incrs);
+
+	/* before error checks, return current max */
+	prev_max = job->syncpt_end = host1x_syncpt_read_max(sp);
+
+	/* get submit lock */
+	err = mutex_lock_interruptible(&ch->submitlock);
+	if (err)
+		goto error;
+
+	completed_waiter = host1x_intr_alloc_waiter();
+	if (!completed_waiter) {
+		mutex_unlock(&ch->submitlock);
+		err = -ENOMEM;
+		goto error;
+	}
+
+	/* begin a CDMA submit */
+	err = host1x_cdma_begin(&ch->cdma, job);
+	if (err) {
+		mutex_unlock(&ch->submitlock);
+		goto error;
+	}
+
+	if (job->serialize) {
+		/*
+		 * Force serialization by inserting a host wait for the
+		 * previous job to finish before this one can commence.
+		 */
+		host1x_cdma_push(&ch->cdma,
+				host1x_opcode_setclass(NV_HOST1X_CLASS_ID,
+					host1x_uclass_wait_syncpt_r(),
+					1),
+				host1x_class_host_wait_syncpt(job->syncpt_id,
+					host1x_syncpt_read_max(sp)));
+	}
+
+	syncval = host1x_syncpt_incr_max(sp, user_syncpt_incrs);
+
+	job->syncpt_end = syncval;
+
+	/* add a setclass for modules that require it */
+	if (job->class)
+		host1x_cdma_push(&ch->cdma,
+			host1x_opcode_setclass(job->class, 0, 0),
+			HOST1X_OPCODE_NOOP);
+
+	submit_gathers(job);
+
+	/* end CDMA submit & stash pinned hMems into sync queue */
+	host1x_cdma_end(&ch->cdma, job);
+
+	trace_host1x_channel_submitted(ch->dev->name,
+			prev_max, syncval);
+
+	/* schedule a submit complete interrupt */
+	err = host1x_intr_add_action(&host1x_get_host(ch->dev)->intr,
+			job->syncpt_id, syncval,
+			HOST1X_INTR_ACTION_SUBMIT_COMPLETE, ch,
+			completed_waiter,
+			NULL);
+	completed_waiter = NULL;
+	WARN(err, "Failed to set submit complete interrupt");
+
+	mutex_unlock(&ch->submitlock);
+
+	return 0;
+
+error:
+	kfree(completed_waiter);
+	return err;
+}
+
+static inline void __iomem *host1x_channel_regs(void __iomem *p, int ndx)
+{
+	p += ndx * NV_HOST1X_CHANNEL_MAP_SIZE_BYTES;
+	return p;
+}
+
+static int host1x_channel_init(struct host1x_channel *ch,
+	struct host1x *dev, int index)
+{
+	ch->chid = index;
+	mutex_init(&ch->reflock);
+	mutex_init(&ch->submitlock);
+
+	ch->regs = host1x_channel_regs(dev->regs, index);
+	return 0;
+}
+
+static const struct host1x_channel_ops host1x_channel_ops = {
+	.init = host1x_channel_init,
+	.submit = channel_submit,
+};
diff --git a/drivers/gpu/host1x/hw/host1x01.c b/drivers/gpu/host1x/hw/host1x01.c
index 3d633a3..7569a1e 100644
--- a/drivers/gpu/host1x/hw/host1x01.c
+++ b/drivers/gpu/host1x/hw/host1x01.c
@@ -23,13 +23,19 @@
 
 #include "hw/host1x01.h"
 #include "dev.h"
+#include "channel.h"
 #include "hw/host1x01_hardware.h"
 
+#include "hw/channel_hw.c"
+#include "hw/cdma_hw.c"
 #include "hw/syncpt_hw.c"
 #include "hw/intr_hw.c"
 
 int host1x01_init(struct host1x *host)
 {
+	host->channel_op = host1x_channel_ops;
+	host->cdma_op = host1x_cdma_ops;
+	host->cdma_pb_op = host1x_pushbuffer_ops;
 	host->syncpt_op = host1x_syncpt_ops;
 	host->intr_op = host1x_intr_ops;
 
diff --git a/drivers/gpu/host1x/hw/host1x01_hardware.h b/drivers/gpu/host1x/hw/host1x01_hardware.h
index c1d5324..03873c0 100644
--- a/drivers/gpu/host1x/hw/host1x01_hardware.h
+++ b/drivers/gpu/host1x/hw/host1x01_hardware.h
@@ -21,6 +21,130 @@
 
 #include <linux/types.h>
 #include <linux/bitops.h>
+#include "hw_host1x01_channel.h"
 #include "hw_host1x01_sync.h"
+#include "hw_host1x01_uclass.h"
+
+/* channel registers */
+#define NV_HOST1X_CHANNEL_MAP_SIZE_BYTES 16384
+
+static inline u32 host1x_class_host_wait_syncpt(
+	unsigned indx, unsigned threshold)
+{
+	return host1x_uclass_wait_syncpt_indx_f(indx)
+		| host1x_uclass_wait_syncpt_thresh_f(threshold);
+}
+
+static inline u32 host1x_class_host_load_syncpt_base(
+	unsigned indx, unsigned threshold)
+{
+	return host1x_uclass_load_syncpt_base_base_indx_f(indx)
+		| host1x_uclass_load_syncpt_base_value_f(threshold);
+}
+
+static inline u32 host1x_class_host_wait_syncpt_base(
+	unsigned indx, unsigned base_indx, unsigned offset)
+{
+	return host1x_uclass_wait_syncpt_base_indx_f(indx)
+		| host1x_uclass_wait_syncpt_base_base_indx_f(base_indx)
+		| host1x_uclass_wait_syncpt_base_offset_f(offset);
+}
+
+static inline u32 host1x_class_host_incr_syncpt_base(
+	unsigned base_indx, unsigned offset)
+{
+	return host1x_uclass_incr_syncpt_base_base_indx_f(base_indx)
+		| host1x_uclass_incr_syncpt_base_offset_f(offset);
+}
+
+static inline u32 host1x_class_host_incr_syncpt(
+	unsigned cond, unsigned indx)
+{
+	return host1x_uclass_incr_syncpt_cond_f(cond)
+		| host1x_uclass_incr_syncpt_indx_f(indx);
+}
+
+static inline u32 host1x_class_host_indoff_reg_write(
+	unsigned mod_id, unsigned offset, bool auto_inc)
+{
+	u32 v = host1x_uclass_indoff_indbe_f(0xf)
+		| host1x_uclass_indoff_indmodid_f(mod_id)
+		| host1x_uclass_indoff_indroffset_f(offset);
+	if (auto_inc)
+		v |= host1x_uclass_indoff_autoinc_f(1);
+	return v;
+}
+
+static inline u32 host1x_class_host_indoff_reg_read(
+	unsigned mod_id, unsigned offset, bool auto_inc)
+{
+	u32 v = host1x_uclass_indoff_indmodid_f(mod_id)
+		| host1x_uclass_indoff_indroffset_f(offset)
+		| host1x_uclass_indoff_rwn_read_v();
+	if (auto_inc)
+		v |= host1x_uclass_indoff_autoinc_f(1);
+	return v;
+}
+
+
+/* cdma opcodes */
+static inline u32 host1x_opcode_setclass(
+	unsigned class_id, unsigned offset, unsigned mask)
+{
+	return (0 << 28) | (offset << 16) | (class_id << 6) | mask;
+}
+
+static inline u32 host1x_opcode_incr(unsigned offset, unsigned count)
+{
+	return (1 << 28) | (offset << 16) | count;
+}
+
+static inline u32 host1x_opcode_nonincr(unsigned offset, unsigned count)
+{
+	return (2 << 28) | (offset << 16) | count;
+}
+
+static inline u32 host1x_opcode_mask(unsigned offset, unsigned mask)
+{
+	return (3 << 28) | (offset << 16) | mask;
+}
+
+static inline u32 host1x_opcode_imm(unsigned offset, unsigned value)
+{
+	return (4 << 28) | (offset << 16) | value;
+}
+
+static inline u32 host1x_opcode_imm_incr_syncpt(unsigned cond, unsigned indx)
+{
+	return host1x_opcode_imm(host1x_uclass_incr_syncpt_r(),
+		host1x_class_host_incr_syncpt(cond, indx));
+}
+
+static inline u32 host1x_opcode_restart(unsigned address)
+{
+	return (5 << 28) | (address >> 4);
+}
+
+static inline u32 host1x_opcode_gather(unsigned count)
+{
+	return (6 << 28) | count;
+}
+
+static inline u32 host1x_opcode_gather_nonincr(unsigned offset,	unsigned count)
+{
+	return (6 << 28) | (offset << 16) | BIT(15) | count;
+}
+
+static inline u32 host1x_opcode_gather_incr(unsigned offset, unsigned count)
+{
+	return (6 << 28) | (offset << 16) | BIT(15) | BIT(14) | count;
+}
+
+#define HOST1X_OPCODE_NOOP host1x_opcode_nonincr(0, 0)
+
+static inline u32 host1x_mask2(unsigned x, unsigned y)
+{
+	return 1 | (1 << (y - x));
+}
 
 #endif
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_channel.h b/drivers/gpu/host1x/hw/hw_host1x01_channel.h
new file mode 100644
index 0000000..dad4fee
--- /dev/null
+++ b/drivers/gpu/host1x/hw/hw_host1x01_channel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+
+#ifndef __hw_host1x_channel_host1x_h__
+#define __hw_host1x_channel_host1x_h__
+
+static inline u32 host1x_channel_dmastart_r(void)
+{
+	return 0x14;
+}
+#define HOST1X_CHANNEL_DMASTART \
+	host1x_channel_dmastart_r()
+static inline u32 host1x_channel_dmaput_r(void)
+{
+	return 0x18;
+}
+#define HOST1X_CHANNEL_DMAPUT \
+	host1x_channel_dmaput_r()
+static inline u32 host1x_channel_dmaget_r(void)
+{
+	return 0x1c;
+}
+#define HOST1X_CHANNEL_DMAGET \
+	host1x_channel_dmaget_r()
+static inline u32 host1x_channel_dmaend_r(void)
+{
+	return 0x20;
+}
+#define HOST1X_CHANNEL_DMAEND \
+	host1x_channel_dmaend_r()
+static inline u32 host1x_channel_dmactrl_r(void)
+{
+	return 0x24;
+}
+#define HOST1X_CHANNEL_DMACTRL \
+	host1x_channel_dmactrl_r()
+static inline u32 host1x_channel_dmactrl_dmastop_f(u32 v)
+{
+	return (v & 0x1) << 0;
+}
+#define HOST1X_CHANNEL_DMACTRL_DMASTOP_F(v) \
+	host1x_channel_dmactrl_dmastop_f(v)
+static inline u32 host1x_channel_dmactrl_dmagetrst_f(u32 v)
+{
+	return (v & 0x1) << 1;
+}
+#define HOST1X_CHANNEL_DMACTRL_DMAGETRST_F(v) \
+	host1x_channel_dmactrl_dmagetrst_f(v)
+static inline u32 host1x_channel_dmactrl_dmainitget_f(u32 v)
+{
+	return (v & 0x1) << 2;
+}
+#define HOST1X_CHANNEL_DMACTRL_DMAINITGET_F(v) \
+	host1x_channel_dmactrl_dmainitget_f(v)
+#endif
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_sync.h b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
index 5da9afb..3073d37 100644
--- a/drivers/gpu/host1x/hw/hw_host1x01_sync.h
+++ b/drivers/gpu/host1x/hw/hw_host1x01_sync.h
@@ -69,6 +69,18 @@ static inline u32 host1x_sync_syncpt_thresh_int_enable_cpu0_r(void)
 }
 #define HOST1X_SYNC_SYNCPT_THRESH_INT_ENABLE_CPU0 \
 	host1x_sync_syncpt_thresh_int_enable_cpu0_r()
+static inline u32 host1x_sync_cmdproc_stop_r(void)
+{
+	return 0xac;
+}
+#define HOST1X_SYNC_CMDPROC_STOP \
+	host1x_sync_cmdproc_stop_r()
+static inline u32 host1x_sync_ch_teardown_r(void)
+{
+	return 0xb0;
+}
+#define HOST1X_SYNC_CH_TEARDOWN \
+	host1x_sync_ch_teardown_r()
 static inline u32 host1x_sync_usec_clk_r(void)
 {
 	return 0x1a4;
diff --git a/drivers/gpu/host1x/hw/hw_host1x01_uclass.h b/drivers/gpu/host1x/hw/hw_host1x01_uclass.h
new file mode 100644
index 0000000..7af6609
--- /dev/null
+++ b/drivers/gpu/host1x/hw/hw_host1x01_uclass.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+ /*
+  * Function naming determines intended use:
+  *
+  *     <x>_r(void) : Returns the offset for register <x>.
+  *
+  *     <x>_w(void) : Returns the word offset for word (4 byte) element <x>.
+  *
+  *     <x>_<y>_s(void) : Returns size of field <y> of register <x> in bits.
+  *
+  *     <x>_<y>_f(u32 v) : Returns a value based on 'v' which has been shifted
+  *         and masked to place it at field <y> of register <x>.  This value
+  *         can be |'d with others to produce a full register value for
+  *         register <x>.
+  *
+  *     <x>_<y>_m(void) : Returns a mask for field <y> of register <x>.  This
+  *         value can be ~'d and then &'d to clear the value of field <y> for
+  *         register <x>.
+  *
+  *     <x>_<y>_<z>_f(void) : Returns the constant value <z> after being shifted
+  *         to place it at field <y> of register <x>.  This value can be |'d
+  *         with others to produce a full register value for <x>.
+  *
+  *     <x>_<y>_v(u32 r) : Returns the value of field <y> from a full register
+  *         <x> value 'r' after being shifted to place its LSB at bit 0.
+  *         This value is suitable for direct comparison with other unshifted
+  *         values appropriate for use in field <y> of register <x>.
+  *
+  *     <x>_<y>_<z>_v(void) : Returns the constant value for <z> defined for
+  *         field <y> of register <x>.  This value is suitable for direct
+  *         comparison with unshifted values appropriate for use in field <y>
+  *         of register <x>.
+  */
+
+#ifndef __hw_host1x_uclass_host1x_h__
+#define __hw_host1x_uclass_host1x_h__
+
+static inline u32 host1x_uclass_incr_syncpt_r(void)
+{
+	return 0x0;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT \
+	host1x_uclass_incr_syncpt_r()
+static inline u32 host1x_uclass_incr_syncpt_cond_f(u32 v)
+{
+	return (v & 0xff) << 8;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_COND_F(v) \
+	host1x_uclass_incr_syncpt_cond_f(v)
+static inline u32 host1x_uclass_incr_syncpt_indx_f(u32 v)
+{
+	return (v & 0xff) << 0;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_INDX_F(v) \
+	host1x_uclass_incr_syncpt_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_r(void)
+{
+	return 0x8;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT \
+	host1x_uclass_wait_syncpt_r()
+static inline u32 host1x_uclass_wait_syncpt_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_INDX_F(v) \
+	host1x_uclass_wait_syncpt_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_thresh_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_THRESH_F(v) \
+	host1x_uclass_wait_syncpt_thresh_f(v)
+static inline u32 host1x_uclass_wait_syncpt_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_BASE_INDX_F(v) \
+	host1x_uclass_wait_syncpt_base_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_base_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 16;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_BASE_BASE_INDX_F(v) \
+	host1x_uclass_wait_syncpt_base_base_indx_f(v)
+static inline u32 host1x_uclass_wait_syncpt_base_offset_f(u32 v)
+{
+	return (v & 0xffff) << 0;
+}
+#define HOST1X_UCLASS_WAIT_SYNCPT_BASE_OFFSET_F(v) \
+	host1x_uclass_wait_syncpt_base_offset_f(v)
+static inline u32 host1x_uclass_load_syncpt_base_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_BASE_BASE_INDX_F(v) \
+	host1x_uclass_load_syncpt_base_base_indx_f(v)
+static inline u32 host1x_uclass_load_syncpt_base_value_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+#define HOST1X_UCLASS_LOAD_SYNCPT_BASE_VALUE_F(v) \
+	host1x_uclass_load_syncpt_base_value_f(v)
+static inline u32 host1x_uclass_incr_syncpt_base_base_indx_f(u32 v)
+{
+	return (v & 0xff) << 24;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_BASE_BASE_INDX_F(v) \
+	host1x_uclass_incr_syncpt_base_base_indx_f(v)
+static inline u32 host1x_uclass_incr_syncpt_base_offset_f(u32 v)
+{
+	return (v & 0xffffff) << 0;
+}
+#define HOST1X_UCLASS_INCR_SYNCPT_BASE_OFFSET_F(v) \
+	host1x_uclass_incr_syncpt_base_offset_f(v)
+static inline u32 host1x_uclass_indoff_r(void)
+{
+	return 0x2d;
+}
+#define HOST1X_UCLASS_INDOFF \
+	host1x_uclass_indoff_r()
+static inline u32 host1x_uclass_indoff_indbe_f(u32 v)
+{
+	return (v & 0xf) << 28;
+}
+#define HOST1X_UCLASS_INDOFF_INDBE_F(v) \
+	host1x_uclass_indoff_indbe_f(v)
+static inline u32 host1x_uclass_indoff_autoinc_f(u32 v)
+{
+	return (v & 0x1) << 27;
+}
+#define HOST1X_UCLASS_INDOFF_AUTOINC_F(v) \
+	host1x_uclass_indoff_autoinc_f(v)
+static inline u32 host1x_uclass_indoff_indmodid_f(u32 v)
+{
+	return (v & 0xff) << 18;
+}
+#define HOST1X_UCLASS_INDOFF_INDMODID_F(v) \
+	host1x_uclass_indoff_indmodid_f(v)
+static inline u32 host1x_uclass_indoff_indroffset_f(u32 v)
+{
+	return (v & 0xffff) << 2;
+}
+#define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
+	host1x_uclass_indoff_indroffset_f(v)
+static inline u32 host1x_uclass_indoff_rwn_read_v(void)
+{
+	return 1;
+}
+#define HOST1X_UCLASS_INDOFF_INDROFFSET_F(v) \
+	host1x_uclass_indoff_indroffset_f(v)
+#endif
diff --git a/drivers/gpu/host1x/hw/syncpt_hw.c b/drivers/gpu/host1x/hw/syncpt_hw.c
index 16e3ada..ba48cee 100644
--- a/drivers/gpu/host1x/hw/syncpt_hw.c
+++ b/drivers/gpu/host1x/hw/syncpt_hw.c
@@ -97,6 +97,15 @@ static void syncpt_cpu_incr(struct host1x_syncpt *sp)
 	wmb();
 }
 
+/* remove a wait pointed to by patch_addr */
+static int syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr)
+{
+	u32 override = host1x_class_host_wait_syncpt(
+			NVSYNCPT_GRAPHICS_HOST, 0);
+	__raw_writel(override, patch_addr);
+	return 0;
+}
+
 static const char *syncpt_name(struct host1x_syncpt *sp)
 {
 	struct host1x_device_info *info = &sp->dev->info;
@@ -141,6 +150,7 @@ static const struct host1x_syncpt_ops host1x_syncpt_ops = {
 	.read_wait_base = syncpt_read_wait_base,
 	.load_min = syncpt_load_min,
 	.cpu_incr = syncpt_cpu_incr,
+	.patch_wait = syncpt_patch_wait,
 	.debug = syncpt_debug,
 	.name = syncpt_name,
 };
diff --git a/drivers/gpu/host1x/intr.c b/drivers/gpu/host1x/intr.c
index 26099b8..9d0b5f1 100644
--- a/drivers/gpu/host1x/intr.c
+++ b/drivers/gpu/host1x/intr.c
@@ -20,6 +20,8 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
+#include <trace/events/host1x.h>
+#include "channel.h"
 #include "dev.h"
 
 /* Wait list management */
@@ -74,7 +76,7 @@ static void remove_completed_waiters(struct list_head *head, u32 sync,
 			struct list_head completed[HOST1X_INTR_ACTION_COUNT])
 {
 	struct list_head *dest;
-	struct host1x_waitlist *waiter, *next;
+	struct host1x_waitlist *waiter, *next, *prev;
 
 	list_for_each_entry_safe(waiter, next, head, list) {
 		if ((s32)(waiter->thresh - sync) > 0)
@@ -82,6 +84,17 @@ static void remove_completed_waiters(struct list_head *head, u32 sync,
 
 		dest = completed + waiter->action;
 
+		/* consolidate submit cleanups */
+		if (waiter->action == HOST1X_INTR_ACTION_SUBMIT_COMPLETE
+			&& !list_empty(dest)) {
+			prev = list_entry(dest->prev,
+					struct host1x_waitlist, list);
+			if (prev->data == waiter->data) {
+				prev->count++;
+				dest = NULL;
+			}
+		}
+
 		/* PENDING->REMOVED or CANCELLED->HANDLED */
 		if (atomic_inc_return(&waiter->state) == WLS_HANDLED || !dest) {
 			list_del(&waiter->list);
@@ -104,6 +117,19 @@ static void reset_threshold_interrupt(struct host1x_intr *intr,
 	host1x->intr_op.enable_syncpt_intr(intr, id);
 }
 
+static void action_submit_complete(struct host1x_waitlist *waiter)
+{
+	struct host1x_channel *channel = waiter->data;
+	int nr_completed = waiter->count;
+
+	host1x_cdma_update(&channel->cdma);
+
+	/*  Add nr_completed to trace */
+	trace_host1x_channel_submit_complete(channel->dev->name,
+			nr_completed, waiter->thresh);
+
+}
+
 static void action_wakeup(struct host1x_waitlist *waiter)
 {
 	wait_queue_head_t *wq = waiter->data;
@@ -121,6 +147,7 @@ static void action_wakeup_interruptible(struct host1x_waitlist *waiter)
 typedef void (*action_handler)(struct host1x_waitlist *waiter);
 
 static action_handler action_handlers[HOST1X_INTR_ACTION_COUNT] = {
+	action_submit_complete,
 	action_wakeup,
 	action_wakeup_interruptible,
 };
diff --git a/drivers/gpu/host1x/intr.h b/drivers/gpu/host1x/intr.h
index 679a7b4..979b929 100644
--- a/drivers/gpu/host1x/intr.h
+++ b/drivers/gpu/host1x/intr.h
@@ -24,6 +24,12 @@
 
 enum host1x_intr_action {
 	/*
+	 * Perform cleanup after a submit has completed.
+	 * 'data' points to a channel
+	 */
+	HOST1X_INTR_ACTION_SUBMIT_COMPLETE = 0,
+
+	/*
 	 * Wake up a  task.
 	 * 'data' points to a wait_queue_head_t
 	 */
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
new file mode 100644
index 0000000..cc9c84a
--- /dev/null
+++ b/drivers/gpu/host1x/job.c
@@ -0,0 +1,612 @@
+/*
+ * Tegra host1x Job
+ *
+ * Copyright (c) 2010-2012, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kref.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/scatterlist.h>
+#include <trace/events/host1x.h>
+#include <linux/dma-mapping.h>
+#include "job.h"
+#include "channel.h"
+#include "syncpt.h"
+#include "dev.h"
+#include "memmgr.h"
+
+#ifdef CONFIG_TEGRA_HOST1X_FIREWALL
+static int host1x_firewall = 1;
+#else
+static int host1x_firewall;
+#endif
+
+struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
+		u32 num_cmdbufs, u32 num_relocs, u32 num_waitchks)
+{
+	struct host1x_job *job = NULL;
+	int num_unpins = num_cmdbufs + num_relocs;
+	s64 total;
+	void *mem;
+
+	/* Check that we're not going to overflow */
+	total = sizeof(struct host1x_job)
+			+ num_relocs * sizeof(struct host1x_reloc)
+			+ num_unpins * sizeof(struct host1x_job_unpin_data)
+			+ num_waitchks * sizeof(struct host1x_waitchk)
+			+ num_cmdbufs * sizeof(struct host1x_job_gather)
+			+ num_unpins * sizeof(dma_addr_t)
+			+ num_unpins * sizeof(u32 *);
+	if (total > ULONG_MAX)
+		return NULL;
+
+	mem = job = kzalloc(total, GFP_KERNEL);
+	if (!job)
+		return NULL;
+
+	kref_init(&job->ref);
+	job->ch = ch;
+
+	/* First init state to zero */
+
+	/*
+	 * Redistribute memory to the structs.
+	 * Overflows and negative conditions have
+	 * already been checked in job_alloc().
+	 */
+	mem += sizeof(struct host1x_job);
+	job->relocarray = num_relocs ? mem : NULL;
+	mem += num_relocs * sizeof(struct host1x_reloc);
+	job->unpins = num_unpins ? mem : NULL;
+	mem += num_unpins * sizeof(struct host1x_job_unpin_data);
+	job->waitchk = num_waitchks ? mem : NULL;
+	mem += num_waitchks * sizeof(struct host1x_waitchk);
+	job->gathers = num_cmdbufs ? mem : NULL;
+	mem += num_cmdbufs * sizeof(struct host1x_job_gather);
+	job->addr_phys = num_unpins ? mem : NULL;
+	mem += num_unpins * sizeof(dma_addr_t);
+	job->pin_ids = num_unpins ? mem : NULL;
+
+	job->reloc_addr_phys = job->addr_phys;
+	job->gather_addr_phys = &job->addr_phys[num_relocs];
+
+	return job;
+}
+
+void host1x_job_get(struct host1x_job *job)
+{
+	kref_get(&job->ref);
+}
+
+static void job_free(struct kref *ref)
+{
+	struct host1x_job *job = container_of(ref, struct host1x_job, ref);
+
+	kfree(job);
+}
+
+void host1x_job_put(struct host1x_job *job)
+{
+	kref_put(&job->ref, job_free);
+}
+
+void host1x_job_add_gather(struct host1x_job *job,
+		u32 mem_id, u32 words, u32 offset)
+{
+	struct host1x_job_gather *cur_gather =
+			&job->gathers[job->num_gathers];
+
+	cur_gather->words = words;
+	cur_gather->mem_id = mem_id;
+	cur_gather->offset = offset;
+	job->num_gathers++;
+}
+
+/*
+ * Check driver supplied waitchk structs for syncpt thresholds
+ * that have already been satisfied and NULL the comparison (to
+ * avoid a wrap condition in the HW).
+ */
+static int do_waitchks(struct host1x_job *job, struct host1x *host,
+		u32 patch_mem, struct mem_handle *h)
+{
+	int i;
+
+	/* compare syncpt vs wait threshold */
+	for (i = 0; i < job->num_waitchk; i++) {
+		struct host1x_waitchk *wait = &job->waitchk[i];
+		struct host1x_syncpt *sp =
+			host1x_syncpt_get(host, wait->syncpt_id);
+
+		/* validate syncpt id */
+		if (wait->syncpt_id > host1x_syncpt_nb_pts(host))
+			continue;
+
+		/* skip all other gathers */
+		if (patch_mem != wait->mem)
+			continue;
+
+		trace_host1x_syncpt_wait_check(wait->mem, wait->offset,
+				wait->syncpt_id, wait->thresh,
+				host1x_syncpt_read_min(sp));
+		if (host1x_syncpt_is_expired(
+			host1x_syncpt_get(host, wait->syncpt_id),
+			wait->thresh)) {
+			struct host1x_syncpt *sp =
+				host1x_syncpt_get(host, wait->syncpt_id);
+
+			void *patch_addr = NULL;
+
+			/*
+			 * NULL an already satisfied WAIT_SYNCPT host method,
+			 * by patching its args in the command stream. The
+			 * method data is changed to reference a reserved
+			 * (never given out or incr) NVSYNCPT_GRAPHICS_HOST
+			 * syncpt with a matching threshold value of 0, so
+			 * is guaranteed to be popped by the host HW.
+			 */
+			dev_dbg(&host->dev->dev,
+			    "drop WAIT id %d (%s) thresh 0x%x, min 0x%x\n",
+			    wait->syncpt_id, sp->name, wait->thresh,
+			    host1x_syncpt_read_min(sp));
+
+			/* patch the wait */
+			patch_addr = host1x_memmgr_kmap(h,
+					wait->offset >> PAGE_SHIFT);
+			if (patch_addr) {
+				host1x_syncpt_patch_wait(sp,
+					(patch_addr +
+						(wait->offset & ~PAGE_MASK)));
+				host1x_memmgr_kunmap(h,
+						wait->offset >> PAGE_SHIFT,
+						patch_addr);
+			} else {
+				pr_err("Couldn't map cmdbuf for wait check\n");
+			}
+		}
+
+		wait->mem = 0;
+	}
+	return 0;
+}
+
+
+static int pin_job_mem(struct host1x_job *job)
+{
+	int i;
+	int count = 0;
+	int result;
+
+	for (i = 0; i < job->num_relocs; i++) {
+		struct host1x_reloc *reloc = &job->relocarray[i];
+		job->pin_ids[count] = reloc->target;
+		count++;
+	}
+
+	for (i = 0; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+		job->pin_ids[count] = g->mem_id;
+		count++;
+	}
+
+	/* validate array and pin unique ids, get refs for unpinning */
+	result = host1x_memmgr_pin_array_ids(job->ch->dev,
+		job->pin_ids, job->addr_phys,
+		count,
+		job->unpins);
+
+	if (result > 0)
+		job->num_unpins = result;
+
+	return result;
+}
+
+static int do_relocs(struct host1x_job *job,
+		u32 cmdbuf_mem, struct mem_handle *h)
+{
+	int i = 0;
+	int last_page = -1;
+	void *cmdbuf_page_addr = NULL;
+
+	/* pin & patch the relocs for one gather */
+	while (i < job->num_relocs) {
+		struct host1x_reloc *reloc = &job->relocarray[i];
+
+		/* skip all other gathers */
+		if (cmdbuf_mem != reloc->cmdbuf_mem) {
+			i++;
+			continue;
+		}
+
+		if (last_page != reloc->cmdbuf_offset >> PAGE_SHIFT) {
+			if (cmdbuf_page_addr)
+				host1x_memmgr_kunmap(h,
+						last_page, cmdbuf_page_addr);
+
+			cmdbuf_page_addr = host1x_memmgr_kmap(h,
+					reloc->cmdbuf_offset >> PAGE_SHIFT);
+			last_page = reloc->cmdbuf_offset >> PAGE_SHIFT;
+
+			if (unlikely(!cmdbuf_page_addr)) {
+				pr_err("Couldn't map cmdbuf for relocation\n");
+				return -ENOMEM;
+			}
+		}
+
+		__raw_writel(
+			(job->reloc_addr_phys[i] +
+				reloc->target_offset) >> reloc->shift,
+			(cmdbuf_page_addr +
+				(reloc->cmdbuf_offset & ~PAGE_MASK)));
+
+		/* remove completed reloc from the job */
+		if (i != job->num_relocs - 1) {
+			struct host1x_reloc *reloc_last =
+				&job->relocarray[job->num_relocs - 1];
+			reloc->cmdbuf_mem	= reloc_last->cmdbuf_mem;
+			reloc->cmdbuf_offset	= reloc_last->cmdbuf_offset;
+			reloc->target		= reloc_last->target;
+			reloc->target_offset	= reloc_last->target_offset;
+			reloc->shift		= reloc_last->shift;
+			job->reloc_addr_phys[i] =
+				job->reloc_addr_phys[job->num_relocs - 1];
+			job->num_relocs--;
+		} else {
+			break;
+		}
+	}
+
+	if (cmdbuf_page_addr)
+		host1x_memmgr_kunmap(h, last_page, cmdbuf_page_addr);
+
+	return 0;
+}
+
+static int check_reloc(struct host1x_reloc *reloc,
+		u32 cmdbuf_id, int offset)
+{
+	int err = 0;
+	if (reloc->cmdbuf_mem != cmdbuf_id
+			|| reloc->cmdbuf_offset != offset * sizeof(u32))
+		err = -EINVAL;
+
+	return err;
+}
+
+static int check_mask(struct host1x_job *job,
+		struct platform_device *pdev,
+		struct host1x_reloc **reloc, int *num_relocs,
+		u32 cmdbuf_id, int *offset,
+		u32 *words, u32 class, u32 reg, u32 mask)
+{
+	while (mask) {
+		if (*words == 0)
+			return -EINVAL;
+
+		if (mask & 1) {
+			if (job->is_addr_reg(pdev, class, reg)) {
+				if (!*num_relocs ||
+					check_reloc(*reloc, cmdbuf_id, *offset))
+					return -EINVAL;
+				(*reloc)++;
+				(*num_relocs)--;
+			}
+			(*words)--;
+			(*offset)++;
+		}
+		mask >>= 1;
+		reg += 1;
+	}
+
+	return 0;
+}
+
+static int check_incr(struct host1x_job *job,
+		struct platform_device *pdev,
+		struct host1x_reloc **reloc, int *num_relocs,
+		u32 cmdbuf_id, int *offset,
+		u32 *words, u32 class, u32 reg, u32 count)
+{
+	while (count) {
+		if (*words == 0)
+			return -EINVAL;
+
+		if (job->is_addr_reg(pdev, class, reg)) {
+			if (!*num_relocs ||
+				check_reloc(*reloc, cmdbuf_id, *offset))
+				return -EINVAL;
+			(*reloc)++;
+			(*num_relocs)--;
+		}
+		reg += 1;
+		(*words)--;
+		(*offset)++;
+		count--;
+	}
+
+	return 0;
+}
+
+static int check_nonincr(struct host1x_job *job,
+		struct platform_device *pdev,
+		struct host1x_reloc **reloc, int *num_relocs,
+		u32 cmdbuf_id, int *offset,
+		u32 *words, u32 class, u32 reg, u32 count)
+{
+	int is_addr_reg = job->is_addr_reg(pdev, class, reg);
+
+	while (count) {
+		if (*words == 0)
+			return -EINVAL;
+
+		if (is_addr_reg) {
+			if (!*num_relocs ||
+				check_reloc(*reloc, cmdbuf_id, *offset))
+				return -EINVAL;
+			(*reloc)++;
+			(*num_relocs)--;
+		}
+		(*words)--;
+		(*offset)++;
+		count--;
+	}
+
+	return 0;
+}
+
+static int validate(struct host1x_job *job, struct platform_device *pdev,
+		struct host1x_job_gather *g)
+{
+	struct host1x_reloc *reloc = job->relocarray;
+	int num_relocs = job->num_relocs;
+	u32 *cmdbuf_base;
+	int offset = 0;
+	unsigned int words;
+	int err = 0;
+	int class = 0;
+
+	if (!job->is_addr_reg)
+		return 0;
+
+	cmdbuf_base = host1x_memmgr_mmap(g->ref);
+	if (!cmdbuf_base)
+		return -ENOMEM;
+
+	words = g->words;
+	while (words && !err) {
+		u32 word = cmdbuf_base[offset];
+		u32 opcode = (word & 0xf0000000) >> 28;
+		u32 mask = 0;
+		u32 reg = 0;
+		u32 count = 0;
+
+		words--;
+		offset++;
+
+		switch (opcode) {
+		case 0:
+			class = word >> 6 & 0x3ff;
+			mask = word & 0x3f;
+			reg = word >> 16 & 0xfff;
+			err = check_mask(job, pdev,
+					&reloc, &num_relocs, g->mem_id,
+					&offset, &words, class, reg, mask);
+			if (err)
+				goto out;
+			break;
+		case 1:
+			reg = word >> 16 & 0xfff;
+			count = word & 0xffff;
+			err = check_incr(job, pdev,
+					&reloc, &num_relocs, g->mem_id,
+					&offset, &words, class, reg, count);
+			if (err)
+				goto out;
+			break;
+
+		case 2:
+			reg = word >> 16 & 0xfff;
+			count = word & 0xffff;
+			err = check_nonincr(job, pdev,
+					&reloc, &num_relocs, g->mem_id,
+					&offset, &words, class, reg, count);
+			if (err)
+				goto out;
+			break;
+
+		case 3:
+			mask = word & 0xffff;
+			reg = word >> 16 & 0xfff;
+			err = check_mask(job, pdev,
+					&reloc, &num_relocs, g->mem_id,
+					&offset, &words, class, reg, mask);
+			if (err)
+				goto out;
+			break;
+		case 4:
+		case 5:
+		case 14:
+			break;
+		default:
+			err = -EINVAL;
+			break;
+		}
+	}
+
+	/* No relocs should remain at this point */
+	if (num_relocs)
+		err = -EINVAL;
+
+out:
+	host1x_memmgr_munmap(g->ref, cmdbuf_base);
+
+	return err;
+}
+
+static inline int copy_gathers(struct host1x_job *job,
+		struct platform_device *pdev)
+{
+	size_t size = 0;
+	size_t offset = 0;
+	int i;
+
+	for (i = 0; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+		size += g->words * sizeof(u32);
+	}
+
+	job->gather_copy_mapped = dma_alloc_writecombine(&pdev->dev,
+			size, &job->gather_copy, GFP_KERNEL);
+	if (IS_ERR(job->gather_copy_mapped)) {
+		int err = PTR_ERR(job->gather_copy_mapped);
+		job->gather_copy_mapped = NULL;
+		return err;
+	}
+
+	job->gather_copy_size = size;
+
+	for (i = 0; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+		void *gather = host1x_memmgr_mmap(g->ref);
+		memcpy(job->gather_copy_mapped + offset,
+				gather + g->offset,
+				g->words * sizeof(u32));
+
+		g->mem_base = job->gather_copy;
+		g->offset = offset;
+		g->mem_id = 0;
+		g->ref = 0;
+
+		host1x_memmgr_munmap(g->ref, gather);
+		offset += g->words * sizeof(u32);
+	}
+
+	return 0;
+}
+
+int host1x_job_pin(struct host1x_job *job, struct platform_device *pdev)
+{
+	int err = 0, i = 0, j = 0;
+	struct host1x *host = host1x_get_host(pdev);
+	DECLARE_BITMAP(waitchk_mask, host1x_syncpt_nb_pts(host));
+
+	bitmap_zero(waitchk_mask, host1x_syncpt_nb_pts(host));
+	for (i = 0; i < job->num_waitchk; i++) {
+		u32 syncpt_id = job->waitchk[i].syncpt_id;
+		if (syncpt_id < host1x_syncpt_nb_pts(host))
+			set_bit(syncpt_id, waitchk_mask);
+	}
+
+	/* get current syncpt values for waitchk */
+	for_each_set_bit(i, &waitchk_mask[0], sizeof(waitchk_mask))
+		host1x_syncpt_load_min(host->syncpt + i);
+
+	/* pin memory */
+	err = pin_job_mem(job);
+	if (err <= 0)
+		goto out;
+
+	/* patch gathers */
+	for (i = 0; i < job->num_gathers; i++) {
+		struct host1x_job_gather *g = &job->gathers[i];
+
+		/* process each gather mem only once */
+		if (!g->ref) {
+			g->ref = host1x_memmgr_get(g->mem_id, job->ch->dev);
+			if (IS_ERR(g->ref)) {
+				err = PTR_ERR(g->ref);
+				g->ref = NULL;
+				break;
+			}
+
+			g->mem_base = job->gather_addr_phys[i];
+
+			for (j = 0; j < job->num_gathers; j++) {
+				struct host1x_job_gather *tmp =
+					&job->gathers[j];
+				if (!tmp->ref && tmp->mem_id == g->mem_id) {
+					tmp->ref = g->ref;
+					tmp->mem_base = g->mem_base;
+				}
+			}
+			err = 0;
+			if (host1x_firewall)
+				err = validate(job, pdev, g);
+			if (err)
+				dev_err(&pdev->dev,
+					"Job validate returned %d\n", err);
+			if (!err)
+				err = do_relocs(job, g->mem_id,  g->ref);
+			if (!err)
+				err = do_waitchks(job, host,
+						g->mem_id, g->ref);
+			host1x_memmgr_put(g->ref);
+			if (err)
+				break;
+		}
+	}
+
+	if (host1x_firewall && !err) {
+		err = copy_gathers(job, pdev);
+		if (err) {
+			host1x_job_unpin(job);
+			return err;
+		}
+	}
+
+out:
+	wmb();
+
+	return err;
+}
+
+void host1x_job_unpin(struct host1x_job *job)
+{
+	int i;
+
+	for (i = 0; i < job->num_unpins; i++) {
+		struct host1x_job_unpin_data *unpin = &job->unpins[i];
+		host1x_memmgr_unpin(unpin->h, unpin->mem);
+		host1x_memmgr_put(unpin->h);
+	}
+	job->num_unpins = 0;
+
+	if (job->gather_copy_size)
+		dma_free_writecombine(&job->ch->dev->dev,
+			job->gather_copy_size,
+			job->gather_copy_mapped, job->gather_copy);
+}
+
+/*
+ * Debug routine used to dump job entries
+ */
+void host1x_job_dump(struct device *dev, struct host1x_job *job)
+{
+	dev_dbg(dev, "    SYNCPT_ID   %d\n",
+		job->syncpt_id);
+	dev_dbg(dev, "    SYNCPT_VAL  %d\n",
+		job->syncpt_end);
+	dev_dbg(dev, "    FIRST_GET   0x%x\n",
+		job->first_get);
+	dev_dbg(dev, "    TIMEOUT     %d\n",
+		job->timeout);
+	dev_dbg(dev, "    NUM_SLOTS   %d\n",
+		job->num_slots);
+	dev_dbg(dev, "    NUM_HANDLES %d\n",
+		job->num_unpins);
+}
diff --git a/drivers/gpu/host1x/job.h b/drivers/gpu/host1x/job.h
new file mode 100644
index 0000000..428c670
--- /dev/null
+++ b/drivers/gpu/host1x/job.h
@@ -0,0 +1,164 @@
+/*
+ * Tegra host1x Job
+ *
+ * Copyright (c) 2011-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HOST1X_JOB_H
+#define __HOST1X_JOB_H
+
+struct platform_device;
+
+struct host1x_job_gather {
+	u32 words;
+	dma_addr_t mem_base;
+	u32 mem_id;
+	int offset;
+	struct mem_handle *ref;
+};
+
+struct host1x_cmdbuf {
+	__u32 mem;
+	__u32 offset;
+	__u32 words;
+	__u32 pad;
+};
+
+struct host1x_reloc {
+	__u32 cmdbuf_mem;
+	__u32 cmdbuf_offset;
+	__u32 target;
+	__u32 target_offset;
+	__u32 shift;
+	__u32 pad;
+};
+
+struct host1x_waitchk {
+	__u32 mem;
+	__u32 offset;
+	__u32 syncpt_id;
+	__u32 thresh;
+};
+
+/*
+ * Each submit is tracked as a host1x_job.
+ */
+struct host1x_job {
+	/* When refcount goes to zero, job can be freed */
+	struct kref ref;
+
+	/* List entry */
+	struct list_head list;
+
+	/* Channel where job is submitted to */
+	struct host1x_channel *ch;
+
+	int clientid;
+
+	/* Gathers and their memory */
+	struct host1x_job_gather *gathers;
+	int num_gathers;
+
+	/* Wait checks to be processed at submit time */
+	struct host1x_waitchk *waitchk;
+	int num_waitchk;
+	u32 waitchk_mask;
+
+	/* Array of handles to be pinned & unpinned */
+	struct host1x_reloc *relocarray;
+	int num_relocs;
+	struct host1x_job_unpin_data *unpins;
+	int num_unpins;
+
+	dma_addr_t *addr_phys;
+	dma_addr_t *gather_addr_phys;
+	dma_addr_t *reloc_addr_phys;
+
+	/* Sync point id, number of increments and end related to the submit */
+	u32 syncpt_id;
+	u32 syncpt_incrs;
+	u32 syncpt_end;
+
+	/* Maximum time to wait for this job */
+	int timeout;
+
+	/* Null kickoff prevents submit from being sent to hardware */
+	bool null_kickoff;
+
+	/* Index and number of slots used in the push buffer */
+	int first_get;
+	int num_slots;
+
+	/* Copy of gathers */
+	size_t gather_copy_size;
+	dma_addr_t gather_copy;
+	u8 *gather_copy_mapped;
+
+	/* Temporary space for unpin ids */
+	long unsigned int *pin_ids;
+
+	/* Check if register is marked as an address reg */
+	int (*is_addr_reg)(struct platform_device *dev, u32 reg, u32 class);
+
+	/* Request a SETCLASS to this class */
+	u32 class;
+
+	/* Add a channel wait for previous ops to complete */
+	u32 serialize;
+};
+/*
+ * Allocate memory for a job. Just enough memory will be allocated to
+ * accomodate the submit.
+ */
+struct host1x_job *host1x_job_alloc(struct host1x_channel *ch,
+		u32 num_cmdbufs, u32 num_relocs, u32 num_waitchks);
+
+/*
+ * Add a gather to a job.
+ */
+void host1x_job_add_gather(struct host1x_job *job,
+		u32 mem_id, u32 words, u32 offset);
+
+/*
+ * Increment reference going to host1x_job.
+ */
+void host1x_job_get(struct host1x_job *job);
+
+/*
+ * Decrement reference job, free if goes to zero.
+ */
+void host1x_job_put(struct host1x_job *job);
+
+/*
+ * Pin memory related to job. This handles relocation of addresses to the
+ * host1x address space. Handles both the gather memory and any other memory
+ * referred to from the gather buffers.
+ *
+ * Handles also patching out host waits that would wait for an expired sync
+ * point value.
+ */
+int host1x_job_pin(struct host1x_job *job, struct platform_device *pdev);
+
+/*
+ * Unpin memory related to job.
+ */
+void host1x_job_unpin(struct host1x_job *job);
+
+/*
+ * Dump contents of job to debug output.
+ */
+void host1x_job_dump(struct device *dev, struct host1x_job *job);
+
+#endif
diff --git a/drivers/gpu/host1x/memmgr.c b/drivers/gpu/host1x/memmgr.c
new file mode 100644
index 0000000..eceb782
--- /dev/null
+++ b/drivers/gpu/host1x/memmgr.c
@@ -0,0 +1,173 @@
+/*
+ * Tegra host1x Memory Management Abstraction
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/err.h>
+
+#include "memmgr.h"
+#include "cma.h"
+
+struct mem_handle *host1x_memmgr_alloc(size_t size, size_t align, int flags)
+{
+	return NULL;
+}
+
+struct mem_handle *host1x_memmgr_get(u32 id, struct platform_device *dev)
+{
+	struct mem_handle *h = NULL;
+
+	switch (host1x_memmgr_type(id)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		h = (struct mem_handle *) host1x_cma_get(id, dev);
+		break;
+#endif
+	default:
+		break;
+	}
+
+	return h;
+}
+
+void host1x_memmgr_put(struct mem_handle *handle)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		host1x_cma_put(handle);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+struct sg_table *host1x_memmgr_pin(struct mem_handle *handle)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		return host1x_cma_pin(handle);
+		break;
+#endif
+	default:
+		return NULL;
+		break;
+	}
+}
+
+void host1x_memmgr_unpin(struct mem_handle *handle, struct sg_table *sgt)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		host1x_cma_unpin(handle, sgt);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+void *host1x_memmgr_mmap(struct mem_handle *handle)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		return host1x_cma_mmap(handle);
+		break;
+#endif
+	default:
+		return NULL;
+		break;
+	}
+}
+
+void host1x_memmgr_munmap(struct mem_handle *handle, void *addr)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		host1x_cma_munmap(handle, addr);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+void *host1x_memmgr_kmap(struct mem_handle *handle, unsigned int pagenum)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		return host1x_cma_kmap(handle, pagenum);
+		break;
+#endif
+	default:
+		return NULL;
+		break;
+	}
+}
+
+void host1x_memmgr_kunmap(struct mem_handle *handle, unsigned int pagenum,
+		void *addr)
+{
+	switch (host1x_memmgr_type((u32)handle)) {
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	case mem_mgr_type_cma:
+		host1x_cma_kunmap(handle, pagenum, addr);
+		break;
+#endif
+	default:
+		break;
+	}
+}
+
+int host1x_memmgr_pin_array_ids(struct platform_device *dev,
+		long unsigned *ids,
+		dma_addr_t *phys_addr,
+		u32 count,
+		struct host1x_job_unpin_data *unpin_data)
+{
+	int pin_count = 0;
+
+#if defined(CONFIG_TEGRA_HOST1X_CMA)
+	{
+		int cma_count = host1x_cma_pin_array_ids(dev,
+			ids, MEMMGR_TYPE_MASK,
+			mem_mgr_type_cma,
+			count, &unpin_data[pin_count],
+			phys_addr);
+
+		if (cma_count < 0) {
+			/* clean up previous handles */
+			while (pin_count) {
+				pin_count--;
+				/* unpin, put */
+				host1x_memmgr_unpin(unpin_data[pin_count].h,
+						unpin_data[pin_count].mem);
+				host1x_memmgr_put(unpin_data[pin_count].h);
+			}
+			return cma_count;
+		}
+		pin_count += cma_count;
+	}
+#endif
+	return pin_count;
+}
diff --git a/drivers/gpu/host1x/memmgr.h b/drivers/gpu/host1x/memmgr.h
new file mode 100644
index 0000000..a265fe8
--- /dev/null
+++ b/drivers/gpu/host1x/memmgr.h
@@ -0,0 +1,72 @@
+/*
+ * Tegra host1x Memory Management Abstraction header
+ *
+ * Copyright (c) 2012-2013, NVIDIA Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _HOST1X_MEM_MGR_H
+#define _HOST1X_MEM_MGR_H
+
+struct mem_handle;
+struct platform_device;
+
+struct host1x_job_unpin_data {
+	struct mem_handle *h;
+	struct sg_table *mem;
+};
+
+enum mem_mgr_flag {
+	mem_mgr_flag_uncacheable = 0,
+	mem_mgr_flag_write_combine = 1,
+};
+
+/* Buffer encapsulation */
+enum mem_mgr_type {
+	mem_mgr_type_cma = 2,
+};
+
+#define MEMMGR_TYPE_MASK	0x3
+#define MEMMGR_ID_MASK		~0x3
+
+static inline int host1x_memmgr_type(u32 id) { return id & MEMMGR_TYPE_MASK; }
+static inline int host1x_memmgr_id(u32 id) { return id & MEMMGR_ID_MASK; }
+static inline unsigned int host1x_memmgr_host1x_id(u32 type, u32 handle)
+{
+	if (host1x_memmgr_type(type) != type ||
+		host1x_memmgr_id(handle) != handle)
+		return 0;
+
+	return handle | type;
+}
+
+struct mem_handle *host1x_memmgr_alloc(size_t size, size_t align,
+		int flags);
+struct mem_handle *host1x_memmgr_get(u32 id, struct platform_device *dev);
+void host1x_memmgr_put(struct mem_handle *handle);
+struct sg_table *host1x_memmgr_pin(struct mem_handle *handle);
+void host1x_memmgr_unpin(struct mem_handle *handle, struct sg_table *sgt);
+void *host1x_memmgr_mmap(struct mem_handle *handle);
+void host1x_memmgr_munmap(struct mem_handle *handle, void *addr);
+void *host1x_memmgr_kmap(struct mem_handle *handle, unsigned int pagenum);
+void host1x_memmgr_kunmap(struct mem_handle *handle, unsigned int pagenum,
+		void *addr);
+
+int host1x_memmgr_pin_array_ids(struct platform_device *dev,
+		long unsigned *ids,
+		dma_addr_t *phys_addr,
+		u32 count,
+		struct host1x_job_unpin_data *unpin_data);
+
+#endif
diff --git a/drivers/gpu/host1x/syncpt.c b/drivers/gpu/host1x/syncpt.c
index 32e2b42..f21c688 100644
--- a/drivers/gpu/host1x/syncpt.c
+++ b/drivers/gpu/host1x/syncpt.c
@@ -287,6 +287,12 @@ void host1x_syncpt_debug(struct host1x_syncpt *sp)
 	sp->dev->syncpt_op.debug(sp);
 }
 
+/* remove a wait pointed to by patch_addr */
+int host1x_syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr)
+{
+	return sp->dev->syncpt_op.patch_wait(sp, patch_addr);
+}
+
 int host1x_syncpt_init(struct host1x *host)
 {
 	struct host1x_syncpt *syncpt, *sp;
@@ -305,6 +311,11 @@ int host1x_syncpt_init(struct host1x *host)
 
 	host->syncpt = syncpt;
 
+	/* Allocate sync point to use for clearing waits for expired fences */
+	host->nop_sp = _host1x_syncpt_alloc(host, NULL, 0);
+	if (!host->nop_sp)
+		return -ENOMEM;
+
 	return 0;
 }
 
diff --git a/drivers/gpu/host1x/syncpt.h b/drivers/gpu/host1x/syncpt.h
index b46d044..255a3a3 100644
--- a/drivers/gpu/host1x/syncpt.h
+++ b/drivers/gpu/host1x/syncpt.h
@@ -26,6 +26,7 @@
 struct host1x;
 
 #define NVSYNCPT_INVALID			(-1)
+#define NVSYNCPT_GRAPHICS_HOST			0
 
 struct host1x_syncpt {
 	int id;
@@ -145,6 +146,9 @@ static inline int host1x_syncpt_is_valid(struct host1x_syncpt *sp)
 		sp->id < host1x_syncpt_nb_pts(sp->dev);
 }
 
+/* Patch a wait by replacing it with a wait for syncpt 0 value 0 */
+int host1x_syncpt_patch_wait(struct host1x_syncpt *sp, void *patch_addr);
+
 /* Return id of the sync point */
 u32 host1x_syncpt_id(struct host1x_syncpt *sp);
 
diff --git a/include/trace/events/host1x.h b/include/trace/events/host1x.h
index 3c14cac..c63d75c 100644
--- a/include/trace/events/host1x.h
+++ b/include/trace/events/host1x.h
@@ -37,6 +37,190 @@ DECLARE_EVENT_CLASS(host1x,
 	TP_printk("name=%s", __entry->name)
 );
 
+DEFINE_EVENT(host1x, host1x_channel_open,
+	TP_PROTO(const char *name),
+	TP_ARGS(name)
+);
+
+DEFINE_EVENT(host1x, host1x_channel_release,
+	TP_PROTO(const char *name),
+	TP_ARGS(name)
+);
+
+DEFINE_EVENT(host1x, host1x_cdma_begin,
+	TP_PROTO(const char *name),
+	TP_ARGS(name)
+);
+
+DEFINE_EVENT(host1x, host1x_cdma_end,
+	TP_PROTO(const char *name),
+	TP_ARGS(name)
+);
+
+TRACE_EVENT(host1x,
+	TP_PROTO(const char *name, int timeout),
+
+	TP_ARGS(name, timeout),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(int, timeout)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->timeout = timeout;
+	),
+
+	TP_printk("name=%s, timeout=%d",
+		__entry->name, __entry->timeout)
+);
+
+TRACE_EVENT(host1x_cdma_push,
+	TP_PROTO(const char *name, u32 op1, u32 op2),
+
+	TP_ARGS(name, op1, op2),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, op1)
+		__field(u32, op2)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->op1 = op1;
+		__entry->op2 = op2;
+	),
+
+	TP_printk("name=%s, op1=%08x, op2=%08x",
+		__entry->name, __entry->op1, __entry->op2)
+);
+
+TRACE_EVENT(host1x_cdma_push_gather,
+	TP_PROTO(const char *name, u32 mem_id,
+			u32 words, u32 offset, void *cmdbuf),
+
+	TP_ARGS(name, mem_id, words, offset, cmdbuf),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, mem_id)
+		__field(u32, words)
+		__field(u32, offset)
+		__field(bool, cmdbuf)
+		__dynamic_array(u32, cmdbuf, words)
+	),
+
+	TP_fast_assign(
+		if (cmdbuf) {
+			memcpy(__get_dynamic_array(cmdbuf), cmdbuf+offset,
+					words * sizeof(u32));
+		}
+		__entry->cmdbuf = cmdbuf;
+		__entry->name = name;
+		__entry->mem_id = mem_id;
+		__entry->words = words;
+		__entry->offset = offset;
+	),
+
+	TP_printk("name=%s, mem_id=%08x, words=%u, offset=%d, contents=[%s]",
+	  __entry->name, __entry->mem_id,
+	  __entry->words, __entry->offset,
+	  __print_hex(__get_dynamic_array(cmdbuf),
+		  __entry->cmdbuf ? __entry->words * 4 : 0))
+);
+
+TRACE_EVENT(host1x_channel_submit,
+	TP_PROTO(const char *name, u32 cmdbufs, u32 relocs, u32 waitchks,
+			u32 syncpt_id, u32 syncpt_incrs),
+
+	TP_ARGS(name, cmdbufs, relocs, waitchks, syncpt_id, syncpt_incrs),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, cmdbufs)
+		__field(u32, relocs)
+		__field(u32, waitchks)
+		__field(u32, syncpt_id)
+		__field(u32, syncpt_incrs)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->cmdbufs = cmdbufs;
+		__entry->relocs = relocs;
+		__entry->waitchks = waitchks;
+		__entry->syncpt_id = syncpt_id;
+		__entry->syncpt_incrs = syncpt_incrs;
+	),
+
+	TP_printk("name=%s, cmdbufs=%u, relocs=%u, waitchks=%d,"
+		"syncpt_id=%u, syncpt_incrs=%u",
+	  __entry->name, __entry->cmdbufs, __entry->relocs, __entry->waitchks,
+	  __entry->syncpt_id, __entry->syncpt_incrs)
+);
+
+TRACE_EVENT(host1x_channel_submitted,
+	TP_PROTO(const char *name, u32 syncpt_base, u32 syncpt_max),
+
+	TP_ARGS(name, syncpt_base, syncpt_max),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, syncpt_base)
+		__field(u32, syncpt_max)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->syncpt_base = syncpt_base;
+		__entry->syncpt_max = syncpt_max;
+	),
+
+	TP_printk("name=%s, syncpt_base=%d, syncpt_max=%d",
+		__entry->name, __entry->syncpt_base, __entry->syncpt_max)
+);
+
+TRACE_EVENT(host1x_channel_submit_complete,
+	TP_PROTO(const char *name, int count, u32 thresh),
+
+	TP_ARGS(name, count, thresh),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(int, count)
+		__field(u32, thresh)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->count = count;
+		__entry->thresh = thresh;
+	),
+
+	TP_printk("name=%s, count=%d, thresh=%d",
+		__entry->name, __entry->count, __entry->thresh)
+);
+
+TRACE_EVENT(host1x_wait_cdma,
+	TP_PROTO(const char *name, u32 eventid),
+
+	TP_ARGS(name, eventid),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(u32, eventid)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->eventid = eventid;
+	),
+
+	TP_printk("name=%s, event=%d", __entry->name, __entry->eventid)
+);
+
 TRACE_EVENT(host1x_syncpt_load_min,
 	TP_PROTO(u32 id, u32 val),
 
@@ -55,6 +239,33 @@ TRACE_EVENT(host1x_syncpt_load_min,
 	TP_printk("id=%d, val=%d", __entry->id, __entry->val)
 );
 
+TRACE_EVENT(host1x_syncpt_wait_check,
+	TP_PROTO(u32 mem_id, u32 offset, u32 syncpt_id, u32 thresh, u32 min),
+
+	TP_ARGS(mem_id, offset, syncpt_id, thresh, min),
+
+	TP_STRUCT__entry(
+		__field(u32, mem_id)
+		__field(u32, offset)
+		__field(u32, syncpt_id)
+		__field(u32, thresh)
+		__field(u32, min)
+	),
+
+	TP_fast_assign(
+		__entry->mem_id = mem_id;
+		__entry->offset = offset;
+		__entry->syncpt_id = syncpt_id;
+		__entry->thresh = thresh;
+		__entry->min = min;
+	),
+
+	TP_printk("mem_id=%08x, offset=%05x, id=%d, thresh=%d, current=%d",
+		__entry->mem_id, __entry->offset,
+		__entry->syncpt_id, __entry->thresh,
+		__entry->min)
+);
+
 #endif /*  _TRACE_HOST1X_H */
 
 /* This part must be outside protection */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-tegra" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html