mesa/r300-bufmgr.patch

commit 576ee3db94bd79092fe7f69b4ac8293367b6dfb1
Author: Dave Airlie <airlied@redhat.com>
Date:   Thu Oct 23 10:27:54 2008 +1000

    r300: fallback to sw rendering if we can't fit textures into aperture

commit bd018ad09b6225a6386c57d8ca8eb37f74025006
Author: Dave Airlie <airlied@redhat.com>
Date:   Thu Oct 23 10:13:28 2008 +1000

    r300: add bufmgr vram sizing to stop overflowing textures in VRAM
    
    a) needs to be redone for bufmgr in screen
    b) need to add GART aperture sizing also.

commit e768e7df0f6b7f61f82d70a55c7419c359b17cb2
Author: Dave Airlie <airlied@redhat.com>
Date:   Sun Oct 19 18:56:56 2008 +1000

    r300: fixup no kms case for map/unmap

commit 6b6995112d3223496c04e7afc338ffc9bcc5093e
Author: Dave Airlie <airlied@redhat.com>
Date:   Sun Oct 19 18:42:24 2008 +1000

    radeon: disable leak detection for now

commit 445d367fed8b55a1351788dbf3e2e303e56095b5
Author: Dave Airlie <airlied@redhat.com>
Date:   Sun Oct 19 18:28:45 2008 +1000

    r300: only enable set tex offset for non-kernel mm systems

commit 737b174d7e82cc277d877ff810ffe058e1aa4522
Author: Dave Airlie <airlied@redhat.com>
Date:   Mon Oct 13 15:40:58 2008 +1000

    radeon: fixup prototypes

commit a4fa4aaf6754e2e35cee42924dc141738b7edc27
Author: Dave Airlie <airlied@redhat.com>
Date:   Mon Oct 13 15:39:46 2008 +1000

    radeon: fix rounding to avoid kernel EFAULT

commit c6b6a65281b8e98f5050f142d284e6ba622bea1b
Author: Dave Airlie <airlied@redhat.com>
Date:   Mon Oct 13 15:39:20 2008 +1000

    r300: with real buffers - sw fallbacks to hardcoded maps
    
    so don't screw with virtual

commit c41a002914ab9cfd8f49199e4a4612a6aeecf0b5
Author: Dave Airlie <airlied@redhat.com>
Date:   Tue Sep 30 16:51:04 2008 +1000

    r300: make work again
    
    Conflicts:
    
    	src/mesa/drivers/dri/r300/r300_mem.c
    	src/mesa/drivers/dri/r300/r300_mem.h
    	src/mesa/drivers/dri/r300/r300_texstate.c
    	src/mesa/drivers/dri/r300/radeon_context.c
    	src/mesa/drivers/dri/radeon/radeon_buffer.h

commit 289b38b16b9e0aae4d51bac8c745359a36c0d54e
Author: Dave Airlie <airlied@redhat.com>
Date:   Fri Sep 5 14:11:25 2008 +1000

    drop r300 use of ttm flags

commit 1ca0e2ac03b4fd95ee9cd2dbe82bb7a0e9c1614d
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Mon Sep 29 09:40:50 2008 +0200

    r300: Fix crash in r300ClearBuffer
    
    That function can currently be called without a radeon_renderbuffer
    when only Z/stencil needs clearing.

commit 4d19a9b9ac84a972d0807ff931df596418d9a7f4
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Sep 28 19:37:52 2008 +0200

    r300_mem: Prefer using screen structures over context structures
    
    Eventually, bufmgr will have to live with only screen structures for
    correctness.

commit 8d42990244a196b85ceb1439ad2b5eef6c559d61
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Sep 28 19:20:40 2008 +0200

    r300: Remove radeon_bufmgr
    
    Since the bufmgr is now driver-specific anyway, there's no point in having
    the additional indirection around.

commit 3dba9178ec5dbf8183a5ceee701c9a36606ad464
Merge: 857362e... 2e5d717...
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Sep 28 18:28:05 2008 +0200

    Merge branch 'master' into develop
    
    Conflicts:
    
    	src/mesa/drivers/dri/r300/r300_tex.c
    	src/mesa/drivers/dri/r300/r300_texstate.c
    	src/mesa/drivers/dri/r300/radeon_context.c
    	src/mesa/drivers/dri/r300/radeon_context.h

commit 857362ef14fc4f175a9d425a74f9ca2b691971d6
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Tue Aug 19 22:02:41 2008 +0200

    r300: Reasonably hacked up texture images

commit b3153fc6f7823b434ab82faec4e966c3311f8ce8
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Fri Aug 29 17:55:37 2008 +0200

    Call FreeTexImageData unconditionally during mipmap generation.
    
    When drivers override texture image storage, Data may be null even though
    storage has been reserved, e.g. when the data is currently in video memory
    but not mapped into CPU-accessible memory.

commit d8d306220c19f0cc60d14876021543fe6e542775
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Fri Aug 29 18:40:05 2008 +0200

    r300: Remove some debug messages

commit b179525a87b5bf510628918406e3562e2d138ce7
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Tue Aug 26 23:21:24 2008 +0200

    r300: Fix various software fallback regressions
    
    Renderbuffers are now mapped and unmapped properly for software rendering,
    and various assorted fixes.

commit ebf556644058c748c7c621b1e5ae12dc900a2dd0
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Fri Aug 15 19:55:59 2008 +0200

    r300: Remove unused field pitch in r300_tex_obj

commit 7db556b55baa392a951f92074c058868ead2edb5
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Tue Aug 26 19:02:14 2008 +0200

    radeon: Add missing header radeon_buffer.h

commit 093023518b81c11d0c3242a27b90def9a7e2e1c0
Author: Adam Jackson <ajax@redhat.com>
Date:   Sun Aug 24 18:47:53 2008 +1000

    r300: move color/depth buffers to using fake buffers
    
    Emit state for color/depth properly
    
    This also moves the bufmgr into the radeon struct, as we
    need it from make current. It won't affect the other chips yet.
    
    can probably move fbLocation into the bufmgr soon

commit 004100f123ca3ea4b35e16ae5d91911ad02b327f
Author: Dave Airlie <airlied@linux.ie>
Date:   Sun Aug 24 18:33:06 2008 +1000

    r300: add static BO alloc to classic bufmgr

commit 6827a4d742cf2b815d2651dd7ffe002f46a07b1b
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Fri Aug 8 18:37:37 2008 +0200

    r300: Fix regression when redefining texture images
    
    We obviously have to destroy and recreate the miptree when incompatible
    changes to the texture objects are made.

commit b3b221caefbcbb749fea9dc1040e4313fc9ea717
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Mon Aug 4 01:04:15 2008 +0200

    r300: Texture memory managed by bufmgr
    
    The bufmgr makes use of the texture heap by wrapping the old texmem.c code.
    
    There are currently three copies of each texture image:
    - Mesa's storage
    - bufmgr backing store
    - VRAM (if the texture has recently been used)
    The first one should eventually be eliminated by making changes along the
    lines of the Intel driver.

commit 7eed94472049bba4e41bf7e51c51d0cc418cfc87
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 21:59:52 2008 +0200

    r300: Introduce r300_mipmap_tree

commit 6d8ee45f31c11cae2424e987a0577f9b7e26762d
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 17:23:50 2008 +0200

    r300: Start pushing driTextureObject down

commit f403a51e2aedfb04f845d03c1aa1023c7e518e45
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 15:59:04 2008 +0200

    r300: Remove unused texture object variables

commit 4e432ffb0a403c44898ea608a7a1d7a515b59a73
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 14:28:59 2008 +0200

    r300: Remove bo_use
    
    Validating buffers and marking them as pending is now done automatically
    based on relocations.

commit 604515f17698572be3a2ae61b6843b4acef02f2e
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 14:27:05 2008 +0200

    r300: Remove last traces of r300_dma_region

commit 656a1df66fbf5d2f748babf6be58524ee4b65bcc
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 14:18:42 2008 +0200

    r300: No longer use r300_dma_region for dma.current buffer

commit b1267a2c7b144f7286ae901cea02c4e81cb435ab
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 13:41:12 2008 +0200

    r300: Stop using r300_dma_region for elt_dma

commit 0ff9bcf22b7fc0c8a0fa55ec15c3c3ba176f2afd
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 12:37:51 2008 +0200

    r300: Remove unused variable r300_state.Elts

commit 723af37157ffa8be9be710c8e168386f02035909
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 12:26:28 2008 +0200

    r300: Fix interaction of ClearState emits with swtcl

commit ecedb19db194721c6d2030287e346ca6b3d50168
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 11:18:48 2008 +0200

    r300: Fix Clear state emission

commit e1bdd637c0740c8e3c00874ff3301a361df9c5b3
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 10:41:17 2008 +0200

    r300: Use explicit r300_aos structure instead of overloaded r300_dma_region

commit fa63e5eea06d6ac709465ef2cfef4e1760ce0a62
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sun Aug 3 00:23:06 2008 +0200

    r300: Remove unused variable

commit 168b082ec48cb6aedb3d7a45538e65fa24d07f1a
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:50:15 2008 +0200

    r300: Use relocations in texture blits

commit 723a451b5ecc2862701f0b1f7c518902c4675fe1
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:29:36 2008 +0200

    r300: Remove old emit helpers

commit 511a071823e58a5861f0afb9af086411fc3aa7eb
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:28:14 2008 +0200

    r300: Convert swtcl to new emit system

commit 4288e01990c66299830088942094772bb938e597
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:27:18 2008 +0200

    r300: Complete copnversion of r300ClearBuffer

commit 3acc8c5ac2c8aed11a4196387db2016a399ef12a
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:15:23 2008 +0200

    r300: Continue convering ClearState to new emit

commit 5e49bf6abf82bee702b51248dea65404267b1856
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:13:58 2008 +0200

    r300: Continue convering ClearState to new emit

commit b94bad12c3bc000e93e139cceaf761e0cb9805f6
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:10:35 2008 +0200

    r300: Convert ClearState to new emit

commit 8937fcc3c100bdd16a26a3a91953e8bfded6a17b
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:07:33 2008 +0200

    r300: Use new style emit in r300ClearBuffer

commit e231031ece3ee08618b781cf4b9c29b60258a8c4
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:05:15 2008 +0200

    r300: Prepare Clear implementation for new style emit

commit ed54357c566d7c7ed84cca33d45388d9986e1a75
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:02:38 2008 +0200

    r300: Rewrite cp_delay and cp_wait using new emit

commit 8ab20decadddac4f16a906a8411d1ff64be2a03f
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 23:01:30 2008 +0200

    r300: Rewrite end_3d using new emit

commit 7316a87cb95375cc79bb957695fc39668c55ca26
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 22:56:29 2008 +0200

    r300: Use new emit for r300EmitBlit and get rid of redundant r300EmitWait

commit e2e3062864f5fd7332e0ba3356f5047b66608ba6
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 22:55:15 2008 +0200

    r300: Use new emit in r300EmitCacheFlush

commit 021d49403d259bb43982235f8e47f04f104545cd
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 22:54:01 2008 +0200

    r300: Prepare texture blitting for new emit style

commit e3640e75a22e1647b8a4237212fd9091516cc0d0
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 22:48:59 2008 +0200

    r300: Use new emit style for memory fencing

commit 887c24496910df9d4efbb597a4e7abb3ad655f54
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 22:46:48 2008 +0200

    r300: Use new emit style in r300_render

commit 196a156ed067f00a38ef24cd89084392dbe58e95
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 22:41:31 2008 +0200

    r300: Add additional helper macros for emit

commit dad24c477db1db583cddfb407937e0f749e3a063
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 22:39:52 2008 +0200

    r300: Use new emit style for r300EmitAtoms

commit ebb3329c089ad8db18998e85135fb0eabe3f547c
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 22:34:23 2008 +0200

    r300: Start moving cmdbuf towards an emit system that is closer to DRM and DDX

commit 296d81a420e0831b2c795a050442acf7e5eaa4d6
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 20:01:27 2008 +0200

    r300: Move r300EmitWait and r300EmitBlit to r300_emit.c
    
    These functions don't really fit into r300_cmdbuf.c.

commit 3c7b7b60f5736698b0c68d1100baa3feba7dfa14
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 19:15:19 2008 +0200

    r300: Remove unused variable buf0_address

commit 1a25eb30d6f7567946b920fa58d760959db6726d
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 19:12:50 2008 +0200

    r300: Be more conservative about command buffer space allocation
    
    Having the main rendering sequence split up into separate command buffer is
    probably one of the worst offenders for hardware lockups, so this might help.
    It can't hurt in any case.

commit 88d3539f3a6cc6d81ec91b901bbd1b76bf38ba67
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 18:55:19 2008 +0200

    r300: Manage cmdbuf via the r300_mem dri_bufmgr
    
    Relocation code isn't actually used yet.

commit bab989ab8aba252b6a8a041a517308aeb22ed897
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 15:43:07 2008 +0200

    r300: Remove more GART-related legacy code

commit ccd0bf4dbabeb57fc6a496a76aa7258d4451b190
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 15:31:46 2008 +0200

    r300: Eliminate Gart- and ClientStorage-related functions
    
    These functions will be incorrect with a new memory manager. Furthermore,
    they seem to be no-ops right now, judging from the comments and a cursory
    inspection.
    
    Actually, it's possible that applications out there have been using the
    client_storage extension successfully, who knows - but they should continue
    to work, just with client_storage turned into a no-op.

commit 9fb9931fa7e4fe0d82d73b00f6347ea07f9afa32
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 15:17:44 2008 +0200

    r300: Start turning r300_mem into a dri_bufmgr-style manager
    
    This is only for DMA memory right now, textures do not go through this
    manager yet.

commit 990a685e1191b875ec7c7d2d481def37554877db
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 12:15:22 2008 +0200

    r300: Remove unused variable r300_state.swtcl_dma

commit 50ad5c9a8f1da5fdf51b1ca17259369f0a7dbab6
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 11:30:42 2008 +0200

    r300: Remove unused or ineffective and untested variables

commit 8f71ab684448f5caa0c649e52bb6ea1d0bb86077
Author: Nicolai Haehnle <nhaehnle@gmail.com>
Date:   Sat Aug 2 11:09:36 2008 +0200

    r300: Get rid of USER_BUFFERS define
diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile
index e9144ac..c26c43b 100644
--- a/src/mesa/drivers/dri/r200/Makefile
+++ b/src/mesa/drivers/dri/r200/Makefile
@@ -48,7 +48,9 @@ SYMLINKS = \
 COMMON_SYMLINKS = \
 	radeon_chipset.h \
 	radeon_screen.c \
-	radeon_screen.h
+	radeon_screen.h \
+	radeon_buffer.h \
+	radeon_dri_bufmgr.h
 
 ##### TARGETS #####
 
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index 6ca9342..acadf1e 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -11,15 +11,6 @@ ifeq ($(USING_EGL), 1)
 EGL_SOURCES = server/radeon_egl.c
 endif
 
-COMMON_SOURCES = \
-	../../common/driverfuncs.c \
-	../common/mm.c \
-	../common/utils.c \
-	../common/texmem.c \
-	../common/vblank.c \
-	../common/xmlconfig.c \
-	../common/dri_util.c
-
 DRIVER_SOURCES = \
 		 radeon_screen.c \
 		 radeon_context.c \
@@ -36,6 +27,7 @@ DRIVER_SOURCES = \
 		 r300_texmem.c \
 		 r300_tex.c \
 		 r300_texstate.c \
+		 r300_mipmap_tree.c \
 		 radeon_program.c \
 		 radeon_program_alu.c \
 		 radeon_program_pair.c \
@@ -49,9 +41,10 @@ DRIVER_SOURCES = \
 		 r300_shader.c \
 		 r300_emit.c \
 		 r300_swtcl.c \
+		 radeon_dri_bufmgr.c \
 		 $(EGL_SOURCES)
 
-C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
+C_SOURCES = $(COMMON_SOURCES) $(COMMON_BM_SOURCES) $(DRIVER_SOURCES)
 
 DRIVER_DEFINES = -DCOMPILE_R300 -DR200_MERGED=0 \
 	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300
@@ -68,7 +61,10 @@ COMMON_SYMLINKS = \
 	radeon_chipset.h \
 	radeon_screen.c \
 	radeon_screen.h \
-	radeon_span.h
+	radeon_span.h \
+	radeon_buffer.h \
+	radeon_dri_bufmgr.h \
+	radeon_dri_bufmgr.c
 
 ##### TARGETS #####
 
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.c b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
index c9e1dfe..421cb81 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.c
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
@@ -44,6 +44,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drm.h"
 #include "radeon_drm.h"
 
+#include "radeon_buffer.h"
 #include "radeon_ioctl.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
@@ -51,11 +52,18 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_reg.h"
 #include "r300_cmdbuf.h"
 #include "r300_emit.h"
+#include "r300_mem.h"
+#include "r300_mipmap_tree.h"
 #include "r300_state.h"
 
 // Set this to 1 for extremely verbose debugging of command buffers
 #define DEBUG_CMDBUF		0
 
+/** # of dwords reserved for additional instructions that may need to be written
+ * during flushing.
+ */
+#define SPACE_FOR_FLUSHING	4
+
 /**
  * Send the current command buffer via ioctl to the hardware.
  */
@@ -66,24 +74,42 @@ int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller)
 	drm_radeon_cmd_buffer_t cmd;
 	int start;
 
+	if (r300->cmdbuf.flushing) {
+		fprintf(stderr, "Recursive call into r300FlushCmdBufLocked!\n");
+		exit(-1);
+	}
+	r300->cmdbuf.flushing = 1;
+
 	if (r300->radeon.lost_context) {
 		start = 0;
 		r300->radeon.lost_context = GL_FALSE;
 	} else
-		start = r300->cmdbuf.count_reemit;
+		start = r300->cmdbuf.reemit;
 
 	if (RADEON_DEBUG & DEBUG_IOCTL) {
 		fprintf(stderr, "%s from %s - %i cliprects\n",
 			__FUNCTION__, caller, r300->radeon.numClipRects);
 
-		if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_VERBOSE)
-			for (i = start; i < r300->cmdbuf.count_used; ++i)
+		if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_VERBOSE) {
+			fprintf(stderr, "written: %d  committed: %d\n", r300->cmdbuf.written, r300->cmdbuf.committed);
+			for (i = start; i < r300->cmdbuf.written; ++i)
 				fprintf(stderr, "%d: %08x\n", i,
-					r300->cmdbuf.cmd_buf[i]);
+					((uint32_t*)r300->cmdbuf.buf->virtual)[i]);
+		}
 	}
 
-	cmd.buf = (char *)(r300->cmdbuf.cmd_buf + start);
-	cmd.bufsz = (r300->cmdbuf.count_used - start) * 4;
+	if (r300->cmdbuf.written != r300->cmdbuf.committed) {
+		_mesa_problem(r300->radeon.glCtx,
+			"Command buffer contains %d uncommitted dwords\n"
+			"in r300FlushCmdBufLocked called from %s.\n",
+			r300->cmdbuf.written - r300->cmdbuf.committed, caller);
+	}
+
+	dri_bo_unmap(r300->cmdbuf.buf);
+	dri_process_relocs(r300->cmdbuf.buf, 1);
+
+	cmd.buf = (char *)r300->cmdbuf.buf->virtual + 4*start;
+	cmd.bufsz = (r300->cmdbuf.committed - start) * 4;
 
 	if (r300->radeon.state.scissor.enabled) {
 		cmd.nbox = r300->radeon.state.scissor.numClipRects;
@@ -103,9 +129,19 @@ int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller)
 		radeonWaitForIdleLocked(&r300->radeon);
 	}
 
+	dri_post_submit(r300->cmdbuf.buf, 0);
+	dri_bo_unreference(r300->cmdbuf.buf);
+
 	r300->dma.nr_released_bufs = 0;
-	r300->cmdbuf.count_used = 0;
-	r300->cmdbuf.count_reemit = 0;
+	r300->cmdbuf.buf = dri_bo_alloc(r300->radeon.bufmgr, "cmdbuf",
+		r300->cmdbuf.size*4, 16, DRM_BO_MEM_CMDBUF);
+	r300->cmdbuf.written = 0;
+	r300->cmdbuf.reserved = 0;
+	r300->cmdbuf.committed = 0;
+	r300->cmdbuf.reemit = 0;
+	dri_bo_map(r300->cmdbuf.buf, GL_TRUE);
+
+	r300->cmdbuf.flushing = 0;
 
 	return ret;
 }
@@ -115,9 +151,7 @@ int r300FlushCmdBuf(r300ContextPtr r300, const char *caller)
 	int ret;
 
 	LOCK_HARDWARE(&r300->radeon);
-
 	ret = r300FlushCmdBufLocked(r300, caller);
-
 	UNLOCK_HARDWARE(&r300->radeon);
 
 	if (ret) {
@@ -128,6 +162,44 @@ int r300FlushCmdBuf(r300ContextPtr r300, const char *caller)
 	return ret;
 }
 
+/**
+ * Make sure that enough space is available in the command buffer
+ * by flushing if necessary.
+ *
+ * \param dwords The number of dwords we need to be free on the command buffer
+ */
+void r300EnsureCmdBufSpace(r300ContextPtr r300, int dwords, const char *caller)
+{
+	assert(dwords < r300->cmdbuf.size);
+
+	if (!r300->cmdbuf.flushing)
+		dwords += SPACE_FOR_FLUSHING;
+
+	if (r300->cmdbuf.written + dwords > r300->cmdbuf.size)
+		r300FlushCmdBuf(r300, caller);
+}
+
+void r300BeginBatch(r300ContextPtr r300, int n, GLboolean autostate, const char* function, int line)
+{
+	assert(r300->cmdbuf.written == r300->cmdbuf.reserved);
+
+	r300EnsureCmdBufSpace(r300, n, function);
+
+	if (autostate && !r300->cmdbuf.written) {
+		if (RADEON_DEBUG & DEBUG_IOCTL)
+			fprintf(stderr,
+				"Reemit state after flush (from %s)\n", function);
+		r300EmitState(r300);
+	}
+
+	r300->cmdbuf.reserved += n;
+	assert(r300->cmdbuf.reserved < r300->cmdbuf.size);
+
+	if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "BEGIN_BATCH(%d) at %d, from %s:%i\n",
+			n, r300->cmdbuf.written, function, line);
+}
+
 static void r300PrintStateAtom(r300ContextPtr r300, struct r300_state_atom *state)
 {
 	int i;
@@ -152,33 +224,18 @@ static void r300PrintStateAtom(r300ContextPtr r300, struct r300_state_atom *stat
  */
 static INLINE void r300EmitAtoms(r300ContextPtr r300, GLboolean dirty)
 {
+	BATCH_LOCALS(r300);
 	struct r300_state_atom *atom;
-	uint32_t *dest;
 	int dwords;
 
-	dest = r300->cmdbuf.cmd_buf + r300->cmdbuf.count_used;
-
-	/* Emit WAIT */
-	*dest = cmdwait(R300_WAIT_3D | R300_WAIT_3D_CLEAN);
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit cache flush */
-	*dest = cmdpacket0(R300_TX_INVALTAGS, 1);
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	*dest = R300_TX_FLUSH;
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit END3D */
-	*dest = cmdpacify();
-	dest++;
-	r300->cmdbuf.count_used++;
+	BEGIN_BATCH_NO_AUTOSTATE(4);
+	OUT_BATCH(cmdwait(R300_WAIT_3D | R300_WAIT_3D_CLEAN));
+	OUT_BATCH(cmdpacket0(R300_TX_INVALTAGS, 1));
+	OUT_BATCH(R300_TX_FLUSH);
+	OUT_BATCH(cmdpacify());
+	END_BATCH();
 
 	/* Emit actual atoms */
-
 	foreach(atom, &r300->hw.atomlist) {
 		if ((atom->dirty || r300->hw.all_dirty) == dirty) {
 			dwords = (*atom->check) (r300, atom);
@@ -186,9 +243,13 @@ static INLINE void r300EmitAtoms(r300ContextPtr r300, GLboolean dirty)
 				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
 					r300PrintStateAtom(r300, atom);
 				}
-				memcpy(dest, atom->cmd, dwords * 4);
-				dest += dwords;
-				r300->cmdbuf.count_used += dwords;
+				if (atom->emit) {
+					(*atom->emit)(r300);
+				} else {
+					BEGIN_BATCH_NO_AUTOSTATE(dwords);
+					OUT_BATCH_TABLE(atom->cmd, dwords);
+					END_BATCH();
+				}
 				atom->dirty = GL_FALSE;
 			} else {
 				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
@@ -198,6 +259,8 @@ static INLINE void r300EmitAtoms(r300ContextPtr r300, GLboolean dirty)
 			}
 		}
 	}
+
+	COMMIT_BATCH();
 }
 
 /**
@@ -211,22 +274,21 @@ void r300EmitState(r300ContextPtr r300)
 	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_PRIMS))
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
-	if (r300->cmdbuf.count_used && !r300->hw.is_dirty
+	if (r300->cmdbuf.written && !r300->hw.is_dirty
 	    && !r300->hw.all_dirty)
 		return;
 
 	/* To avoid going across the entire set of states multiple times, just check
-	 * for enough space for the case of emitting all state, and inline the
-	 * r300AllocCmdBuf code here without all the checks.
+	 * for enough space for the case of emitting all state.
 	 */
 	r300EnsureCmdBufSpace(r300, r300->hw.max_state_size, __FUNCTION__);
 
-	if (!r300->cmdbuf.count_used) {
+	if (!r300->cmdbuf.written) {
 		if (RADEON_DEBUG & DEBUG_STATE)
 			fprintf(stderr, "Begin reemit state\n");
 
 		r300EmitAtoms(r300, GL_FALSE);
-		r300->cmdbuf.count_reemit = r300->cmdbuf.count_used;
+		r300->cmdbuf.reemit = r300->cmdbuf.committed;
 	}
 
 	if (RADEON_DEBUG & DEBUG_STATE)
@@ -234,7 +296,7 @@ void r300EmitState(r300ContextPtr r300)
 
 	r300EmitAtoms(r300, GL_TRUE);
 
-	assert(r300->cmdbuf.count_used < r300->cmdbuf.size);
+	assert(r300->cmdbuf.written < r300->cmdbuf.size);
 
 	r300->hw.is_dirty = GL_FALSE;
 	r300->hw.all_dirty = GL_FALSE;
@@ -244,6 +306,79 @@ void r300EmitState(r300ContextPtr r300)
 #define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
 #define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
 
+static void emit_tex_offsets(r300ContextPtr r300)
+{
+	BATCH_LOCALS(r300);
+	int numtmus = packet0_count(r300->hw.tex.offset.cmd);
+
+	if (numtmus) {
+		int i;
+
+		BEGIN_BATCH(numtmus + 1);
+		OUT_BATCH_REGSEQ(R300_TX_OFFSET_0, numtmus);
+		for(i = 0; i < numtmus; ++i) {
+			r300TexObj *t = r300->hw.textures[i];
+			if (t && !t->image_override) {
+				OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0, DRM_RELOC_TXOFFSET);
+			} else if (!t) {
+				OUT_BATCH(r300->radeon.radeonScreen->texOffset[0]);
+			} else {
+				OUT_BATCH(t->override_offset);
+			}
+		}
+		END_BATCH();
+	}
+}
+
+static void emit_cb_offset(r300ContextPtr r300)
+{
+	BATCH_LOCALS(r300);
+	struct radeon_renderbuffer *rrb;
+	uint32_t cbpitch;
+
+	rrb = r300->radeon.state.color.rrb;
+	if (!rrb) {
+		fprintf(stderr, "no rrb\n");
+		return;
+	}
+
+	cbpitch = rrb->pitch;
+	if (rrb->cpp == 4)
+		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+	else
+		cbpitch |= R300_COLOR_FORMAT_RGB565;
+
+	if (r300->radeon.sarea->tiling_enabled)
+		cbpitch |= R300_COLOR_TILE_ENABLE;
+
+	BEGIN_BATCH(4);
+	OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+	OUT_BATCH_RELOC(0, rrb->bo, 0, DRM_RELOC_TXOFFSET);
+	OUT_BATCH_REGSEQ(R300_RB3D_COLORPITCH0, 1);
+	OUT_BATCH(cbpitch);
+	END_BATCH();
+}
+
+static void emit_zb_offset(r300ContextPtr r300)
+{
+	BATCH_LOCALS(r300);
+	struct radeon_renderbuffer *rrb;
+	uint32_t zbpitch;
+
+	rrb = r300->radeon.state.depth_buffer;
+	if (!rrb)
+		return;
+
+	zbpitch = rrb->pitch;
+
+	BEGIN_BATCH(3);
+	OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 2);
+	OUT_BATCH_RELOC(0, rrb->bo, 0, DRM_RELOC_TXOFFSET);
+	OUT_BATCH(zbpitch);
+	END_BATCH();
+
+}
+
 static int check_always(r300ContextPtr r300, struct r300_state_atom *atom)
 {
 	return atom->cmd_size;
@@ -480,8 +615,7 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	ALLOC_STATE(rop, always, 2, 0);
 	r300->hw.rop.cmd[0] = cmdpacket0(R300_RB3D_ROPCNTL, 1);
 	ALLOC_STATE(cb, always, R300_CB_CMDSIZE, 0);
-	r300->hw.cb.cmd[R300_CB_CMD_0] = cmdpacket0(R300_RB3D_COLOROFFSET0, 1);
-	r300->hw.cb.cmd[R300_CB_CMD_1] = cmdpacket0(R300_RB3D_COLORPITCH0, 1);
+	r300->hw.cb.emit = &emit_cb_offset;
 	ALLOC_STATE(rb3d_dither_ctl, always, 10, 0);
 	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(R300_RB3D_DITHER_CTL, 9);
 	ALLOC_STATE(rb3d_aaresolve_ctl, always, 2, 0);
@@ -495,7 +629,7 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	r300->hw.zstencil_format.cmd[0] =
 	    cmdpacket0(R300_ZB_FORMAT, 4);
 	ALLOC_STATE(zb, always, R300_ZB_CMDSIZE, 0);
-	r300->hw.zb.cmd[R300_ZB_CMD_0] = cmdpacket0(R300_ZB_DEPTHOFFSET, 2);
+	r300->hw.zb.emit = emit_zb_offset;
 	ALLOC_STATE(zb_depthclearvalue, always, 2, 0);
 	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(R300_ZB_DEPTHCLEARVALUE, 1);
 	ALLOC_STATE(unk4F30, always, 3, 0);
@@ -562,9 +696,10 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	ALLOC_STATE(tex.pitch, variable, mtu + 1, 0);
 	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_FORMAT2_0, 0);
 
-	ALLOC_STATE(tex.offset, variable, mtu + 1, 0);
+	ALLOC_STATE(tex.offset, variable, 1, 0);
 	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
 	    cmdpacket0(R300_TX_OFFSET_0, 0);
+	r300->hw.tex.offset.emit = &emit_tex_offsets;
 
 	ALLOC_STATE(tex.chroma_key, variable, mtu + 1, 0);
 	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
@@ -597,10 +732,14 @@ void r300InitCmdBuf(r300ContextPtr r300)
 			size * 4, r300->hw.max_state_size * 4);
 	}
 
+	r300->cmdbuf.buf = dri_bo_alloc(r300->radeon.bufmgr, "cmdbuf",
+		size*4, 16, DRM_BO_MEM_CMDBUF);
 	r300->cmdbuf.size = size;
-	r300->cmdbuf.cmd_buf = (uint32_t *) CALLOC(size * 4);
-	r300->cmdbuf.count_used = 0;
-	r300->cmdbuf.count_reemit = 0;
+	r300->cmdbuf.written = 0;
+	r300->cmdbuf.reserved = 0;
+	r300->cmdbuf.committed = 0;
+	r300->cmdbuf.reemit = 0;
+	dri_bo_map(r300->cmdbuf.buf, GL_TRUE);
 }
 
 /**
@@ -610,66 +749,10 @@ void r300DestroyCmdBuf(r300ContextPtr r300)
 {
 	struct r300_state_atom *atom;
 
-	FREE(r300->cmdbuf.cmd_buf);
+	dri_bo_unmap(r300->cmdbuf.buf);
+	dri_bo_unreference(r300->cmdbuf.buf);
 
 	foreach(atom, &r300->hw.atomlist) {
 		FREE(atom->cmd);
 	}
 }
-
-void r300EmitBlit(r300ContextPtr rmesa,
-		  GLuint color_fmt,
-		  GLuint src_pitch,
-		  GLuint src_offset,
-		  GLuint dst_pitch,
-		  GLuint dst_offset,
-		  GLint srcx, GLint srcy,
-		  GLint dstx, GLint dsty, GLuint w, GLuint h)
-{
-	drm_r300_cmd_header_t *cmd;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr,
-			"%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
-			__FUNCTION__, src_pitch, src_offset, srcx, srcy,
-			dst_pitch, dst_offset, dstx, dsty, w, h);
-
-	assert((src_pitch & 63) == 0);
-	assert((dst_pitch & 63) == 0);
-	assert((src_offset & 1023) == 0);
-	assert((dst_offset & 1023) == 0);
-	assert(w < (1 << 16));
-	assert(h < (1 << 16));
-
-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 8, __FUNCTION__);
-
-	cmd[0].header.cmd_type = R300_CMD_PACKET3;
-	cmd[0].header.pad0 = R300_CMD_PACKET3_RAW;
-	cmd[1].u = R300_CP_CMD_BITBLT_MULTI | (5 << 16);
-	cmd[2].u = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
-		    RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-		    RADEON_GMC_BRUSH_NONE |
-		    (color_fmt << 8) |
-		    RADEON_GMC_SRC_DATATYPE_COLOR |
-		    RADEON_ROP3_S |
-		    RADEON_DP_SRC_SOURCE_MEMORY |
-		    RADEON_GMC_CLR_CMP_CNTL_DIS | RADEON_GMC_WR_MSK_DIS);
-
-	cmd[3].u = ((src_pitch / 64) << 22) | (src_offset >> 10);
-	cmd[4].u = ((dst_pitch / 64) << 22) | (dst_offset >> 10);
-	cmd[5].u = (srcx << 16) | srcy;
-	cmd[6].u = (dstx << 16) | dsty;	/* dst */
-	cmd[7].u = (w << 16) | h;
-}
-
-void r300EmitWait(r300ContextPtr rmesa, GLuint flags)
-{
-	drm_r300_cmd_header_t *cmd;
-
-	assert(!(flags & ~(R300_WAIT_2D | R300_WAIT_3D)));
-
-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].u = 0;
-	cmd[0].wait.cmd_type = R300_CMD_WAIT;
-	cmd[0].wait.flags = flags;
-}
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.h b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
index a8eaa58..4708a4c 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.h
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
@@ -45,29 +45,88 @@ extern void r300EmitState(r300ContextPtr r300);
 
 extern void r300InitCmdBuf(r300ContextPtr r300);
 extern void r300DestroyCmdBuf(r300ContextPtr r300);
+extern void r300EnsureCmdBufSpace(r300ContextPtr r300, int dwords, const char *caller);
+
+extern void r300BeginBatch(r300ContextPtr r300, int n, GLboolean autostate, const char* function, int line);
 
 /**
- * Make sure that enough space is available in the command buffer
- * by flushing if necessary.
- *
- * \param dwords The number of dwords we need to be free on the command buffer
+ * Every function writing to the command buffer needs to declare this
+ * to get the necessary local variables.
  */
-static INLINE void r300EnsureCmdBufSpace(r300ContextPtr r300,
-					     int dwords, const char *caller)
-{
-	assert(dwords < r300->cmdbuf.size);
+#define BATCH_LOCALS(r300) \
+	const r300ContextPtr b_l_r300 = r300
 
-	if (r300->cmdbuf.count_used + dwords > r300->cmdbuf.size)
-		r300FlushCmdBuf(r300, caller);
-}
+/**
+ * Prepare writing n dwords to the command buffer,
+ * including producing any necessary state emits on buffer wraparound.
+ */
+#define BEGIN_BATCH(n) r300BeginBatch(b_l_r300, n, GL_TRUE, __FUNCTION__, __LINE__)
+
+/**
+ * Same as BEGIN_BATCH, but do not cause automatic state emits.
+ */
+#define BEGIN_BATCH_NO_AUTOSTATE(n) r300BeginBatch(b_l_r300, n, GL_FALSE, __FUNCTION__, __LINE__)
+
+/**
+ * Write one dword to the command buffer.
+ */
+#define OUT_BATCH(data) \
+	do { \
+		if (b_l_r300->cmdbuf.written < b_l_r300->cmdbuf.reserved) { \
+			((uint32_t*)b_l_r300->cmdbuf.buf->virtual)[b_l_r300->cmdbuf.written++] = data; \
+		} else { \
+			_mesa_problem(b_l_r300->radeon.glCtx, "%s:%i: OUT_BATCH mismatch", __FUNCTION__, __LINE__); \
+		} \
+	} while(0)
 
 /**
- * Allocate the given number of dwords in the command buffer and return
- * a pointer to the allocated area.
- * When necessary, these functions cause a flush. r300AllocCmdBuf() also
- * causes state reemission after a flush. This is necessary to ensure
- * correct hardware state after an unlock.
+ * Write a relocated dword to the command buffer.
  */
+#define OUT_BATCH_RELOC(data, bo, offset, flags) \
+	do { \
+		if (b_l_r300->cmdbuf.written < b_l_r300->cmdbuf.reserved) { \
+			dri_emit_reloc(b_l_r300->cmdbuf.buf, flags, offset, 4*b_l_r300->cmdbuf.written, bo); \
+			((uint32_t*)b_l_r300->cmdbuf.buf->virtual)[b_l_r300->cmdbuf.written++] = data; \
+		} else { \
+			_mesa_problem(b_l_r300->radeon.glCtx, "%s:%i: OUT_BATCH mismatch", __FUNCTION__, __LINE__); \
+		} \
+	} while(0)
+
+/**
+ * Write n dwords from ptr to the command buffer.
+ */
+#define OUT_BATCH_TABLE(ptr,n) \
+	do { \
+		int _n = n; \
+		if (b_l_r300->cmdbuf.written+_n <= b_l_r300->cmdbuf.reserved) { \
+			memcpy((uint32_t*)b_l_r300->cmdbuf.buf->virtual + b_l_r300->cmdbuf.written, (ptr), 4*_n); \
+			b_l_r300->cmdbuf.written += _n; \
+		} else { \
+			_mesa_problem(b_l_r300->radeon.glCtx, "%s:%i: OUT_BATCH_TABLE mismatch", __FUNCTION__, __LINE__); \
+		} \
+	} while(0)
+
+/**
+ * Finish writing dwords to the command buffer.
+ * The number of (direct or indirect) OUT_BATCH calls between the previous
+ * BEGIN_BATCH and END_BATCH must match the number specified at BEGIN_BATCH time.
+ */
+#define END_BATCH() \
+	do { \
+		if (b_l_r300->cmdbuf.written != b_l_r300->cmdbuf.reserved) \
+			_mesa_problem(b_l_r300->radeon.glCtx, "%s:%i: END_BATCH mismatch", __FUNCTION__, __LINE__); \
+	} while(0)
+
+/**
+ * After the last END_BATCH() of rendering, this indicates that flushing
+ * the command buffer now is okay.
+ */
+#define COMMIT_BATCH() \
+	do { \
+		assert(b_l_r300->cmdbuf.written == b_l_r300->cmdbuf.reserved); \
+		b_l_r300->cmdbuf.committed = b_l_r300->cmdbuf.written; \
+	} while(0)
+
 static INLINE uint32_t *r300RawAllocCmdBuf(r300ContextPtr r300,
 					       int dwords, const char *caller)
 {
@@ -75,8 +134,9 @@ static INLINE uint32_t *r300RawAllocCmdBuf(r300ContextPtr r300,
 
 	r300EnsureCmdBufSpace(r300, dwords, caller);
 
-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
-	r300->cmdbuf.count_used += dwords;
+	ptr = (uint32_t*)r300->cmdbuf.buf->virtual + r300->cmdbuf.written;
+	r300->cmdbuf.written += dwords;
+	r300->cmdbuf.reserved = r300->cmdbuf.committed = r300->cmdbuf.written;
 	return ptr;
 }
 
@@ -87,30 +147,17 @@ static INLINE uint32_t *r300AllocCmdBuf(r300ContextPtr r300,
 
 	r300EnsureCmdBufSpace(r300, dwords, caller);
 
-	if (!r300->cmdbuf.count_used) {
+	if (!r300->cmdbuf.written) {
 		if (RADEON_DEBUG & DEBUG_IOCTL)
 			fprintf(stderr,
 				"Reemit state after flush (from %s)\n", caller);
 		r300EmitState(r300);
 	}
 
-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
-	r300->cmdbuf.count_used += dwords;
+	ptr = (uint32_t*)r300->cmdbuf.buf->virtual + r300->cmdbuf.written;
+	r300->cmdbuf.written += dwords;
+	r300->cmdbuf.reserved = r300->cmdbuf.committed = r300->cmdbuf.written;
 	return ptr;
 }
 
-extern void r300EmitBlit(r300ContextPtr rmesa,
-			 GLuint color_fmt,
-			 GLuint src_pitch,
-			 GLuint src_offset,
-			 GLuint dst_pitch,
-			 GLuint dst_offset,
-			 GLint srcx, GLint srcy,
-			 GLint dstx, GLint dsty, GLuint w, GLuint h);
-
-extern void r300EmitWait(r300ContextPtr rmesa, GLuint flags);
-extern void r300EmitLOAD_VBPNTR(r300ContextPtr rmesa, int start);
-extern void r300EmitVertexShader(r300ContextPtr rmesa);
-extern void r300EmitPixelShader(r300ContextPtr rmesa);
-
 #endif				/* __R300_CMDBUF_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index ee4a69d..75cbebe 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -59,15 +59,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_span.h"
 #include "r300_context.h"
 #include "r300_cmdbuf.h"
+#include "r300_mipmap_tree.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
 #include "r300_swtcl.h"
 
-#ifdef USER_BUFFERS
 #include "r300_mem.h"
-#endif
 
 #include "vblank.h"
 #include "utils.h"
@@ -190,7 +189,7 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	struct dd_function_table functions;
 	r300ContextPtr r300;
 	GLcontext *ctx;
-	int tcl_mode, i;
+	int tcl_mode;
 
 	assert(glVisual);
 	assert(driContextPriv);
@@ -222,10 +221,6 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	r300InitTextureFuncs(&functions);
 	r300InitShaderFuncs(&functions);
 
-#ifdef USER_BUFFERS
-	r300_mem_init(r300);
-#endif
-
 	if (!radeonInitContext(&r300->radeon, &functions,
 			       glVisual, driContextPriv,
 			       sharedContextPrivate)) {
@@ -233,34 +228,9 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 		return GL_FALSE;
 	}
 
+	r300->radeon.bufmgr = radeonBufmgrClassicInit(r300);
+
 	/* Init r300 context data */
-	r300->dma.buf0_address =
-	    r300->radeon.radeonScreen->buffers->list[0].address;
-
-	(void)memset(r300->texture_heaps, 0, sizeof(r300->texture_heaps));
-	make_empty_list(&r300->swapped);
-
-	r300->nr_heaps = 1 /* screen->numTexHeaps */ ;
-	assert(r300->nr_heaps < RADEON_NR_TEX_HEAPS);
-	for (i = 0; i < r300->nr_heaps; i++) {
-		/* *INDENT-OFF* */
-		r300->texture_heaps[i] = driCreateTextureHeap(i, r300,
-							       screen->
-							       texSize[i], 12,
-							       RADEON_NR_TEX_REGIONS,
-							       (drmTextureRegionPtr)
-							       r300->radeon.sarea->
-							       tex_list[i],
-							       &r300->radeon.sarea->
-							       tex_age[i],
-							       &r300->swapped,
-							       sizeof
-							       (r300TexObj),
-							       (destroy_texture_object_t
-								*)
-							       r300DestroyTexObj);
-		/* *INDENT-ON* */
-	}
 	r300->texture_depth = driQueryOptioni(&r300->radeon.optionCache,
 					      "texture_depth");
 	if (r300->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
@@ -299,13 +269,11 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
 	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
 
-#ifdef USER_BUFFERS
 	/* Needs further modifications */
 #if 0
 	ctx->Const.MaxArrayLockSize =
 	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
 #endif
-#endif
 
 	/* Initialize the software rasterizer and helper modules.
 	 */
@@ -407,72 +375,6 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	return GL_TRUE;
 }
 
-static void r300FreeGartAllocations(r300ContextPtr r300)
-{
-	int i, ret, tries = 0, done_age, in_use = 0;
-	drm_radeon_mem_free_t memfree;
-
-	memfree.region = RADEON_MEM_REGION_GART;
-
-#ifdef USER_BUFFERS
-	for (i = r300->rmm->u_last; i > 0; i--) {
-		if (r300->rmm->u_list[i].ptr == NULL) {
-			continue;
-		}
-
-		/* check whether this buffer is still in use */
-		if (r300->rmm->u_list[i].pending) {
-			in_use++;
-		}
-	}
-	/* Cannot flush/lock if no context exists. */
-	if (in_use)
-		r300FlushCmdBuf(r300, __FUNCTION__);
-
-	done_age = radeonGetAge((radeonContextPtr) r300);
-
-	for (i = r300->rmm->u_last; i > 0; i--) {
-		if (r300->rmm->u_list[i].ptr == NULL) {
-			continue;
-		}
-
-		/* check whether this buffer is still in use */
-		if (!r300->rmm->u_list[i].pending) {
-			continue;
-		}
-
-		assert(r300->rmm->u_list[i].h_pending == 0);
-
-		tries = 0;
-		while (r300->rmm->u_list[i].age > done_age && tries++ < 1000) {
-			usleep(10);
-			done_age = radeonGetAge((radeonContextPtr) r300);
-		}
-		if (tries >= 1000) {
-			WARN_ONCE("Failed to idle region!");
-		}
-
-		memfree.region_offset = (char *)r300->rmm->u_list[i].ptr -
-		    (char *)r300->radeon.radeonScreen->gartTextures.map;
-
-		ret = drmCommandWrite(r300->radeon.radeonScreen->driScreen->fd,
-				      DRM_RADEON_FREE, &memfree,
-				      sizeof(memfree));
-		if (ret) {
-			fprintf(stderr, "Failed to free at %p\nret = %s\n",
-				r300->rmm->u_list[i].ptr, strerror(-ret));
-		} else {
-			if (i == r300->rmm->u_last)
-				r300->rmm->u_last--;
-
-			r300->rmm->u_list[i].pending = 0;
-			r300->rmm->u_list[i].ptr = NULL;
-		}
-	}
-	r300->rmm->u_head = i;
-#endif				/* USER_BUFFERS */
-}
-
 /* Destroy the device specific context.
  */
 void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
@@ -496,23 +398,16 @@ void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
 	assert(r300);		/* should never be null */
 
 	if (r300) {
-		GLboolean release_texture_heaps;
-
-		release_texture_heaps =
-		    (r300->radeon.glCtx->Shared->RefCount == 1);
 		_swsetup_DestroyContext(r300->radeon.glCtx);
 		_tnl_DestroyContext(r300->radeon.glCtx);
 		_vbo_DestroyContext(r300->radeon.glCtx);
 		_swrast_DestroyContext(r300->radeon.glCtx);
 
-		if (r300->dma.current.buf) {
-			r300ReleaseDmaRegion(r300, &r300->dma.current,
-					     __FUNCTION__);
-#ifndef USER_BUFFERS
-			r300FlushCmdBuf(r300, __FUNCTION__);
-#endif
+		if (r300->dma.current) {
+			dri_bo_unreference(r300->dma.current);
+			r300->dma.current = 0;
 		}
-		r300FreeGartAllocations(r300);
+		r300FlushCmdBuf(r300, __FUNCTION__);
 		r300DestroyCmdBuf(r300);
 
 		if (radeon->state.scissor.pClipRects) {
@@ -520,28 +415,13 @@ void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
 			radeon->state.scissor.pClipRects = NULL;
 		}
 
-		if (release_texture_heaps) {
-			/* This share group is about to go away, free our private
-			 * texture object data.
-			 */
-			int i;
-
-			for (i = 0; i < r300->nr_heaps; i++) {
-				driDestroyTextureHeap(r300->texture_heaps[i]);
-				r300->texture_heaps[i] = NULL;
-			}
-
-			assert(is_empty_list(&r300->swapped));
-		}
-
 		radeonCleanupContext(&r300->radeon);
 
-#ifdef USER_BUFFERS
 		/* the memory manager might be accessed when Mesa frees the shared
 		 * state, so don't destroy it earlier
 		 */
-		r300_mem_destroy(r300);
-#endif
+		dri_bufmgr_destroy(r300->radeon.bufmgr);
+		r300->radeon.bufmgr = 0;
 
 		/* free the option cache */
 		driDestroyOptionCache(&r300->radeon.optionCache);
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index c15e9fa..037d0e6 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -40,6 +40,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_vertex.h"
 #include "drm.h"
 #include "radeon_drm.h"
+#include "radeon_dri_bufmgr.h"
 #include "dri_util.h"
 #include "texmem.h"
 
@@ -47,8 +48,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/mtypes.h"
 #include "main/colormac.h"
 
-#define USER_BUFFERS
-
 struct r300_context;
 typedef struct r300_context r300ContextRec;
 typedef struct r300_context *r300ContextPtr;
@@ -122,68 +121,71 @@ static INLINE uint32_t r300PackFloat24(float f)
 
 /************ DMA BUFFERS **************/
 
-/* Need refcounting on dma buffers:
- */
-struct r300_dma_buffer {
-	int refcount;		/**< the number of retained regions in buf */
-	drmBufPtr buf;
-	int id;
-};
-#undef GET_START
-#ifdef USER_BUFFERS
-#define GET_START(rvb) (r300GartOffsetFromVirtual(rmesa, (rvb)->address+(rvb)->start))
-#else
-#define GET_START(rvb) (rmesa->radeon.radeonScreen->gart_buffer_offset +		\
-			(rvb)->address - rmesa->dma.buf0_address +	\
-			(rvb)->start)
-#endif
-/* A retained region, eg vertices for indexed vertices.
- */
-struct r300_dma_region {
-	struct r300_dma_buffer *buf;
-	char *address;		/* == buf->address */
-	int start, end, ptr;	/* offsets from start of buf */
-
-	int aos_offset;		/* address in GART memory */
-	int aos_stride;		/* distance between elements, in dwords */
-	int aos_size;		/* number of components (1-4) */
-};
-
 struct r300_dma {
 	/* Active dma region.  Allocations for vertices and retained
 	 * regions come from here.  Also used for emitting random vertices,
 	 * these may be flushed by calling flush_current();
 	 */
-	struct r300_dma_region current;
+	dri_bo *current; /** Buffer that DMA memory is allocated from */
+	int current_used; /** Number of bytes allocated and forgotten about */
+	int current_vertexptr; /** End of active vertex region */
 
+	/**
+	 * If current_vertexptr != current_used then flush must be non-zero.
+	 * flush must be called before non-active vertex allocations can be
+	 * performed.
+	 */
 	void (*flush) (r300ContextPtr);
 
-	char *buf0_address;	/* start of buf[0], for index calcs */
-
 	/* Number of "in-flight" DMA buffers, i.e. the number of buffers
 	 * for which a DISCARD command is currently queued in the command buffer.
 	 */
 	GLuint nr_released_bufs;
 };
 
-       /* Texture related */
-
+/* Texture related */
 typedef struct r300_tex_obj r300TexObj, *r300TexObjPtr;
+typedef struct _r300_texture_image r300_texture_image;
+
+
+struct _r300_texture_image {
+	struct gl_texture_image base;
+
+	/**
+	 * If mt != 0, the image is stored in hardware format in the
+	 * given mipmap tree. In this case, base.Data may point into the
+	 * mapping of the buffer object that contains the mipmap tree.
+	 *
+	 * If mt == 0, the image is stored in normal memory pointed to
+	 * by base.Data.
+	 */
+	struct _r300_mipmap_tree *mt;
+
+	int mtlevel; /** if mt != 0, this is the image's level in the mipmap tree */
+	int mtface; /** if mt != 0, this is the image's face in the mipmap tree */
+};
+
+static INLINE r300_texture_image *get_r300_texture_image(struct gl_texture_image *image)
+{
+	return (r300_texture_image*)image;
+}
+
 
 /* Texture object in locally shared texture space.
  */
 struct r300_tex_obj {
-	driTextureObject base;
+	struct gl_texture_object base;
+	struct _r300_mipmap_tree *mt;
 
-	GLuint bufAddr;		/* Offset to start of locally
-				   shared texture block */
-
-	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
-	/* Six, for the cube faces */
+	/**
+	 * This is true if we've verified that the mipmap tree above is complete
+	 * and so on.
+	 */
+	GLboolean validated;
 
 	GLboolean image_override;	/* Image overridden by GLX_EXT_tfp */
+	GLuint override_offset;
 
-	GLuint pitch;		/* this isn't sent to hardware just used in calculations */
 	/* hardware register values */
 	/* Note that R200 has 8 registers per texture and R300 only 7 */
 	GLuint filter;
@@ -191,30 +193,16 @@ struct r300_tex_obj {
 	GLuint pitch_reg;
 	GLuint size;		/* npot only */
 	GLuint format;
-	GLuint offset;		/* Image location in the card's address space.
-				   All cube faces follow. */
-	GLuint unknown4;
-	GLuint unknown5;
-	/* end hardware registers */
-
-	/* registers computed by r200 code - keep them here to
-	   compare against what is actually written.
-
-	   to be removed later.. */
 	GLuint pp_border_color;
-	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
-	GLuint format_x;
-
-	GLboolean border_fallback;
+	/* end hardware registers */
 
 	GLuint tile_bits;	/* hw texture tile bits used on this texture */
 };
 
-struct r300_texture_env_state {
-	r300TexObjPtr texobj;
-	GLenum format;
-	GLenum envMode;
-};
+static INLINE r300TexObj* r300_tex_obj(struct gl_texture_object *texObj)
+{
+	return (r300TexObj*)texObj;
+}
 
 /* The blit width for texture uploads
  */
@@ -222,7 +210,6 @@ struct r300_texture_env_state {
 #define R300_MAX_TEXTURE_UNITS 8
 
 struct r300_texture_state {
-	struct r300_texture_env_state unit[R300_MAX_TEXTURE_UNITS];
 	int tc_count;		/* number of incoming texture coordinates from VAP */
 };
 
@@ -242,6 +229,7 @@ struct r300_state_atom {
 	GLboolean dirty;
 
 	int (*check) (r300ContextPtr, struct r300_state_atom * atom);
+	void (*emit) (r300ContextPtr);
 };
 
 #define R300_VPT_CMD_0		0
@@ -549,6 +537,8 @@ struct r300_hw_state {
 		struct r300_state_atom border_color;
 	} tex;
 	struct r300_state_atom txe;	/* tex enable (4104) */
+
+	r300TexObj *textures[R300_MAX_TEXTURE_UNITS];
 };
 
 /**
@@ -559,10 +549,14 @@ struct r300_hw_state {
  * otherwise.
  */
 struct r300_cmdbuf {
-	int size;		/* DWORDs allocated for buffer */
-	uint32_t *cmd_buf;
-	int count_used;		/* DWORDs filled so far */
-	int count_reemit;	/* size of re-emission batch */
+	dri_bo *buf;
+	int reemit; /** # of dwords in reemit sequence (is always <= committed) */
+	int size; /** # of dwords total */
+
+	int committed; /** # of dwords that we have committed to */
+	int written; /** # of dwords written (is always >= committed) */
+	int reserved; /** # of dwords reserved up to previous BEGIN_BATCH */
+	unsigned int flushing:1; /** whether we're currently in FlushCmdBufLocked */
 };
 
 /**
@@ -811,18 +805,25 @@ struct r500_fragment_program {
 #define REG_COLOR0	1
 #define REG_TEX0	2
 
+struct r300_aos {
+	dri_bo *bo; /** Buffer object where vertex data is stored */
+	int offset; /** Offset into buffer object, in bytes */
+	int components; /** Number of components per vertex */
+	int stride; /** Stride in dwords (may be 0 for repeating) */
+	int count; /** Number of vertices */
+};
+
 struct r300_state {
 	struct r300_depthbuffer_state depth;
 	struct r300_texture_state texture;
 	int sw_tcl_inputs[VERT_ATTRIB_MAX];
 	struct r300_vertex_shader_state vertex_shader;
-	struct r300_dma_region aos[R300_MAX_AOS_ARRAYS];
+	struct r300_aos aos[R300_MAX_AOS_ARRAYS];
 	int aos_count;
 
-	GLuint *Elts;
-	struct r300_dma_region elt_dma;
+	dri_bo *elt_dma_bo; /** Buffer object that contains element indices */
+	int elt_dma_offset; /** Offset into this buffer object, in bytes */
 
-	struct r300_dma_region swtcl_dma;
 	DECLARE_RENDERINPUTS(render_inputs_bitset);	/* actual render inputs that R300 was configured for.
 							   They are the same as tnl->render_inputs for fixed pipeline */
 
@@ -880,13 +881,6 @@ struct r300_swtcl_info {
     * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
     */
    GLuint specoffset;
-
-   /**
-    * Should Mesa project vertex data or will the hardware do it?
-    */
-   GLboolean needproj;
-
-   struct r300_dma_region indexed_verts;
 };
 
 
@@ -905,25 +899,11 @@ struct r300_context {
 	/* Vertex buffers
 	 */
 	struct r300_dma dma;
-	GLboolean save_on_next_unlock;
 	GLuint NewGLState;
 
-	/* Texture object bookkeeping
-	 */
-	unsigned nr_heaps;
-	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
-	driTextureObject swapped;
 	int texture_depth;
 	float initialMaxAnisotropy;
 
-	/* Clientdata textures;
-	 */
-	GLuint prefer_gart_client_texturing;
-
-#ifdef USER_BUFFERS
-	struct r300_memory_manager *rmm;
-#endif
-
 	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
 	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
 
diff --git a/src/mesa/drivers/dri/r300/r300_emit.c b/src/mesa/drivers/dri/r300/r300_emit.c
index 80bd338..6ed2638 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_emit.c
@@ -51,9 +51,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_emit.h"
 #include "r300_ioctl.h"
 
-#ifdef USER_BUFFERS
 #include "r300_mem.h"
-#endif
 
 #if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
     SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
@@ -86,11 +84,9 @@ do {						\
 } while (0)
 #endif
 
-static void r300EmitVec4(GLcontext * ctx, struct r300_dma_region *rvb,
-			 GLvoid * data, int stride, int count)
+static void r300EmitVec4(uint32_t *out, GLvoid * data, int stride, int count)
 {
 	int i;
-	int *out = (int *)(rvb->address + rvb->start);
 
 	if (RADEON_DEBUG & DEBUG_VERTS)
 		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
@@ -106,11 +102,9 @@ static void r300EmitVec4(GLcontext * ctx, struct r300_dma_region *rvb,
 		}
 }
 
-static void r300EmitVec8(GLcontext * ctx, struct r300_dma_region *rvb,
-			 GLvoid * data, int stride, int count)
+static void r300EmitVec8(uint32_t *out, GLvoid * data, int stride, int count)
 {
 	int i;
-	int *out = (int *)(rvb->address + rvb->start);
 
 	if (RADEON_DEBUG & DEBUG_VERTS)
 		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
@@ -127,11 +121,9 @@ static void r300EmitVec8(GLcontext * ctx, struct r300_dma_region *rvb,
 		}
 }
 
-static void r300EmitVec12(GLcontext * ctx, struct r300_dma_region *rvb,
-			  GLvoid * data, int stride, int count)
+static void r300EmitVec12(uint32_t *out, GLvoid * data, int stride, int count)
 {
 	int i;
-	int *out = (int *)(rvb->address + rvb->start);
 
 	if (RADEON_DEBUG & DEBUG_VERTS)
 		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
@@ -149,11 +141,9 @@ static void r300EmitVec12(GLcontext * ctx, struct r300_dma_region *rvb,
 		}
 }
 
-static void r300EmitVec16(GLcontext * ctx, struct r300_dma_region *rvb,
-			  GLvoid * data, int stride, int count)
+static void r300EmitVec16(uint32_t *out, GLvoid * data, int stride, int count)
 {
 	int i;
-	int *out = (int *)(rvb->address + rvb->start);
 
 	if (RADEON_DEBUG & DEBUG_VERTS)
 		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
@@ -172,35 +162,31 @@ static void r300EmitVec16(GLcontext * ctx, struct r300_dma_region *rvb,
 		}
 }
 
-static void r300EmitVec(GLcontext * ctx, struct r300_dma_region *rvb,
+
+static void r300EmitVec(GLcontext * ctx, struct r300_aos *aos,
 			GLvoid * data, int size, int stride, int count)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	uint32_t *out;
 
 	if (stride == 0) {
-		r300AllocDmaRegion(rmesa, rvb, size * 4, 4);
+		r300AllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
 		count = 1;
-		rvb->aos_offset = GET_START(rvb);
-		rvb->aos_stride = 0;
+		aos->stride = 0;
 	} else {
-		r300AllocDmaRegion(rmesa, rvb, size * count * 4, 4);
-		rvb->aos_offset = GET_START(rvb);
-		rvb->aos_stride = size;
+		r300AllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * count * 4, 32);
+		aos->stride = size;
 	}
 
+	aos->components = size;
+	aos->count = count;
+
+	out = (uint32_t*)((char*)aos->bo->virtual + aos->offset);
 	switch (size) {
-	case 1:
-		r300EmitVec4(ctx, rvb, data, stride, count);
-		break;
-	case 2:
-		r300EmitVec8(ctx, rvb, data, stride, count);
-		break;
-	case 3:
-		r300EmitVec12(ctx, rvb, data, stride, count);
-		break;
-	case 4:
-		r300EmitVec16(ctx, rvb, data, stride, count);
-		break;
+	case 1: r300EmitVec4(out, data, stride, count); break;
+	case 2: r300EmitVec8(out, data, stride, count); break;
+	case 3: r300EmitVec12(out, data, stride, count); break;
+	case 4: r300EmitVec16(out, data, stride, count); break;
 	default:
 		assert(0);
 		break;
@@ -433,7 +419,7 @@ int r300EmitArrays(GLcontext * ctx)
 	}
 
 	for (i = 0; i < nr; i++) {
-		int ci, fix, found = 0;
+		int ci;
 
 		swizzle[i][0] = SWIZZLE_ZERO;
 		swizzle[i][1] = SWIZZLE_ZERO;
@@ -444,48 +430,10 @@ int r300EmitArrays(GLcontext * ctx)
 			swizzle[i][ci] = ci;
 		}
 
-		if (r300IsGartMemory(rmesa, vb->AttribPtr[tab[i]]->data, 4)) {
-			if (vb->AttribPtr[tab[i]]->stride % 4) {
-				return R300_FALLBACK_TCL;
-			}
-			rmesa->state.aos[i].address = (void *)(vb->AttribPtr[tab[i]]->data);
-			rmesa->state.aos[i].start = 0;
-			rmesa->state.aos[i].aos_offset = r300GartOffsetFromVirtual(rmesa, vb->AttribPtr[tab[i]]->data);
-			rmesa->state.aos[i].aos_stride = vb->AttribPtr[tab[i]]->stride / 4;
-			rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
-		} else {
-			r300EmitVec(ctx, &rmesa->state.aos[i],
-				    vb->AttribPtr[tab[i]]->data,
-				    vb->AttribPtr[tab[i]]->size,
-				    vb->AttribPtr[tab[i]]->stride, count);
-		}
-
-		rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
-
-		for (fix = 0; fix <= 4 - vb->AttribPtr[tab[i]]->size; fix++) {
-			if ((rmesa->state.aos[i].aos_offset - _mesa_sizeof_type(GL_FLOAT) * fix) % 4) {
-				continue;
-			}
-			found = 1;
-			break;
-		}
-
-		if (found) {
-			if (fix > 0) {
-				WARN_ONCE("Feeling lucky?\n");
-			}
-			rmesa->state.aos[i].aos_offset -= _mesa_sizeof_type(GL_FLOAT) * fix;
-			for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
-				swizzle[i][ci] += fix;
-			}
-		} else {
-			WARN_ONCE
-			    ("Cannot handle offset %x with stride %d, comp %d\n",
-			     rmesa->state.aos[i].aos_offset,
-			     rmesa->state.aos[i].aos_stride,
-			     vb->AttribPtr[tab[i]]->size);
-			return R300_FALLBACK_TCL;
-		}
+		r300EmitVec(ctx, &rmesa->state.aos[i],
+				vb->AttribPtr[tab[i]]->data,
+				vb->AttribPtr[tab[i]]->size,
+				vb->AttribPtr[tab[i]]->stride, count);
 	}
 
 	/* Setup INPUT_ROUTE. */
@@ -515,45 +463,76 @@ int r300EmitArrays(GLcontext * ctx)
 	return R300_FALLBACK_NONE;
 }
 
-#ifdef USER_BUFFERS
-void r300UseArrays(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	int i;
-
-	if (rmesa->state.elt_dma.buf)
-		r300_mem_use(rmesa, rmesa->state.elt_dma.buf->id);
-
-	for (i = 0; i < rmesa->state.aos_count; i++) {
-		if (rmesa->state.aos[i].buf)
-			r300_mem_use(rmesa, rmesa->state.aos[i].buf->id);
-	}
-}
-#endif
-
 void r300ReleaseArrays(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	int i;
 
-	r300ReleaseDmaRegion(rmesa, &rmesa->state.elt_dma, __FUNCTION__);
+	if (rmesa->state.elt_dma_bo) {
+		dri_bo_unreference(rmesa->state.elt_dma_bo);
+		rmesa->state.elt_dma_bo = 0;
+	}
 	for (i = 0; i < rmesa->state.aos_count; i++) {
-		r300ReleaseDmaRegion(rmesa, &rmesa->state.aos[i], __FUNCTION__);
+		if (rmesa->state.aos[i].bo) {
+			dri_bo_unreference(rmesa->state.aos[i].bo);
+			rmesa->state.aos[i].bo = 0;
+		}
 	}
 }
 
 void r300EmitCacheFlush(r300ContextPtr rmesa)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
-	e32(R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
-	    R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+	BATCH_LOCALS(rmesa);
+
+	BEGIN_BATCH(4);
+	OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT,
+		R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+		R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+	OUT_BATCH_REGVAL(R300_ZB_ZCACHE_CTLSTAT,
+		R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+		R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+	END_BATCH();
+	COMMIT_BATCH();
+}
 
-	reg_start(R300_ZB_ZCACHE_CTLSTAT, 0);
-	e32(R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
-	    R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+void r300EmitBlit(r300ContextPtr rmesa,
+		  GLuint color_fmt,
+		  GLuint src_pitch,
+		  dri_bo *src_bo, int src_offset,
+		  GLuint dst_pitch,
+		  GLuint dst_offset,
+		  GLint srcx, GLint srcy,
+		  GLint dstx, GLint dsty, GLuint w, GLuint h)
+{
+	BATCH_LOCALS(rmesa);
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr,
+			"%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
+			__FUNCTION__, src_pitch, src_offset, srcx, srcy,
+			dst_pitch, dst_offset, dstx, dsty, w, h);
+
+	assert((src_pitch & 63) == 0);
+	assert((dst_pitch & 63) == 0);
+	assert((src_offset & 1023) == 0);
+	assert((dst_offset & 1023) == 0);
+	assert(w < (1 << 16));
+	assert(h < (1 << 16));
+
+	BEGIN_BATCH(8);
+	OUT_BATCH_PACKET3(R300_CP_CMD_BITBLT_MULTI, 5);
+	OUT_BATCH(RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
+		  RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+		  RADEON_GMC_BRUSH_NONE |
+		  (color_fmt << 8) |
+		  RADEON_GMC_SRC_DATATYPE_COLOR |
+		  RADEON_ROP3_S |
+		  RADEON_DP_SRC_SOURCE_MEMORY |
+		  RADEON_GMC_CLR_CMP_CNTL_DIS | RADEON_GMC_WR_MSK_DIS);
+	OUT_BATCH_RELOC((src_pitch / 64) << 22, src_bo, src_offset, DRM_RELOC_BLITTER);
+	OUT_BATCH(((dst_pitch / 64) << 22) | (dst_offset >> 10));
+	OUT_BATCH((srcx << 16) | srcy);
+	OUT_BATCH((dstx << 16) | dsty);
+	OUT_BATCH((w << 16) | h);
+	END_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_emit.h b/src/mesa/drivers/dri/r300/r300_emit.h
index 89d7383..7e11604 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.h
+++ b/src/mesa/drivers/dri/r300/r300_emit.h
@@ -127,130 +127,62 @@ static INLINE uint32_t cmdpacify(void)
 	return cmd.u;
 }
 
-/**
- * Prepare to write a register value to register at address reg.
- * If num_extra > 0 then the following extra values are written
- * to registers with address +4, +8 and so on..
- */
-#define reg_start(reg, num_extra)					\
-	do {								\
-		int _n;							\
-		_n=(num_extra);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+2),				\
-					__FUNCTION__);			\
-		cmd_reserved=_n+2;					\
-		cmd_written=1;						\
-		cmd[0].i=cmdpacket0((reg), _n+1);			\
-	} while (0);
+
+/** Single register write to command buffer; requires 2 dwords. */
+#define OUT_BATCH_REGVAL(reg, val) \
+	OUT_BATCH(cmdpacket0((reg), 1)); \
+	OUT_BATCH((val))
+
+/** Continuous register range write to command buffer; requires 1 dword,
+ * expects count dwords afterwards for register contents. */
+#define OUT_BATCH_REGSEQ(reg, count) \
+	OUT_BATCH(cmdpacket0((reg), (count)));
+
+/** Write a 32 bit float to the ring; requires 1 dword. */
+#define OUT_BATCH_FLOAT32(f) \
+	OUT_BATCH(r300PackFloat32((f)));
 
 /**
- * Emit GLuint freestyle
+ * Write the header of a packet3 to the command buffer.
+ * Outputs 2 dwords and expects (num_extra+1) additional dwords afterwards.
  */
-#define e32(dword)							\
-	do {								\
-		if(cmd_written<cmd_reserved) {				\
-			cmd[cmd_written].i=(dword);			\
-			cmd_written++;					\
-		} else {						\
-			fprintf(stderr,					\
-				"e32 but no previous packet "		\
-				"declaration.\n"			\
-				"Aborting! in %s::%s at line %d, "	\
-				"cmd_written=%d cmd_reserved=%d\n",	\
-				__FILE__, __FUNCTION__, __LINE__,	\
-				cmd_written, cmd_reserved);		\
-			_mesa_exit(-1);					\
-		}							\
+#define OUT_BATCH_PACKET3(packet, num_extra) do {\
+	OUT_BATCH(cmdpacket3(R300_CMD_PACKET3_RAW)); \
+	OUT_BATCH(CP_PACKET3((packet), (num_extra))); \
 	} while(0)
 
-#define	efloat(f) e32(r300PackFloat32(f))
-
-#define vsf_start_fragment(dest, length)				\
-	do {								\
-		int _n;							\
-		_n = (length);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+1),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+2;					\
-		cmd_written =1;						\
-		cmd[0].i = cmdvpu((dest), _n/4);			\
-	} while (0);
-
-#define r500fp_start_fragment(dest, length)				\
-	do {								\
-		int _n;							\
-		_n = (length);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+1),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+1;					\
-		cmd_written =1;						\
-		cmd[0].i = cmdr500fp((dest), _n/6, 0, 0);		\
-	} while (0);
-
-#define start_packet3(packet, count)					\
-	{								\
-		int _n;							\
-		GLuint _p;						\
-		_n = (count);						\
-		_p = (packet);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+3),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+3;					\
-		cmd_written = 2;					\
-		if(_n > 0x3fff) {					\
-			fprintf(stderr,"Too big packet3 %08x: cannot "	\
-				"store %d dwords\n",			\
-				_p, _n);				\
-			_mesa_exit(-1);					\
-		}							\
-		cmd[0].i = cmdpacket3(R300_CMD_PACKET3_RAW);		\
-		cmd[1].i = _p | ((_n & 0x3fff)<<16);			\
-	}
-
 /**
  * Must be sent to switch to 2d commands
  */
 void static INLINE end_3d(r300ContextPtr rmesa)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(rmesa);
 
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].header.cmd_type = R300_CMD_END3D;
+	BEGIN_BATCH(1);
+	OUT_BATCH(cmdpacify());
+	END_BATCH();
 }
 
 void static INLINE cp_delay(r300ContextPtr rmesa, unsigned short count)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(rmesa);
 
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].i = cmdcpdelay(count);
+	BEGIN_BATCH(1);
+	OUT_BATCH(cmdcpdelay(count));
+	END_BATCH();
 }
 
 void static INLINE cp_wait(r300ContextPtr rmesa, unsigned char flags)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(rmesa);
 
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].i = cmdwait(flags);
+	BEGIN_BATCH(1);
+	OUT_BATCH(cmdwait(flags));
+	END_BATCH();
 }
 
 extern int r300EmitArrays(GLcontext * ctx);
 
-#ifdef USER_BUFFERS
-void r300UseArrays(GLcontext * ctx);
-#endif
-
 extern void r300ReleaseArrays(GLcontext * ctx);
 extern int r300PrimitiveType(r300ContextPtr rmesa, int prim);
 extern int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim);
@@ -265,4 +197,13 @@ extern GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead);
 extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten);
 extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten);
 
+extern void r300EmitBlit(r300ContextPtr rmesa,
+			 GLuint color_fmt,
+			 GLuint src_pitch,
+			 dri_bo *src_bo, int src_offset,
+			 GLuint dst_pitch,
+			 GLuint dst_offset,
+			 GLint srcx, GLint srcy,
+			 GLint dstx, GLint dsty, GLuint w, GLuint h);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
index ee85e22..c3d0d72 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.c
@@ -46,6 +46,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "swrast/swrast.h"
 
+#include "radeon_buffer.h"
 #include "r300_context.h"
 #include "radeon_ioctl.h"
 #include "r300_ioctl.h"
@@ -55,6 +56,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_reg.h"
 #include "r300_emit.h"
 #include "r300_fragprog.h"
+#include "r300_mem.h"
 
 #include "vblank.h"
 
@@ -62,64 +64,56 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define CLEARBUFFER_DEPTH	0x2
 #define CLEARBUFFER_STENCIL	0x4
 
-static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
+static void r300ClearBuffer(r300ContextPtr r300, int flags,
+			    struct radeon_renderbuffer *rrb)
 {
+	BATCH_LOCALS(r300);
 	GLcontext *ctx = r300->radeon.glCtx;
 	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
-	GLuint cboffset, cbpitch;
-	drm_r300_cmd_header_t *cmd2;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
+	GLuint cbpitch = 0;
 	r300ContextPtr rmesa = r300;
 
 	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s: %s buffer (%i,%i %ix%i)\n",
-			__FUNCTION__, buffer ? "back" : "front",
-			dPriv->x, dPriv->y, dPriv->w, dPriv->h);
-
-	if (buffer) {
-		cboffset = r300->radeon.radeonScreen->backOffset;
-		cbpitch = r300->radeon.radeonScreen->backPitch;
-	} else {
-		cboffset = r300->radeon.radeonScreen->frontOffset;
-		cbpitch = r300->radeon.radeonScreen->frontPitch;
+		fprintf(stderr, "%s: buffer %p (%i,%i %ix%i)\n",
+			__FUNCTION__, rrb, dPriv->x, dPriv->y,
+			dPriv->w, dPriv->h);
+
+	if (rrb) {
+		cbpitch = rrb->pitch;
+		if (rrb->cpp == 4)
+			cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+		else
+			cbpitch |= R300_COLOR_FORMAT_RGB565;
+
+		if (r300->radeon.sarea->tiling_enabled)
+			cbpitch |= R300_COLOR_TILE_ENABLE;
 	}
 
-	cboffset += r300->radeon.radeonScreen->fbLocation;
-
+	/* TODO in bufmgr */
 	cp_wait(r300, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
 	end_3d(rmesa);
 
-	R300_STATECHANGE(r300, cb);
-	reg_start(R300_RB3D_COLOROFFSET0, 0);
-	e32(cboffset);
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		cbpitch |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		cbpitch |= R300_COLOR_TILE_ENABLE;
-
-	reg_start(R300_RB3D_COLORPITCH0, 0);
-	e32(cbpitch);
-
-	R300_STATECHANGE(r300, cmk);
-	reg_start(RB3D_COLOR_CHANNEL_MASK, 0);
+	if (flags & CLEARBUFFER_COLOR) {
+		assert(rrb != 0);
+		BEGIN_BATCH(4);
+		OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+		OUT_BATCH_RELOC(0, rrb->bo, 0, DRM_RELOC_TXOFFSET);
+		OUT_BATCH_REGVAL(R300_RB3D_COLORPITCH0, cbpitch);
+		END_BATCH();
+	}
 
+	BEGIN_BATCH(15);
+	OUT_BATCH_REGSEQ(RB3D_COLOR_CHANNEL_MASK, 1);
 	if (flags & CLEARBUFFER_COLOR) {
-		e32((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
-		    (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
-		    (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
-		    (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
+		OUT_BATCH((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
+			  (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
+			  (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
+			  (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
 	} else {
-		e32(0x0);
+		OUT_BATCH(0);
 	}
 
-	R300_STATECHANGE(r300, zs);
-	reg_start(R300_ZB_CNTL, 2);
+	OUT_BATCH_REGSEQ(R300_ZB_CNTL, 3);
 
 	{
 		uint32_t t1, t2;
@@ -146,37 +140,37 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
 			     R300_S_FRONT_ZFAIL_OP_SHIFT);
 		}
 
-		e32(t1);
-		e32(t2);
-		e32(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) << R300_STENCILWRITEMASK_SHIFT) |
-		    (ctx->Stencil.Clear & R300_STENCILREF_MASK));
+		OUT_BATCH(t1);
+		OUT_BATCH(t2);
+		OUT_BATCH(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) << R300_STENCILWRITEMASK_SHIFT) |
+			  (ctx->Stencil.Clear & R300_STENCILREF_MASK));
 	}
 
-	cmd2 = (drm_r300_cmd_header_t *) r300AllocCmdBuf(r300, 9, __FUNCTION__);
-	cmd2[0].packet3.cmd_type = R300_CMD_PACKET3;
-	cmd2[0].packet3.packet = R300_CMD_PACKET3_CLEAR;
-	cmd2[1].u = r300PackFloat32(dPriv->w / 2.0);
-	cmd2[2].u = r300PackFloat32(dPriv->h / 2.0);
-	cmd2[3].u = r300PackFloat32(ctx->Depth.Clear);
-	cmd2[4].u = r300PackFloat32(1.0);
-	cmd2[5].u = r300PackFloat32(ctx->Color.ClearColor[0]);
-	cmd2[6].u = r300PackFloat32(ctx->Color.ClearColor[1]);
-	cmd2[7].u = r300PackFloat32(ctx->Color.ClearColor[2]);
-	cmd2[8].u = r300PackFloat32(ctx->Color.ClearColor[3]);
+	OUT_BATCH(cmdpacket3(R300_CMD_PACKET3_CLEAR));
+	OUT_BATCH_FLOAT32(dPriv->w / 2.0);
+	OUT_BATCH_FLOAT32(dPriv->h / 2.0);
+	OUT_BATCH_FLOAT32(ctx->Depth.Clear);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
+	OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
+	OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
+	OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
+	END_BATCH();
 
 	r300EmitCacheFlush(rmesa);
 	cp_wait(rmesa, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+
+	R300_STATECHANGE(r300, cb);
+	R300_STATECHANGE(r300, cmk);
+	R300_STATECHANGE(r300, zs);
 }
 
 static void r300EmitClearState(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	r300ContextPtr rmesa = r300;
+	BATCH_LOCALS(r300);
 	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
 	int i;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
 	int has_tcl = 1;
 	int is_r500 = 0;
 	GLuint vap_cntl;
@@ -184,35 +178,37 @@ static void r300EmitClearState(GLcontext * ctx)
 	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
 		has_tcl = 0;
 
-        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-                is_r500 = 1;
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+		is_r500 = 1;
 
-
-	/* FIXME: the values written to R300_VAP_INPUT_ROUTE_0_0 and
-	 * R300_VAP_INPUT_ROUTE_0_1 are in fact known, however, the values are
-	 * quite complex; see the functions in r300_emit.c.
+	/* State atom dirty tracking is a little subtle here.
+	 *
+	 * On the one hand, we need to make sure base state is emitted
+	 * here if we start with an empty batch buffer, otherwise clear
+	 * works incorrectly with multiple processes. Therefore, the first
+	 * BEGIN_BATCH cannot be a BEGIN_BATCH_NO_AUTOSTATE.
 	 *
-	 * I believe it would be a good idea to extend the functions in
-	 * r300_emit.c so that they can be used to setup the default values for
-	 * these registers, as well as the actual values used for rendering.
+	 * On the other hand, implicit state emission clears the state atom
+	 * dirty bits, so we have to call R300_STATECHANGE later than the
+	 * first BEGIN_BATCH.
+	 *
+	 * The final trickiness is that, because we change state, we need
+	 * to ensure that any stored swtcl primitives are flushed properly
+	 * before we start changing state. See the R300_NEWPRIM in r300Clear
+	 * for this.
 	 */
-	R300_STATECHANGE(r300, vir[0]);
-	reg_start(R300_VAP_PROG_STREAM_CNTL_0, 0);
+	BEGIN_BATCH(31);
+	OUT_BATCH_REGSEQ(R300_VAP_PROG_STREAM_CNTL_0, 1);
 	if (!has_tcl)
-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
 		 ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
 	else
-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
 		 ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
 
-	/* disable fog */
-	R300_STATECHANGE(r300, fogs);
-	reg_start(R300_FG_FOG_BLEND, 0);
-	e32(0x0);
-
-	R300_STATECHANGE(r300, vir[1]);
-	reg_start(R300_VAP_PROG_STREAM_CNTL_EXT_0, 0);
-	e32(((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
+	OUT_BATCH_REGVAL(R300_FG_FOG_BLEND, 0);
+	OUT_BATCH_REGVAL(R300_VAP_PROG_STREAM_CNTL_EXT_0,
+	   ((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
 	       (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
 	       (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) |
 	       (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) |
@@ -226,238 +222,246 @@ static void r300EmitClearState(GLcontext * ctx)
 	      << R300_SWIZZLE1_SHIFT)));
 
 	/* R300_VAP_INPUT_CNTL_0, R300_VAP_INPUT_CNTL_1 */
-	R300_STATECHANGE(r300, vic);
-	reg_start(R300_VAP_VTX_STATE_CNTL, 1);
-	e32((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
-	e32(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
+	OUT_BATCH_REGSEQ(R300_VAP_VTX_STATE_CNTL, 2);
+	OUT_BATCH((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
+	OUT_BATCH(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
 
-	R300_STATECHANGE(r300, vte);
 	/* comes from fglrx startup of clear */
-	reg_start(R300_SE_VTE_CNTL, 1);
-	e32(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
-	    R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
-	    R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
-	    R300_VPORT_Z_OFFSET_ENA);
-	e32(0x8);
+	OUT_BATCH_REGSEQ(R300_SE_VTE_CNTL, 2);
+	OUT_BATCH(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
+		  R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+		  R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+		  R300_VPORT_Z_OFFSET_ENA);
+	OUT_BATCH(0x8);
 
-	reg_start(R300_VAP_PSC_SGN_NORM_CNTL, 0);
-	e32(0xaaaaaaaa);
+	OUT_BATCH_REGVAL(R300_VAP_PSC_SGN_NORM_CNTL, 0xaaaaaaaa);
 
-	R300_STATECHANGE(r300, vof);
-	reg_start(R300_VAP_OUTPUT_VTX_FMT_0, 1);
-	e32(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
-	    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
-	e32(0x0);		/* no textures */
+	OUT_BATCH_REGSEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+	OUT_BATCH(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
+		  R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
+	OUT_BATCH(0); /* no textures */
 
-	R300_STATECHANGE(r300, txe);
-	reg_start(R300_TX_ENABLE, 0);
-	e32(0x0);
+	OUT_BATCH_REGVAL(R300_TX_ENABLE, 0);
 
-	R300_STATECHANGE(r300, vpt);
-	reg_start(R300_SE_VPORT_XSCALE, 5);
-	efloat(1.0);
-	efloat(dPriv->x);
-	efloat(1.0);
-	efloat(dPriv->y);
-	efloat(1.0);
-	efloat(0.0);
+	OUT_BATCH_REGSEQ(R300_SE_VPORT_XSCALE, 6);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(dPriv->x);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(dPriv->y);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(0.0);
 
-	R300_STATECHANGE(r300, at);
-	reg_start(R300_FG_ALPHA_FUNC, 0);
-	e32(0x0);
+	OUT_BATCH_REGVAL(R300_FG_ALPHA_FUNC, 0);
 
+	OUT_BATCH_REGSEQ(R300_RB3D_CBLEND, 2);
+	OUT_BATCH(0x0);
+	OUT_BATCH(0x0);
+	END_BATCH();
+
+	R300_STATECHANGE(r300, vir[0]);
+	R300_STATECHANGE(r300, fogs);
+	R300_STATECHANGE(r300, vir[1]);
+	R300_STATECHANGE(r300, vic);
+	R300_STATECHANGE(r300, vte);
+	R300_STATECHANGE(r300, vof);
+	R300_STATECHANGE(r300, txe);
+	R300_STATECHANGE(r300, vpt);
+	R300_STATECHANGE(r300, at);
 	R300_STATECHANGE(r300, bld);
-	reg_start(R300_RB3D_CBLEND, 1);
-	e32(0x0);
-	e32(0x0);
+	R300_STATECHANGE(r300, ps);
 
 	if (has_tcl) {
-	    R300_STATECHANGE(r300, vap_clip_cntl);
-	    reg_start(R300_VAP_CLIP_CNTL, 0);
-	    e32(R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
+		R300_STATECHANGE(r300, vap_clip_cntl);
+
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		OUT_BATCH_REGVAL(R300_VAP_CLIP_CNTL, R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
+		END_BATCH();
         }
 
-	R300_STATECHANGE(r300, ps);
-	reg_start(R300_GA_POINT_SIZE, 0);
-	e32(((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
-	    ((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+	BEGIN_BATCH_NO_AUTOSTATE(2);
+	OUT_BATCH_REGVAL(R300_GA_POINT_SIZE,
+		((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
+		((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+	END_BATCH();
 
 	if (!is_r500) {
 		R300_STATECHANGE(r300, ri);
-		reg_start(R300_RS_IP_0, 7);
-		for (i = 0; i < 8; ++i) {
-			e32(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
-		}
-
 		R300_STATECHANGE(r300, rc);
-		/* The second constant is needed to get glxgears display anything .. */
-		reg_start(R300_RS_COUNT, 1);
-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		e32(0x0);
-
 		R300_STATECHANGE(r300, rr);
-		reg_start(R300_RS_INST_0, 0);
-		e32(R300_RS_INST_COL_CN_WRITE);
+
+		BEGIN_BATCH(14);
+		OUT_BATCH_REGSEQ(R300_RS_IP_0, 8);
+		for (i = 0; i < 8; ++i)
+			OUT_BATCH(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
+
+		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+		OUT_BATCH(0x0);
+
+		OUT_BATCH_REGVAL(R300_RS_INST_0, R300_RS_INST_COL_CN_WRITE);
+		END_BATCH();
 	} else {
 		R300_STATECHANGE(r300, ri);
-		reg_start(R500_RS_IP_0, 7);
+		R300_STATECHANGE(r300, rc);
+		R300_STATECHANGE(r300, rr);
+
+		BEGIN_BATCH(14);
+		OUT_BATCH_REGSEQ(R500_RS_IP_0, 8);
 		for (i = 0; i < 8; ++i) {
-			e32((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-			    (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
+			OUT_BATCH((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+				  (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
 		}
 
-		R300_STATECHANGE(r300, rc);
-		/* The second constant is needed to get glxgears display anything .. */
-		reg_start(R300_RS_COUNT, 1);
-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		e32(0x0);
-
-		R300_STATECHANGE(r300, rr);
-		reg_start(R500_RS_INST_0, 0);
-		e32(R500_RS_INST_COL_CN_WRITE);
+		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+		OUT_BATCH(0x0);
 
+		OUT_BATCH_REGVAL(R500_RS_INST_0, R500_RS_INST_COL_CN_WRITE);
+		END_BATCH();
 	}
 
 	if (!is_r500) {
 		R300_STATECHANGE(r300, fp);
-		reg_start(R300_US_CONFIG, 2);
-		e32(0x0);
-		e32(0x0);
-		e32(0x0);
-		reg_start(R300_US_CODE_ADDR_0, 3);
-		e32(0x0);
-		e32(0x0);
-		e32(0x0);
-		e32(R300_RGBA_OUT);
-
 		R300_STATECHANGE(r300, fpi[0]);
 		R300_STATECHANGE(r300, fpi[1]);
 		R300_STATECHANGE(r300, fpi[2]);
 		R300_STATECHANGE(r300, fpi[3]);
 
-		reg_start(R300_US_ALU_RGB_INST_0, 0);
-		e32(FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
-
-		reg_start(R300_US_ALU_RGB_ADDR_0, 0);
-		e32(FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
-
-		reg_start(R300_US_ALU_ALPHA_INST_0, 0);
-		e32(FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
-
-		reg_start(R300_US_ALU_ALPHA_ADDR_0, 0);
-		e32(FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+		BEGIN_BATCH(17);
+		OUT_BATCH_REGSEQ(R300_US_CONFIG, 3);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH_REGSEQ(R300_US_CODE_ADDR_0, 4);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(R300_RGBA_OUT);
+
+		OUT_BATCH_REGVAL(R300_US_ALU_RGB_INST_0,
+			FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
+		OUT_BATCH_REGVAL(R300_US_ALU_RGB_ADDR_0,
+			FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
+		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_INST_0,
+			FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
+		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_ADDR_0,
+			FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+		END_BATCH();
 	} else {
- 		R300_STATECHANGE(r300, fp);
- 		reg_start(R500_US_CONFIG, 1);
- 		e32(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
- 		e32(0x0);
- 		reg_start(R500_US_CODE_ADDR, 2);
- 		e32(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
- 		e32(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
- 		e32(R500_US_CODE_OFFSET_ADDR(0));
-
+		R300_STATECHANGE(r300, fp);
 		R300_STATECHANGE(r300, r500fp);
-		r500fp_start_fragment(0, 6);
-
-		e32(R500_INST_TYPE_OUT |
-		    R500_INST_TEX_SEM_WAIT |
-		    R500_INST_LAST |
-		    R500_INST_RGB_OMASK_R |
-		    R500_INST_RGB_OMASK_G |
-		    R500_INST_RGB_OMASK_B |
-		    R500_INST_ALPHA_OMASK |
-		    R500_INST_RGB_CLAMP |
-		    R500_INST_ALPHA_CLAMP);
-
-		e32(R500_RGB_ADDR0(0) |
-		    R500_RGB_ADDR1(0) |
-		    R500_RGB_ADDR1_CONST |
-		    R500_RGB_ADDR2(0) |
-		    R500_RGB_ADDR2_CONST);
-
-		e32(R500_ALPHA_ADDR0(0) |
-		    R500_ALPHA_ADDR1(0) |
-		    R500_ALPHA_ADDR1_CONST |
-		    R500_ALPHA_ADDR2(0) |
-		    R500_ALPHA_ADDR2_CONST);
-
-		e32(R500_ALU_RGB_SEL_A_SRC0 |
-		    R500_ALU_RGB_R_SWIZ_A_R |
-		    R500_ALU_RGB_G_SWIZ_A_G |
-		    R500_ALU_RGB_B_SWIZ_A_B |
-		    R500_ALU_RGB_SEL_B_SRC0 |
-		    R500_ALU_RGB_R_SWIZ_B_R |
-		    R500_ALU_RGB_B_SWIZ_B_G |
-		    R500_ALU_RGB_G_SWIZ_B_B);
-
-		e32(R500_ALPHA_OP_CMP |
-		    R500_ALPHA_SWIZ_A_A |
-		    R500_ALPHA_SWIZ_B_A);
-
-		e32(R500_ALU_RGBA_OP_CMP |
-		    R500_ALU_RGBA_R_SWIZ_0 |
-		    R500_ALU_RGBA_G_SWIZ_0 |
-		    R500_ALU_RGBA_B_SWIZ_0 |
-		    R500_ALU_RGBA_A_SWIZ_0);
+
+		BEGIN_BATCH(14);
+		OUT_BATCH_REGSEQ(R500_US_CONFIG, 2);
+		OUT_BATCH(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+		OUT_BATCH(0x0);
+		OUT_BATCH_REGSEQ(R500_US_CODE_ADDR, 3);
+		OUT_BATCH(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
+		OUT_BATCH(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
+		OUT_BATCH(R500_US_CODE_OFFSET_ADDR(0));
+
+		OUT_BATCH(cmdr500fp(0, 1, 0, 0));
+		OUT_BATCH(R500_INST_TYPE_OUT |
+			  R500_INST_TEX_SEM_WAIT |
+			  R500_INST_LAST |
+			  R500_INST_RGB_OMASK_R |
+			  R500_INST_RGB_OMASK_G |
+			  R500_INST_RGB_OMASK_B |
+			  R500_INST_ALPHA_OMASK |
+			  R500_INST_RGB_CLAMP |
+			  R500_INST_ALPHA_CLAMP);
+		OUT_BATCH(R500_RGB_ADDR0(0) |
+			  R500_RGB_ADDR1(0) |
+			  R500_RGB_ADDR1_CONST |
+			  R500_RGB_ADDR2(0) |
+			  R500_RGB_ADDR2_CONST);
+		OUT_BATCH(R500_ALPHA_ADDR0(0) |
+			  R500_ALPHA_ADDR1(0) |
+			  R500_ALPHA_ADDR1_CONST |
+			  R500_ALPHA_ADDR2(0) |
+			  R500_ALPHA_ADDR2_CONST);
+		OUT_BATCH(R500_ALU_RGB_SEL_A_SRC0 |
+			  R500_ALU_RGB_R_SWIZ_A_R |
+			  R500_ALU_RGB_G_SWIZ_A_G |
+			  R500_ALU_RGB_B_SWIZ_A_B |
+			  R500_ALU_RGB_SEL_B_SRC0 |
+			  R500_ALU_RGB_R_SWIZ_B_R |
+			  R500_ALU_RGB_B_SWIZ_B_G |
+			  R500_ALU_RGB_G_SWIZ_B_B);
+		OUT_BATCH(R500_ALPHA_OP_CMP |
+			  R500_ALPHA_SWIZ_A_A |
+			  R500_ALPHA_SWIZ_B_A);
+		OUT_BATCH(R500_ALU_RGBA_OP_CMP |
+			  R500_ALU_RGBA_R_SWIZ_0 |
+			  R500_ALU_RGBA_G_SWIZ_0 |
+			  R500_ALU_RGBA_B_SWIZ_0 |
+			  R500_ALU_RGBA_A_SWIZ_0);
+		END_BATCH();
 	}
 
-	reg_start(R300_VAP_PVS_STATE_FLUSH_REG, 0);
-	e32(0x00000000);
+	BEGIN_BATCH(2);
+	OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+	END_BATCH();
+
 	if (has_tcl) {
-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
 			(12 << R300_VF_MAX_VTX_NUM_SHIFT));
-	    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
-	} else
-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+			vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
+	} else {
+		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
 			(5 << R300_VF_MAX_VTX_NUM_SHIFT));
+	}
 
 	if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
-	    vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
-	    vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
-	    vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
-	    vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
 	else
-	    vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
 
-	R300_STATECHANGE(rmesa, vap_cntl);
-	reg_start(R300_VAP_CNTL, 0);
-	e32(vap_cntl);
+	R300_STATECHANGE(r300, vap_cntl);
+
+	BEGIN_BATCH(2);
+	OUT_BATCH_REGVAL(R300_VAP_CNTL, vap_cntl);
+	END_BATCH();
 
 	if (has_tcl) {
 		R300_STATECHANGE(r300, pvs);
-		reg_start(R300_VAP_PVS_CODE_CNTL_0, 2);
-
-		e32((0 << R300_PVS_FIRST_INST_SHIFT) |
-		    (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-		    (1 << R300_PVS_LAST_INST_SHIFT));
-		e32((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-		    (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
-		e32(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-
 		R300_STATECHANGE(r300, vpi);
-		vsf_start_fragment(0x0, 8);
 
-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 0, 0xf, PVS_DST_REG_OUT));
-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(0x0);
-
-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf, PVS_DST_REG_OUT));
-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(0x0);
+		BEGIN_BATCH(13);
+		OUT_BATCH_REGSEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
+		OUT_BATCH((0 << R300_PVS_FIRST_INST_SHIFT) |
+			  (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			  (1 << R300_PVS_LAST_INST_SHIFT));
+		OUT_BATCH((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+			  (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
+		OUT_BATCH(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+
+		OUT_BATCH(cmdvpu(0, 2));
+		OUT_BATCH(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 0, 0xf, PVS_DST_REG_OUT));
+		OUT_BATCH(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
+		OUT_BATCH(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
+		OUT_BATCH(0x0);
+
+		OUT_BATCH(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf, PVS_DST_REG_OUT));
+		OUT_BATCH(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
+		OUT_BATCH(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
+		OUT_BATCH(0x0);
+		END_BATCH();
 	}
 }
 
@@ -467,7 +471,10 @@ static void r300EmitClearState(GLcontext * ctx)
 static void r300Clear(GLcontext * ctx, GLbitfield mask)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(r300);
 	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	GLframebuffer *fb = dPriv->driverPrivate;
+	struct radeon_renderbuffer *rrb;
 	int flags = 0;
 	int bits = 0;
 	int swapped;
@@ -482,6 +489,12 @@ static void r300Clear(GLcontext * ctx, GLbitfield mask)
 			return;
 	}
 
+	/* Flush swtcl vertices if necessary, because we will change hardware
+	 * state during clear. See also the state-related comment in
+	 * r300EmitClearState.
+	 */
+	R300_NEWPRIM(r300);
+
 	if (mask & BUFFER_BIT_FRONT_LEFT) {
 		flags |= BUFFER_BIT_FRONT_LEFT;
 		mask &= ~BUFFER_BIT_FRONT_LEFT;
@@ -509,26 +522,27 @@ static void r300Clear(GLcontext * ctx, GLbitfield mask)
 		_swrast_Clear(ctx, mask);
 	}
 
-	swapped = r300->radeon.sarea->pfCurrentPage == 1;
-
 	/* Make sure it fits there. */
 	r300EnsureCmdBufSpace(r300, 421 * 3, __FUNCTION__);
 	if (flags || bits)
 		r300EmitClearState(ctx);
 
 	if (flags & BUFFER_BIT_FRONT_LEFT) {
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped);
+		rrb = (void *)fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb);
 		bits = 0;
 	}
 
 	if (flags & BUFFER_BIT_BACK_LEFT) {
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped ^ 1);
+		rrb = (void *)fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb);
 		bits = 0;
 	}
 
 	if (bits)
-		r300ClearBuffer(r300, bits, 0);
+		r300ClearBuffer(r300, bits, NULL);
 
+	COMMIT_BATCH();
 }
 
 void r300Flush(GLcontext * ctx)
@@ -541,16 +555,12 @@ void r300Flush(GLcontext * ctx)
 	if (rmesa->dma.flush)
 		rmesa->dma.flush( rmesa );
 
-	if (rmesa->cmdbuf.count_used > rmesa->cmdbuf.count_reemit)
+	if (rmesa->cmdbuf.committed > rmesa->cmdbuf.reemit)
 		r300FlushCmdBuf(rmesa, __FUNCTION__);
 }
 
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-
 void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size)
 {
-	struct r300_dma_buffer *dmabuf;
 	size = MAX2(size, RADEON_BUFFER_SIZE * 16);
 
 	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
@@ -560,216 +570,24 @@ void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size)
 		rmesa->dma.flush(rmesa);
 	}
 
-	if (rmesa->dma.current.buf) {
-#ifdef USER_BUFFERS
-		r300_mem_use(rmesa, rmesa->dma.current.buf->id);
-#endif
-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+	if (rmesa->dma.current) {
+		dri_bo_unreference(rmesa->dma.current);
+		rmesa->dma.current = 0;
 	}
 	if (rmesa->dma.nr_released_bufs > 4)
 		r300FlushCmdBuf(rmesa, __FUNCTION__);
 
-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
-	dmabuf->buf = (void *)1;	/* hack */
-	dmabuf->refcount = 1;
-
-	dmabuf->id = r300_mem_alloc(rmesa, 4, size);
-	if (dmabuf->id == 0) {
-		LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
-
-		r300FlushCmdBufLocked(rmesa, __FUNCTION__);
-		radeonWaitForIdleLocked(&rmesa->radeon);
-
-		dmabuf->id = r300_mem_alloc(rmesa, 4, size);
-
-		UNLOCK_HARDWARE(&rmesa->radeon);
-
-		if (dmabuf->id == 0) {
-			fprintf(stderr,
-				"Error: Could not get dma buffer... exiting\n");
-			_mesa_exit(-1);
-		}
-	}
-
-	rmesa->dma.current.buf = dmabuf;
-	rmesa->dma.current.address = r300_mem_ptr(rmesa, dmabuf->id);
-	rmesa->dma.current.end = size;
-	rmesa->dma.current.start = 0;
-	rmesa->dma.current.ptr = 0;
-}
-
-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-			  struct r300_dma_region *region, const char *caller)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
-
-	if (!region->buf)
-		return;
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (--region->buf->refcount == 0) {
-		r300_mem_free(rmesa, region->buf->id);
-		FREE(region->buf);
-		rmesa->dma.nr_released_bufs++;
-	}
-
-	region->buf = 0;
-	region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void r300AllocDmaRegion(r300ContextPtr rmesa,
-			struct r300_dma_region *region,
-			int bytes, int alignment)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (region->buf)
-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
-
-	alignment--;
-	rmesa->dma.current.start = rmesa->dma.current.ptr =
-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
-		r300RefillCurrentDmaRegion(rmesa, (bytes + 0x7) & ~0x7);
-
-	region->start = rmesa->dma.current.start;
-	region->ptr = rmesa->dma.current.start;
-	region->end = rmesa->dma.current.start + bytes;
-	region->address = rmesa->dma.current.address;
-	region->buf = rmesa->dma.current.buf;
-	region->buf->refcount++;
-
-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
-	rmesa->dma.current.start =
-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
-
-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
-}
-
-#else
-static void r300RefillCurrentDmaRegion(r300ContextPtr rmesa)
-{
-	struct r300_dma_buffer *dmabuf;
-	int fd = rmesa->radeon.dri.fd;
-	int index = 0;
-	int size = 0;
-	drmDMAReq dma;
-	int ret;
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (rmesa->dma.flush) {
-		rmesa->dma.flush(rmesa);
-	}
-
-	if (rmesa->dma.current.buf)
-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
-
-	if (rmesa->dma.nr_released_bufs > 4)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-	dma.context = rmesa->radeon.dri.hwContext;
-	dma.send_count = 0;
-	dma.send_list = NULL;
-	dma.send_sizes = NULL;
-	dma.flags = 0;
-	dma.request_count = 1;
-	dma.request_size = RADEON_BUFFER_SIZE;
-	dma.request_list = &index;
-	dma.request_sizes = &size;
-	dma.granted_count = 0;
-
-	LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
-
-	ret = drmDMA(fd, &dma);
-
-	if (ret != 0) {
-		/* Try to release some buffers and wait until we can't get any more */
-		if (rmesa->dma.nr_released_bufs) {
-			r300FlushCmdBufLocked(rmesa, __FUNCTION__);
-		}
-
-		if (RADEON_DEBUG & DEBUG_DMA)
-			fprintf(stderr, "Waiting for buffers\n");
-
-		radeonWaitForIdleLocked(&rmesa->radeon);
-		ret = drmDMA(fd, &dma);
-
-		if (ret != 0) {
-			UNLOCK_HARDWARE(&rmesa->radeon);
-			fprintf(stderr,
-				"Error: Could not get dma buffer... exiting\n");
-			_mesa_exit(-1);
-		}
-	}
-
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	if (RADEON_DEBUG & DEBUG_DMA)
-		fprintf(stderr, "Allocated buffer %d\n", index);
-
-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
-	dmabuf->buf = &rmesa->radeon.radeonScreen->buffers->list[index];
-	dmabuf->refcount = 1;
-
-	rmesa->dma.current.buf = dmabuf;
-	rmesa->dma.current.address = dmabuf->buf->address;
-	rmesa->dma.current.end = dmabuf->buf->total;
-	rmesa->dma.current.start = 0;
-	rmesa->dma.current.ptr = 0;
-}
-
-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-			  struct r300_dma_region *region, const char *caller)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
-
-	if (!region->buf)
-		return;
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (--region->buf->refcount == 0) {
-		drm_radeon_cmd_header_t *cmd;
-
-		if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-			fprintf(stderr, "%s -- DISCARD BUF %d\n",
-				__FUNCTION__, region->buf->buf->idx);
-		cmd =
-		    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa,
-								sizeof
-								(*cmd) / 4,
-								__FUNCTION__);
-		cmd->dma.cmd_type = R300_CMD_DMA_DISCARD;
-		cmd->dma.buf_idx = region->buf->buf->idx;
-
-		FREE(region->buf);
-		rmesa->dma.nr_released_bufs++;
-	}
-
-	region->buf = 0;
-	region->start = 0;
+	rmesa->dma.current = dri_bo_alloc(rmesa->radeon.bufmgr, "DMA regions",
+		size, 4, DRM_BO_MEM_DMA);
+	rmesa->dma.current_used = 0;
+	rmesa->dma.current_vertexptr = 0;
 }
 
 /* Allocates a region from rmesa->dma.current.  If there isn't enough
  * space in current, grab a new buffer (and discard what was left of current)
  */
 void r300AllocDmaRegion(r300ContextPtr rmesa,
-			struct r300_dma_region *region,
+			dri_bo **pbo, int *poffset,
 			int bytes, int alignment)
 {
 	if (RADEON_DEBUG & DEBUG_IOCTL)
@@ -778,62 +596,23 @@ void r300AllocDmaRegion(r300ContextPtr rmesa,
 	if (rmesa->dma.flush)
 		rmesa->dma.flush(rmesa);
 
-	if (region->buf)
-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
+	assert(rmesa->dma.current_used == rmesa->dma.current_vertexptr);
 
 	alignment--;
-	rmesa->dma.current.start = rmesa->dma.current.ptr =
-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
-		r300RefillCurrentDmaRegion(rmesa);
-
-	region->start = rmesa->dma.current.start;
-	region->ptr = rmesa->dma.current.start;
-	region->end = rmesa->dma.current.start + bytes;
-	region->address = rmesa->dma.current.address;
-	region->buf = rmesa->dma.current.buf;
-	region->buf->refcount++;
-
-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
-	rmesa->dma.current.start =
-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
+	rmesa->dma.current_used = (rmesa->dma.current_used + alignment) & ~alignment;
 
-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
-}
-
-#endif
-
-GLboolean r300IsGartMemory(r300ContextPtr rmesa, const GLvoid * pointer,
-			   GLint size)
-{
-	int offset =
-	    (char *)pointer -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-	int valid = (size >= 0 && offset >= 0
-		     && offset + size <
-		     rmesa->radeon.radeonScreen->gartTextures.size);
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "r300IsGartMemory( %p ) : %d\n", pointer,
-			valid);
+	if (!rmesa->dma.current || rmesa->dma.current_used + bytes > rmesa->dma.current->size)
+		r300RefillCurrentDmaRegion(rmesa, (bytes + 15) & ~15);
 
-	return valid;
-}
+	*poffset = rmesa->dma.current_used;
+	*pbo = rmesa->dma.current;
+	dri_bo_reference(*pbo);
 
-GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa, const GLvoid * pointer)
-{
-	int offset =
-	    (char *)pointer -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
+	/* Always align to at least 16 bytes */
+	rmesa->dma.current_used = (rmesa->dma.current_used + bytes + 15) & ~15;
+	rmesa->dma.current_vertexptr = rmesa->dma.current_used;
 
-	//fprintf(stderr, "offset=%08x\n", offset);
-
-	if (offset < 0
-	    || offset > rmesa->radeon.radeonScreen->gartTextures.size)
-		return ~0;
-	else
-		return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
+	assert(rmesa->dma.current_used <= rmesa->dma.current->size);
 }
 
 void r300InitIoctlFuncs(struct dd_function_table *functions)
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.h b/src/mesa/drivers/dri/r300/r300_ioctl.h
index e1143fb..c743478 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.h
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.h
@@ -39,20 +39,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "radeon_drm.h"
 
-extern GLboolean r300IsGartMemory(r300ContextPtr rmesa,
-				  const GLvoid * pointer, GLint size);
-
-extern GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa,
-					const GLvoid * pointer);
-
 extern void r300Flush(GLcontext * ctx);
 
-extern void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-				 struct r300_dma_region *region,
-				 const char *caller);
 extern void r300AllocDmaRegion(r300ContextPtr rmesa,
-			       struct r300_dma_region *region, int bytes,
-			       int alignment);
+			       dri_bo **pbo, int *poffset,
+			       int bytes, int alignment);
 
 extern void r300InitIoctlFuncs(struct dd_function_table *functions);
 
diff --git a/src/mesa/drivers/dri/r300/r300_mem.c b/src/mesa/drivers/dri/r300/r300_mem.c
index f8f9d4f..3c0b055 100644
--- a/src/mesa/drivers/dri/r300/r300_mem.c
+++ b/src/mesa/drivers/dri/r300/r300_mem.c
@@ -27,359 +27,955 @@
 
 /**
  * \file
+ * Simulate a real memory manager for R300 in the old-style scheme.
+ *
+ * NOTE: Right now, this is DMA-only and really only a skeleton of a true bufmgr.
  *
  * \author Aapo Tahkola <aet@rasterburn.org>
  */
 
+#include "r300_mem.h"
+
+#include <errno.h>
 #include <unistd.h>
 
-#include "r300_context.h"
-#include "r300_cmdbuf.h"
-#include "r300_ioctl.h"
-#include "r300_mem.h"
+#include "main/simple_list.h"
+
+#include "radeon_buffer.h"
 #include "radeon_ioctl.h"
+#include "r300_cmdbuf.h"
 
-#ifdef USER_BUFFERS
+typedef struct _radeon_bufmgr_classic radeon_bufmgr_classic;
+typedef struct _radeon_bo_classic radeon_bo_classic;
+typedef struct _radeon_bo_functions radeon_bo_functions;
+typedef struct _radeon_reloc radeon_reloc;
+typedef struct _radeon_bo_vram radeon_bo_vram;
+
+struct _radeon_bufmgr_classic {
+	dri_bufmgr base;
+	radeonScreenPtr screen;
+	r300ContextPtr rmesa;
+
+	radeon_bo_classic *buffers; /** Unsorted linked list of all buffer objects */
+
+	radeon_bo_classic *pending; /** Age-sorted linked list of pending buffer objects */
+	radeon_bo_classic **pending_tail;
+
+	/* Texture heap bookkeeping */
+	driTexHeap *texture_heap;
+	GLuint texture_offset;
+	driTextureObject texture_swapped;
+
+  	GLuint total_vram_used;
+};
+
+struct _radeon_reloc {
+	uint64_t flags;
+	GLuint offset; /**< Offset (in bytes) into command buffer to relocated dword */
+	radeon_bo_classic *target;
+	GLuint delta;
+};
+
+struct _radeon_bo_functions {
+	/**
+	 * Free a buffer object. Caller has verified that the object is not
+	 * referenced or pending.
+	 */
+	void (*free)(radeon_bo_classic*);
+
+	/**
+	 * Validate the given buffer. Must set the validated flag to 1.
+	 *
+	 * May be null for buffer objects that are always valid.
+	 * Always called with lock held.
+	 * return -1 to restart validation
+	 */
+	int (*validate)(radeon_bo_classic*);
+
+	/**
+	 * Map the buffer for CPU access.
+	 * Only called when the buffer isn't already mapped.
+	 *
+	 * May be null.
+	 */
+	void (*map)(radeon_bo_classic*, GLboolean write);
+
+	/**
+	 * Unmap the buffer.
+	 * Only called on final unmap.
+	 *
+	 * May be null.
+	 */
+	void (*unmap)(radeon_bo_classic*);
+
+	/**
+	 * Indicate that the buffer object is now used by the hardware.
+	 *
+	 * May be null.
+	 */
+	void (*bind)(radeon_bo_classic*);
+
+	/**
+	 * Indicate that the buffer object is no longer used by the hardware.
+	 *
+	 * May be null.
+	 */
+	void (*unbind)(radeon_bo_classic*);
+};
 
-static void resize_u_list(r300ContextPtr rmesa)
-{
-	void *temp;
-	int nsize;
+/**
+ * A buffer object. There are three types of buffer objects:
+ *  1. cmdbuf: Ordinary malloc()ed memory, used for command buffers
+ *  2. dma: GART memory allocated via the DRM_RADEON_ALLOC ioctl.
+ *  3. vram: Objects with malloc()ed backing store that will be uploaded
+ *     into VRAM on demand; used for textures.
+ * There is a @ref functions table for operations that depend on the
+ * buffer object type.
+ *
+ * Fencing is handled the same way all buffer objects. During command buffer
+ * submission, the pending flag and corresponding variables are set accordingly.
+ */
+struct _radeon_bo_classic {
+	dri_bo base;
 
-	temp = rmesa->rmm->u_list;
-	nsize = rmesa->rmm->u_size * 2;
+	const radeon_bo_functions *functions;
 
-	rmesa->rmm->u_list = _mesa_malloc(nsize * sizeof(*rmesa->rmm->u_list));
-	_mesa_memset(rmesa->rmm->u_list, 0,
-		     nsize * sizeof(*rmesa->rmm->u_list));
+	radeon_bo_classic *next; /** Unsorted linked list of all buffer objects */
+	radeon_bo_classic **pprev;
 
-	if (temp) {
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+	/**
+	 * Number of software references to this buffer.
+	 * A buffer is freed automatically as soon as its reference count reaches 0
+	 * *and* it is no longer pending.
+	 */
+	unsigned int refcount;
+	unsigned int mapcount; /** mmap count; mutually exclusive to being pending */
 
-		_mesa_memcpy(rmesa->rmm->u_list, temp,
-			     rmesa->rmm->u_size * sizeof(*rmesa->rmm->u_list));
-		_mesa_free(temp);
-	}
+	unsigned int validated:1; /** whether the buffer is validated for hardware use right now */
+	unsigned int used:1; /* only for communication between process_relocs and post_submit */
+
+	unsigned int pending:1;
+	unsigned int space_accounted:1;
+	radeon_bo_classic *pending_next; /** Age-sorted linked list of pending buffer objects */
+	radeon_bo_classic **pending_pprev;
+
+	/* The following two variables are intricately linked to the DRM interface,
+	 * and must be in this physical memory order, or else chaos ensues.
+	 * See the DRM's implementation of R300_CMD_SCRATCH for details.
+	 */
+	uint32_t pending_age; /** Buffer object pending until this age is reached, written by the DRM */
+	uint32_t pending_count; /** Number of pending R300_CMD_SCRATCH references to this object */
+
+	radeon_reloc *relocs; /** Array of relocations in this buffer */
+	GLuint relocs_used; /** # of relocations in relocation array */
+	GLuint relocs_size; /** # of reloc records reserved in relocation array */
+};
+
+typedef struct _radeon_vram_wrapper radeon_vram_wrapper;
+
+/** Wrapper around heap object */
+struct _radeon_vram_wrapper {
+	driTextureObject base;
+	radeon_bo_vram *bo;
+};
+
+struct _radeon_bo_vram {
+	radeon_bo_classic base;
+
+	unsigned int backing_store_dirty:1; /** Backing store has changed, block must be reuploaded */
+
+	radeon_vram_wrapper *vram; /** Block in VRAM (if any) */
+};
+
+static radeon_bufmgr_classic* get_bufmgr_classic(dri_bufmgr *bufmgr_ctx)
+{
+	return (radeon_bufmgr_classic*)bufmgr_ctx;
+}
+
+static radeon_bo_classic* get_bo_classic(dri_bo *bo_base)
+{
+	return (radeon_bo_classic*)bo_base;
+}
 
-	rmesa->rmm->u_size = nsize;
+static radeon_bo_vram* get_bo_vram(radeon_bo_classic *bo_base)
+{
+	return (radeon_bo_vram*)bo_base;
 }
 
-void r300_mem_init(r300ContextPtr rmesa)
+/**
+ * Really free a given buffer object.
+ */
+static void bo_free(radeon_bo_classic *bo)
 {
-	rmesa->rmm = malloc(sizeof(struct r300_memory_manager));
-	memset(rmesa->rmm, 0, sizeof(struct r300_memory_manager));
+	assert(!bo->refcount);
+	assert(!bo->pending);
+	assert(!bo->mapcount);
+
+	if (bo->relocs) {
+		int i;
+		for(i = 0; i < bo->relocs_used; ++i)
+			dri_bo_unreference(&bo->relocs[i].target->base);
+		free(bo->relocs);
+		bo->relocs = 0;
+	}
 
-	rmesa->rmm->u_size = 128;
-	resize_u_list(rmesa);
+	*bo->pprev = bo->next;
+	if (bo->next)
+		bo->next->pprev = bo->pprev;
+
+	bo->functions->free(bo);
 }
 
-void r300_mem_destroy(r300ContextPtr rmesa)
+
+/**
+ * Keep track of which buffer objects are still pending, i.e. waiting for
+ * some hardware operation to complete.
+ */
+static void track_pending_buffers(radeon_bufmgr_classic *bufmgr)
 {
-	_mesa_free(rmesa->rmm->u_list);
-	rmesa->rmm->u_list = NULL;
+	uint32_t currentage = radeonGetAge((radeonContextPtr)bufmgr->rmesa);
+
+	while(bufmgr->pending) {
+		radeon_bo_classic *bo = bufmgr->pending;
+
+		assert(bo->pending);
 
-	_mesa_free(rmesa->rmm);
-	rmesa->rmm = NULL;
+		if (bo->pending_count ||
+		    bo->pending_age > currentage) // TODO: Age counter wraparound!
+			break;
+
+		bo->pending = 0;
+		bufmgr->pending = bo->pending_next;
+		if (bufmgr->pending)
+			bufmgr->pending->pending_pprev = &bufmgr->pending;
+		else
+			bufmgr->pending_tail = &bufmgr->pending;
+
+		if (bo->functions->unbind)
+			(*bo->functions->unbind)(bo);
+		if (!bo->refcount)
+			bo_free(bo);
+	}
 }
 
-void *r300_mem_ptr(r300ContextPtr rmesa, int id)
+/**
+ * Initialize common buffer object data.
+ */
+static void init_buffer(radeon_bufmgr_classic *bufmgr, radeon_bo_classic *bo, unsigned long size)
 {
-	assert(id <= rmesa->rmm->u_last);
-	return rmesa->rmm->u_list[id].ptr;
+	bo->base.bufmgr = &bufmgr->base;
+	bo->base.size = size;
+	bo->refcount = 1;
+
+	bo->pprev = &bufmgr->buffers;
+	bo->next = bufmgr->buffers;
+	if (bo->next)
+		bo->next->pprev = &bo->next;
+	bufmgr->buffers = bo;
 }
 
-int r300_mem_find(r300ContextPtr rmesa, void *ptr)
+
+/**
+ * Free a DMA-based buffer.
+ */
+static void dma_free(radeon_bo_classic *bo)
 {
-	int i;
+	radeon_bufmgr_classic* bufmgr = get_bufmgr_classic(bo->base.bufmgr);
+	drm_radeon_mem_free_t memfree;
+	int ret;
 
-	for (i = 1; i < rmesa->rmm->u_size + 1; i++)
-		if (rmesa->rmm->u_list[i].ptr &&
-		    ptr >= rmesa->rmm->u_list[i].ptr &&
-		    ptr <
-		    rmesa->rmm->u_list[i].ptr + rmesa->rmm->u_list[i].size)
-			break;
+	memfree.region = RADEON_MEM_REGION_GART;
+	memfree.region_offset = bo->base.offset;
+	memfree.region_offset -= bufmgr->screen->gart_texture_offset;
 
-	if (i < rmesa->rmm->u_size + 1)
-		return i;
+	ret = drmCommandWrite(bufmgr->screen->driScreen->fd,
+		DRM_RADEON_FREE, &memfree, sizeof(memfree));
+	if (ret) {
+		fprintf(stderr, "Failed to free bo[%p] at %08x\n", bo, memfree.region_offset);
+		fprintf(stderr, "ret = %s\n", strerror(-ret));
+		exit(1);
+	}
 
-	fprintf(stderr, "%p failed\n", ptr);
-	return 0;
+	free(bo);
 }
 
-//#define MM_DEBUG
-int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size)
+static const radeon_bo_functions dma_bo_functions = {
+	.free = &dma_free
+};
+
+/**
+ * Call the DRM to allocate GART memory for the given (incomplete)
+ * buffer object.
+ */
+static int try_dma_alloc(radeon_bufmgr_classic *bufmgr, radeon_bo_classic *bo,
+		unsigned long size, unsigned int alignment)
 {
 	drm_radeon_mem_alloc_t alloc;
-	int offset = 0, ret;
-	int i, free = -1;
-	int done_age;
-	drm_radeon_mem_free_t memfree;
-	int tries = 0;
-	static int bytes_wasted = 0, allocated = 0;
+	int baseoffset;
+	int ret;
 
-	if (size < 4096)
-		bytes_wasted += 4096 - size;
+	alloc.region = RADEON_MEM_REGION_GART;
+	alloc.alignment = alignment;
+	alloc.size = size;
+	alloc.region_offset = &baseoffset;
 
-	allocated += size;
+	ret = drmCommandWriteRead(bufmgr->screen->driScreen->fd,
+			DRM_RADEON_ALLOC, &alloc, sizeof(alloc));
+	if (ret) {
+		if (RADEON_DEBUG & DEBUG_MEMORY)
+			fprintf(stderr, "DRM_RADEON_ALLOC failed: %d\n", ret);
+		return 0;
+	}
+
+	bo->base.virtual = (char*)bufmgr->screen->gartTextures.map + baseoffset;
+	bo->base.offset = bufmgr->screen->gart_texture_offset + baseoffset;
 
-#if 0
-	static int t = 0;
-	if (t != time(NULL)) {
-		t = time(NULL);
-		fprintf(stderr, "slots used %d, wasted %d kb, allocated %d\n",
-			rmesa->rmm->u_last, bytes_wasted / 1024,
-			allocated / 1024);
+	return 1;
+}
+
+/**
+ * Allocate a DMA buffer.
+ */
+static dri_bo *dma_alloc(radeon_bufmgr_classic *bufmgr, const char *name,
+		unsigned long size, unsigned int alignment)
+{
+	radeon_bo_classic* bo = (radeon_bo_classic*)calloc(1, sizeof(radeon_bo_classic));
+
+	bo->functions = &dma_bo_functions;
+
+	track_pending_buffers(bufmgr);
+	if (!try_dma_alloc(bufmgr, bo, size, alignment)) {
+		if (RADEON_DEBUG & DEBUG_MEMORY)
+			fprintf(stderr, "Failed to allocate %ld bytes, finishing command buffer...\n", size);
+		radeonFinish(bufmgr->rmesa->radeon.glCtx);
+		track_pending_buffers(bufmgr);
+		if (!try_dma_alloc(bufmgr, bo, size, alignment)) {
+			WARN_ONCE(
+				"Ran out of GART memory (for %ld)!\n"
+				"Please consider adjusting GARTSize option.\n",
+				size);
+			free(bo);
+			return 0;
+		}
 	}
-#endif
 
-	memfree.region = RADEON_MEM_REGION_GART;
+	init_buffer(bufmgr, bo, size);
+	bo->validated = 1; /* DMA buffer offsets are always valid */
 
-      again:
+	return &bo->base;
+}
 
-	done_age = radeonGetAge((radeonContextPtr) rmesa);
+/**
+ * Free a command buffer
+ */
+static void cmdbuf_free(radeon_bo_classic *bo)
+{
+	free(bo->base.virtual);
+	free(bo);
+}
 
-	if (rmesa->rmm->u_last + 1 >= rmesa->rmm->u_size)
-		resize_u_list(rmesa);
+static const radeon_bo_functions cmdbuf_bo_functions = {
+	.free = cmdbuf_free
+};
 
-	for (i = rmesa->rmm->u_last + 1; i > 0; i--) {
-		if (rmesa->rmm->u_list[i].ptr == NULL) {
-			free = i;
-			continue;
-		}
+/**
+ * Allocate a command buffer.
+ *
+ * Command buffers are really just malloc'ed buffers. They are managed by
+ * the bufmgr to enable relocations.
+ */
+static dri_bo *cmdbuf_alloc(radeon_bufmgr_classic *bufmgr, const char *name,
+		unsigned long size)
+{
+	radeon_bo_classic* bo = (radeon_bo_classic*)calloc(1, sizeof(radeon_bo_classic));
 
-		if (rmesa->rmm->u_list[i].h_pending == 0 &&
-		    rmesa->rmm->u_list[i].pending
-		    && rmesa->rmm->u_list[i].age <= done_age) {
-			memfree.region_offset =
-			    (char *)rmesa->rmm->u_list[i].ptr -
-			    (char *)rmesa->radeon.radeonScreen->gartTextures.
-			    map;
+	bo->functions = &cmdbuf_bo_functions;
+	bo->base.virtual = malloc(size);
 
-			ret =
-			    drmCommandWrite(rmesa->radeon.radeonScreen->
-					    driScreen->fd, DRM_RADEON_FREE,
-					    &memfree, sizeof(memfree));
+	init_buffer(bufmgr, bo, size);
+	return &bo->base;
+}
 
-			if (ret) {
-				fprintf(stderr, "Failed to free at %p\n",
-					rmesa->rmm->u_list[i].ptr);
-				fprintf(stderr, "ret = %s\n", strerror(-ret));
-				exit(1);
-			} else {
-#ifdef MM_DEBUG
-				fprintf(stderr, "really freed %d at age %x\n",
-					i,
-					radeonGetAge((radeonContextPtr) rmesa));
-#endif
-				if (i == rmesa->rmm->u_last)
-					rmesa->rmm->u_last--;
-
-				if (rmesa->rmm->u_list[i].size < 4096)
-					bytes_wasted -=
-					    4096 - rmesa->rmm->u_list[i].size;
-
-				allocated -= rmesa->rmm->u_list[i].size;
-				rmesa->rmm->u_list[i].pending = 0;
-				rmesa->rmm->u_list[i].ptr = NULL;
-				free = i;
+/**
+ * Free a VRAM-based buffer object.
+ */
+static void vram_free(radeon_bo_classic *bo_base)
+{
+	radeon_bo_vram *bo = get_bo_vram(bo_base);
+
+	if (bo->vram) {
+		driDestroyTextureObject(&bo->vram->base);
+		bo->vram = 0;
+	}
+
+	free(bo->base.base.virtual);
+	free(bo);
+}
+
+/**
+ * Allocate/update the copy in vram.
+ *
+ * Note: Assume we're called with the DRI lock held.
+ */
+static int vram_validate(radeon_bo_classic *bo_base)
+{
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(bo_base->base.bufmgr);
+	radeon_bo_vram *bo = get_bo_vram(bo_base);
+	int retry_count = 0, pending_retry = 0;
+
+	track_pending_buffers(bufmgr);
+	if (!bo->vram) {
+		bo->backing_store_dirty = 1;
+		bo->vram = (radeon_vram_wrapper*)calloc(1, sizeof(radeon_vram_wrapper));
+		bo->vram->bo = bo;
+		make_empty_list(&bo->vram->base);
+		bo->vram->base.totalSize = bo->base.base.size;
+retry:
+		if (driAllocateTexture(&bufmgr->texture_heap, 1, &bo->vram->base) < 0) {
+			pending_retry = 0;
+			while(bufmgr->pending && pending_retry++ < 10000)
+				track_pending_buffers(bufmgr);
+			retry_count++;
+			if (retry_count > 2) {
+				free(bo->vram);
+				bo->vram = NULL;
+				return -1;
 			}
+			goto retry;
 		}
 	}
-	rmesa->rmm->u_head = i;
-
-	if (free == -1) {
-		WARN_ONCE("Ran out of slots!\n");
-		//usleep(100);
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-		tries++;
-		if (tries > 100) {
-			WARN_ONCE("Ran out of slots!\n");
-			exit(1);
+
+	assert(bo->vram->base.memBlock);
+
+	bo->base.base.offset = bufmgr->texture_offset + bo->vram->base.memBlock->ofs;
+
+	if (bo->backing_store_dirty) {
+		/* Copy to VRAM using a blit.
+		 * All memory is 4K aligned. We're using 1024 pixels wide blits.
+		 */
+		drm_radeon_texture_t tex;
+		drm_radeon_tex_image_t tmp;
+		int ret;
+
+		tex.offset = bo->base.base.offset;
+		tex.image = &tmp;
+
+		assert(!(tex.offset & 1023));
+
+		tmp.x = 0;
+		tmp.y = 0;
+		if (bo->base.base.size < 4096) {
+			tmp.width = (bo->base.base.size + 3) / 4;
+			tmp.height = 1;
+		} else {
+			tmp.width = 1024;
+			tmp.height = (bo->base.base.size + 4095) / 4096;
 		}
-		goto again;
+		tmp.data = bo->base.base.virtual;
+
+		tex.format = RADEON_TXFORMAT_ARGB8888;
+		tex.width = tmp.width;
+		tex.height = tmp.height;
+		tex.pitch = MAX2(tmp.width / 16, 1);
+
+		do {
+			ret = drmCommandWriteRead(bufmgr->screen->driScreen->fd,
+						DRM_RADEON_TEXTURE, &tex,
+						sizeof(drm_radeon_texture_t));
+			if (ret) {
+				if (RADEON_DEBUG & DEBUG_IOCTL)
+					fprintf(stderr,
+						"DRM_RADEON_TEXTURE:  again!\n");
+				usleep(1);
+			}
+		} while (ret == -EAGAIN);
+
+		bo->backing_store_dirty = 0;
 	}
 
-	alloc.region = RADEON_MEM_REGION_GART;
-	alloc.alignment = alignment;
-	alloc.size = size;
-	alloc.region_offset = &offset;
+	bo->base.validated = 1;
+	return 0;
+}
 
-	ret =
-	    drmCommandWriteRead(rmesa->radeon.dri.fd, DRM_RADEON_ALLOC, &alloc,
-				sizeof(alloc));
-	if (ret) {
-#if 0
-		WARN_ONCE("Ran out of mem!\n");
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-		//usleep(100);
-		tries2++;
-		tries = 0;
-		if (tries2 > 100) {
-			WARN_ONCE("Ran out of GART memory!\n");
-			exit(1);
-		}
-		goto again;
-#else
-		WARN_ONCE
-		    ("Ran out of GART memory (for %d)!\nPlease consider adjusting GARTSize option.\n",
-		     size);
-		return 0;
-#endif
+/* No need for actual mmap actions since we have backing store,
+ * but mark buffer dirty when necessary */
+static void vram_map(radeon_bo_classic *bo_base, GLboolean write)
+{
+	radeon_bo_vram *bo = get_bo_vram(bo_base);
+
+	if (write) {
+		bo->base.validated = 0;
+		bo->backing_store_dirty = 1;
 	}
+}
+
+static void vram_bind(radeon_bo_classic *bo_base)
+{
+	radeon_bo_vram *bo = get_bo_vram(bo_base);
 
-	i = free;
+	if (bo->vram) {
+		bo->vram->base.bound = 1;
+		driUpdateTextureLRU(&bo->vram->base);
+	}
+}
 
-	if (i > rmesa->rmm->u_last)
-		rmesa->rmm->u_last = i;
+static void vram_unbind(radeon_bo_classic *bo_base)
+{
+	radeon_bo_vram *bo = get_bo_vram(bo_base);
 
-	rmesa->rmm->u_list[i].ptr =
-	    ((GLubyte *) rmesa->radeon.radeonScreen->gartTextures.map) + offset;
-	rmesa->rmm->u_list[i].size = size;
-	rmesa->rmm->u_list[i].age = 0;
-	//fprintf(stderr, "alloc %p at id %d\n", rmesa->rmm->u_list[i].ptr, i);
+	if (bo->vram)
+		bo->vram->base.bound = 0;
+}
 
-#ifdef MM_DEBUG
-	fprintf(stderr, "allocated %d at age %x\n", i,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
+/** Callback function called by the texture heap when a texture is evicted */
+static void destroy_vram_wrapper(void *data, driTextureObject *t)
+{
+	radeon_vram_wrapper *wrapper = (radeon_vram_wrapper*)t;
 
-	return i;
+	if (wrapper->bo && wrapper->bo->vram == wrapper) {
+		wrapper->bo->base.validated = 0;
+		wrapper->bo->vram = 0;
+	}
 }
 
-void r300_mem_use(r300ContextPtr rmesa, int id)
+static const radeon_bo_functions vram_bo_functions = {
+	.free = vram_free,
+	.validate = vram_validate,
+	.map = vram_map,
+	.bind = vram_bind,
+	.unbind = vram_unbind
+};
+
+/**
+ * Free a VRAM-based buffer object.
+ */
+static void static_free(radeon_bo_classic *bo_base)
 {
-	uint64_t ull;
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-	drm_r300_cmd_header_t *cmd;
+	radeon_bo_vram *bo = get_bo_vram(bo_base);
 
-	assert(id <= rmesa->rmm->u_last);
+	free(bo);
+}
 
-	if (id == 0)
-		return;
+static void static_map(radeon_bo_classic *bo_base, GLboolean write)
+{
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(bo_base->base.bufmgr);
+
+	/* don't map static for kernel mm  - we have hardcoded maps */
+	if (!bufmgr->screen->kernel_mm)
+		bo_base->base.virtual = bufmgr->screen->driScreen->pFB +
+			(bo_base->base.offset - bufmgr->screen->fbLocation);
+
+	/* Read the first pixel in the frame buffer.  This should
+	 * be a noop, right?  In fact without this conform fails as reading
+	 * from the framebuffer sometimes produces old results -- the
+	 * on-card read cache gets mixed up and doesn't notice that the
+	 * framebuffer has been updated.
+	 *
+	 * Note that we should probably be reading some otherwise unused
+	 * region of VRAM, otherwise we might get incorrect results when
+	 * reading pixels from the top left of the screen.
+	 *
+	 * I found this problem on an R420 with glean's texCube test.
+	 * Note that the R200 span code also *writes* the first pixel in the
+	 * framebuffer, but I've found this to be unnecessary.
+	 *  -- Nicolai Hähnle, June 2008
+	 */
+	{
+		int p;
+		volatile int *buf = (int*)bufmgr->screen->driScreen->pFB;
+		p = *buf;
+	}
+}
 
-	cmd =
-	    (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa,
-						      2 + sizeof(ull) / 4,
-						      __FUNCTION__);
-	cmd[0].scratch.cmd_type = R300_CMD_SCRATCH;
-	cmd[0].scratch.reg = R300_MEM_SCRATCH;
-	cmd[0].scratch.n_bufs = 1;
-	cmd[0].scratch.flags = 0;
-	cmd++;
+static void static_unmap(radeon_bo_classic *bo_base)
+{
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(bo_base->base.bufmgr);
+	/* don't unmap for kernel mm we have hardcoded maps */
+	if (!bufmgr->screen->kernel_mm)
+		bo_base->base.virtual = 0;
+}
 
-	ull = (uint64_t) (intptr_t) & rmesa->rmm->u_list[id].age;
-	_mesa_memcpy(cmd, &ull, sizeof(ull));
-	cmd += sizeof(ull) / 4;
+static const radeon_bo_functions static_bo_functions = {
+	.free = static_free,
+	.map = static_map,
+	.unmap = static_unmap
+};
 
-	cmd[0].u = /*id */ 0;
+/**
+ * Allocate a backing store buffer object that is validated into VRAM.
+ */
+static dri_bo *vram_alloc(radeon_bufmgr_classic *bufmgr, const char *name,
+		unsigned long size, unsigned int alignment)
+{
+	radeon_bo_vram* bo = (radeon_bo_vram*)calloc(1, sizeof(radeon_bo_vram));
+	uint32_t pgsize = getpagesize() - 1;
+
+	size = (size + pgsize) & ~pgsize;
+	/* round size up to page size */
+	bo->base.functions = &vram_bo_functions;
+	bo->base.base.virtual = malloc(size);
+	init_buffer(bufmgr, &bo->base, size);
+	return &bo->base.base;
+}
 
-	LOCK_HARDWARE(&rmesa->radeon);	/* Protect from DRM. */
-	rmesa->rmm->u_list[id].h_pending++;
-	UNLOCK_HARDWARE(&rmesa->radeon);
+dri_bo *radeon_bufmgr_classic_bo_alloc(dri_bufmgr *bufmgr_ctx, const char *name,
+				       unsigned long size, unsigned int alignment,
+				       uint64_t location_mask)
+{
+	radeon_bufmgr_classic* bufmgr = get_bufmgr_classic(bufmgr_ctx);
+
+	if (location_mask & DRM_BO_MEM_CMDBUF) {
+		return cmdbuf_alloc(bufmgr, name, size);
+	} else if (location_mask & DRM_BO_MEM_DMA) {
+		return dma_alloc(bufmgr, name, size, alignment);
+	} else {
+		return vram_alloc(bufmgr, name, size, alignment);
+	}
 }
 
-unsigned long r300_mem_offset(r300ContextPtr rmesa, int id)
+static dri_bo *bufmgr_classic_bo_alloc_static(dri_bufmgr *bufmgr_ctx, const char *name,
+					      unsigned long offset, unsigned long size,
+					      void *virtual, uint64_t location_mask)
 {
-	unsigned long offset;
+  	radeon_bufmgr_classic* bufmgr = get_bufmgr_classic(bufmgr_ctx);
+	radeon_bo_vram* bo = (radeon_bo_vram*)calloc(1, sizeof(radeon_bo_vram));
 
-	assert(id <= rmesa->rmm->u_last);
+	bo->base.functions = &static_bo_functions;
+	bo->base.base.virtual = virtual;
+	bo->base.base.offset = offset + bufmgr->screen->fbLocation;
+	bo->base.validated = 1; /* Static buffer offsets are always valid */
 
-	offset = (char *)rmesa->rmm->u_list[id].ptr -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-	offset += rmesa->radeon.radeonScreen->gart_texture_offset;
+	init_buffer(bufmgr, &bo->base, size);
+	return &bo->base.base;
 
-	return offset;
 }
 
-void *r300_mem_map(r300ContextPtr rmesa, int id, int access)
+static void bufmgr_classic_bo_reference(dri_bo *bo_base)
 {
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-	void *ptr;
-	int tries = 0;
+	radeon_bo_classic *bo = get_bo_classic(bo_base);
+	bo->refcount++;
+	assert(bo->refcount > 0);
+}
+
+static void bufmgr_classic_bo_unreference(dri_bo *bo_base)
+{
+	radeon_bo_classic *bo = get_bo_classic(bo_base);
+
+	if (!bo_base)
+		return;
+
+	assert(bo->refcount > 0);
+	bo->refcount--;
+	if (!bo->refcount) {
+		// Ugly HACK - figure out whether this is really necessary
+		get_bufmgr_classic(bo_base->bufmgr)->rmesa->dma.nr_released_bufs++;
+
+		assert(!bo->mapcount);
+		if (!bo->pending)
+			bo_free(bo);
+	}
+}
+
+static int bufmgr_classic_bo_map(dri_bo *bo_base, int write_enable)
+{
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(bo_base->bufmgr);
+	radeon_bo_classic *bo = get_bo_classic(bo_base);
+	assert(bo->refcount > 0);
+
+	if (bo->pending) {
+		track_pending_buffers(bufmgr);
+		if (bo->pending) {
+			// TODO: Better fence waiting
+			if (RADEON_DEBUG & DEBUG_MEMORY)
+				fprintf(stderr, "bo_map: buffer is pending. Flushing...\n");
+			radeonFinish(bufmgr->rmesa->radeon.glCtx);
+			track_pending_buffers(bufmgr);
+			if (bo->pending) {
+				fprintf(stderr, "Internal error or hardware lockup: bo_map: buffer is still pending.\n");
+				abort();
+			}
+		}
+	}
 
-	assert(id <= rmesa->rmm->u_last);
+	if (!bo->mapcount && bo->functions->map)
+		bo->functions->map(bo, write_enable);
 
-	if (access == R300_MEM_R) {
+	bo->mapcount++;
+	assert(bo->mapcount > 0);
+	return 0;
+}
 
-		if (rmesa->rmm->u_list[id].mapped == 1)
-			WARN_ONCE("buffer %d already mapped\n", id);
+static int bufmgr_classic_bo_unmap(dri_bo *buf)
+{
+	radeon_bo_classic *bo = get_bo_classic(buf);
+	assert(bo->refcount > 0);
+	assert(bo->mapcount > 0);
+	bo->mapcount--;
 
-		rmesa->rmm->u_list[id].mapped = 1;
-		ptr = r300_mem_ptr(rmesa, id);
+	if (!bo->mapcount && bo->functions->unmap)
+		bo->functions->unmap(bo);
 
-		return ptr;
+	return 0;
+}
+
+/**
+ * Mark the given buffer as pending and move it to the tail
+ * of the pending list.
+ * The caller is responsible for setting up pending_count and pending_age.
+ */
+static void move_to_pending_tail(radeon_bo_classic *bo)
+{
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(bo->base.bufmgr);
+
+	if (bo->pending) {
+		*bo->pending_pprev = bo->pending_next;
+		if (bo->pending_next)
+			bo->pending_next->pending_pprev = bo->pending_pprev;
+		else
+			bufmgr->pending_tail = bo->pending_pprev;
 	}
 
-	if (rmesa->rmm->u_list[id].h_pending)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
+	bo->pending = 1;
+	bo->pending_pprev = bufmgr->pending_tail;
+	bo->pending_next = 0;
+	*bufmgr->pending_tail = bo;
+	bufmgr->pending_tail = &bo->pending_next;
+}
+
+/**
+ * Emit commands to the batch buffer that cause the guven buffer's
+ * pending_count and pending_age to be updated.
+ */
+static void emit_age_for_buffer(radeon_bo_classic* bo)
+{
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(bo->base.bufmgr);
+	BATCH_LOCALS(bufmgr->rmesa);
+	drm_r300_cmd_header_t cmd;
+	uint64_t ull;
+
+	cmd.scratch.cmd_type = R300_CMD_SCRATCH;
+	cmd.scratch.reg = 2; /* Scratch register 2 corresponds to what radeonGetAge polls */
+	cmd.scratch.n_bufs = 1;
+	cmd.scratch.flags = 0;
+	ull = (uint64_t) (intptr_t) &bo->pending_age;
+
+	BEGIN_BATCH(4);
+	OUT_BATCH(cmd.u);
+	OUT_BATCH(ull & 0xffffffff);
+	OUT_BATCH(ull >> 32);
+	OUT_BATCH(0);
+	END_BATCH();
+	COMMIT_BATCH();
+
+	bo->pending_count++;
+}
 
-	if (rmesa->rmm->u_list[id].h_pending) {
-		return NULL;
+static int bufmgr_classic_emit_reloc(dri_bo *batch_buf, uint64_t flags, GLuint delta,
+				     GLuint offset, dri_bo *target)
+{
+	radeon_bo_classic *bo = get_bo_classic(batch_buf);
+	radeon_reloc *reloc;
+	
+	if (bo->relocs_used >= bo->relocs_size) {
+		bo->relocs_size *= 2;
+		if (bo->relocs_size < 32)
+			bo->relocs_size = 32;
+
+		bo->relocs = (radeon_reloc*)realloc(bo->relocs, bo->relocs_size*sizeof(radeon_reloc));
 	}
 
-	while (rmesa->rmm->u_list[id].age >
-	       radeonGetAge((radeonContextPtr) rmesa) && tries++ < 1000)
-		usleep(10);
+	reloc = &bo->relocs[bo->relocs_used++];
+	reloc->flags = flags;
+	reloc->offset = offset;
+	reloc->delta = delta;
+	reloc->target = get_bo_classic(target);
+	dri_bo_reference(target);
+	return 0;
+}
 
-	if (tries >= 1000) {
-		fprintf(stderr, "Idling failed (%x vs %x)\n",
-			rmesa->rmm->u_list[id].age,
-			radeonGetAge((radeonContextPtr) rmesa));
-		return NULL;
+static void bufmgr_kick_all_buffers(radeon_bufmgr_classic *bufmgr)
+{
+	radeon_bo_classic *bo;
+
+	bo = bufmgr->buffers;
+	while(bo) {
+		if (bo->functions == &vram_bo_functions) {
+			radeon_bo_vram *bo_vram = get_bo_vram(bo);
+			if (bo->validated) {
+				driDestroyTextureObject(&bo_vram->vram->base);
+				bo_vram->vram = 0;
+				bo->validated = 0;
+			}
+		}
+		bo = bo->next;
 	}
+}
 
-	if (rmesa->rmm->u_list[id].mapped == 1)
-		WARN_ONCE("buffer %d already mapped\n", id);
+/* process_relocs is called just before the given command buffer
+ * is executed. It ensures that all referenced buffers are in
+ * the right GPU domain.
+ */
+static void *bufmgr_classic_process_relocs(dri_bo *batch_buf)
+{
+	radeon_bo_classic *batch_bo = get_bo_classic(batch_buf);
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(batch_bo->base.bufmgr);
+	int i;
+	int ret = 0;
+	int retries = 0;
+
+	// Warning: At this point, we append something to the batch buffer
+	// during flush.
+	emit_age_for_buffer(batch_bo);
+
+	dri_bo_map(batch_buf, GL_TRUE);
+
+restart:
+	for(i = 0; i < batch_bo->relocs_used; ++i) {
+		radeon_reloc *reloc = &batch_bo->relocs[i];
+		uint32_t *dest = (uint32_t*)((char*)batch_buf->virtual + reloc->offset);
+		uint32_t offset;
+		
+		ret = 0;
+		if (!reloc->target->validated)
+		    ret = reloc->target->functions->validate(reloc->target);
+		
+		if (ret == -1) {
+			track_pending_buffers(bufmgr);
+			/* seriously not afraid of the police */
+		  	bufmgr_kick_all_buffers(bufmgr);
+		  	retries++;
+		  	if (retries == 2) {
+				fprintf(stderr,"r300: Failed to get relocations into aperture\n");
+				exit(-1);
+			}
+			goto restart;
+		}
 
-	rmesa->rmm->u_list[id].mapped = 1;
-	ptr = r300_mem_ptr(rmesa, id);
+		reloc->target->used = 1;
+		offset = reloc->target->base.offset + reloc->delta;
 
-	return ptr;
+		if (reloc->flags & DRM_RELOC_BLITTER)
+			*dest = (*dest & 0xffc00000) | (offset >> 10);
+		else if (reloc->flags & DRM_RELOC_TXOFFSET)
+			*dest = (*dest & 31) | (offset & ~31);
+		else
+			*dest = offset;
+	}
+	dri_bo_unmap(batch_buf);
+	return 0;
 }
 
-void r300_mem_unmap(r300ContextPtr rmesa, int id)
+/* post_submit is called just after the given command buffer
+ * is executed. It ensures that buffers are properly marked as
+ * pending.
+ */
+static void bufmgr_classic_post_submit(dri_bo *batch_buf)
 {
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
+	radeon_bo_classic *batch_bo = get_bo_classic(batch_buf);
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(batch_bo->base.bufmgr);
+	int i;
 
-	assert(id <= rmesa->rmm->u_last);
+	assert(!batch_bo->pending_count);
 
-	if (rmesa->rmm->u_list[id].mapped == 0)
-		WARN_ONCE("buffer %d not mapped\n", id);
+	for(i = 0; i < batch_bo->relocs_used; ++i) {
+		radeon_reloc *reloc = &batch_bo->relocs[i];
 
-	rmesa->rmm->u_list[id].mapped = 0;
+		if (reloc->target->used) {
+			reloc->target->used = 0;
+			assert(!reloc->target->pending_count);
+			reloc->target->pending_age = batch_bo->pending_age;
+			move_to_pending_tail(reloc->target);
+			if (reloc->target->functions->bind)
+				(*reloc->target->functions->bind)(reloc->target);
+		}
+		if (reloc->target->space_accounted)
+			reloc->target->space_accounted = 0;
+	}
+	bufmgr->total_vram_used = 0;
 }
 
-void r300_mem_free(r300ContextPtr rmesa, int id)
+static void bufmgr_classic_destroy(dri_bufmgr *bufmgr_ctx)
 {
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
+	radeon_bufmgr_classic* bufmgr = get_bufmgr_classic(bufmgr_ctx);
+
+	track_pending_buffers(bufmgr);
+	if (bufmgr->pending)
+		radeonFinish(bufmgr->rmesa->radeon.glCtx);
+	track_pending_buffers(bufmgr);
+
+	if (bufmgr->buffers) {
+		//fprintf(stderr, "Warning: Buffer objects have leaked\n");
+		while(bufmgr->buffers) {
+		//	fprintf(stderr, "  Leak of size %ld\n", bufmgr->buffers->base.size);
+			bufmgr->buffers->refcount = 0;
+			bufmgr->buffers->mapcount = 0;
+			bufmgr->buffers->pending = 0;
+			bo_free(bufmgr->buffers);
+		}
+	}
 
-	assert(id <= rmesa->rmm->u_last);
+	driDestroyTextureHeap(bufmgr->texture_heap);
+	bufmgr->texture_heap = 0;
+	assert(is_empty_list(&bufmgr->texture_swapped));
 
-	if (id == 0)
-		return;
+	free(bufmgr);
+}
 
-	if (rmesa->rmm->u_list[id].ptr == NULL) {
-		WARN_ONCE("Not allocated!\n");
-		return;
+static int bufmgr_check_aperture_space(dri_bo *buf)
+{
+	radeon_bo_classic *bo = get_bo_classic(buf);	
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(bo->base.bufmgr);
+  	if (bo->space_accounted == 0) {
+		bo->space_accounted = 1;
+		if (bo->functions == &vram_bo_functions) {
+			bufmgr->total_vram_used += bo->base.size;
+		}
 	}
 
-	if (rmesa->rmm->u_list[id].pending) {
-		WARN_ONCE("%p already pended!\n", rmesa->rmm->u_list[id].ptr);
-		return;
+	if (bufmgr->total_vram_used >= bufmgr->texture_heap->size) {
+		bufmgr->total_vram_used -= bo->base.size;
+		bo->space_accounted = 0;
+		return -1;
 	}
 
-	rmesa->rmm->u_list[id].pending = 1;
+	return 0;
+}
+
+dri_bufmgr* radeonBufmgrClassicInit(r300ContextPtr rmesa)
+{
+	radeon_bufmgr_classic* bufmgr = (radeon_bufmgr_classic*)calloc(1, sizeof(radeon_bufmgr_classic));
+
+	bufmgr->screen = rmesa->radeon.radeonScreen;
+	bufmgr->rmesa = rmesa;
+	bufmgr->base.bo_alloc = &radeon_bufmgr_classic_bo_alloc;
+	bufmgr->base.bo_alloc_static = bufmgr_classic_bo_alloc_static;
+	bufmgr->base.bo_reference = &bufmgr_classic_bo_reference;
+	bufmgr->base.bo_unreference = &bufmgr_classic_bo_unreference;
+	bufmgr->base.bo_map = &bufmgr_classic_bo_map;
+	bufmgr->base.bo_unmap = &bufmgr_classic_bo_unmap;
+	bufmgr->base.emit_reloc = &bufmgr_classic_emit_reloc;
+	bufmgr->base.process_relocs = &bufmgr_classic_process_relocs;
+	bufmgr->base.post_submit = &bufmgr_classic_post_submit;
+	bufmgr->base.destroy = &bufmgr_classic_destroy;
+	bufmgr->base.check_aperture_space = &bufmgr_check_aperture_space;
+	bufmgr->pending_tail = &bufmgr->pending;
+
+	/* Init texture heap */
+	make_empty_list(&bufmgr->texture_swapped);
+	bufmgr->texture_heap = driCreateTextureHeap(0, bufmgr,
+			bufmgr->screen->texSize[0], 12, RADEON_NR_TEX_REGIONS,
+			(drmTextureRegionPtr)rmesa->radeon.sarea->tex_list[0],
+			&rmesa->radeon.sarea->tex_age[0],
+			&bufmgr->texture_swapped, sizeof(radeon_vram_wrapper),
+			&destroy_vram_wrapper);
+	bufmgr->texture_offset = bufmgr->screen->texOffset[0];
+
+	return &bufmgr->base;
+}
+
+void radeonBufmgrContendedLockTake(dri_bufmgr* bufmgr_ctx)
+{
+	radeon_bufmgr_classic *bufmgr = get_bufmgr_classic(bufmgr_ctx);
+
+	DRI_AGE_TEXTURES(bufmgr->texture_heap);
 }
-#endif
diff --git a/src/mesa/drivers/dri/r300/r300_mem.h b/src/mesa/drivers/dri/r300/r300_mem.h
index 625a7f6..a6788a8 100644
--- a/src/mesa/drivers/dri/r300/r300_mem.h
+++ b/src/mesa/drivers/dri/r300/r300_mem.h
@@ -1,37 +1,22 @@
 #ifndef __R300_MEM_H__
 #define __R300_MEM_H__
 
-//#define R300_MEM_PDL 0
-#define R300_MEM_UL 1
-
-#define R300_MEM_R 1
-#define R300_MEM_W 2
-#define R300_MEM_RW (R300_MEM_R | R300_MEM_W)
-
-#define R300_MEM_SCRATCH 2
-
-struct r300_memory_manager {
-	struct {
-		void *ptr;
-		uint32_t size;
-		uint32_t age;
-		uint32_t h_pending;
-		int pending;
-		int mapped;
-	} *u_list;
-	int u_head, u_size, u_last;
-
-};
-
-extern void r300_mem_init(r300ContextPtr rmesa);
-extern void r300_mem_destroy(r300ContextPtr rmesa);
-extern void *r300_mem_ptr(r300ContextPtr rmesa, int id);
-extern int r300_mem_find(r300ContextPtr rmesa, void *ptr);
-extern int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size);
-extern void r300_mem_use(r300ContextPtr rmesa, int id);
-extern unsigned long r300_mem_offset(r300ContextPtr rmesa, int id);
-extern void *r300_mem_map(r300ContextPtr rmesa, int id, int access);
-extern void r300_mem_unmap(r300ContextPtr rmesa, int id);
-extern void r300_mem_free(r300ContextPtr rmesa, int id);
+#include "main/glheader.h"
+
+#include "r300_context.h"
+
+
+/* Note: The following flags should probably be ultimately eliminated,
+ * or replaced by something else.
+ */
+#define DRM_BO_MEM_DMA (1 << 27) /** Use for transient buffers (texture upload, vertex buffers...) */
+#define DRM_BO_MEM_CMDBUF (1 << 28) /** Use for command buffers */
+
+#define DRM_RELOC_BLITTER (1 << 23) /** Offset overwrites lower 22 bits (used with blit packet3) */
+#define DRM_RELOC_TXOFFSET (1 << 24) /** Offset overwrites everything but low bits (used for texture offsets) */
+
+dri_bufmgr* radeonBufmgrClassicInit(r300ContextPtr rmesa);
+void radeonBufmgrContendedLockTake(dri_bufmgr* bufmgr_ctx);
+
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_mipmap_tree.c b/src/mesa/drivers/dri/r300/r300_mipmap_tree.c
new file mode 100644
index 0000000..49fc161
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_mipmap_tree.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "r300_mipmap_tree.h"
+
+#include <errno.h>
+#include <unistd.h>
+
+#include "main/simple_list.h"
+#include "main/texcompress.h"
+#include "main/texformat.h"
+
+#include "radeon_buffer.h"
+#include "r300_mem.h"
+
+static GLuint r300_compressed_texture_size(GLcontext *ctx,
+		GLsizei width, GLsizei height, GLsizei depth,
+		GLuint mesaFormat)
+{
+	GLuint size = _mesa_compressed_texture_size(ctx, width, height, depth, mesaFormat);
+
+	if (mesaFormat == MESA_FORMAT_RGB_DXT1 ||
+	    mesaFormat == MESA_FORMAT_RGBA_DXT1) {
+		if (width + 3 < 8)	/* width one block */
+			size = size * 4;
+		else if (width + 3 < 16)
+			size = size * 2;
+	} else {
+		/* DXT3/5, 16 bytes per block */
+		WARN_ONCE("DXT 3/5 suffers from multitexturing problems!\n");
+		if (width + 3 < 8)
+			size = size * 2;
+	}
+
+	return size;
+}
+
+/**
+ * Compute sizes and fill in offset and blit information for the given
+ * image (determined by \p face and \p level).
+ *
+ * \param curOffset points to the offset at which the image is to be stored
+ * and is updated by this function according to the size of the image.
+ */
+static void compute_tex_image_offset(r300_mipmap_tree *mt,
+	GLuint face, GLuint level, GLuint* curOffset)
+{
+	r300_mipmap_level *lvl = &mt->levels[level];
+
+	/* Find image size in bytes */
+	if (mt->compressed) {
+		/* TODO: Is this correct? Need test cases for compressed textures! */
+		GLuint align;
+
+		if (mt->target == GL_TEXTURE_RECTANGLE_NV)
+			align = 64 / mt->bpp;
+		else
+			align = 32 / mt->bpp;
+		lvl->rowstride = (lvl->width + align - 1) & ~(align - 1);
+		lvl->size = r300_compressed_texture_size(mt->r300->radeon.glCtx,
+			lvl->width, lvl->height, lvl->depth, mt->compressed);
+	} else if (mt->target == GL_TEXTURE_RECTANGLE_NV) {
+		lvl->rowstride = (lvl->width * mt->bpp + 63) & ~63;
+		lvl->size = lvl->rowstride * lvl->height;
+	} else if (mt->tilebits & R300_TXO_MICRO_TILE) {
+		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+		 * though the actual offset may be different (if texture is less than
+		 * 32 bytes width) to the untiled case */
+		lvl->rowstride = (lvl->width * mt->bpp * 2 + 31) & ~31;
+		lvl->size = lvl->rowstride * ((lvl->height + 1) / 2) * lvl->depth;
+	} else {
+		lvl->rowstride = (lvl->width * mt->bpp + 31) & ~31;
+		lvl->size = lvl->rowstride * lvl->height * lvl->depth;
+	}
+	assert(lvl->size > 0);
+
+	/* All images are aligned to a 32-byte offset */
+	*curOffset = (*curOffset + 0x1f) & ~0x1f;
+	lvl->faces[face].offset = *curOffset;
+	*curOffset += lvl->size;
+}
+
+static GLuint minify(GLuint size, GLuint levels)
+{
+	size = size >> levels;
+	if (size < 1)
+		size = 1;
+	return size;
+}
+
+static void calculate_miptree_layout(r300_mipmap_tree *mt)
+{
+	GLuint curOffset;
+	GLuint numLevels;
+	GLuint i;
+
+	numLevels = mt->lastLevel - mt->firstLevel + 1;
+	assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
+
+	curOffset = 0;
+	for(i = 0; i < numLevels; i++) {
+		GLuint face;
+
+		mt->levels[i].width = minify(mt->width0, i);
+		mt->levels[i].height = minify(mt->height0, i);
+		mt->levels[i].depth = minify(mt->depth0, i);
+
+		for(face = 0; face < mt->faces; face++)
+			compute_tex_image_offset(mt, face, i, &curOffset);
+	}
+
+	/* Note the required size in memory */
+	mt->totalsize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+}
+
+
+/**
+ * Create a new mipmap tree, calculate its layout and allocate memory.
+ */
+r300_mipmap_tree* r300_miptree_create(r300ContextPtr rmesa, r300TexObj *t,
+		GLenum target, GLuint firstLevel, GLuint lastLevel,
+		GLuint width0, GLuint height0, GLuint depth0,
+		GLuint bpp, GLuint tilebits, GLuint compressed)
+{
+	r300_mipmap_tree *mt = CALLOC_STRUCT(_r300_mipmap_tree);
+
+	mt->r300 = rmesa;
+	mt->refcount = 1;
+	mt->t = t;
+	mt->target = target;
+	mt->faces = (target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
+	mt->firstLevel = firstLevel;
+	mt->lastLevel = lastLevel;
+	mt->width0 = width0;
+	mt->height0 = height0;
+	mt->depth0 = depth0;
+	mt->bpp = bpp;
+	mt->tilebits = tilebits;
+	mt->compressed = compressed;
+
+	calculate_miptree_layout(mt);
+
+	mt->bo = dri_bo_alloc(rmesa->radeon.bufmgr, "texture", mt->totalsize, 1024, 0);
+
+	return mt;
+}
+
+void r300_miptree_reference(r300_mipmap_tree *mt)
+{
+	mt->refcount++;
+	assert(mt->refcount > 0);
+}
+
+void r300_miptree_unreference(r300_mipmap_tree *mt)
+{
+	if (!mt)
+		return;
+
+	assert(mt->refcount > 0);
+	mt->refcount--;
+	if (!mt->refcount) {
+		dri_bo_unreference(mt->bo);
+		free(mt);
+	}
+}
+
+
+static void calculate_first_last_level(struct gl_texture_object *tObj,
+				       GLuint *pfirstLevel, GLuint *plastLevel)
+{
+	const struct gl_texture_image * const baseImage =
+		tObj->Image[0][tObj->BaseLevel];
+
+	/* These must be signed values.  MinLod and MaxLod can be negative numbers,
+	* and having firstLevel and lastLevel as signed prevents the need for
+	* extra sign checks.
+	*/
+	int   firstLevel;
+	int   lastLevel;
+
+	/* Yes, this looks overly complicated, but it's all needed.
+	*/
+	switch (tObj->Target) {
+	case GL_TEXTURE_1D:
+	case GL_TEXTURE_2D:
+	case GL_TEXTURE_3D:
+	case GL_TEXTURE_CUBE_MAP:
+		if (tObj->MinFilter == GL_NEAREST || tObj->MinFilter == GL_LINEAR) {
+			/* GL_NEAREST and GL_LINEAR only care about GL_TEXTURE_BASE_LEVEL.
+			*/
+			firstLevel = lastLevel = tObj->BaseLevel;
+		} else {
+			firstLevel = tObj->BaseLevel + (GLint)(tObj->MinLod + 0.5);
+			firstLevel = MAX2(firstLevel, tObj->BaseLevel);
+			firstLevel = MIN2(firstLevel, tObj->BaseLevel + baseImage->MaxLog2);
+			lastLevel = tObj->BaseLevel + (GLint)(tObj->MaxLod + 0.5);
+			lastLevel = MAX2(lastLevel, tObj->BaseLevel);
+			lastLevel = MIN2(lastLevel, tObj->BaseLevel + baseImage->MaxLog2);
+			lastLevel = MIN2(lastLevel, tObj->MaxLevel);
+			lastLevel = MAX2(firstLevel, lastLevel); /* need at least one level */
+		}
+		break;
+	case GL_TEXTURE_RECTANGLE_NV:
+	case GL_TEXTURE_4D_SGIS:
+		firstLevel = lastLevel = 0;
+		break;
+	default:
+		return;
+	}
+
+	/* save these values */
+	*pfirstLevel = firstLevel;
+	*plastLevel = lastLevel;
+}
+
+
+/**
+ * Checks whether the given miptree can hold the given texture image at the
+ * given face and level.
+ */
+GLboolean r300_miptree_matches_image(r300_mipmap_tree *mt,
+		struct gl_texture_image *texImage, GLuint face, GLuint level)
+{
+	r300_mipmap_level *lvl;
+
+	if (face >= mt->faces || level < mt->firstLevel || level > mt->lastLevel)
+		return GL_FALSE;
+
+	if (texImage->TexFormat->TexelBytes != mt->bpp)
+		return GL_FALSE;
+
+	lvl = &mt->levels[level - mt->firstLevel];
+	if (lvl->width != texImage->Width ||
+	    lvl->height != texImage->Height ||
+	    lvl->depth != texImage->Depth)
+		return GL_FALSE;
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Checks whether the given miptree has the right format to store the given texture object.
+ */
+GLboolean r300_miptree_matches_texture(r300_mipmap_tree *mt, struct gl_texture_object *texObj)
+{
+	struct gl_texture_image *firstImage;
+	GLuint compressed;
+	GLuint numfaces = 1;
+	GLuint firstLevel, lastLevel;
+
+	calculate_first_last_level(texObj, &firstLevel, &lastLevel);
+	if (texObj->Target == GL_TEXTURE_CUBE_MAP)
+		numfaces = 6;
+
+	firstImage = texObj->Image[0][firstLevel];
+	compressed = firstImage->IsCompressed ? firstImage->TexFormat->MesaFormat : 0;
+
+	return (mt->firstLevel == firstLevel &&
+	        mt->lastLevel == lastLevel &&
+	        mt->width0 == firstImage->Width &&
+	        mt->height0 == firstImage->Height &&
+	        mt->depth0 == firstImage->Depth &&
+	        mt->bpp == firstImage->TexFormat->TexelBytes &&
+	        mt->compressed == compressed);
+}
+
+
+/**
+ * Try to allocate a mipmap tree for the given texture that will fit the
+ * given image in the given position.
+ */
+void r300_try_alloc_miptree(r300ContextPtr rmesa, r300TexObj *t,
+		struct gl_texture_image *texImage, GLuint face, GLuint level)
+{
+	GLuint compressed = texImage->IsCompressed ? texImage->TexFormat->MesaFormat : 0;
+	GLuint numfaces = 1;
+	GLuint firstLevel, lastLevel;
+
+	assert(!t->mt);
+
+	calculate_first_last_level(&t->base, &firstLevel, &lastLevel);
+	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
+		numfaces = 6;
+
+	if (level != firstLevel || face >= numfaces)
+		return;
+
+	t->mt = r300_miptree_create(rmesa, t, t->base.Target,
+		firstLevel, lastLevel,
+		texImage->Width, texImage->Height, texImage->Depth,
+		texImage->TexFormat->TexelBytes, t->tile_bits, compressed);
+}
diff --git a/src/mesa/drivers/dri/r300/r300_mipmap_tree.h b/src/mesa/drivers/dri/r300/r300_mipmap_tree.h
new file mode 100644
index 0000000..7705a4d
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_mipmap_tree.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R300_MIPMAP_TREE_H_
+#define __R300_MIPMAP_TREE_H_
+
+#include "r300_context.h"
+
+typedef struct _r300_mipmap_tree r300_mipmap_tree;
+typedef struct _r300_mipmap_level r300_mipmap_level;
+typedef struct _r300_mipmap_image r300_mipmap_image;
+
+struct _r300_mipmap_image {
+	GLuint offset; /** Offset of this image from the start of mipmap tree buffer, in bytes */
+};
+
+struct _r300_mipmap_level {
+	GLuint width;
+	GLuint height;
+	GLuint depth;
+	GLuint size; /** Size of each image, in bytes */
+	GLuint rowstride; /** in bytes */
+	r300_mipmap_image faces[6];
+};
+
+
+/**
+ * A mipmap tree contains texture images in the layout that the hardware
+ * expects.
+ *
+ * The meta-data of mipmap trees is immutable, i.e. you cannot change the
+ * layout on-the-fly; however, the texture contents (i.e. texels) can be
+ * changed.
+ */
+struct _r300_mipmap_tree {
+	r300ContextPtr r300;
+	r300TexObj *t;
+	dri_bo *bo;
+	GLuint refcount;
+
+	GLuint totalsize; /** total size of the miptree, in bytes */
+
+	GLenum target; /** GL_TEXTURE_xxx */
+	GLuint faces; /** # of faces: 6 for cubemaps, 1 otherwise */
+	GLuint firstLevel; /** First mip level stored in this mipmap tree */
+	GLuint lastLevel; /** Last mip level stored in this mipmap tree */
+
+	GLuint width0; /** Width of firstLevel image */
+	GLuint height0; /** Height of firstLevel image */
+	GLuint depth0; /** Depth of firstLevel image */
+
+	GLuint bpp; /** Bytes per texel */
+	GLuint tilebits; /** R300_TXO_xxx_TILE */
+	GLuint compressed; /** MESA_FORMAT_xxx indicating a compressed format, or 0 if uncompressed */
+
+	r300_mipmap_level levels[RADEON_MAX_TEXTURE_LEVELS];
+};
+
+r300_mipmap_tree* r300_miptree_create(r300ContextPtr rmesa, r300TexObj *t,
+		GLenum target, GLuint firstLevel, GLuint lastLevel,
+		GLuint width0, GLuint height0, GLuint depth0,
+		GLuint bpp, GLuint tilebits, GLuint compressed);
+void r300_miptree_reference(r300_mipmap_tree *mt);
+void r300_miptree_unreference(r300_mipmap_tree *mt);
+
+GLboolean r300_miptree_matches_image(r300_mipmap_tree *mt,
+		struct gl_texture_image *texImage, GLuint face, GLuint level);
+GLboolean r300_miptree_matches_texture(r300_mipmap_tree *mt, struct gl_texture_object *texObj);
+void r300_try_alloc_miptree(r300ContextPtr rmesa, r300TexObj *t,
+		struct gl_texture_image *texImage, GLuint face, GLuint level);
+
+
+#endif /* __R300_MIPMAP_TREE_H_ */
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index 292f87a..4cf11cf 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -175,89 +175,79 @@ int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim)
 static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_dma_region *rvb = &rmesa->state.elt_dma;
 	void *out;
 
-	if (r300IsGartMemory(rmesa, elts, n_elts * 4)) {
-		rvb->address = rmesa->radeon.radeonScreen->gartTextures.map;
-		rvb->start = ((char *)elts) - rvb->address;
-		rvb->aos_offset =
-		    rmesa->radeon.radeonScreen->gart_texture_offset +
-		    rvb->start;
-		return;
-	} else if (r300IsGartMemory(rmesa, elts, 1)) {
-		WARN_ONCE("Pointer not within GART memory!\n");
-		_mesa_exit(-1);
-	}
-
-	r300AllocDmaRegion(rmesa, rvb, n_elts * 4, 4);
-	rvb->aos_offset = GET_START(rvb);
+	r300AllocDmaRegion(rmesa, &rmesa->state.elt_dma_bo, &rmesa->state.elt_dma_offset,
+			   n_elts * 4, 4);
 
-	out = rvb->address + rvb->start;
+	out = rmesa->state.elt_dma_bo->virtual + rmesa->state.elt_dma_offset;
 	memcpy(out, elts, n_elts * 4);
 }
 
-static void r300FireEB(r300ContextPtr rmesa, unsigned long addr,
-		       int vertex_count, int type)
+static void r300FireEB(r300ContextPtr rmesa, int vertex_count, int type)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(rmesa);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (vertex_count << 16) | type | R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+	BEGIN_BATCH(8);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0);
+	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (vertex_count << 16) | type | R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_INDX_BUFFER, 2), 2);
-	e32(R300_EB_UNK1 | (0 << 16) | R300_EB_UNK2);
-	e32(addr);
-	e32(vertex_count);
+	OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
+	OUT_BATCH(R300_EB_UNK1 | (0 << 16) | R300_EB_UNK2);
+	OUT_BATCH_RELOC(0, rmesa->state.elt_dma_bo, rmesa->state.elt_dma_offset, 0);
+	OUT_BATCH(vertex_count);
+	END_BATCH();
 }
 
 static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
 {
+	BATCH_LOCALS(rmesa);
 	int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
 	int i;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
 
 	if (RADEON_DEBUG & DEBUG_VERTS)
 		fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
 			offset);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1), sz - 1);
-	e32(nr);
+	BEGIN_BATCH(sz+2);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+	OUT_BATCH(nr);
 
 	for (i = 0; i + 1 < nr; i += 2) {
-		e32((rmesa->state.aos[i].aos_size << 0) |
-		    (rmesa->state.aos[i].aos_stride << 8) |
-		    (rmesa->state.aos[i + 1].aos_size << 16) |
-		    (rmesa->state.aos[i + 1].aos_stride << 24));
-
-		e32(rmesa->state.aos[i].aos_offset + offset * 4 * rmesa->state.aos[i].aos_stride);
-		e32(rmesa->state.aos[i + 1].aos_offset + offset * 4 * rmesa->state.aos[i + 1].aos_stride);
+		OUT_BATCH((rmesa->state.aos[i].components << 0) |
+			  (rmesa->state.aos[i].stride << 8) |
+			  (rmesa->state.aos[i + 1].components << 16) |
+			  (rmesa->state.aos[i + 1].stride << 24));
+
+		OUT_BATCH_RELOC(0, rmesa->state.aos[i].bo,
+			rmesa->state.aos[i].offset + offset * 4 * rmesa->state.aos[i].stride, 0);
+		OUT_BATCH_RELOC(0, rmesa->state.aos[i+1].bo,
+			rmesa->state.aos[i+1].offset + offset * 4 * rmesa->state.aos[i + 1].stride, 0);
 	}
 
 	if (nr & 1) {
-		e32((rmesa->state.aos[nr - 1].aos_size << 0) |
-		    (rmesa->state.aos[nr - 1].aos_stride << 8));
-		e32(rmesa->state.aos[nr - 1].aos_offset + offset * 4 * rmesa->state.aos[nr - 1].aos_stride);
+		OUT_BATCH((rmesa->state.aos[nr - 1].components << 0) |
+			  (rmesa->state.aos[nr - 1].stride << 8));
+		OUT_BATCH_RELOC(0, rmesa->state.aos[nr - 1].bo,
+			rmesa->state.aos[nr - 1].offset + offset * 4 * rmesa->state.aos[nr - 1].stride, 0);
 	}
+	END_BATCH();
 }
 
 static void r300FireAOS(r300ContextPtr rmesa, int vertex_count, int type)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(rmesa);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
+	BEGIN_BATCH(3);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
+	END_BATCH();
 }
 
 static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 				   int start, int end, int prim)
 {
+	BATCH_LOCALS(rmesa);
 	int type, num_verts;
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	struct vertex_buffer *vb = &tnl->vb;
@@ -268,6 +258,12 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 	if (type < 0 || num_verts <= 0)
 		return;
 
+	/* Make space for at least 64 dwords.
+	 * This is supposed to ensure that we can get all rendering
+	 * commands into a single command buffer.
+	 */
+	r300EnsureCmdBufSpace(rmesa, 64, __FUNCTION__);
+
 	if (vb->Elts) {
 		if (num_verts > 65535) {
 			/* not implemented yet */
@@ -287,11 +283,12 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 		 */
 		r300EmitElts(ctx, vb->Elts, num_verts);
 		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
-		r300FireEB(rmesa, rmesa->state.elt_dma.aos_offset, num_verts, type);
+		r300FireEB(rmesa, num_verts, type);
 	} else {
 		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
 		r300FireAOS(rmesa, num_verts, type);
 	}
+	COMMIT_BATCH();
 }
 
 static GLboolean r300RunRender(GLcontext * ctx,
@@ -310,6 +307,10 @@ static GLboolean r300RunRender(GLcontext * ctx,
 	if (r300EmitArrays(ctx))
 		return GL_TRUE;
 
+
+	if (r300ValidateTextures(ctx))
+		return GL_TRUE;
+
 	r300UpdateShaderStates(rmesa);
 
 	r300EmitCacheFlush(rmesa);
@@ -324,10 +325,6 @@ static GLboolean r300RunRender(GLcontext * ctx,
 
 	r300EmitCacheFlush(rmesa);
 
-#ifdef USER_BUFFERS
-	r300UseArrays(ctx);
-#endif
-
 	r300ReleaseArrays(ctx);
 
 	return GL_FALSE;
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index 6a5c363..ed399b2 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -55,6 +55,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "radeon_ioctl.h"
 #include "radeon_state.h"
+#include "radeon_buffer.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
@@ -1146,39 +1147,25 @@ void r300UpdateDrawBuffer(GLcontext * ctx)
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	r300ContextPtr r300 = rmesa;
 	struct gl_framebuffer *fb = ctx->DrawBuffer;
-	driRenderbuffer *drb;
+	struct radeon_renderbuffer *rrb;
 
 	if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
 		/* draw to front */
-		drb =
-		    (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].
-		    Renderbuffer;
+		rrb =
+		    (void *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
 	} else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
 		/* draw to back */
-		drb =
-		    (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].
-		    Renderbuffer;
+		rrb = (void *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
 	} else {
 		/* drawing to multiple buffers, or none */
 		return;
 	}
 
-	assert(drb);
-	assert(drb->flippedPitch);
+	assert(rrb);
+	assert(rrb->pitch);
 
 	R300_STATECHANGE(rmesa, cb);
 
-	r300->hw.cb.cmd[R300_CB_OFFSET] = drb->flippedOffset +	//r300->radeon.state.color.drawOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.cb.cmd[R300_CB_PITCH] = drb->flippedPitch;	//r300->radeon.state.color.drawPitch;
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
 #if 0
 	R200_STATECHANGE(rmesa, ctx);
 
@@ -1497,14 +1484,9 @@ static void r300SetupTextures(GLcontext * ctx)
 	/* We cannot let disabled tmu offsets pass DRM */
 	for (i = 0; i < mtu; i++) {
 		if (ctx->Texture.Unit[i]._ReallyEnabled) {
-
-#if 0				/* Enables old behaviour */
-			hw_tmu = i;
-#endif
 			tmu_mappings[i] = hw_tmu;
 
-			t = r300->state.texture.unit[i].texobj;
-			/* XXX questionable fix for bug 9170: */
+			t = r300_tex_obj(ctx->Texture.Unit[i]._Current);
 			if (!t)
 				continue;
 
@@ -1530,21 +1512,20 @@ static void r300SetupTextures(GLcontext * ctx)
 			 */
 			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] =
 				t->filter_1 |
-				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.tObj->LodBias);
+				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.LodBias);
 			r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
 			    t->size;
 			r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
 						hw_tmu] = t->format;
 			r300->hw.tex.pitch.cmd[R300_TEX_VALUE_0 + hw_tmu] =
 			    t->pitch_reg;
-			r300->hw.tex.offset.cmd[R300_TEX_VALUE_0 +
-						hw_tmu] = t->offset;
+			r300->hw.textures[hw_tmu] = t;
 
-			if (t->offset & R300_TXO_MACRO_TILE) {
+			if (t->tile_bits & R300_TXO_MACRO_TILE) {
 				WARN_ONCE("macro tiling enabled!\n");
 			}
 
-			if (t->offset & R300_TXO_MICRO_TILE) {
+			if (t->tile_bits & R300_TXO_MICRO_TILE) {
 				WARN_ONCE("micro tiling enabled!\n");
 			}
 
@@ -2223,8 +2204,6 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300UpdateCulling(ctx);
 
-	r300UpdateTextureState(ctx);
-
 	r300SetBlendState(ctx);
 	r300SetLogicOpState(ctx);
 
@@ -2371,20 +2350,6 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300BlendColor(ctx, ctx->Color.BlendColor);
 
-	/* Again, r300ClearBuffer uses this */
-	r300->hw.cb.cmd[R300_CB_OFFSET] =
-	    r300->radeon.state.color.drawOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-
 	r300->hw.rb3d_dither_ctl.cmd[1] = 0;
 	r300->hw.rb3d_dither_ctl.cmd[2] = 0;
 	r300->hw.rb3d_dither_ctl.cmd[3] = 0;
@@ -2400,10 +2365,6 @@ static void r300ResetHwState(r300ContextPtr r300)
 	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
 	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
 
-	r300->hw.zb.cmd[R300_ZB_OFFSET] =
-	    r300->radeon.radeonScreen->depthOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.zb.cmd[R300_ZB_PITCH] = r300->radeon.radeonScreen->depthPitch;
 
 	if (r300->radeon.sarea->tiling_enabled) {
 		/* XXX: Turn off when clearing buffers ? */
@@ -2675,7 +2636,6 @@ void r300UpdateShaderStates(r300ContextPtr rmesa)
 	GLcontext *ctx;
 	ctx = rmesa->radeon.glCtx;
 
-	r300UpdateTextureState(ctx);
 	r300SetEarlyZState(ctx);
 
 	GLuint fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
diff --git a/src/mesa/drivers/dri/r300/r300_state.h b/src/mesa/drivers/dri/r300/r300_state.h
index 0589ab7..96177ba 100644
--- a/src/mesa/drivers/dri/r300/r300_state.h
+++ b/src/mesa/drivers/dri/r300/r300_state.h
@@ -59,7 +59,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R300_FIREVERTICES( r300 )			\
 do {							\
     \
-   if ( (r300)->cmdbuf.count_used || (r300)->dma.flush ) {	\
+   if ( (r300)->cmdbuf.committed || (r300)->dma.flush ) {	\
       r300Flush( (r300)->radeon.glCtx );		\
    }							\
     \
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.c b/src/mesa/drivers/dri/r300/r300_swtcl.c
index b6e7ce1..c4e88e2 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.c
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.c
@@ -57,11 +57,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_ioctl.h"
 #include "r300_emit.h"
 #include "r300_mem.h"
+#include "r300_tex.h"
 
 static void flush_last_swtcl_prim( r300ContextPtr rmesa  );
 
 
-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset);
+void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, dri_bo *bo, GLuint offset);
 void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr);
 #define EMIT_ATTR( ATTR, STYLE )					\
 do {									\
@@ -175,7 +176,7 @@ static void r300SetVertexFormat( GLcontext *ctx )
 			inputs[i] = -1;
 		}
 	}
-	
+
 	/* Fixed, apply to vir0 only */
 	if (InputsRead & (1 << VERT_ATTRIB_POS))
 		inputs[VERT_ATTRIB_POS] = 0;
@@ -186,16 +187,16 @@ static void r300SetVertexFormat( GLcontext *ctx )
 	for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
 		if (InputsRead & (1 << i))
 			inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
-	
+
 	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
 		if (InputsRead & (1 << i)) {
 			tab[nr++] = i;
 		}
 	}
-	
+
 	for (i = 0; i < nr; i++) {
 		int ci;
-		
+
 		swizzle[i][0] = SWIZZLE_ZERO;
 		swizzle[i][1] = SWIZZLE_ZERO;
 		swizzle[i][2] = SWIZZLE_ZERO;
@@ -215,21 +216,21 @@ static void r300SetVertexFormat( GLcontext *ctx )
 	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
 		r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
 				   nr);
-   
+
 	R300_STATECHANGE(rmesa, vic);
 	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
 	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-   
+
 	R300_STATECHANGE(rmesa, vof);
 	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
 	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = vap_fmt_1;
-   
+
 	rmesa->swtcl.vertex_size =
 		_tnl_install_attrs( ctx,
-				    rmesa->swtcl.vertex_attrs, 
+				    rmesa->swtcl.vertex_attrs,
 				    rmesa->swtcl.vertex_attr_count,
 				    NULL, 0 );
-	
+
 	rmesa->swtcl.vertex_size /= 4;
 
 	RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
@@ -245,38 +246,40 @@ static void r300SetVertexFormat( GLcontext *ctx )
  */
 static void flush_last_swtcl_prim( r300ContextPtr rmesa  )
 {
+	BATCH_LOCALS(rmesa);
+
 	if (RADEON_DEBUG & DEBUG_IOCTL)
 		fprintf(stderr, "%s\n", __FUNCTION__);
-	
+
 	rmesa->dma.flush = NULL;
 
-	if (rmesa->dma.current.buf) {
-		struct r300_dma_region *current = &rmesa->dma.current;
-		GLuint current_offset = GET_START(current);
+	if (rmesa->dma.current) {
+		GLuint current_offset = rmesa->dma.current_used;
 
-		assert (current->start + 
+		assert (rmesa->dma.current_used +
 			rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-			current->ptr);
+			rmesa->dma.current_vertexptr);
 
-		if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
+		if (rmesa->dma.current_used != rmesa->dma.current_vertexptr) {
+			rmesa->dma.current_used = rmesa->dma.current_vertexptr;
 
 			r300EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size + (12*sizeof(int)), __FUNCTION__);
-			
+
 			r300EmitState(rmesa);
-			
+
 			r300EmitVertexAOS( rmesa,
 					   rmesa->swtcl.vertex_size,
-					   current_offset);
-			
+					   rmesa->dma.current, current_offset);
+
 			r300EmitVbufPrim( rmesa,
 					  rmesa->swtcl.hw_primitive,
 					  rmesa->swtcl.numverts);
-			
+
 			r300EmitCacheFlush(rmesa);
+			COMMIT_BATCH();
 		}
-		
+
 		rmesa->swtcl.numverts = 0;
-		current->start = current->ptr;
 	}
 }
 
@@ -287,7 +290,7 @@ r300AllocDmaLowVerts( r300ContextPtr rmesa, int nverts, int vsize )
 {
 	GLuint bytes = vsize * nverts;
 
-	if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
+	if (!rmesa->dma.current || rmesa->dma.current_vertexptr + bytes > rmesa->dma.current->size)
 		r300RefillCurrentDmaRegion( rmesa, bytes);
 
 	if (!rmesa->dma.flush) {
@@ -297,13 +300,13 @@ r300AllocDmaLowVerts( r300ContextPtr rmesa, int nverts, int vsize )
 
 	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
 	ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
-	ASSERT( rmesa->dma.current.start + 
+	ASSERT( rmesa->dma.current_used +
 		rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-		rmesa->dma.current.ptr );
+		rmesa->dma.current_vertexptr );
 
 	{
-		GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
-		rmesa->dma.current.ptr += bytes;
+		GLubyte *head = (GLubyte *) (rmesa->dma.current->virtual + rmesa->dma.current_vertexptr);
+		rmesa->dma.current_vertexptr += bytes;
 		rmesa->swtcl.numverts += nverts;
 		return head;
 	}
@@ -352,7 +355,7 @@ static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
    r300ContextPtr rmesa = R300_CONTEXT(ctx);		\
    const char *r300verts = (char *)rmesa->swtcl.verts;
 #define VERT(x) (r300Vertex *)(r300verts + ((x) * vertsize * sizeof(int)))
-#define VERTEX r300Vertex 
+#define VERTEX r300Vertex
 #define DO_DEBUG_VERTS (1 && (RADEON_DEBUG & DEBUG_VERTS))
 #define PRINT_VERTEX(x)
 #undef TAG
@@ -572,15 +575,17 @@ static void r300RenderStart(GLcontext *ctx)
         r300ContextPtr rmesa = R300_CONTEXT( ctx );
 	//	fprintf(stderr, "%s\n", __FUNCTION__);
 
-	r300ChooseRenderState(ctx);	
+	r300ChooseRenderState(ctx);
 	r300SetVertexFormat(ctx);
 
 	r300UpdateShaders(rmesa);
+	
+	r300ValidateTextures(ctx);
 	r300UpdateShaderStates(rmesa);
 
 	r300EmitCacheFlush(rmesa);
-	
-	if (rmesa->dma.flush != 0 && 
+
+	if (rmesa->dma.flush != 0 &&
 	    rmesa->dma.flush != flush_last_swtcl_prim)
 		rmesa->dma.flush( rmesa );
 
@@ -593,7 +598,7 @@ static void r300RenderFinish(GLcontext *ctx)
 static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	
+
 	if (rmesa->swtcl.hw_primitive != hwprim) {
 	        R300_NEWPRIM( rmesa );
 		rmesa->swtcl.hw_primitive = hwprim;
@@ -611,7 +616,7 @@ static void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
 
 	r300RasterPrimitive( ctx, reduced_prim[prim] );
 	//	fprintf(stderr, "%s\n", __FUNCTION__);
-	
+
 }
 
 static void r300ResetLineStipple(GLcontext *ctx)
@@ -625,12 +630,12 @@ void r300InitSwtcl(GLcontext *ctx)
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	static int firsttime = 1;
-	
+
 	if (firsttime) {
 		init_rast_tab();
 		firsttime = 0;
 	}
-	
+
 	tnl->Driver.Render.Start = r300RenderStart;
 	tnl->Driver.Render.Finish = r300RenderFinish;
 	tnl->Driver.Render.PrimitiveNotify = r300RenderPrimitive;
@@ -638,15 +643,15 @@ void r300InitSwtcl(GLcontext *ctx)
 	tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
 	tnl->Driver.Render.CopyPV = _tnl_copy_pv;
 	tnl->Driver.Render.Interp = _tnl_interp;
-	
+
 	/* FIXME: what are these numbers? */
-	_tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+	_tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12,
 			    48 * sizeof(GLfloat) );
-	
+
 	rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
 	rmesa->swtcl.RenderIndex = ~0;
 	rmesa->swtcl.render_primitive = GL_TRIANGLES;
-	rmesa->swtcl.hw_primitive = 0;	
+	rmesa->swtcl.hw_primitive = 0;
 
 	_tnl_invalidate_vertex_state( ctx, ~0 );
 	_tnl_invalidate_vertices( ctx, ~0 );
@@ -655,9 +660,9 @@ void r300InitSwtcl(GLcontext *ctx)
 	_tnl_need_projected_coords( ctx, GL_FALSE );
 	r300ChooseRenderState(ctx);
 
-	_mesa_validate_all_lighting_tables( ctx ); 
+	_mesa_validate_all_lighting_tables( ctx );
 
-	tnl->Driver.NotifyMaterialChange = 
+	tnl->Driver.NotifyMaterialChange =
 	  _mesa_validate_all_lighting_tables;
 }
 
@@ -665,33 +670,32 @@ void r300DestroySwtcl(GLcontext *ctx)
 {
 }
 
-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset)
+void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, dri_bo *bo, GLuint offset)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
+	BATCH_LOCALS(rmesa);
 
-	drm_radeon_cmd_header_t *cmd = NULL;
 	if (RADEON_DEBUG & DEBUG_VERTS)
-	  fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
-		  __FUNCTION__, vertex_size, offset);
-
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2), 2);
-	e32(1);
-	e32(vertex_size | (vertex_size << 8));
-	e32(offset);
+		fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
+			__FUNCTION__, vertex_size, offset);
+
+	BEGIN_BATCH(5);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2);
+	OUT_BATCH(1);
+	OUT_BATCH(vertex_size | (vertex_size << 8));
+	OUT_BATCH_RELOC(0, bo, offset, 0);
+	END_BATCH();
 }
 
 void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
 {
-
-	int cmd_reserved = 0;
-	int cmd_written = 0;
+	BATCH_LOCALS(rmesa);
 	int type, num_verts;
-	drm_radeon_cmd_header_t *cmd = NULL;
 
 	type = r300PrimitiveType(rmesa, primitive);
 	num_verts = r300NumVerts(rmesa, vertex_nr, primitive);
-	
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
+
+	BEGIN_BATCH(3);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
+	END_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index 8ab382c..bdb971a 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/image.h"
+#include "main/mipmap.h"
 #include "main/simple_list.h"
 #include "main/texformat.h"
 #include "main/texstore.h"
@@ -49,6 +50,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
+#include "r300_mipmap_tree.h"
 #include "r300_tex.h"
 
 #include "xmlpool.h"
@@ -79,7 +81,7 @@ static unsigned int translate_wrap_mode(GLenum wrapmode)
  */
 static void r300UpdateTexWrap(r300TexObjPtr t)
 {
-	struct gl_texture_object *tObj = t->base.tObj;
+	struct gl_texture_object *tObj = &t->base;
 
 	t->filter &=
 	    ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_R_MASK);
@@ -119,6 +121,9 @@ static GLuint aniso_filter(GLfloat anisotropy)
  */
 static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
 {
+	/* Force revalidation to account for switches from/to mipmapping. */
+	t->validated = GL_FALSE;
+
 	t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
 	t->filter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
 
@@ -176,39 +181,6 @@ static void r300SetTexBorderColor(r300TexObjPtr t, GLubyte c[4])
 	t->pp_border_color = PACK_COLOR_8888(c[3], c[0], c[1], c[2]);
 }
 
-/**
- * Allocate space for and load the mesa images into the texture memory block.
- * This will happen before drawing with a new texture, or drawing with a
- * texture after it was swapped out or teximaged again.
- */
-
-static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj)
-{
-	r300TexObjPtr t;
-
-	t = CALLOC_STRUCT(r300_tex_obj);
-	texObj->DriverData = t;
-	if (t != NULL) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE) {
-			fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
-				(void *)texObj, (void *)t);
-		}
-
-		/* Initialize non-image-dependent parts of the state:
-		 */
-		t->base.tObj = texObj;
-		t->border_fallback = GL_FALSE;
-
-		make_empty_list(&t->base);
-
-		r300UpdateTexWrap(t);
-		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
-		r300SetTexBorderColor(t, texObj->_BorderChan);
-	}
-
-	return t;
-}
-
 /* try to find a format which will only need a memcopy */
 static const struct gl_texture_format *r300Choose8888TexFormat(GLenum srcFormat,
 							       GLenum srcType)
@@ -434,277 +406,204 @@ static const struct gl_texture_format *r300ChooseTextureFormat(GLcontext * ctx,
 	return NULL;		/* never get here */
 }
 
-static GLboolean
-r300ValidateClientStorage(GLcontext * ctx, GLenum target,
-			  GLint internalFormat,
-			  GLint srcWidth, GLint srcHeight,
-			  GLenum format, GLenum type, const void *pixels,
-			  const struct gl_pixelstore_attrib *packing,
-			  struct gl_texture_object *texObj,
-			  struct gl_texture_image *texImage)
+
+/**
+ * Allocate an empty texture image object.
+ */
+static struct gl_texture_image *r300NewTextureImage(GLcontext *ctx)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	return CALLOC(sizeof(r300_texture_image));
+}
 
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "intformat %s format %s type %s\n",
-			_mesa_lookup_enum_by_nr(internalFormat),
-			_mesa_lookup_enum_by_nr(format),
-			_mesa_lookup_enum_by_nr(type));
+/**
+ * Free memory associated with this texture image.
+ */
+static void r300FreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage)
+{
+	r300_texture_image* image = get_r300_texture_image(timage);
 
-	if (!ctx->Unpack.ClientStorage)
-		return 0;
+	if (image->mt) {
+		r300_miptree_unreference(image->mt);
+		image->mt = 0;
+		assert(!image->base.Data);
+	} else {
+		_mesa_free_texture_image_data(ctx, timage);
+	}
+}
 
-	if (ctx->_ImageTransferState ||
-	    texImage->IsCompressed || texObj->GenerateMipmap)
-		return 0;
 
-	/* This list is incomplete, may be different on ppc???
-	 */
-	switch (internalFormat) {
-	case GL_RGBA:
-		if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV) {
-			texImage->TexFormat = _dri_texformat_argb8888;
-		} else
-			return 0;
-		break;
+/* Set Data pointer and additional data for mapped texture image */
+static void teximage_set_map_data(r300_texture_image *image)
+{
+	r300_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+	image->base.Data = image->mt->bo->virtual + lvl->faces[image->mtface].offset;
+	image->base.RowStride = lvl->rowstride / image->mt->bpp;
+}
 
-	case GL_RGB:
-		if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
-			texImage->TexFormat = _dri_texformat_rgb565;
-		} else
-			return 0;
-		break;
 
-	case GL_YCBCR_MESA:
-		if (format == GL_YCBCR_MESA &&
-		    type == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
-			texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
-		} else if (format == GL_YCBCR_MESA &&
-			   (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-			    type == GL_UNSIGNED_BYTE)) {
-			texImage->TexFormat = &_mesa_texformat_ycbcr;
-		} else
-			return 0;
-		break;
+/**
+ * Map a single texture image for glTexImage and friends.
+ */
+static void r300_teximage_map(r300_texture_image *image, GLboolean write_enable)
+{
+	if (image->mt) {
+		assert(!image->base.Data);
 
-	default:
-		return 0;
+		dri_bo_map(image->mt->bo, write_enable);
+		teximage_set_map_data(image);
 	}
+}
 
-	/* Could deal with these packing issues, but currently don't:
-	 */
-	if (packing->SkipPixels ||
-	    packing->SkipRows || packing->SwapBytes || packing->LsbFirst) {
-		return 0;
-	}
 
-	GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
-						    format, type);
+static void r300_teximage_unmap(r300_texture_image *image)
+{
+	if (image->mt) {
+		assert(image->base.Data);
 
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "%s: srcRowStride %d/%x\n",
-			__FUNCTION__, srcRowStride, srcRowStride);
+		image->base.Data = 0;
+		dri_bo_unmap(image->mt->bo);
+	}
+}
 
-	/* Could check this later in upload, pitch restrictions could be
-	 * relaxed, but would need to store the image pitch somewhere,
-	 * as packing details might change before image is uploaded:
-	 */
-	if (!r300IsGartMemory(rmesa, pixels, srcHeight * srcRowStride)
-	    || (srcRowStride & 63))
-		return 0;
+/**
+ * Map a validated texture for reading during software rendering.
+ */
+static void r300MapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
+{
+	r300TexObj* t = r300_tex_obj(texObj);
+	int face, level;
 
-	/* Have validated that _mesa_transfer_teximage would be a straight
-	 * memcpy at this point.  NOTE: future calls to TexSubImage will
-	 * overwrite the client data.  This is explicitly mentioned in the
-	 * extension spec.
-	 */
-	texImage->Data = (void *)pixels;
-	texImage->IsClientData = GL_TRUE;
-	texImage->RowStride = srcRowStride / texImage->TexFormat->TexelBytes;
+	assert(texObj->_Complete);
+	assert(t->mt);
 
-	return 1;
+	dri_bo_map(t->mt->bo, GL_FALSE);
+	for(face = 0; face < t->mt->faces; ++face) {
+		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+			teximage_set_map_data(get_r300_texture_image(texObj->Image[face][level]));
+	}
 }
 
-static void r300TexImage1D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
+static void r300UnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
 {
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	r300TexObj* t = r300_tex_obj(texObj);
+	int face, level;
 
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
-			return;
-		}
-	}
+	assert(texObj->_Complete);
+	assert(t->mt);
 
-	/* Note, this will call ChooseTextureFormat */
-	_mesa_store_teximage1d(ctx, target, level, internalFormat,
-			       width, border, format, type, pixels,
-			       &ctx->Unpack, texObj, texImage);
-
-	t->dirty_images[0] |= (1 << level);
+	for(face = 0; face < t->mt->faces; ++face) {
+		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+			texObj->Image[face][level]->Data = 0;
+	}
+	dri_bo_unmap(t->mt->bo);
 }
 
-static void r300TexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
-			      GLint xoffset,
-			      GLsizei width,
-			      GLenum format, GLenum type,
-			      const GLvoid * pixels,
-			      const struct gl_pixelstore_attrib *packing,
-			      struct gl_texture_object *texObj,
-			      struct gl_texture_image *texImage)
+/**
+ * All glTexImage calls go through this function.
+ */
+static void r300_teximage(
+	GLcontext *ctx, int dims,
+	GLint face, GLint level,
+	GLint internalFormat,
+	GLint width, GLint height, GLint depth,
+	GLsizei imageSize,
+	GLenum format, GLenum type, const GLvoid * pixels,
+	const struct gl_pixelstore_attrib *packing,
+	struct gl_texture_object *texObj,
+	struct gl_texture_image *texImage,
+	int compressed)
 {
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	r300TexObj* t = r300_tex_obj(texObj);
+	r300_texture_image* image = get_r300_texture_image(texImage);
 
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
-			return;
-		}
-	}
+	R300_FIREVERTICES(rmesa);
 
-	_mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
-				  format, type, pixels, packing, texObj,
-				  texImage);
+	t->validated = GL_FALSE;
 
-	t->dirty_images[0] |= (1 << level);
-}
+	/* Choose and fill in the texture format for this image */
+	texImage->TexFormat = r300ChooseTextureFormat(ctx, internalFormat, format, type);
+	_mesa_set_fetch_functions(texImage, dims);
 
-static void r300TexImage2D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint height, GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
+	if (texImage->TexFormat->TexelBytes == 0) {
+		texImage->IsCompressed = GL_TRUE;
+		texImage->CompressedSize =
+			ctx->Driver.CompressedTextureSize(ctx, texImage->Width,
+					   texImage->Height, texImage->Depth,
+					   texImage->TexFormat->MesaFormat);
+	} else {
+		texImage->IsCompressed = GL_FALSE;
+		texImage->CompressedSize = 0;
 	}
 
-	if (t != NULL) {
-		driSwapOutTextureObject(t);
+	/* Allocate memory for image */
+	r300FreeTexImageData(ctx, texImage); /* Mesa core only clears texImage->Data but not image->mt */
+
+	if (!t->mt)
+		r300_try_alloc_miptree(rmesa, t, texImage, face, level);
+	if (t->mt && r300_miptree_matches_image(t->mt, texImage, face, level)) {
+		image->mt = t->mt;
+		image->mtlevel = level - t->mt->firstLevel;
+		image->mtface = face;
+		r300_miptree_reference(t->mt);
 	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
-			return;
+		int size;
+		if (texImage->IsCompressed) {
+			size = texImage->CompressedSize;
+		} else {
+			size = texImage->Width * texImage->Height * texImage->Depth * texImage->TexFormat->TexelBytes;
 		}
+		texImage->Data = _mesa_alloc_texmemory(size);
 	}
 
-	texImage->IsClientData = GL_FALSE;
-
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
+	/* Upload texture image; note that the spec allows pixels to be NULL */
+	if (compressed) {
+		pixels = _mesa_validate_pbo_compressed_teximage(
+			ctx, imageSize, pixels, packing, "glCompressedTexImage");
 	} else {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_teximage2d(ctx, target, level, internalFormat,
-				       width, height, border, format, type,
-				       pixels, &ctx->Unpack, texObj, texImage);
-
-		t->dirty_images[face] |= (1 << level);
+		pixels = _mesa_validate_pbo_teximage(
+			ctx, dims, width, height, depth,
+			format, type, pixels, packing, "glTexImage");
 	}
-}
 
-static void r300TexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
-			      GLint xoffset, GLint yoffset,
-			      GLsizei width, GLsizei height,
-			      GLenum format, GLenum type,
-			      const GLvoid * pixels,
-			      const struct gl_pixelstore_attrib *packing,
-			      struct gl_texture_object *texObj,
-			      struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
+	if (pixels) {
+		r300_teximage_map(image, GL_TRUE);
 
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
-			return;
+		if (compressed) {
+			memcpy(texImage->Data, pixels, imageSize);
+		} else {
+			GLuint dstRowStride;
+			if (image->mt) {
+				r300_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+				dstRowStride = lvl->rowstride;
+			} else {
+				dstRowStride = texImage->Width * texImage->TexFormat->TexelBytes;
+			}
+			if (!texImage->TexFormat->StoreImage(ctx, dims,
+						texImage->_BaseFormat,
+						texImage->TexFormat,
+						texImage->Data, 0, 0, 0, /* dstX/Y/Zoffset */
+						dstRowStride,
+						texImage->ImageOffsets,
+						width, height, depth,
+						format, type, pixels, packing))
+				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
 		}
+
+		r300_teximage_unmap(image);
 	}
 
-	_mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-				  height, format, type, pixels, packing, texObj,
-				  texImage);
+	_mesa_unmap_teximage_pbo(ctx, packing);
 
-	t->dirty_images[face] |= (1 << level);
+	/* SGIS_generate_mipmap */
+	if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+		ctx->Driver.GenerateMipmap(ctx, texObj->Target, texObj);
+	}
 }
 
-static void r300CompressedTexImage2D(GLcontext * ctx, GLenum target,
-				     GLint level, GLint internalFormat,
-				     GLint width, GLint height, GLint border,
-				     GLsizei imageSize, const GLvoid * data,
-				     struct gl_texture_object *texObj,
-				     struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
 
-	/* which cube face or ordinary 2D image */
+static GLuint face_for_target(GLenum target)
+{
 	switch (target) {
 	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
 	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
@@ -712,103 +611,50 @@ static void r300CompressedTexImage2D(GLcontext * ctx, GLenum target,
 	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
 	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
 	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
+		return (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
 	default:
-		face = 0;
-	}
-
-	if (t != NULL) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
-				    "glCompressedTexImage2D");
-			return;
-		}
+		return 0;
 	}
+}
 
-	texImage->IsClientData = GL_FALSE;
 
-	/* can't call this, different parameters. Would never evaluate to true anyway currently */
-#if 0
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else
-#endif
-	{
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_compressed_teximage2d(ctx, target, level,
-						  internalFormat, width, height,
-						  border, imageSize, data,
-						  texObj, texImage);
-
-		t->dirty_images[face] |= (1 << level);
-	}
+static void r300TexImage1D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+{
+	r300_teximage(ctx, 1, 0, level, internalFormat, width, 1, 1,
+		0, format, type, pixels, packing, texObj, texImage, 0);
 }
 
-static void r300CompressedTexSubImage2D(GLcontext * ctx, GLenum target,
-					GLint level, GLint xoffset,
-					GLint yoffset, GLsizei width,
-					GLsizei height, GLenum format,
-					GLsizei imageSize, const GLvoid * data,
-					struct gl_texture_object *texObj,
-					struct gl_texture_image *texImage)
+static void r300TexImage2D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint height, GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
 {
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
+	GLuint face = face_for_target(target);
 
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
-				    "glCompressedTexSubImage3D");
-			return;
-		}
-	}
+	r300_teximage(ctx, 2, face, level, internalFormat, width, height, 1,
+		0, format, type, pixels, packing, texObj, texImage, 0);
+}
 
-	_mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset,
-					     yoffset, width, height, format,
-					     imageSize, data, texObj, texImage);
+static void r300CompressedTexImage2D(GLcontext * ctx, GLenum target,
+				     GLint level, GLint internalFormat,
+				     GLint width, GLint height, GLint border,
+				     GLsizei imageSize, const GLvoid * data,
+				     struct gl_texture_object *texObj,
+				     struct gl_texture_image *texImage)
+{
+	GLuint face = face_for_target(target);
 
-	t->dirty_images[face] |= (1 << level);
+	r300_teximage(ctx, 2, face, level, internalFormat, width, height, 1,
+		imageSize, 0, 0, data, 0, texObj, texImage, 1);
 }
 
 static void r300TexImage3D(GLcontext * ctx, GLenum target, GLint level,
@@ -820,51 +666,100 @@ static void r300TexImage3D(GLcontext * ctx, GLenum target, GLint level,
 			   struct gl_texture_object *texObj,
 			   struct gl_texture_image *texImage)
 {
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	r300_teximage(ctx, 3, 0, level, internalFormat, width, height, depth,
+		0, format, type, pixels, packing, texObj, texImage, 0);
+}
 
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
-			return;
+/**
+ * Update a subregion of the given texture image.
+ */
+static void r300_texsubimage(GLcontext* ctx, int dims, int level,
+		GLint xoffset, GLint yoffset, GLint zoffset,
+		GLsizei width, GLsizei height, GLsizei depth,
+		GLenum format, GLenum type,
+		const GLvoid * pixels,
+		const struct gl_pixelstore_attrib *packing,
+		struct gl_texture_object *texObj,
+		struct gl_texture_image *texImage,
+		int compressed)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	r300_texture_image* image = get_r300_texture_image(texImage);
+
+	R300_FIREVERTICES(rmesa);
+
+	pixels = _mesa_validate_pbo_teximage(ctx, dims,
+		width, height, depth, format, type, pixels, packing, "glTexSubImage1D");
+
+	if (pixels) {
+		GLint dstRowStride;
+		r300_teximage_map(image, GL_TRUE);
+
+		if (image->mt) {
+			r300_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+			dstRowStride = lvl->rowstride;
+		} else {
+			dstRowStride = texImage->Width * texImage->TexFormat->TexelBytes;
 		}
-	}
 
-	texImage->IsClientData = GL_FALSE;
+		if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
+				texImage->TexFormat, texImage->Data,
+				xoffset, yoffset, zoffset,
+				dstRowStride,
+				texImage->ImageOffsets,
+				width, height, depth,
+				format, type, pixels, packing))
+			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage");
 
-#if 0
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else
-#endif
-	{
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_teximage3d(ctx, target, level, internalFormat,
-				       width, height, depth, border,
-				       format, type, pixels,
-				       &ctx->Unpack, texObj, texImage);
+		r300_teximage_unmap(image);
+	}
 
-		t->dirty_images[0] |= (1 << level);
+	_mesa_unmap_teximage_pbo(ctx, packing);
+
+	/* GL_SGIS_generate_mipmap */
+	if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+		ctx->Driver.GenerateMipmap(ctx, texObj->Target, texObj);
 	}
 }
 
+static void r300TexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
+			      GLint xoffset,
+			      GLsizei width,
+			      GLenum format, GLenum type,
+			      const GLvoid * pixels,
+			      const struct gl_pixelstore_attrib *packing,
+			      struct gl_texture_object *texObj,
+			      struct gl_texture_image *texImage)
+{
+	r300_texsubimage(ctx, 1, level, xoffset, 0, 0, width, 1, 1,
+		format, type, pixels, packing, texObj, texImage, 0);
+}
+
+static void r300TexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
+			      GLint xoffset, GLint yoffset,
+			      GLsizei width, GLsizei height,
+			      GLenum format, GLenum type,
+			      const GLvoid * pixels,
+			      const struct gl_pixelstore_attrib *packing,
+			      struct gl_texture_object *texObj,
+			      struct gl_texture_image *texImage)
+{
+	r300_texsubimage(ctx, 2, level, xoffset, yoffset, 0, width, height, 1,
+		format, type, pixels, packing, texObj, texImage, 0);
+}
+
+static void r300CompressedTexSubImage2D(GLcontext * ctx, GLenum target,
+					GLint level, GLint xoffset,
+					GLint yoffset, GLsizei width,
+					GLsizei height, GLenum format,
+					GLsizei imageSize, const GLvoid * data,
+					struct gl_texture_object *texObj,
+					struct gl_texture_image *texImage)
+{
+	r300_texsubimage(ctx, 2, level, xoffset, yoffset, 0, width, height, 1,
+		format, 0, data, 0, texObj, texImage, 1);
+}
+
 static void
 r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
 		  GLint xoffset, GLint yoffset, GLint zoffset,
@@ -875,30 +770,29 @@ r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
 		  struct gl_texture_object *texObj,
 		  struct gl_texture_image *texImage)
 {
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-/*     fprintf(stderr, "%s\n", __FUNCTION__); */
+	r300_texsubimage(ctx, 3, level, xoffset, yoffset, zoffset, width, height, depth,
+		format, type, pixels, packing, texObj, texImage, 0);
+}
 
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
-			return;
-		}
-		texObj->DriverData = t;
-	}
 
-	_mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
-				  width, height, depth,
-				  format, type, pixels, packing, texObj,
-				  texImage);
+/**
+ * Wraps Mesa's implementation to ensure that the base level image is mapped.
+ *
+ * This relies on internal details of _mesa_generate_mipmap, in particular
+ * the fact that the memory for recreated texture images is always freed.
+ */
+static void r300_generate_mipmap(GLcontext* ctx, GLenum target, struct gl_texture_object *texObj)
+{
+	GLuint face = face_for_target(target);
+	r300_texture_image *baseimage = get_r300_texture_image(texObj->Image[face][texObj->BaseLevel]);
 
-	t->dirty_images[0] |= (1 << level);
+	r300_teximage_map(baseimage, GL_FALSE);
+	_mesa_generate_mipmap(ctx, target, texObj);
+	r300_teximage_unmap(baseimage);
 }
 
+
+
 /**
  * Changes variables and flags for a state update, which will happen at the
  * next UpdateTextureState
@@ -908,7 +802,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 			     struct gl_texture_object *texObj,
 			     GLenum pname, const GLfloat * params)
 {
-	r300TexObjPtr t = (r300TexObjPtr) texObj->DriverData;
+	r300TexObj* t = r300_tex_obj(texObj);
 
 	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
 		fprintf(stderr, "%s( %s )\n", __FUNCTION__,
@@ -941,7 +835,11 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 		 * we just have to rely on loading the right subset of mipmap levels
 		 * to simulate a clamped LOD.
 		 */
-		driSwapOutTextureObject((driTextureObject *) t);
+		if (t->mt) {
+			r300_miptree_unreference(t->mt);
+			t->mt = 0;
+			t->validated = GL_FALSE;
+		}
 		break;
 
 	case GL_DEPTH_TEXTURE_MODE:
@@ -964,27 +862,10 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 	}
 }
 
-static void r300BindTexture(GLcontext * ctx, GLenum target,
-			    struct gl_texture_object *texObj)
-{
-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
-		fprintf(stderr, "%s( %p ) unit=%d\n", __FUNCTION__,
-			(void *)texObj, ctx->Texture.CurrentUnit);
-	}
-
-	if ((target == GL_TEXTURE_1D)
-	    || (target == GL_TEXTURE_2D)
-	    || (target == GL_TEXTURE_3D)
-	    || (target == GL_TEXTURE_CUBE_MAP)
-	    || (target == GL_TEXTURE_RECTANGLE_NV)) {
-		assert(texObj->DriverData != NULL);
-	}
-}
-
 static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	r300TexObj* t = r300_tex_obj(texObj);
 
 	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
 		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
@@ -992,14 +873,19 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 			_mesa_lookup_enum_by_nr(texObj->Target));
 	}
 
-	if (t != NULL) {
-		if (rmesa) {
-			R300_FIREVERTICES(rmesa);
-		}
+	if (rmesa) {
+		int i;
+		R300_FIREVERTICES(rmesa);
+
+		for(i = 0; i < R300_MAX_TEXTURE_UNITS; ++i)
+			if (rmesa->hw.textures[i] == t)
+				rmesa->hw.textures[i] = 0;
+	}
 
-		driDestroyTextureObject(t);
+	if (t->mt) {
+		r300_miptree_unreference(t->mt);
+		t->mt = 0;
 	}
-	/* Free mipmap images and the texture object itself */
 	_mesa_delete_texture_object(ctx, texObj);
 }
 
@@ -1008,8 +894,6 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
  * Called via ctx->Driver.NewTextureObject.
  * Note: this function will be called during context creation to
  * allocate the default texture objects.
- * Note: we could use containment here to 'derive' the driver-specific
- * texture object from the core mesa gl_texture_object.  Not done at this time.
  * Fixup MaxAnisotropy according to user preference.
  */
 static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
@@ -1017,14 +901,23 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
 						      GLenum target)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_object *obj;
-	obj = _mesa_new_texture_object(ctx, name, target);
-	if (!obj)
-		return NULL;
-	obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+	r300TexObj* t = CALLOC_STRUCT(r300_tex_obj);
 
-	r300AllocTexObj(obj);
-	return obj;
+
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+			t, _mesa_lookup_enum_by_nr(target));
+	}
+
+	_mesa_initialize_texture_object(&t->base, name, target);
+	t->base.MaxAnisotropy = rmesa->initialMaxAnisotropy;
+
+	/* Initialize hardware state */
+	r300UpdateTexWrap(t);
+	r300SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
+	r300SetTexBorderColor(t, t->base._BorderChan);
+
+	return &t->base;
 }
 
 void r300InitTextureFuncs(struct dd_function_table *functions)
@@ -1032,6 +925,11 @@ void r300InitTextureFuncs(struct dd_function_table *functions)
 	/* Note: we only plug in the functions we implement in the driver
 	 * since _mesa_init_driver_functions() was already called.
 	 */
+	functions->NewTextureImage = r300NewTextureImage;
+	functions->FreeTexImageData = r300FreeTexImageData;
+	functions->MapTexture = r300MapTexture;
+	functions->UnmapTexture = r300UnmapTexture;
+
 	functions->ChooseTextureFormat = r300ChooseTextureFormat;
 	functions->TexImage1D = r300TexImage1D;
 	functions->TexImage2D = r300TexImage2D;
@@ -1040,7 +938,6 @@ void r300InitTextureFuncs(struct dd_function_table *functions)
 	functions->TexSubImage2D = r300TexSubImage2D;
 	functions->TexSubImage3D = r300TexSubImage3D;
 	functions->NewTextureObject = r300NewTextureObject;
-	functions->BindTexture = r300BindTexture;
 	functions->DeleteTexture = r300DeleteTexture;
 	functions->IsTextureResident = driIsTextureResident;
 
@@ -1049,5 +946,7 @@ void r300InitTextureFuncs(struct dd_function_table *functions)
 	functions->CompressedTexImage2D = r300CompressedTexImage2D;
 	functions->CompressedTexSubImage2D = r300CompressedTexSubImage2D;
 
+	functions->GenerateMipmap = r300_generate_mipmap;
+
 	driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_tex.h b/src/mesa/drivers/dri/r300/r300_tex.h
index b86d45b..a293ccf 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.h
+++ b/src/mesa/drivers/dri/r300/r300_tex.h
@@ -41,12 +41,7 @@ extern void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 			     unsigned long long offset, GLint depth,
 			     GLuint pitch);
 
-extern void r300UpdateTextureState(GLcontext * ctx);
-
-extern int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t,
-			       GLuint face);
-
-extern void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t);
+extern GLboolean r300ValidateTextures(GLcontext * ctx);
 
 extern void r300InitTextureFuncs(struct dd_function_table *functions);
 
diff --git a/src/mesa/drivers/dri/r300/r300_texmem.c b/src/mesa/drivers/dri/r300/r300_texmem.c
index b03eefa..bacf12f 100644
--- a/src/mesa/drivers/dri/r300/r300_texmem.c
+++ b/src/mesa/drivers/dri/r300/r300_texmem.c
@@ -48,520 +48,12 @@ SOFTWARE.
 #include "r300_context.h"
 #include "r300_state.h"
 #include "r300_cmdbuf.h"
+#include "r300_emit.h"
+#include "r300_mipmap_tree.h"
 #include "radeon_ioctl.h"
 #include "r300_tex.h"
 #include "r300_ioctl.h"
 #include <unistd.h>		/* for usleep() */
 
-#ifdef USER_BUFFERS
 #include "r300_mem.h"
-#endif
 
-/**
- * Destroy any device-dependent state associated with the texture.  This may
- * include NULLing out hardware state that points to the texture.
- */
-void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t)
-{
-	int i;
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
-		fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
-			(void *)t, (void *)t->base.tObj);
-	}
-
-	for (i = 0; i < rmesa->radeon.glCtx->Const.MaxTextureUnits; i++) {
-		if (rmesa->state.texture.unit[i].texobj == t) {
-			rmesa->state.texture.unit[i].texobj = NULL;
-		}
-	}
-}
-
-/* ------------------------------------------------------------
- * Texture image conversions
- */
-
-static void r300UploadGARTClientSubImage(r300ContextPtr rmesa,
-					 r300TexObjPtr t,
-					 struct gl_texture_image *texImage,
-					 GLint hwlevel,
-					 GLint x, GLint y,
-					 GLint width, GLint height)
-{
-	const struct gl_texture_format *texFormat = texImage->TexFormat;
-	GLuint srcPitch, dstPitch;
-	int blit_format;
-	int srcOffset;
-
-	/*
-	 * XXX it appears that we always upload the full image, not a subimage.
-	 * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
-	 * changed, the src pitch will have to change.
-	 */
-	switch (texFormat->TexelBytes) {
-	case 1:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 2:
-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 4:
-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 8:
-	case 16:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	default:
-		return;
-	}
-
-	t->image[0][hwlevel].data = texImage->Data;
-	srcOffset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
-
-	assert(srcOffset != ~0);
-
-	/* Don't currently need to cope with small pitches?
-	 */
-	width = texImage->Width;
-	height = texImage->Height;
-
-	if (texFormat->TexelBytes > 4) {
-		width *= texFormat->TexelBytes;
-	}
-
-	r300EmitWait(rmesa, R300_WAIT_3D);
-
-	r300EmitBlit(rmesa, blit_format,
-		     srcPitch,
-		     srcOffset,
-		     dstPitch,
-		     t->bufAddr,
-		     x,
-		     y,
-		     t->image[0][hwlevel].x + x,
-		     t->image[0][hwlevel].y + y, width, height);
-
-	r300EmitWait(rmesa, R300_WAIT_2D);
-}
-
-static void r300UploadRectSubImage(r300ContextPtr rmesa,
-				   r300TexObjPtr t,
-				   struct gl_texture_image *texImage,
-				   GLint x, GLint y, GLint width, GLint height)
-{
-	const struct gl_texture_format *texFormat = texImage->TexFormat;
-	int blit_format, dstPitch, done;
-
-	switch (texFormat->TexelBytes) {
-	case 1:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		break;
-	case 2:
-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
-		break;
-	case 4:
-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
-		break;
-	case 8:
-	case 16:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		break;
-	default:
-		return;
-	}
-
-	t->image[0][0].data = texImage->Data;
-
-	/* Currently don't need to cope with small pitches.
-	 */
-	width = texImage->Width;
-	height = texImage->Height;
-	dstPitch = t->pitch;
-
-	if (texFormat->TexelBytes > 4) {
-		width *= texFormat->TexelBytes;
-	}
-
-	if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
-		/* In this case, could also use GART texturing.  This is
-		 * currently disabled, but has been tested & works.
-		 */
-		t->offset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
-		t->pitch = texImage->RowStride * texFormat->TexelBytes - 32;
-
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr,
-				"Using GART texturing for rectangular client texture\n");
-
-		/* Release FB memory allocated for this image:
-		 */
-		/* FIXME This may not be correct as driSwapOutTextureObject sets
-		 * FIXME dirty_images.  It may be fine, though.
-		 */
-		if (t->base.memBlock) {
-			driSwapOutTextureObject((driTextureObject *) t);
-		}
-	} else if (texImage->IsClientData) {
-		/* Data already in GART memory, with usable pitch.
-		 */
-		GLuint srcPitch;
-		srcPitch = texImage->RowStride * texFormat->TexelBytes;
-		r300EmitBlit(rmesa,
-			     blit_format,
-			     srcPitch,
-			     r300GartOffsetFromVirtual(rmesa, texImage->Data),
-			     dstPitch, t->bufAddr, 0, 0, 0, 0, width, height);
-	} else {
-		/* Data not in GART memory, or bad pitch.
-		 */
-		for (done = 0; done < height;) {
-			struct r300_dma_region region;
-			int lines =
-			    MIN2(height - done, RADEON_BUFFER_SIZE / dstPitch);
-			int src_pitch;
-			char *tex;
-
-			src_pitch = texImage->RowStride * texFormat->TexelBytes;
-
-			tex = (char *)texImage->Data + done * src_pitch;
-
-			memset(&region, 0, sizeof(region));
-			r300AllocDmaRegion(rmesa, &region, lines * dstPitch,
-					   1024);
-
-			/* Copy texdata to dma:
-			 */
-			if (RADEON_DEBUG & DEBUG_TEXTURE)
-				fprintf(stderr,
-					"%s: src_pitch %d dst_pitch %d\n",
-					__FUNCTION__, src_pitch, dstPitch);
-
-			if (src_pitch == dstPitch) {
-				memcpy(region.address + region.start, tex,
-				       lines * src_pitch);
-			} else {
-				char *buf = region.address + region.start;
-				int i;
-				for (i = 0; i < lines; i++) {
-					memcpy(buf, tex, src_pitch);
-					buf += dstPitch;
-					tex += src_pitch;
-				}
-			}
-
-			r300EmitWait(rmesa, R300_WAIT_3D);
-
-			/* Blit to framebuffer
-			 */
-			r300EmitBlit(rmesa,
-				     blit_format,
-				     dstPitch, GET_START(&region),
-				     dstPitch | (t->tile_bits >> 16),
-				     t->bufAddr, 0, 0, 0, done, width, lines);
-
-			r300EmitWait(rmesa, R300_WAIT_2D);
-#ifdef USER_BUFFERS
-			r300_mem_use(rmesa, region.buf->id);
-#endif
-
-			r300ReleaseDmaRegion(rmesa, &region, __FUNCTION__);
-			done += lines;
-		}
-	}
-}
-
-/**
- * Upload the texture image associated with texture \a t at the specified
- * level at the address relative to \a start.
- */
-static void r300UploadSubImage(r300ContextPtr rmesa, r300TexObjPtr t,
-			       GLint hwlevel,
-			       GLint x, GLint y, GLint width, GLint height,
-			       GLuint face)
-{
-	struct gl_texture_image *texImage = NULL;
-	GLuint offset;
-	GLint imageWidth, imageHeight;
-	GLint ret;
-	drm_radeon_texture_t tex;
-	drm_radeon_tex_image_t tmp;
-	const int level = hwlevel + t->base.firstLevel;
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
-		fprintf(stderr,
-			"%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n",
-			__FUNCTION__, (void *)t, (void *)t->base.tObj, level,
-			width, height, face);
-	}
-
-	ASSERT(face < 6);
-
-	/* Ensure we have a valid texture to upload */
-	if ((hwlevel < 0) || (hwlevel >= RADEON_MAX_TEXTURE_LEVELS)) {
-		_mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
-		return;
-	}
-
-	texImage = t->base.tObj->Image[face][level];
-
-	if (!texImage) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: texImage %d is NULL!\n",
-				__FUNCTION__, level);
-		return;
-	}
-	if (!texImage->Data) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: image data is NULL!\n",
-				__FUNCTION__);
-		return;
-	}
-
-	if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		assert(level == 0);
-		assert(hwlevel == 0);
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: image data is rectangular\n",
-				__FUNCTION__);
-		r300UploadRectSubImage(rmesa, t, texImage, x, y, width, height);
-		return;
-	} else if (texImage->IsClientData) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr,
-				"%s: image data is in GART client storage\n",
-				__FUNCTION__);
-		r300UploadGARTClientSubImage(rmesa, t, texImage, hwlevel, x, y,
-					     width, height);
-		return;
-	} else if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "%s: image data is in normal memory\n",
-			__FUNCTION__);
-
-	imageWidth = texImage->Width;
-	imageHeight = texImage->Height;
-
-	offset = t->bufAddr;
-
-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
-		GLint imageX = 0;
-		GLint imageY = 0;
-		GLint blitX = t->image[face][hwlevel].x;
-		GLint blitY = t->image[face][hwlevel].y;
-		GLint blitWidth = t->image[face][hwlevel].width;
-		GLint blitHeight = t->image[face][hwlevel].height;
-		fprintf(stderr, "   upload image: %d,%d at %d,%d\n",
-			imageWidth, imageHeight, imageX, imageY);
-		fprintf(stderr, "   upload  blit: %d,%d at %d,%d\n",
-			blitWidth, blitHeight, blitX, blitY);
-		fprintf(stderr, "       blit ofs: 0x%07x level: %d/%d\n",
-			(GLuint) offset, hwlevel, level);
-	}
-
-	t->image[face][hwlevel].data = texImage->Data;
-
-	/* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
-	 * NOTE: we're always use a 1KB-wide blit and I8 texture format.
-	 * We used to use 1, 2 and 4-byte texels and used to use the texture
-	 * width to dictate the blit width - but that won't work for compressed
-	 * textures. (Brian)
-	 * NOTE: can't do that with texture tiling. (sroland)
-	 */
-	tex.offset = offset;
-	tex.image = &tmp;
-	/* copy (x,y,width,height,data) */
-	memcpy(&tmp, &t->image[face][hwlevel], sizeof(tmp));
-
-	if (texImage->TexFormat->TexelBytes > 4) {
-		const int log2TexelBytes =
-		    (3 + (texImage->TexFormat->TexelBytes >> 4));
-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
-		tex.pitch =
-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
-			 64, 1);
-		tex.height = imageHeight;
-		tex.width = imageWidth << log2TexelBytes;
-		tex.offset += (tmp.x << log2TexelBytes) & ~1023;
-		tmp.x = tmp.x % (1024 >> log2TexelBytes);
-		tmp.width = tmp.width << log2TexelBytes;
-	} else if (texImage->TexFormat->TexelBytes) {
-		/* use multi-byte upload scheme */
-		tex.height = imageHeight;
-		tex.width = imageWidth;
-		switch (texImage->TexFormat->TexelBytes) {
-		case 1:
-			tex.format = RADEON_TXFORMAT_I8;
-			break;
-		case 2:
-			tex.format = RADEON_TXFORMAT_AI88;
-			break;
-		case 4:
-			tex.format = RADEON_TXFORMAT_ARGB8888;
-			break;
-		}
-		tex.pitch =
-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
-			 64, 1);
-		tex.offset += tmp.x & ~1023;
-		tmp.x = tmp.x % 1024;
-
-		if (t->tile_bits & R300_TXO_MICRO_TILE) {
-			/* need something like "tiled coordinates" ? */
-			tmp.y = tmp.x / (tex.pitch * 128) * 2;
-			tmp.x =
-			    tmp.x % (tex.pitch * 128) / 2 /
-			    texImage->TexFormat->TexelBytes;
-			tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
-		} else {
-			tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
-		}
-#if 1
-		if ((t->tile_bits & R300_TXO_MACRO_TILE) &&
-		    (texImage->Width * texImage->TexFormat->TexelBytes >= 256)
-		    && ((!(t->tile_bits & R300_TXO_MICRO_TILE)
-			 && (texImage->Height >= 8))
-			|| (texImage->Height >= 16))) {
-			/* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
-			   OR if height is smaller than 8 automatically, but if micro tiling is active
-			   the limit is height 16 instead ? */
-			tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
-		}
-#endif
-	} else {
-		/* In case of for instance 8x8 texture (2x2 dxt blocks),
-		   padding after the first two blocks is needed (only
-		   with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
-		/* set tex.height to 1/4 since 1 "macropixel" (dxt-block)
-		   has 4 real pixels. Needed so the kernel module reads
-		   the right amount of data. */
-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
-		tex.pitch = (R300_BLIT_WIDTH_BYTES / 64);
-		tex.height = (imageHeight + 3) / 4;
-		tex.width = (imageWidth + 3) / 4;
-		if ((t->format & R300_TX_FORMAT_DXT1) == R300_TX_FORMAT_DXT1) {
-			tex.width *= 8;
-		} else {
-			tex.width *= 16;
-		}
-	}
-
-	LOCK_HARDWARE(&rmesa->radeon);
-	do {
-		ret =
-		    drmCommandWriteRead(rmesa->radeon.dri.fd,
-					DRM_RADEON_TEXTURE, &tex,
-					sizeof(drm_radeon_texture_t));
-		if (ret) {
-			if (RADEON_DEBUG & DEBUG_IOCTL)
-				fprintf(stderr,
-					"DRM_RADEON_TEXTURE:  again!\n");
-			usleep(1);
-		}
-	} while (ret == -EAGAIN);
-
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	if (ret) {
-		fprintf(stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret);
-		fprintf(stderr, "   offset=0x%08x\n", offset);
-		fprintf(stderr, "   image width=%d height=%d\n",
-			imageWidth, imageHeight);
-		fprintf(stderr, "    blit width=%d height=%d data=%p\n",
-			t->image[face][hwlevel].width,
-			t->image[face][hwlevel].height,
-			t->image[face][hwlevel].data);
-		_mesa_exit(-1);
-	}
-}
-
-/**
- * Upload the texture images associated with texture \a t.  This might
- * require the allocation of texture memory.
- *
- * \param rmesa Context pointer
- * \param t Texture to be uploaded
- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
- */
-
-int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t, GLuint face)
-{
-	const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-	if (t->image_override)
-		return 0;
-
-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
-		fprintf(stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
-			(void *)rmesa->radeon.glCtx, (void *)t->base.tObj,
-			t->base.totalSize, t->base.firstLevel,
-			t->base.lastLevel);
-	}
-
-	if (t->base.totalSize == 0)
-		return 0;
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-		radeonFinish(rmesa->radeon.glCtx);
-	}
-
-	LOCK_HARDWARE(&rmesa->radeon);
-
-	if (t->base.memBlock == NULL) {
-		int heap;
-
-		heap = driAllocateTexture(rmesa->texture_heaps, rmesa->nr_heaps,
-					  (driTextureObject *) t);
-		if (heap == -1) {
-			UNLOCK_HARDWARE(&rmesa->radeon);
-			return -1;
-		}
-
-		/* Set the base offset of the texture image */
-		t->bufAddr = rmesa->radeon.radeonScreen->texOffset[heap]
-		    + t->base.memBlock->ofs;
-		t->offset = t->bufAddr;
-
-		if (!(t->base.tObj->Image[0][0]->IsClientData)) {
-			/* hope it's safe to add that here... */
-			t->offset |= t->tile_bits;
-		}
-	}
-
-	/* Let the world know we've used this memory recently.
-	 */
-	driUpdateTextureLRU((driTextureObject *) t);
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	/* Upload any images that are new */
-	if (t->base.dirty_images[face]) {
-		int i;
-		for (i = 0; i < numLevels; i++) {
-			if ((t->base.
-			     dirty_images[face] & (1 <<
-						   (i + t->base.firstLevel))) !=
-			    0) {
-				r300UploadSubImage(rmesa, t, i, 0, 0,
-						   t->image[face][i].width,
-						   t->image[face][i].height,
-						   face);
-			}
-		}
-		t->base.dirty_images[face] = 0;
-	}
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-		radeonFinish(rmesa->radeon.glCtx);
-	}
-
-	return 0;
-}
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index e2329f0..f42f020 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -48,6 +48,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_state.h"
 #include "r300_ioctl.h"
 #include "radeon_ioctl.h"
+#include "r300_mipmap_tree.h"
 #include "r300_tex.h"
 #include "r300_reg.h"
 
@@ -148,8 +149,7 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 	if (!tObj)
 		return;
 
-	t = (r300TexObjPtr) tObj->DriverData;
-
+	t = r300_tex_obj(tObj);
 
 	switch (tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat) {
 	case MESA_FORMAT_Z16:
@@ -190,399 +190,241 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 
 
 /**
- * Compute sizes and fill in offset and blit information for the given
- * image (determined by \p face and \p level).
- *
- * \param curOffset points to the offset at which the image is to be stored
- * and is updated by this function according to the size of the image.
- */
-static void compute_tex_image_offset(
-	struct gl_texture_object *tObj,
-	GLuint face,
-	GLint level,
-	GLint* curOffset)
-{
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	const struct gl_texture_image* texImage;
-	GLuint blitWidth = R300_BLIT_WIDTH_BYTES;
-	GLuint texelBytes;
-	GLuint size;
-
-	texImage = tObj->Image[0][level + t->base.firstLevel];
-	if (!texImage)
-		return;
-
-	texelBytes = texImage->TexFormat->TexelBytes;
-
-	/* find image size in bytes */
-	if (texImage->IsCompressed) {
-		if ((t->format & R300_TX_FORMAT_DXT1) ==
-			R300_TX_FORMAT_DXT1) {
-			// fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
-			if ((texImage->Width + 3) < 8)	/* width one block */
-				size = texImage->CompressedSize * 4;
-			else if ((texImage->Width + 3) < 16)
-				size = texImage->CompressedSize * 2;
-			else
-				size = texImage->CompressedSize;
-		} else {
-			/* DXT3/5, 16 bytes per block */
-			WARN_ONCE
-				("DXT 3/5 suffers from multitexturing problems!\n");
-			// fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
-			if ((texImage->Width + 3) < 8)
-				size = texImage->CompressedSize * 2;
-			else
-				size = texImage->CompressedSize;
-		}
-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		size =
-			((texImage->Width * texelBytes +
-			63) & ~63) * texImage->Height;
-		blitWidth = 64 / texelBytes;
-	} else if (t->tile_bits & R300_TXO_MICRO_TILE) {
-		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-			though the actual offset may be different (if texture is less than
-			32 bytes width) to the untiled case */
-		int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
-		size =
-			(w * ((texImage->Height + 1) / 2)) *
-			texImage->Depth;
-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-	} else {
-		int w = (texImage->Width * texelBytes + 31) & ~31;
-		size = w * texImage->Height * texImage->Depth;
-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-	}
-	assert(size > 0);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
-			texImage->Width, texImage->Height,
-			texImage->Depth,
-			texImage->TexFormat->TexelBytes,
-			texImage->InternalFormat);
-
-	/* All images are aligned to a 32-byte offset */
-	*curOffset = (*curOffset + 0x1f) & ~0x1f;
-
-	if (texelBytes) {
-		/* fix x and y coords up later together with offset */
-		t->image[face][level].x = *curOffset;
-		t->image[face][level].y = 0;
-		t->image[face][level].width =
-			MIN2(size / texelBytes, blitWidth);
-		t->image[face][level].height =
-			(size / texelBytes) / t->image[face][level].width;
-	} else {
-		t->image[face][level].x = *curOffset % R300_BLIT_WIDTH_BYTES;
-		t->image[face][level].y = *curOffset / R300_BLIT_WIDTH_BYTES;
-		t->image[face][level].width =
-			MIN2(size, R300_BLIT_WIDTH_BYTES);
-		t->image[face][level].height = size / t->image[face][level].width;
-	}
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr,
-			"level %d, face %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
-			level, face, texImage->Width, texImage->Height,
-			t->image[face][level].x, t->image[face][level].y,
-			t->image[face][level].width, t->image[face][level].height,
-			size, *curOffset);
-
-	*curOffset += size;
-}
-
-
-
-/**
- * This function computes the number of bytes of storage needed for
- * the given texture object (all mipmap levels, all cube faces).
- * The \c image[face][level].x/y/width/height parameters for upload/blitting
- * are computed here.  \c filter, \c format, etc. will be set here
- * too.
+ * Compute the cached hardware register values for the given texture object.
  *
  * \param rmesa Context pointer
- * \param tObj GL texture object whose images are to be posted to
- *                 hardware state.
+ * \param t the r300 texture object
  */
-static void r300SetTexImages(r300ContextPtr rmesa,
-			     struct gl_texture_object *tObj)
+static void setup_hardware_state(r300ContextPtr rmesa, r300TexObj *t)
 {
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	const struct gl_texture_image *baseImage =
-	    tObj->Image[0][tObj->BaseLevel];
-	GLint curOffset;
-	GLint i, texelBytes;
-	GLint numLevels;
-	GLint log2Width, log2Height, log2Depth;
-
-	/* Set the hardware texture format
-	 */
+	const struct gl_texture_image *firstImage =
+	    t->base.Image[0][t->mt->firstLevel];
+
 	if (!t->image_override
-	    && VALID_FORMAT(baseImage->TexFormat->MesaFormat)) {
-		if (baseImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
-			r300SetDepthTexMode(tObj);
+	    && VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+		if (firstImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
+			r300SetDepthTexMode(&t->base);
 		} else {
-			t->format = tx_table[baseImage->TexFormat->MesaFormat].format;
+			t->format = tx_table[firstImage->TexFormat->MesaFormat].format;
 		}
 
-		t->filter |= tx_table[baseImage->TexFormat->MesaFormat].filter;
+		t->filter |= tx_table[firstImage->TexFormat->MesaFormat].filter;
 	} else if (!t->image_override) {
 		_mesa_problem(NULL, "unexpected texture format in %s",
 			      __FUNCTION__);
 		return;
 	}
 
-	texelBytes = baseImage->TexFormat->TexelBytes;
-
-	/* Compute which mipmap levels we really want to send to the hardware.
-	 */
-	driCalculateTextureFirstLastLevel((driTextureObject *) t);
-	log2Width = tObj->Image[0][t->base.firstLevel]->WidthLog2;
-	log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
-	log2Depth = tObj->Image[0][t->base.firstLevel]->DepthLog2;
-
-	numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-	assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
-
-	/* Calculate mipmap offsets and dimensions for blitting (uploading)
-	 * The idea is that we lay out the mipmap levels within a block of
-	 * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
-	 */
 	t->tile_bits = 0;
 
-	/* figure out if this texture is suitable for tiling. */
-#if 0				/* Disabled for now */
-	if (texelBytes) {
-		if ((tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
-		    /* texrect might be able to use micro tiling too in theory? */
-		    (baseImage->Height > 1)) {
-
-			/* allow 32 (bytes) x 1 mip (which will use two times the space
-			   the non-tiled version would use) max if base texture is large enough */
-			if ((numLevels == 1) ||
-			    (((baseImage->Width * texelBytes /
-			       baseImage->Height) <= 32)
-			     && (baseImage->Width * texelBytes > 64))
-			    ||
-			    ((baseImage->Width * texelBytes /
-			      baseImage->Height) <= 16)) {
-				t->tile_bits |= R300_TXO_MICRO_TILE;
-			}
-		}
-
-		if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
-			/* we can set macro tiling even for small textures, they will be untiled anyway */
-			t->tile_bits |= R300_TXO_MACRO_TILE;
-		}
-	}
-#endif
-
-	curOffset = 0;
-
-	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-		ASSERT(log2Width == log2Height);
+	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
 		t->format |= R300_TX_FORMAT_CUBIC_MAP;
+	if (t->base.Target == GL_TEXTURE_3D)
+		t->format |= R300_TX_FORMAT_3D;
 
-		for(i = 0; i < numLevels; i++) {
-			GLuint face;
-			for(face = 0; face < 6; face++)
-				compute_tex_image_offset(tObj, face, i, &curOffset);
-		}
-	} else {
-		if (tObj->Target == GL_TEXTURE_3D)
-                	t->format |= R300_TX_FORMAT_3D;
+	t->size = (((firstImage->Width - 1) << R300_TX_WIDTHMASK_SHIFT)
+		| ((firstImage->Height - 1) << R300_TX_HEIGHTMASK_SHIFT))
+		| ((t->mt->lastLevel - t->mt->firstLevel) << R300_TX_MAX_MIP_LEVEL_SHIFT);
 
-		for (i = 0; i < numLevels; i++)
-			compute_tex_image_offset(tObj, 0, i, &curOffset);
-	}
-
-	/* Align the total size of texture memory block.
-	 */
-	t->base.totalSize =
-	    (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
-
-	t->size =
-	    (((tObj->Image[0][t->base.firstLevel]->Width -
-	       1) << R300_TX_WIDTHMASK_SHIFT)
-	     | ((tObj->Image[0][t->base.firstLevel]->Height - 1) <<
-		R300_TX_HEIGHTMASK_SHIFT)
-	     | ((tObj->Image[0][t->base.firstLevel]->DepthLog2) <<
-		R300_TX_DEPTHMASK_SHIFT))
-	    | ((numLevels - 1) << R300_TX_MAX_MIP_LEVEL_SHIFT);
-
-	t->pitch = 0;
-
-	/* Only need to round to nearest 32 for textures, but the blitter
-	 * requires 64-byte aligned pitches, and we may/may not need the
-	 * blitter.   NPOT only!
-	 */
-	if (baseImage->IsCompressed) {
-		t->pitch |=
-		    (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		unsigned int align = (64 / texelBytes) - 1;
-		t->pitch |= ((tObj->Image[0][t->base.firstLevel]->Width *
-			     texelBytes) + 63) & ~(63);
+	if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
+		unsigned int align = (64 / t->mt->bpp) - 1;
 		t->size |= R300_TX_SIZE_TXPITCH_EN;
 		if (!t->image_override)
-			t->pitch_reg =
-			    (((tObj->Image[0][t->base.firstLevel]->Width) +
-			      align) & ~align) - 1;
-	} else {
-		t->pitch |=
-		    ((tObj->Image[0][t->base.firstLevel]->Width *
-		      texelBytes) + 63) & ~(63);
+			t->pitch_reg = ((firstImage->Width + align) & ~align) - 1;
 	}
 
 	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-	    if (tObj->Image[0][t->base.firstLevel]->Width > 2048)
+	    if (firstImage->Width > 2048)
 		t->pitch_reg |= R500_TXWIDTH_BIT11;
-	    if (tObj->Image[0][t->base.firstLevel]->Height > 2048)
+	    if (firstImage->Height > 2048)
 		t->pitch_reg |= R500_TXHEIGHT_BIT11;
 	}
 }
 
-/* ================================================================
- * Texture unit state management
- */
 
-static GLboolean r300EnableTexture2D(GLcontext * ctx, int unit)
+static void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride,
+	GLuint numrows, GLuint rowsize)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	assert(rowsize <= dststride);
+	assert(rowsize <= srcstride);
 
-	ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
-
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
-
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock && !t->image_override)
-			return GL_FALSE;
+	if (rowsize == srcstride && rowsize == dststride) {
+		memcpy(dst, src, numrows*rowsize);
+	} else {
+		GLuint i;
+		for(i = 0; i < numrows; ++i) {
+			memcpy(dst, src, rowsize);
+			dst += dststride;
+			src += srcstride;
+		}
 	}
-
-	return GL_TRUE;
 }
 
-static GLboolean r300EnableTexture3D(GLcontext * ctx, int unit)
+
+/**
+ * Ensure that the given image is stored in the given miptree from now on.
+ */
+static void migrate_image_to_miptree(r300_mipmap_tree *mt, r300_texture_image *image, int face, int level)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	r300_mipmap_level *dstlvl = &mt->levels[level - mt->firstLevel];
+	unsigned char *dest;
 
-	ASSERT(tObj->Target == GL_TEXTURE_3D);
+	assert(image->mt != mt);
+	assert(dstlvl->width == image->base.Width);
+	assert(dstlvl->height == image->base.Height);
+	assert(dstlvl->depth == image->base.Depth);
 
-	/* r300 does not support mipmaps for 3D textures. */
-	if ((tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR)) {
-		return GL_FALSE;
-	}
+	dri_bo_map(mt->bo, GL_TRUE);
+	dest = mt->bo->virtual + dstlvl->faces[face].offset;
 
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock)
-			return GL_FALSE;
-	}
+	if (image->mt) {
+		/* Format etc. should match, so we really just need a memcpy().
+		 * In fact, that memcpy() could be done by the hardware in many
+		 * cases, provided that we have a proper memory manager.
+		 */
+		r300_mipmap_level *srclvl = &image->mt->levels[image->mtlevel];
 
-	return GL_TRUE;
-}
+		assert(srclvl->size == dstlvl->size);
+		assert(srclvl->rowstride == dstlvl->rowstride);
 
-static GLboolean r300EnableTextureCube(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	GLuint face;
-
-	ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
-
-	if (t->base.dirty_images[0] || t->base.dirty_images[1] ||
-	    t->base.dirty_images[2] || t->base.dirty_images[3] ||
-	    t->base.dirty_images[4] || t->base.dirty_images[5]) {
-		/* flush */
-		R300_FIREVERTICES(rmesa);
-		/* layout memory space, once for all faces */
-		r300SetTexImages(rmesa, tObj);
-	}
+		dri_bo_map(image->mt->bo, GL_FALSE);
+		memcpy(dest,
+			image->mt->bo->virtual + srclvl->faces[face].offset,
+			dstlvl->size);
+		dri_bo_unmap(image->mt->bo);
 
-	/* upload (per face) */
-	for (face = 0; face < 6; face++) {
-		if (t->base.dirty_images[face]) {
-			r300UploadTexImages(rmesa,
-					    (r300TexObjPtr) tObj->DriverData,
-					    face);
-		}
-	}
+		r300_miptree_unreference(image->mt);
+	} else {
+		uint srcrowstride = image->base.Width * image->base.TexFormat->TexelBytes;
 
-	if (!t->base.memBlock) {
-		/* texmem alloc failed, use s/w fallback */
-		return GL_FALSE;
+		if (mt->tilebits)
+			WARN_ONCE("%s: tiling not supported yet", __FUNCTION__);
+
+		copy_rows(dest, dstlvl->rowstride, image->base.Data, srcrowstride,
+			image->base.Height * image->base.Depth, srcrowstride);
+
+		_mesa_free_texmemory(image->base.Data);
+		image->base.Data = 0;
 	}
 
-	return GL_TRUE;
+	dri_bo_unmap(mt->bo);
+
+	image->mt = mt;
+	image->mtface = face;
+	image->mtlevel = level;
+	r300_miptree_reference(image->mt);
 }
 
-static GLboolean r300EnableTextureRect(GLcontext * ctx, int unit)
+
+/**
+ * Ensure the given texture is ready for rendering.
+ *
+ * Mostly this means populating the texture object's mipmap tree.
+ */
+static GLboolean r300_validate_texture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	r300TexObj *t = r300_tex_obj(texObj);
+	r300_texture_image *baseimage = get_r300_texture_image(texObj->Image[0][texObj->BaseLevel]);
+	int face, level;
+
+	if (t->validated)
+		return GL_TRUE;
 
-	ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
+	if (RADEON_DEBUG & DEBUG_TEXTURE)
+		fprintf(stderr, "%s: Validating texture %p now\n", __FUNCTION__, texObj);
 
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
+	if (baseimage->base.Border > 0)
+		return GL_FALSE;
 
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock && !t->image_override &&
-		    !rmesa->prefer_gart_client_texturing)
+	/* Ensure a matching miptree exists.
+	 *
+	 * Differing mipmap trees can result when the app uses TexImage to
+	 * change texture dimensions.
+	 *
+	 * Prefer to use base image's miptree if it
+	 * exists, since that most likely contains more valid data (remember
+	 * that the base level is usually significantly larger than the rest
+	 * of the miptree, so cubemaps are the only possible exception).
+	 */
+	if (baseimage->mt &&
+	    baseimage->mt != t->mt &&
+	    r300_miptree_matches_texture(baseimage->mt, &t->base)) {
+		r300_miptree_unreference(t->mt);
+		t->mt = baseimage->mt;
+		r300_miptree_reference(t->mt);
+	} else if (t->mt && !r300_miptree_matches_texture(t->mt, &t->base)) {
+		r300_miptree_unreference(t->mt);
+		t->mt = 0;
+	}
+
+	if (!t->mt) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, " Allocate new miptree\n");
+		r300_try_alloc_miptree(rmesa, t, &baseimage->base, 0, texObj->BaseLevel);
+		if (!t->mt) {
+			_mesa_problem(ctx, "r300_validate_texture failed to alloc miptree");
 			return GL_FALSE;
+		}
+	}
+
+	/* Ensure all images are stored in the single main miptree */
+	for(face = 0; face < t->mt->faces; ++face) {
+		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level) {
+			r300_texture_image *image = get_r300_texture_image(texObj->Image[face][level]);
+			if (RADEON_DEBUG & DEBUG_TEXTURE)
+				fprintf(stderr, " face %i, level %i... ", face, level);
+			if (t->mt == image->mt) {
+				if (RADEON_DEBUG & DEBUG_TEXTURE)
+					fprintf(stderr, "OK\n");
+				continue;
+			}
+
+			if (RADEON_DEBUG & DEBUG_TEXTURE)
+				fprintf(stderr, "migrating\n");
+			migrate_image_to_miptree(t->mt, image, face, level);
+		}
 	}
 
+	/* Configure the hardware registers (more precisely, the cached version
+	 * of the hardware registers). */
+	setup_hardware_state(rmesa, t);
+
+	t->validated = GL_TRUE;
 	return GL_TRUE;
 }
 
-static GLboolean r300UpdateTexture(GLcontext * ctx, int unit)
+
+/**
+ * Ensure all enabled and complete textures are uploaded.
+ */
+GLboolean r300ValidateTextures(GLcontext * ctx)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	int i;
+	int flushed = 0;
 
-	/* Fallback if there's a texture border */
-	if (tObj->Image[0][tObj->BaseLevel]->Border > 0)
-		return GL_FALSE;
+ again:
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+		r300TexObj *t;
+		if (!ctx->Texture.Unit[i]._ReallyEnabled)
+			continue;
 
-	/* Update state if this is a different texture object to last
-	 * time.
-	 */
-	if (rmesa->state.texture.unit[unit].texobj != t) {
-		if (rmesa->state.texture.unit[unit].texobj != NULL) {
-			/* The old texture is no longer bound to this texture unit.
-			 * Mark it as such.
-			 */
-
-			rmesa->state.texture.unit[unit].texobj->base.bound &=
-			    ~(1 << unit);
+		t = r300_tex_obj(ctx->Texture.Unit[i]._Current);
+		if (!r300_validate_texture(ctx, t)) {
+			_mesa_warning(ctx,
+				      "failed to validate texture for unit %d.\n",
+				      i);
 		}
 
-		rmesa->state.texture.unit[unit].texobj = t;
-		t->base.bound |= (1 << unit);
-		driUpdateTextureLRU((driTextureObject *) t);	/* XXX: should be locked! */
+		if (dri_bufmgr_check_aperture_space(t->mt->bo)) {
+			r300Flush(ctx);
+			if (flushed)
+				return GL_TRUE;
+		   	flushed = 1;
+		  	goto again;
+		}
 	}
-
-	return !t->border_fallback;
+	return GL_FALSE;
 }
 
 void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
@@ -591,20 +433,18 @@ void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	r300ContextPtr rmesa = pDRICtx->driverPrivate;
 	struct gl_texture_object *tObj =
 	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
-	r300TexObjPtr t;
+	r300TexObjPtr t = r300_tex_obj(tObj);
 	uint32_t pitch_val;
 
 	if (!tObj)
 		return;
 
-	t = (r300TexObjPtr) tObj->DriverData;
-
 	t->image_override = GL_TRUE;
 
 	if (!offset)
 		return;
 
-	t->offset = offset;
+	t->override_offset = offset;
 	t->pitch_reg &= (1 << 13) -1;
 	pitch_val = pitch;
 
@@ -630,39 +470,3 @@ void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 
 	t->pitch_reg |= pitch_val;
 }
-
-static GLboolean r300UpdateTextureUnit(GLcontext * ctx, int unit)
-{
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-
-	if (texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT)) {
-		return (r300EnableTextureRect(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT)) {
-		return (r300EnableTexture2D(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_3D_BIT)) {
-		return (r300EnableTexture3D(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) {
-		return (r300EnableTextureCube(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled) {
-		return GL_FALSE;
-	} else {
-		return GL_TRUE;
-	}
-}
-
-void r300UpdateTextureState(GLcontext * ctx)
-{
-	int i;
-
-	for (i = 0; i < 8; i++) {
-		if (!r300UpdateTextureUnit(ctx, i)) {
-			_mesa_warning(ctx,
-				      "failed to update texture state for unit %d.\n",
-				      i);
-		}
-	}
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_context.c b/src/mesa/drivers/dri/r300/radeon_context.c
index 5267fe9..c1c2168 100644
--- a/src/mesa/drivers/dri/r300/radeon_context.c
+++ b/src/mesa/drivers/dri/r300/radeon_context.c
@@ -46,6 +46,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drivers/common/driverfuncs.h"
 #include "swrast/swrast.h"
 
+#include "radeon_buffer.h"
 #include "radeon_screen.h"
 #include "radeon_ioctl.h"
 #include "radeon_macros.h"
@@ -57,6 +58,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "utils.h"
 #include "vblank.h"
 #include "xmlpool.h"		/* for symbolic values of enum-type options */
+#include "drirenderbuffer.h"
 
 #define DRIVER_DATE "20060815"
 
@@ -258,6 +260,56 @@ void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
     }
 }
 
+static void
+radeon_make_renderbuffer_current(radeonContextPtr radeon,
+				 GLframebuffer *draw)
+{
+	int size = radeon->radeonScreen->driScreen->fbSize;
+	void *map = 0;
+	/* if radeon->fake */
+	struct radeon_renderbuffer *rb;
+	uint32_t offset;
+
+	if (!radeon->bufmgr)
+		return;
+
+	if ((rb = (void *)draw->Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
+
+		if (radeon->radeonScreen->kernel_mm)
+			map = radeon->radeonScreen->front.map;
+
+ 		offset = radeon->radeonScreen->kernel_mm ? radeon->radeonScreen->front.offset : radeon->radeonScreen->frontOffset;
+			rb->bo = dri_bo_alloc_static(radeon->bufmgr, "front buffer",
+					       offset, size, map, 0);
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->frontPitch;
+	}
+	if ((rb = (void *)draw->Attachment[BUFFER_BACK_LEFT].Renderbuffer)) {
+
+		if (radeon->radeonScreen->kernel_mm)
+			map = radeon->radeonScreen->back.map;
+
+ 		offset = radeon->radeonScreen->kernel_mm ? radeon->radeonScreen->back.offset : radeon->radeonScreen->backOffset;
+		if (!rb->bo)
+			rb->bo = dri_bo_alloc_static(radeon->bufmgr, "back buffer",
+						     offset, size, map, 0);
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->backPitch;
+	}
+	if ((rb = (void *)draw->Attachment[BUFFER_DEPTH].Renderbuffer)) {
+		offset = radeon->radeonScreen->kernel_mm ? radeon->radeonScreen->depth.offset : radeon->radeonScreen->depthOffset;
+
+		if (radeon->radeonScreen->kernel_mm)
+			map = radeon->radeonScreen->depth.map;
+
+		if (!rb->bo)
+			rb->bo = dri_bo_alloc_static(radeon->bufmgr, "depth buffer",
+								       offset, size, map, 0);
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->depthPitch;
+	}
+}
+
 /* Force the context `c' to be the current context and associate with it
  * buffer `b'.
  */
@@ -265,51 +317,57 @@ GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
 			    __DRIdrawablePrivate * driDrawPriv,
 			    __DRIdrawablePrivate * driReadPriv)
 {
-	if (driContextPriv) {
-		radeonContextPtr radeon =
-			(radeonContextPtr) driContextPriv->driverPrivate;
+	radeonContextPtr radeon;
+	GLframebuffer *dfb, *rfb;
 
+	if (!driContextPriv) {
 		if (RADEON_DEBUG & DEBUG_DRI)
-			fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
-				radeon->glCtx);
-
-		if (radeon->dri.drawable != driDrawPriv) {
-			if (driDrawPriv->swap_interval == (unsigned)-1) {
-				driDrawPriv->vblFlags =
-					(radeon->radeonScreen->irq != 0)
-					? driGetDefaultVBlankFlags(&radeon->
-								   optionCache)
-					: VBLANK_FLAG_NO_IRQ;
+			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+		_mesa_make_current(NULL, NULL, NULL);
+		return GL_TRUE;
+	}
 
-				driDrawableInitVBlank(driDrawPriv);
-			}
-		}
+	radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+	dfb = driDrawPriv->driverPrivate;
+	rfb = driReadPriv->driverPrivate;
 
-		radeon->dri.readable = driReadPriv;
+	if (RADEON_DEBUG & DEBUG_DRI)
+		fprintf(stderr, "%s ctx %p\n", __FUNCTION__, radeon->glCtx);
+
+	driUpdateFramebufferSize(radeon->glCtx, driDrawPriv);
+	if (driReadPriv != driDrawPriv)
+		driUpdateFramebufferSize(radeon->glCtx, driReadPriv);
 
-		if (radeon->dri.drawable != driDrawPriv ||
-		    radeon->lastStamp != driDrawPriv->lastStamp) {
-			radeon->dri.drawable = driDrawPriv;
+	radeon_make_renderbuffer_current(radeon, dfb);
 
-			radeonSetCliprects(radeon);
-			r300UpdateViewportOffset(radeon->glCtx);
+	_mesa_make_current(radeon->glCtx, dfb, rfb);
+
+	if (radeon->dri.drawable != driDrawPriv) {
+		if (driDrawPriv->swap_interval == (unsigned)-1) {
+			driDrawPriv->vblFlags =
+				(radeon->radeonScreen->irq != 0)
+				? driGetDefaultVBlankFlags(&radeon->
+							   optionCache)
+					: VBLANK_FLAG_NO_IRQ;
+
+			driDrawableInitVBlank(driDrawPriv);
 		}
+	}
 
-		_mesa_make_current(radeon->glCtx,
-				    (GLframebuffer *) driDrawPriv->
-				    driverPrivate,
-				    (GLframebuffer *) driReadPriv->
-				    driverPrivate);
+	radeon->dri.readable = driReadPriv;
 
-		_mesa_update_state(radeon->glCtx);		
+	if (radeon->dri.drawable != driDrawPriv ||
+	    radeon->lastStamp != driDrawPriv->lastStamp) {
+		radeon->dri.drawable = driDrawPriv;
 
-		radeonUpdatePageFlipping(radeon);
-	} else {
-		if (RADEON_DEBUG & DEBUG_DRI)
-			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
-		_mesa_make_current(0, 0, 0);
+		radeonSetCliprects(radeon);
+		r300UpdateViewportOffset(radeon->glCtx);
 	}
 
+	_mesa_update_state(radeon->glCtx);
+
+	radeonUpdatePageFlipping(radeon);
+
 	if (RADEON_DEBUG & DEBUG_DRI)
 		fprintf(stderr, "End %s\n", __FUNCTION__);
 	return GL_TRUE;
diff --git a/src/mesa/drivers/dri/r300/radeon_context.h b/src/mesa/drivers/dri/r300/radeon_context.h
index 47cbc22..a1f0225 100644
--- a/src/mesa/drivers/dri/r300/radeon_context.h
+++ b/src/mesa/drivers/dri/r300/radeon_context.h
@@ -45,6 +45,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "main/mtypes.h"
 #include "main/colormac.h"
+#include "radeon_dri_bufmgr.h"
 #include "radeon_screen.h"
 #include "drm.h"
 #include "dri_util.h"
@@ -132,12 +133,13 @@ struct radeon_scissor_state {
 
 struct radeon_colorbuffer_state {
 	GLuint clear;
-	GLint drawOffset, drawPitch;
+	struct radeon_renderbuffer *rrb;
 };
 
 struct radeon_state {
 	struct radeon_colorbuffer_state color;
 	struct radeon_scissor_state scissor;
+	struct radeon_renderbuffer *depth_buffer;
 };
 
 /**
@@ -185,6 +187,8 @@ struct radeon_context {
 	/* Configuration cache
 	 */
 	driOptionCache optionCache;
+
+	dri_bufmgr *bufmgr;
 };
 
 #define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
diff --git a/src/mesa/drivers/dri/r300/radeon_ioctl.c b/src/mesa/drivers/dri/r300/radeon_ioctl.c
index 36502eb..6186d1f 100644
--- a/src/mesa/drivers/dri/r300/radeon_ioctl.c
+++ b/src/mesa/drivers/dri/r300/radeon_ioctl.c
@@ -42,6 +42,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 #include "r300_context.h"
 #include "radeon_ioctl.h"
+#include "radeon_buffer.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
 #include "radeon_reg.h"
@@ -171,7 +172,7 @@ void radeonCopyBuffer(__DRIdrawablePrivate * dPriv,
 	assert(dPriv->driContextPriv->driverPrivate);
 
 	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-
+	
 	if (RADEON_DEBUG & DEBUG_IOCTL) {
 		fprintf(stderr, "\n%s( %p )\n\n", __FUNCTION__,
 			(void *)radeon->glCtx);
@@ -261,6 +262,8 @@ void radeonPageFlip(__DRIdrawablePrivate * dPriv)
 	GLint ret;
 	GLboolean missed_target;
 	__DRIscreenPrivate *psp = dPriv->driScreenPriv;
+	GLframebuffer *fb = dPriv->driverPrivate;
+	struct radeon_renderbuffer *rrb;
 
 	assert(dPriv);
 	assert(dPriv->driContextPriv);
@@ -268,6 +271,8 @@ void radeonPageFlip(__DRIdrawablePrivate * dPriv)
 
 	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
 
+	rrb = (void *)fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+
 	if (RADEON_DEBUG & DEBUG_IOCTL) {
 		fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
 			radeon->sarea->pfCurrentPage);
@@ -315,32 +320,10 @@ void radeonPageFlip(__DRIdrawablePrivate * dPriv)
 	radeon->swap_count++;
 	(void)(*psp->systemTime->getUST) (&radeon->swap_ust);
 
-        driFlipRenderbuffers(radeon->glCtx->WinSysDrawBuffer, 
+        driFlipRenderbuffers(radeon->glCtx->WinSysDrawBuffer,
                              radeon->sarea->pfCurrentPage);
 
-	if (radeon->sarea->pfCurrentPage == 1) {
-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
-	} else {
-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
-	}
-
-	if (IS_R300_CLASS(radeon->radeonScreen)) {
-		r300ContextPtr r300 = (r300ContextPtr)radeon;
-		R300_STATECHANGE(r300, cb);
-		r300->hw.cb.cmd[R300_CB_OFFSET] = r300->radeon.state.color.drawOffset + 
-						r300->radeon.radeonScreen->fbLocation;
-		r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
-		
-		if (r300->radeon.radeonScreen->cpp == 4)
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-		else
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-	
-		if (r300->radeon.sarea->tiling_enabled)
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-	}
+	radeon->state.color.rrb = rrb;
 }
 
 void radeonWaitForIdleLocked(radeonContextPtr radeon)
diff --git a/src/mesa/drivers/dri/r300/radeon_lock.c b/src/mesa/drivers/dri/r300/radeon_lock.c
index 4f47afd..9e59c76 100644
--- a/src/mesa/drivers/dri/r300/radeon_lock.c
+++ b/src/mesa/drivers/dri/r300/radeon_lock.c
@@ -44,6 +44,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_state.h"
 #include "r300_context.h"
 #include "r300_state.h"
+#include "r300_mem.h"
 
 #include "main/framebuffer.h"
 
@@ -59,11 +60,11 @@ int prevLockLine = 0;
 void radeonUpdatePageFlipping(radeonContextPtr rmesa)
 {
 	int use_back;
+	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
+	GLframebuffer *fb = drawable->driverPrivate;
 
 	rmesa->doPageFlip = rmesa->sarea->pfState;
 	if (rmesa->glCtx->WinSysDrawBuffer) {
-		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-				     rmesa->sarea->pfCurrentPage);
 		r300UpdateDrawBuffer(rmesa->glCtx);
 	}
 
@@ -72,16 +73,12 @@ void radeonUpdatePageFlipping(radeonContextPtr rmesa)
 	     BUFFER_BACK_LEFT) : 1;
 	use_back ^= (rmesa->sarea->pfCurrentPage == 1);
 
-	if (use_back) {
-		rmesa->state.color.drawOffset =
-		    rmesa->radeonScreen->backOffset;
-		rmesa->state.color.drawPitch = rmesa->radeonScreen->backPitch;
-	} else {
-		rmesa->state.color.drawOffset =
-		    rmesa->radeonScreen->frontOffset;
-		rmesa->state.color.drawPitch =
-		    rmesa->radeonScreen->frontPitch;
-	}
+	if (use_back)
+		rmesa->state.color.rrb = (void *)fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	else
+		rmesa->state.color.rrb = (void *)fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+
+	rmesa->state.depth_buffer = (void *)fb->Attachment[BUFFER_DEPTH].Renderbuffer;
 }
 
 /* Update the hardware state.  This is called if another context has
@@ -125,12 +122,8 @@ void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 	}
 
 	if (sarea->ctx_owner != rmesa->dri.hwContext) {
-		int i;
-
 		sarea->ctx_owner = rmesa->dri.hwContext;
-		for (i = 0; i < r300->nr_heaps; i++) {
-			DRI_AGE_TEXTURES(r300->texture_heaps[i]);
-		}
+		radeonBufmgrContendedLockTake(r300->radeon.bufmgr);
 	}
 
 	rmesa->lost_context = GL_TRUE;
diff --git a/src/mesa/drivers/dri/r300/radeon_span.c b/src/mesa/drivers/dri/r300/radeon_span.c
index 16f9fb9..de94888 100644
--- a/src/mesa/drivers/dri/r300/radeon_span.c
+++ b/src/mesa/drivers/dri/r300/radeon_span.c
@@ -48,7 +48,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_ioctl.h"
 #include "radeon_span.h"
 
-#include "drirenderbuffer.h"
+#include "radeon_buffer.h"
 
 #define DBG 0
 
@@ -58,21 +58,21 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * information.
  */
 #define LOCAL_VARS						\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
+   struct radeon_renderbuffer *rrb = (void *) rb;		\
+   const __DRIdrawablePrivate *dPriv = rrb->dPriv;		\
    const GLuint bottom = dPriv->h - 1;				\
-   GLubyte *buf = (GLubyte *) drb->flippedData			\
-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
+   GLubyte *buf = (GLubyte *) rrb->bo->virtual			\
+      + (dPriv->y * rrb->pitch + dPriv->x) * rrb->cpp;	\
    GLuint p;							\
    (void) p;
 
 #define LOCAL_DEPTH_VARS				\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
+   struct radeon_renderbuffer *rrb = (void *) rb;	\
+   const __DRIdrawablePrivate *dPriv = rrb->dPriv;	\
    const GLuint bottom = dPriv->h - 1;			\
    GLuint xo = dPriv->x;				\
    GLuint yo = dPriv->y;				\
-   GLubyte *buf = (GLubyte *) drb->Base.Data;
+   GLubyte *buf = (GLubyte *) rrb->base.Data;
 
 #define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
 
@@ -93,7 +93,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define TAG(x)    radeon##x##_RGB565
 #define TAG2(x,y) radeon##x##_RGB565##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+#define GET_PTR(X,Y) (buf + ((Y) * rrb->pitch + (X)) * 2)
 #include "spantmp2.h"
 
 /* 32 bit, ARGB8888 color spanline and pixel functions
@@ -103,7 +103,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define TAG(x)    radeon##x##_ARGB8888
 #define TAG2(x,y) radeon##x##_ARGB8888##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+#define GET_PTR(X,Y) (buf + ((Y) * rrb->pitch + (X)) * 4)
 #include "spantmp2.h"
 
 /* ================================================================
@@ -120,10 +120,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * too...
  */
 
-static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
+static GLuint radeon_mba_z32(const struct radeon_renderbuffer * rrb,
+			     GLint x, GLint y)
 {
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
+	GLuint pitch = rrb->pitch;
+	if (rrb->depthHasSurface) {
 		return 4 * (x + y * pitch);
 	} else {
 		GLuint ba, address = 0;	/* a[0..1] = 0           */
@@ -148,10 +149,10 @@ static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
 }
 
 static INLINE GLuint
-radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
+radeon_mba_z16(const struct radeon_renderbuffer *rrb, GLint x, GLint y)
 {
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
+	GLuint pitch = rrb->pitch;
+	if (rrb->depthHasSurface) {
 		return 2 * (x + y * pitch);
 	} else {
 		GLuint ba, address = 0;	/* a[0]    = 0           */
@@ -175,10 +176,10 @@ radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
 #define VALUE_TYPE GLushort
 
 #define WRITE_DEPTH( _x, _y, d )					\
-   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
+   *(GLushort *)(buf + radeon_mba_z16( rrb, _x + xo, _y + yo )) = d;
 
 #define READ_DEPTH( d, _x, _y )						\
-   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
+   d = *(GLushort *)(buf + radeon_mba_z16( rrb, _x + xo, _y + yo ));
 
 #define TAG(x) radeon##x##_z16
 #include "depthtmp.h"
@@ -193,7 +194,7 @@ radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
 #ifdef COMPILE_R300
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint offset = radeon_mba_z32( rrb, _x + xo, _y + yo );		\
    GLuint tmp = *(GLuint *)(buf + offset);				\
    tmp &= 0x000000ff;							\
    tmp |= ((d << 8) & 0xffffff00);					\
@@ -202,7 +203,7 @@ do {									\
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint offset = radeon_mba_z32( rrb, _x + xo, _y + yo );		\
    GLuint tmp = *(GLuint *)(buf + offset);				\
    tmp &= 0xff000000;							\
    tmp |= ((d) & 0x00ffffff);						\
@@ -213,12 +214,12 @@ do {									\
 #ifdef COMPILE_R300
 #define READ_DEPTH( d, _x, _y )						\
   do { \
-    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
+    d = (*(GLuint *)(buf + radeon_mba_z32( rrb, _x + xo,		\
 					 _y + yo )) & 0xffffff00) >> 8; \
   }while(0)
 #else
 #define READ_DEPTH( d, _x, _y )						\
-   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
+   d = *(GLuint *)(buf + radeon_mba_z32( rrb, _x + xo,			\
 					 _y + yo )) & 0x00ffffff;
 #endif
 
@@ -234,7 +235,7 @@ do {									\
 #ifdef COMPILE_R300
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint offset = radeon_mba_z32( rrb, _x + xo, _y + yo );		\
    GLuint tmp = *(GLuint *)(buf + offset);				\
    tmp &= 0xffffff00;							\
    tmp |= (d) & 0xff;							\
@@ -243,7 +244,7 @@ do {									\
 #else
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint offset = radeon_mba_z32( rrb, _x + xo, _y + yo );		\
    GLuint tmp = *(GLuint *)(buf + offset);				\
    tmp &= 0x00ffffff;							\
    tmp |= (((d) & 0xff) << 24);						\
@@ -254,14 +255,14 @@ do {									\
 #ifdef COMPILE_R300
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint offset = radeon_mba_z32( rrb, _x + xo, _y + yo );		\
    GLuint tmp = *(GLuint *)(buf + offset);				\
    d = tmp & 0x000000ff;						\
 } while (0)
 #else
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
+   GLuint offset = radeon_mba_z32( rrb, _x + xo, _y + yo );		\
    GLuint tmp = *(GLuint *)(buf + offset);				\
    d = (tmp & 0xff000000) >> 24;					\
 } while (0)
@@ -270,6 +271,22 @@ do {									\
 #define TAG(x) radeon##x##_z24_s8
 #include "stenciltmp.h"
 
+static void map_buffer(struct gl_renderbuffer *rb, GLboolean write)
+{
+	struct radeon_renderbuffer *rrb = (void*)rb;
+
+	if (rrb->bo)
+		dri_bo_map(rrb->bo, write);
+}
+
+static void unmap_buffer(struct gl_renderbuffer *rb)
+{
+	struct radeon_renderbuffer *rrb = (void*)rb;
+
+	if (rrb->bo)
+		dri_bo_unmap(rrb->bo);
+}
+
 /* Move locking out to get reasonable span performance (10x better
  * than doing this in HW_LOCK above).  WaitForIdle() is the main
  * culprit.
@@ -278,45 +295,61 @@ do {									\
 static void radeonSpanRenderStart(GLcontext * ctx)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	int i;
 #ifdef COMPILE_R300
 	r300ContextPtr r300 = (r300ContextPtr) rmesa;
 	R300_FIREVERTICES(r300);
 #else
 	RADEON_FIREVERTICES(rmesa);
 #endif
-	LOCK_HARDWARE(rmesa);
-	radeonWaitForIdleLocked(rmesa);
 
-	/* Read the first pixel in the frame buffer.  This should
-	 * be a noop, right?  In fact without this conform fails as reading
-	 * from the framebuffer sometimes produces old results -- the
-	 * on-card read cache gets mixed up and doesn't notice that the
-	 * framebuffer has been updated.
-	 *
-	 * Note that we should probably be reading some otherwise unused
-	 * region of VRAM, otherwise we might get incorrect results when
-	 * reading pixels from the top left of the screen.
-	 *
-	 * I found this problem on an R420 with glean's texCube test.
-	 * Note that the R200 span code also *writes* the first pixel in the
-	 * framebuffer, but I've found this to be unnecessary.
-	 *  -- Nicolai Hähnle, June 2008
-	 */
-	{
-		int p;
-		driRenderbuffer *drb =
-			(driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
-		volatile int *buf =
-			(volatile int *)(rmesa->dri.screen->pFB + drb->offset);
-		p = *buf;
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled)
+			ctx->Driver.MapTexture(ctx, ctx->Texture.Unit[i]._Current);
 	}
+
+	/* color draw buffers */
+	for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++)
+		map_buffer(ctx->DrawBuffer->_ColorDrawBuffers[i], GL_TRUE);
+
+	map_buffer(ctx->ReadBuffer->_ColorReadBuffer, GL_FALSE);
+
+	if (ctx->DrawBuffer->_DepthBuffer)
+		map_buffer(ctx->DrawBuffer->_DepthBuffer->Wrapped, GL_TRUE);
+	if (ctx->DrawBuffer->_StencilBuffer)
+		map_buffer(ctx->DrawBuffer->_StencilBuffer->Wrapped, GL_TRUE);
+
+	/* The locking and wait for idle should really only be needed in classic mode.
+	 * In a future memory manager based implementation, this should become
+	 * unnecessary due to the fact that mapping our buffers, textures, etc.
+	 * should implicitly wait for any previous rendering commands that must
+	 * be waited on. */
+	LOCK_HARDWARE(rmesa);
+	radeonWaitForIdleLocked(rmesa);
 }
 
 static void radeonSpanRenderFinish(GLcontext * ctx)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	int i;
 	_swrast_flush(ctx);
 	UNLOCK_HARDWARE(rmesa);
+
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled)
+			ctx->Driver.UnmapTexture(ctx, ctx->Texture.Unit[i]._Current);
+	}
+
+	/* color draw buffers */
+	for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++)
+		unmap_buffer(ctx->DrawBuffer->_ColorDrawBuffers[i]);
+
+	unmap_buffer(ctx->ReadBuffer->_ColorReadBuffer);
+
+	if (ctx->DrawBuffer->_DepthBuffer)
+		unmap_buffer(ctx->DrawBuffer->_DepthBuffer->Wrapped);
+	if (ctx->DrawBuffer->_StencilBuffer)
+		unmap_buffer(ctx->DrawBuffer->_StencilBuffer->Wrapped);
 }
 
 void radeonInitSpanFuncs(GLcontext * ctx)
@@ -330,20 +363,17 @@ void radeonInitSpanFuncs(GLcontext * ctx)
 /**
  * Plug in the Get/Put routines for the given driRenderbuffer.
  */
-void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
+void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb)
 {
-	if (drb->Base.InternalFormat == GL_RGBA) {
-		if (vis->redBits == 5 && vis->greenBits == 6
-		    && vis->blueBits == 5) {
-			radeonInitPointers_RGB565(&drb->Base);
-		} else {
-			radeonInitPointers_ARGB8888(&drb->Base);
-		}
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
-		radeonInitDepthPointers_z16(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
-		radeonInitDepthPointers_z24_s8(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
-		radeonInitStencilPointers_z24_s8(&drb->Base);
+	if (rrb->base.InternalFormat == GL_RGB5) {
+		radeonInitPointers_RGB565(&rrb->base);
+	} else if (rrb->base.InternalFormat == GL_RGBA8) {
+		radeonInitPointers_ARGB8888(&rrb->base);
+	} else if (rrb->base.InternalFormat == GL_DEPTH_COMPONENT16) {
+		radeonInitDepthPointers_z16(&rrb->base);
+	} else if (rrb->base.InternalFormat == GL_DEPTH_COMPONENT24) {
+		radeonInitDepthPointers_z24_s8(&rrb->base);
+	} else if (rrb->base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
+		radeonInitStencilPointers_z24_s8(&rrb->base);
 	}
 }
diff --git a/src/mesa/drivers/dri/r300/radeon_state.c b/src/mesa/drivers/dri/r300/radeon_state.c
index c401da6..0241903 100644
--- a/src/mesa/drivers/dri/r300/radeon_state.c
+++ b/src/mesa/drivers/dri/r300/radeon_state.c
@@ -223,14 +223,6 @@ void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state)
 void radeonInitState(radeonContextPtr radeon)
 {
 	radeon->Fallback = 0;
-
-	if (radeon->glCtx->Visual.doubleBufferMode && radeon->sarea->pfCurrentPage == 0) {
-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
-	} else {
-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
-	}
 }
 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_buffer.h b/src/mesa/drivers/dri/radeon/radeon_buffer.h
new file mode 100644
index 0000000..1f9fdff
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_buffer.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2008 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software")
+ * to deal in the software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * them Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTIBILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *	Adam Jackson <ajax@redhat.com>
+ */
+
+#ifndef RADEON_BUFFER_H
+#define RADEON_BUFFER_H
+
+#include "radeon_dri_bufmgr.h"
+#include "dri_util.h"
+
+struct radeon_renderbuffer
+{
+    struct gl_renderbuffer base;
+    dri_bo *bo;
+    unsigned int cpp;
+    /* unsigned int offset; */
+    unsigned int pitch;
+    unsigned int height;
+
+    /* boo Xorg 6.8.2 compat */
+    int depthHasSurface;
+
+    __DRIdrawablePrivate *dPriv;
+};
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_dri_bufmgr.c b/src/mesa/drivers/dri/radeon/radeon_dri_bufmgr.c
new file mode 100644
index 0000000..0bae2b5
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_dri_bufmgr.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright © 2007 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "main/mtypes.h"
+#include "radeon_dri_bufmgr.h"
+
+/** @file dri_bufmgr.c
+ *
+ * Convenience functions for buffer management methods.
+ */
+
+dri_bo *
+dri_bo_alloc(dri_bufmgr *bufmgr, const char *name, unsigned long size,
+	     unsigned int alignment, uint64_t location_mask)
+{
+   return bufmgr->bo_alloc(bufmgr, name, size, alignment, location_mask);
+}
+
+dri_bo *
+dri_bo_alloc_static(dri_bufmgr *bufmgr, const char *name, unsigned long offset,
+		    unsigned long size, void *virtual,
+		    uint64_t location_mask)
+{
+   return bufmgr->bo_alloc_static(bufmgr, name, offset, size, virtual,
+				  location_mask);
+}
+
+void
+dri_bo_reference(dri_bo *bo)
+{
+   bo->bufmgr->bo_reference(bo);
+}
+
+void
+dri_bo_unreference(dri_bo *bo)
+{
+   if (bo == NULL)
+      return;
+
+   bo->bufmgr->bo_unreference(bo);
+}
+
+int
+dri_bo_map(dri_bo *buf, GLboolean write_enable)
+{
+   return buf->bufmgr->bo_map(buf, write_enable);
+}
+
+int
+dri_bo_unmap(dri_bo *buf)
+{
+   return buf->bufmgr->bo_unmap(buf);
+}
+
+void
+dri_fence_wait(dri_fence *fence)
+{
+   fence->bufmgr->fence_wait(fence);
+}
+
+void
+dri_fence_reference(dri_fence *fence)
+{
+   fence->bufmgr->fence_reference(fence);
+}
+
+void
+dri_fence_unreference(dri_fence *fence)
+{
+   if (fence == NULL)
+      return;
+
+   fence->bufmgr->fence_unreference(fence);
+}
+
+void
+dri_bo_subdata(dri_bo *bo, unsigned long offset,
+	       unsigned long size, const void *data)
+{
+   if (size == 0 || data == NULL)
+      return;
+
+   dri_bo_map(bo, GL_TRUE);
+   memcpy((unsigned char *)bo->virtual + offset, data, size);
+   dri_bo_unmap(bo);
+}
+
+void
+dri_bo_get_subdata(dri_bo *bo, unsigned long offset,
+		   unsigned long size, void *data)
+{
+   if (size == 0 || data == NULL)
+      return;
+
+   dri_bo_map(bo, GL_FALSE);
+   memcpy(data, (unsigned char *)bo->virtual + offset, size);
+   dri_bo_unmap(bo);
+}
+
+void
+dri_bufmgr_destroy(dri_bufmgr *bufmgr)
+{
+   bufmgr->destroy(bufmgr);
+}
+
+
+int dri_emit_reloc(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
+		    GLuint offset, dri_bo *target_buf)
+{
+   return reloc_buf->bufmgr->emit_reloc(reloc_buf, flags, delta, offset, target_buf);
+}
+
+void *dri_process_relocs(dri_bo *batch_buf, GLuint *count)
+{
+   return batch_buf->bufmgr->process_relocs(batch_buf, count);
+}
+
+void dri_post_submit(dri_bo *batch_buf, dri_fence **last_fence)
+{
+   batch_buf->bufmgr->post_submit(batch_buf, last_fence);
+}
+
+void
+dri_bufmgr_set_debug(dri_bufmgr *bufmgr, GLboolean enable_debug)
+{
+   bufmgr->debug = enable_debug;
+}
+
+int
+dri_bufmgr_check_aperture_space(dri_bo *bo)
+{
+    return bo->bufmgr->check_aperture_space(bo);
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_dri_bufmgr.h b/src/mesa/drivers/dri/radeon/radeon_dri_bufmgr.h
new file mode 100644
index 0000000..0a726dc
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_dri_bufmgr.h
@@ -0,0 +1,260 @@
+/**************************************************************************
+ * 
+ * Copyright <20> 2007 Intel Corporation
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+/*
+ * Authors: Thomas Hellstr<74>m <thomas-at-tungstengraphics-dot-com>
+ *          Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ *	    Eric Anholt <eric@anholt.net>
+ */
+
+#ifndef _DRI_BUFMGR_H_
+#define _DRI_BUFMGR_H_
+#include <xf86drm.h>
+
+typedef struct _dri_bufmgr dri_bufmgr;
+typedef struct _dri_bo dri_bo;
+typedef struct _dri_fence dri_fence;
+
+struct _dri_bo {
+   /** Size in bytes of the buffer object. */
+   unsigned long size;
+   /**
+    * Card virtual address (offset from the beginning of the aperture) for the
+    * object.  Only valid while validated.
+    */
+   unsigned long offset;
+   /**
+    * Virtual address for accessing the buffer data.  Only valid while mapped.
+    */
+   void *virtual;
+   /** Buffer manager context associated with this buffer object */
+   dri_bufmgr *bufmgr;
+};
+
+struct _dri_fence {
+   /**
+    * This is an ORed mask of DRM_BO_FLAG_READ, DRM_BO_FLAG_WRITE, and
+    * DRM_FLAG_EXE indicating the operations associated with this fence.
+    *
+    * It is constant for the life of the fence object.
+    */
+   unsigned int type;
+   /** Buffer manager context associated with this fence */
+   dri_bufmgr *bufmgr;
+};
+
+/**
+ * Context for a buffer manager instance.
+ *
+ * Contains public methods followed by private storage for the buffer manager.
+ */
+struct _dri_bufmgr {
+   /**
+    * Allocate a buffer object.
+    *
+    * Buffer objects are not necessarily initially mapped into CPU virtual
+    * address space or graphics device aperture.  They must be mapped using
+    * bo_map() to be used by the CPU, and validated for use using bo_validate()
+    * to be used from the graphics device.
+    */
+   dri_bo *(*bo_alloc)(dri_bufmgr *bufmgr_ctx, const char *name,
+		       unsigned long size, unsigned int alignment,
+		       uint64_t location_mask);
+
+   /**
+    * Allocates a buffer object for a static allocation.
+    *
+    * Static allocations are ones such as the front buffer that are offered by
+    * the X Server, which are never evicted and never moved.
+    */
+   dri_bo *(*bo_alloc_static)(dri_bufmgr *bufmgr_ctx, const char *name,
+			      unsigned long offset, unsigned long size,
+			      void *virtual, uint64_t location_mask);
+
+   /** Takes a reference on a buffer object */
+   void (*bo_reference)(dri_bo *bo);
+
+   /**
+    * Releases a reference on a buffer object, freeing the data if
+    * rerefences remain.
+    */
+   void (*bo_unreference)(dri_bo *bo);
+
+   /**
+    * Maps the buffer into userspace.
+    *
+    * This function will block waiting for any existing fence on the buffer to
+    * clear, first.  The resulting mapping is available at buf->virtual.
+\    */
+   int (*bo_map)(dri_bo *buf, GLboolean write_enable);
+
+   /** Reduces the refcount on the userspace mapping of the buffer object. */
+   int (*bo_unmap)(dri_bo *buf);
+
+   /** Takes a reference on a fence object */
+   void (*fence_reference)(dri_fence *fence);
+
+   /**
+    * Releases a reference on a fence object, freeing the data if
+    * rerefences remain.
+    */
+   void (*fence_unreference)(dri_fence *fence);
+
+   /**
+    * Blocks until the given fence is signaled.
+    */
+   void (*fence_wait)(dri_fence *fence);
+
+   /**
+    * Tears down the buffer manager instance.
+    */
+   void (*destroy)(dri_bufmgr *bufmgr);
+
+   /**
+    * Add relocation entry in reloc_buf, which will be updated with the
+    * target buffer's real offset on on command submission.
+    *
+    * Relocations remain in place for the lifetime of the buffer object.
+    *
+    * \param reloc_buf Buffer to write the relocation into.
+    * \param flags BO flags to be used in validating the target buffer.
+    *	     Applicable flags include:
+    *	     - DRM_BO_FLAG_READ: The buffer will be read in the process of
+    *	       command execution.
+    *	     - DRM_BO_FLAG_WRITE: The buffer will be written in the process of
+    *	       command execution.
+    *	     - DRM_BO_FLAG_MEM_TT: The buffer should be validated in TT memory.
+    *	     - DRM_BO_FLAG_MEM_VRAM: The buffer should be validated in video
+    *	       memory.
+    * \param delta Constant value to be added to the relocation target's offset.
+    * \param offset Byte offset within batch_buf of the relocated pointer.
+    * \param target Buffer whose offset should be written into the relocation
+    *	     entry.
+    */
+   int (*emit_reloc)(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
+		      GLuint offset, dri_bo *target);
+
+   /**
+    * Processes the relocations, either in userland or by converting the list
+    * for use in batchbuffer submission.
+    *
+    * Kernel-based implementations will return a pointer to the arguments
+    * to be handed with batchbuffer submission to the kernel.  The userland
+    * implementation performs the buffer validation and emits relocations
+    * into them the appopriate order.
+    *
+    * \param batch_buf buffer at the root of the tree of relocations
+    * \param count returns the number of buffers validated.
+    * \return relocation record for use in command submission.
+    * */
+   void *(*process_relocs)(dri_bo *batch_buf, GLuint *count);
+
+   void (*post_submit)(dri_bo *batch_buf, dri_fence **fence);
+
+   int (*check_aperture_space)(dri_bo *bo);
+   GLboolean debug; /**< Enables verbose debugging printouts */
+};
+
+dri_bo *dri_bo_alloc(dri_bufmgr *bufmgr, const char *name, unsigned long size,
+		     unsigned int alignment, uint64_t location_mask);
+dri_bo *dri_bo_alloc_static(dri_bufmgr *bufmgr, const char *name,
+			    unsigned long offset, unsigned long size,
+			    void *virtual, uint64_t location_mask);
+void dri_bo_reference(dri_bo *bo);
+void dri_bo_unreference(dri_bo *bo);
+int dri_bo_map(dri_bo *buf, GLboolean write_enable);
+int dri_bo_unmap(dri_bo *buf);
+void dri_fence_wait(dri_fence *fence);
+void dri_fence_reference(dri_fence *fence);
+void dri_fence_unreference(dri_fence *fence);
+
+void dri_bo_subdata(dri_bo *bo, unsigned long offset,
+		    unsigned long size, const void *data);
+void dri_bo_get_subdata(dri_bo *bo, unsigned long offset,
+			unsigned long size, void *data);
+
+void dri_bufmgr_fake_contended_lock_take(dri_bufmgr *bufmgr);
+dri_bufmgr *dri_bufmgr_fake_init(unsigned long low_offset, void *low_virtual,
+				 unsigned long size,
+				 unsigned int (*fence_emit)(void *private),
+				 int (*fence_wait)(void *private,
+						   unsigned int cookie),
+				 void *driver_priv);
+void dri_bufmgr_set_debug(dri_bufmgr *bufmgr, GLboolean enable_debug);
+void dri_bo_fake_disable_backing_store(dri_bo *bo,
+				       void (*invalidate_cb)(dri_bo *bo,
+							     void *ptr),
+				       void *ptr);
+void dri_bufmgr_destroy(dri_bufmgr *bufmgr);
+
+int dri_emit_reloc(dri_bo *reloc_buf, uint64_t flags, GLuint delta,
+		   GLuint offset, dri_bo *target_buf);
+void *dri_process_relocs(dri_bo *batch_buf, uint32_t *count);
+void dri_post_process_relocs(dri_bo *batch_buf);
+void dri_post_submit(dri_bo *batch_buf, dri_fence **last_fence);
+int dri_bufmgr_check_aperture_space(dri_bo *bo);
+
+#ifndef TTM_API
+/* reuse some TTM API */
+
+#define DRM_BO_MEM_LOCAL 0
+#define DRM_BO_MEM_TT 1
+#define DRM_BO_MEM_VRAM 2
+#define DRM_BO_MEM_PRIV0 3
+#define DRM_BO_MEM_PRIV1 4
+#define DRM_BO_MEM_PRIV2 5
+#define DRM_BO_MEM_PRIV3 6
+#define DRM_BO_MEM_PRIV4 7
+
+#define DRM_BO_FLAG_READ        (1ULL << 0)
+#define DRM_BO_FLAG_WRITE       (1ULL << 1)
+#define DRM_BO_FLAG_EXE         (1ULL << 2)
+#define DRM_BO_MASK_ACCESS	(DRM_BO_FLAG_READ | DRM_BO_FLAG_WRITE | DRM_BO_FLAG_EXE)
+#define DRM_BO_FLAG_NO_EVICT    (1ULL << 4)
+
+#define DRM_BO_FLAG_MAPPABLE    (1ULL << 5)
+#define DRM_BO_FLAG_SHAREABLE   (1ULL << 6)
+
+#define DRM_BO_FLAG_CACHED      (1ULL << 7)
+
+#define DRM_BO_FLAG_NO_MOVE     (1ULL << 8)
+#define DRM_BO_FLAG_CACHED_MAPPED    (1ULL << 19)
+#define DRM_BO_FLAG_FORCE_CACHING  (1ULL << 13)
+#define DRM_BO_FLAG_FORCE_MAPPABLE (1ULL << 14)
+#define DRM_BO_FLAG_TILE           (1ULL << 15)
+
+#define DRM_BO_FLAG_MEM_LOCAL  (1ULL << 24)
+#define DRM_BO_FLAG_MEM_TT     (1ULL << 25)
+#define DRM_BO_FLAG_MEM_VRAM   (1ULL << 26)
+
+#define DRM_BO_MASK_MEM         0x00000000FF000000ULL
+
+#define DRM_FENCE_TYPE_EXE                 0x00000001
+#endif
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index 05107dd..1810ded 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -45,6 +45,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_chipset.h"
 #include "radeon_macros.h"
 #include "radeon_screen.h"
+#include "radeon_buffer.h"
 #if !RADEON_COMMON
 #include "radeon_context.h"
 #include "radeon_span.h"
@@ -67,6 +68,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "GL/internal/dri_interface.h"
 
+#include <errno.h>
+#include <sys/ioctl.h>
+
 /* Radeon configuration
  */
 #include "xmlpool.h"
@@ -348,11 +352,115 @@ static const __DRItexOffsetExtension r300texOffsetExtension = {
 };
 #endif
 
+
+static int
+radeon_gem_update_handle(radeonScreenPtr screen, __DRIscreenPrivate *sPriv,
+			 struct radeon_gem_object *gem_obj)
+{
+     struct drm_gem_close close_args;
+     struct drm_gem_open args;
+     struct drm_radeon_gem_mmap mmap_args;
+     struct drm_radeon_gem_pin pin_args;
+     int ret;
+     
+     if (gem_obj->gem_handle) {
+	     close_args.handle = gem_obj->gem_handle;
+
+	     ioctl(sPriv->fd, DRM_IOCTL_GEM_CLOSE, &close_args);
+	     gem_obj->gem_handle = 0;
+     }
+
+     /* do open */
+     args.name = gem_obj->gem_name;
+     ret = ioctl(sPriv->fd, DRM_IOCTL_GEM_OPEN, &args);
+     if (ret) {
+	     fprintf(stderr,"failed to open\n");
+	     return ret;
+     }
+     
+     gem_obj->gem_handle = args.handle;
+     gem_obj->size = args.size;
+     
+     mmap_args.handle = gem_obj->gem_handle;
+     mmap_args.size = gem_obj->size;
+     mmap_args.offset = 0;
+     
+     ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_GEM_MMAP, &mmap_args,
+			       sizeof(mmap_args));
+     
+     if (ret) {
+	     fprintf(stderr,"failed to mmap\n");
+	     return ret;
+     }
+     
+     gem_obj->map = (void *)(unsigned long)(mmap_args.addr_ptr);
+     
+     pin_args.handle = gem_obj->gem_handle;
+     pin_args.pin_domain = 0;
+     pin_args.alignment = 0;
+     
+     ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_GEM_PIN, &pin_args,
+			       sizeof(pin_args));
+     
+     if (ret) {
+	     fprintf(stderr,"failed to get offset\n");
+	     return ret;
+     }
+     
+     gem_obj->offset = pin_args.offset;
+     return 0;
+}
+
+static int
+radeon_init_mm_buffers(radeonScreenPtr screen, __DRIscreenPrivate *sPriv,
+		       RADEONDRIPtr dri_priv)
+{
+	int ret;
+	/* STOP GAP HERE */
+
+	screen->front.gem_name = dri_priv->frontOffset;
+	ret = radeon_gem_update_handle(screen, sPriv, &screen->front);
+	if (ret)	
+		return ret;
+
+	screen->frontOffset = screen->front.offset;
+
+	screen->back.gem_name = dri_priv->backOffset;
+	ret = radeon_gem_update_handle(screen, sPriv, &screen->back);
+	if (ret)	
+		return ret;
+
+	screen->backOffset = screen->back.offset;
+
+	screen->depth.gem_name = dri_priv->depthOffset;
+	ret = radeon_gem_update_handle(screen, sPriv, &screen->depth);
+	if (ret)	
+		return ret;
+
+	screen->depthOffset = screen->depth.offset;
+
+	screen->vram_texture.gem_name = dri_priv->textureOffset;
+	ret = radeon_gem_update_handle(screen, sPriv, &screen->vram_texture);
+	if (ret)	
+		return ret;
+
+	screen->texOffset[RADEON_LOCAL_TEX_HEAP] = screen->vram_texture.offset + screen->fbLocation;
+	screen->texSize[RADEON_LOCAL_TEX_HEAP] = screen->vram_texture.size;
+
+	screen->gart_texture.gem_name = dri_priv->gartTexHandle;
+	ret = radeon_gem_update_handle(screen, sPriv, &screen->gart_texture);
+	if (ret)	
+		return ret;
+
+	screen->gartTextures.map = screen->gart_texture.map;
+	screen->gart_texture_offset = screen->gart_texture.offset + screen->gart_base;
+	return 0;
+}
+
 /* Create the device specific screen private data struct.
  */
 static radeonScreenPtr
-radeonCreateScreen( __DRIscreenPrivate *sPriv )
-{
+radeonCreateScreen( __DRIscreenPrivate *sPriv ){
    radeonScreenPtr screen;
    RADEONDRIPtr dri_priv = (RADEONDRIPtr)sPriv->pDevPriv;
    unsigned char *RADEONMMIO;
@@ -387,6 +495,21 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    screen->card_type = (dri_priv->IsPCI ? RADEON_CARD_PCI : RADEON_CARD_AGP);
    {
       int ret;
+
+#ifdef RADEON_PARAM_KERNEL_MM
+     ret = radeonGetParam( sPriv->fd, RADEON_PARAM_KERNEL_MM,
+                            &screen->kernel_mm);
+
+      if (ret && ret != -EINVAL) {
+         FREE( screen );
+         fprintf(stderr, "drm_radeon_getparam_t (RADEON_OFFSET): %d\n", ret);
+         return NULL;
+      }
+
+      if (ret == -EINVAL)
+          screen->kernel_mm = 0;
+#endif
+
       ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BUFFER_OFFSET,
 			    &screen->gart_buffer_offset);
 
@@ -420,32 +543,34 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
       screen->drmSupportsVertexProgram = (sPriv->drm_version.minor >= 25);
    }
 
-   screen->mmio.handle = dri_priv->registerHandle;
-   screen->mmio.size   = dri_priv->registerSize;
-   if ( drmMap( sPriv->fd,
-		screen->mmio.handle,
-		screen->mmio.size,
-		&screen->mmio.map ) ) {
-      FREE( screen );
-      __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
-      return NULL;
-   }
+   if (!screen->kernel_mm) {
+      screen->mmio.handle = dri_priv->registerHandle;
+      screen->mmio.size   = dri_priv->registerSize;
+      if ( drmMap( sPriv->fd,
+		   screen->mmio.handle,
+		   screen->mmio.size,
+		   &screen->mmio.map ) ) {
+	 FREE( screen );
+	 __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
+	 return NULL;
+      }
 
-   RADEONMMIO = screen->mmio.map;
+      RADEONMMIO = screen->mmio.map;
 
-   screen->status.handle = dri_priv->statusHandle;
-   screen->status.size   = dri_priv->statusSize;
-   if ( drmMap( sPriv->fd,
-		screen->status.handle,
-		screen->status.size,
-		&screen->status.map ) ) {
-      drmUnmap( screen->mmio.map, screen->mmio.size );
-      FREE( screen );
-      __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
-      return NULL;
+      screen->status.handle = dri_priv->statusHandle;
+      screen->status.size   = dri_priv->statusSize;
+      if ( drmMap( sPriv->fd,
+		   screen->status.handle,
+		   screen->status.size,
+		   &screen->status.map ) ) {
+	 drmUnmap( screen->mmio.map, screen->mmio.size );
+	 FREE( screen );
+	 __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
+	 return NULL;
+      }
+      screen->scratch = (__volatile__ u_int32_t *)
+	 ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
    }
-   screen->scratch = (__volatile__ uint32_t *)
-      ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
 
    screen->buffers = drmMapBufs( sPriv->fd );
    if ( !screen->buffers ) {
@@ -456,22 +581,24 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
       return NULL;
    }
 
-   if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
-      screen->gartTextures.handle = dri_priv->gartTexHandle;
-      screen->gartTextures.size   = dri_priv->gartTexMapSize;
-      if ( drmMap( sPriv->fd,
-		   screen->gartTextures.handle,
-		   screen->gartTextures.size,
-		   (drmAddressPtr)&screen->gartTextures.map ) ) {
-	 drmUnmapBufs( screen->buffers );
-	 drmUnmap( screen->status.map, screen->status.size );
-	 drmUnmap( screen->mmio.map, screen->mmio.size );
-	 FREE( screen );
-	 __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
-	 return NULL;
+   if (!screen->kernel_mm) {
+      if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
+	 screen->gartTextures.handle = dri_priv->gartTexHandle;
+	 screen->gartTextures.size   = dri_priv->gartTexMapSize;
+	 if ( drmMap( sPriv->fd,
+		      screen->gartTextures.handle,
+		      screen->gartTextures.size,
+		      (drmAddressPtr)&screen->gartTextures.map ) ) {
+	    drmUnmapBufs( screen->buffers );
+	    drmUnmap( screen->status.map, screen->status.size );
+	    drmUnmap( screen->mmio.map, screen->mmio.size );
+	    FREE( screen );
+	    __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
+	    return NULL;
+	 }
+	 
+	 screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
       }
-
-      screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
    }
 
    screen->chip_flags = 0;
@@ -838,7 +965,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    ret = radeonGetParam( sPriv->fd, RADEON_PARAM_FB_LOCATION,
                          &temp);
    if (ret) {
-       if (screen->chip_family < CHIP_FAMILY_RS690)
+       if (screen->chip_family < CHIP_FAMILY_RS690 && !screen->kernel_mm)
 	   screen->fbLocation      = ( INREG( RADEON_MC_FB_LOCATION ) & 0xffff) << 16;
        else {
            FREE( screen );
@@ -879,55 +1006,65 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
        }
    }
 
-   if ( sPriv->drm_version.minor >= 10 ) {
-      drm_radeon_setparam_t sp;
-
-      sp.param = RADEON_SETPARAM_FB_LOCATION;
-      sp.value = screen->fbLocation;
-
-      drmCommandWrite( sPriv->fd, DRM_RADEON_SETPARAM,
-		       &sp, sizeof( sp ) );
-   }
-
-   screen->frontOffset	= dri_priv->frontOffset;
    screen->frontPitch	= dri_priv->frontPitch;
-   screen->backOffset	= dri_priv->backOffset;
    screen->backPitch	= dri_priv->backPitch;
-   screen->depthOffset	= dri_priv->depthOffset;
    screen->depthPitch	= dri_priv->depthPitch;
 
-   /* Check if ddx has set up a surface reg to cover depth buffer */
-   screen->depthHasSurface = (sPriv->ddx_version.major > 4) ||
-      /* these chips don't use tiled z without hyperz. So always pretend
-         we have set up a surface which will cause linear reads/writes */
-      ((screen->chip_family & RADEON_CLASS_R100) &&
-      !(screen->chip_flags & RADEON_CHIPSET_TCL));
-
-   if ( dri_priv->textureSize == 0 ) {
-      screen->texOffset[RADEON_LOCAL_TEX_HEAP] = screen->gart_texture_offset;
-      screen->texSize[RADEON_LOCAL_TEX_HEAP] = dri_priv->gartTexMapSize;
-      screen->logTexGranularity[RADEON_LOCAL_TEX_HEAP] =
-	 dri_priv->log2GARTTexGran;
-   } else {
-      screen->texOffset[RADEON_LOCAL_TEX_HEAP] = dri_priv->textureOffset
-				               + screen->fbLocation;
-      screen->texSize[RADEON_LOCAL_TEX_HEAP] = dri_priv->textureSize;
-      screen->logTexGranularity[RADEON_LOCAL_TEX_HEAP] =
-	 dri_priv->log2TexGran;
-   }
+   if (!screen->kernel_mm) {
+      if ( sPriv->drm_version.minor >= 10 ) {
+	 drm_radeon_setparam_t sp;
+
+	 sp.param = RADEON_SETPARAM_FB_LOCATION;
+	 sp.value = screen->fbLocation;
+
+	 drmCommandWrite( sPriv->fd, DRM_RADEON_SETPARAM,
+			  &sp, sizeof( sp ) );
+      }
 
-   if ( !screen->gartTextures.map || dri_priv->textureSize == 0
-	|| getenv( "RADEON_GARTTEXTURING_FORCE_DISABLE" ) ) {
-      screen->numTexHeaps = RADEON_NR_TEX_HEAPS - 1;
-      screen->texOffset[RADEON_GART_TEX_HEAP] = 0;
-      screen->texSize[RADEON_GART_TEX_HEAP] = 0;
-      screen->logTexGranularity[RADEON_GART_TEX_HEAP] = 0;
+      screen->frontOffset	= dri_priv->frontOffset;
+      screen->backOffset	= dri_priv->backOffset;
+      screen->depthOffset	= dri_priv->depthOffset;
+
+   
+      /* Check if ddx has set up a surface reg to cover depth buffer */
+      screen->depthHasSurface = (sPriv->ddx_version.major > 4) ||
+	 /* these chips don't use tiled z without hyperz. So always pretend
+	    we have set up a surface which will cause linear reads/writes */
+	 ((screen->chip_family & RADEON_CLASS_R100) &&
+	  !(screen->chip_flags & RADEON_CHIPSET_TCL));
+      
+      if ( dri_priv->textureSize == 0 ) {
+	 screen->texOffset[RADEON_LOCAL_TEX_HEAP] = screen->gart_texture_offset;
+	 screen->texSize[RADEON_LOCAL_TEX_HEAP] = dri_priv->gartTexMapSize;
+	 screen->logTexGranularity[RADEON_LOCAL_TEX_HEAP] =
+	    dri_priv->log2GARTTexGran;
+      } else {
+	 screen->texOffset[RADEON_LOCAL_TEX_HEAP] = dri_priv->textureOffset
+	    + screen->fbLocation;
+	 screen->texSize[RADEON_LOCAL_TEX_HEAP] = dri_priv->textureSize;
+	 screen->logTexGranularity[RADEON_LOCAL_TEX_HEAP] =
+	    dri_priv->log2TexGran;
+      }
+      
+      if ( !screen->gartTextures.map || dri_priv->textureSize == 0
+	   || getenv( "RADEON_GARTTEXTURING_FORCE_DISABLE" ) ) {
+	 screen->numTexHeaps = RADEON_NR_TEX_HEAPS - 1;
+	 screen->texOffset[RADEON_GART_TEX_HEAP] = 0;
+	 screen->texSize[RADEON_GART_TEX_HEAP] = 0;
+	 screen->logTexGranularity[RADEON_GART_TEX_HEAP] = 0;
+      } else {
+	 screen->numTexHeaps = RADEON_NR_TEX_HEAPS;
+	 screen->texOffset[RADEON_GART_TEX_HEAP] = screen->gart_texture_offset;
+	 screen->texSize[RADEON_GART_TEX_HEAP] = dri_priv->gartTexMapSize;
+	 screen->logTexGranularity[RADEON_GART_TEX_HEAP] = dri_priv->log2GARTTexGran;
+      }
    } else {
-      screen->numTexHeaps = RADEON_NR_TEX_HEAPS;
-      screen->texOffset[RADEON_GART_TEX_HEAP] = screen->gart_texture_offset;
-      screen->texSize[RADEON_GART_TEX_HEAP] = dri_priv->gartTexMapSize;
-      screen->logTexGranularity[RADEON_GART_TEX_HEAP] =
-	 dri_priv->log2GARTTexGran;
+      ret = radeon_init_mm_buffers(screen, sPriv, dri_priv);
+      if (ret) {
+           FREE( screen );
+           fprintf(stderr, "Unable to get mm buffers inited\n");
+           return NULL;
+      }
    }
 
    i = 0;
@@ -952,7 +1089,8 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
 #endif
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
-   screen->extensions[i++] = &r300texOffsetExtension.base;
+   if (!screen->kernel_mm) 
+      screen->extensions[i++] = &r300texOffsetExtension.base;
 #endif
 
    screen->extensions[i++] = NULL;
@@ -973,12 +1111,14 @@ radeonDestroyScreen( __DRIscreenPrivate *sPriv )
    if (!screen)
       return;
 
-   if ( screen->gartTextures.map ) {
-      drmUnmap( screen->gartTextures.map, screen->gartTextures.size );
-   }
    drmUnmapBufs( screen->buffers );
-   drmUnmap( screen->status.map, screen->status.size );
-   drmUnmap( screen->mmio.map, screen->mmio.size );
+   if (!screen->kernel_mm) {
+      if ( screen->gartTextures.map ) {
+	 drmUnmap( screen->gartTextures.map, screen->gartTextures.size );
+      }
+      drmUnmap( screen->status.map, screen->status.size );
+      drmUnmap( screen->mmio.map, screen->mmio.size );
+   }
 
    /* free all option information */
    driDestroyOptionInfo (&screen->optionCache);
@@ -1002,6 +1142,160 @@ radeonInitDriver( __DRIscreenPrivate *sPriv )
    return GL_TRUE;
 }
 
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+static GLboolean
+radeon_alloc_window_storage(GLcontext *ctx, struct gl_renderbuffer *rb,
+			    GLenum intFormat, GLuint w, GLuint h)
+{
+    rb->Width = w;
+    rb->Height = h;
+    rb->_ActualFormat = intFormat;
+
+    return GL_TRUE;
+}
+
+
+static struct radeon_renderbuffer *
+radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv)
+{
+    struct radeon_renderbuffer *ret;
+
+    ret = CALLOC_STRUCT(radeon_renderbuffer);
+    if (!ret)
+	return NULL;
+
+    _mesa_init_renderbuffer(&ret->base, 0);
+
+    /* XXX format junk */
+    switch (format) {
+	case GL_RGB5:
+	    ret->base._ActualFormat = GL_RGB5;
+	    ret->base._BaseFormat = GL_RGBA;
+	    ret->base.RedBits = 5;
+	    ret->base.GreenBits = 6;
+	    ret->base.BlueBits = 5;
+	    ret->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_RGBA8:
+	    ret->base._ActualFormat = GL_RGBA8;
+	    ret->base._BaseFormat = GL_RGBA;
+	    ret->base.RedBits = 8;
+	    ret->base.GreenBits = 8;
+	    ret->base.BlueBits = 8;
+	    ret->base.AlphaBits = 8;
+	    ret->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_STENCIL_INDEX8_EXT:
+	    ret->base._ActualFormat = GL_STENCIL_INDEX8_EXT;
+	    ret->base._BaseFormat = GL_STENCIL_INDEX;
+	    ret->base.StencilBits = 8;
+	    ret->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_DEPTH_COMPONENT16:
+	    ret->base._ActualFormat = GL_DEPTH_COMPONENT16;
+	    ret->base._BaseFormat = GL_DEPTH_COMPONENT;
+	    ret->base.DepthBits = 16;
+	    ret->base.DataType = GL_UNSIGNED_SHORT;
+	    break;
+	case GL_DEPTH_COMPONENT24:
+	    ret->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+	    ret->base._BaseFormat = GL_DEPTH_COMPONENT;
+	    ret->base.DepthBits = 24;
+	    ret->base.DataType = GL_UNSIGNED_INT;
+	    break;
+	case GL_DEPTH24_STENCIL8_EXT:
+	    ret->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+	    ret->base._BaseFormat = GL_DEPTH_STENCIL_EXT;
+	    ret->base.DepthBits = 24;
+	    ret->base.StencilBits = 8;
+	    ret->base.DataType = GL_UNSIGNED_INT_24_8_EXT;
+	    break;
+	default:
+	    fprintf(stderr, "%s: Unknown format 0x%04x\n", __FUNCTION__, format);
+	    _mesa_delete_renderbuffer(&ret->base);
+	    return NULL;
+    }
+
+    ret->dPriv = driDrawPriv;
+    ret->base.InternalFormat = format;
+
+    ret->base.AllocStorage = radeon_alloc_window_storage;
+
+    radeonSetSpanFunctions(ret);
+
+    return ret;
+}
+
+/**
+ * Create the Mesa framebuffer and renderbuffers for a given window/drawable.
+ *
+ * \todo This function (and its interface) will need to be updated to support
+ * pbuffers.
+ */
+static GLboolean
+radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
+                    __DRIdrawablePrivate *driDrawPriv,
+                    const __GLcontextModes *mesaVis,
+                    GLboolean isPixmap )
+{
+   radeonScreenPtr screen = (radeonScreenPtr) driScrnPriv->private;
+
+    const GLboolean swDepth = GL_FALSE;
+    const GLboolean swAlpha = GL_FALSE;
+    const GLboolean swAccum = mesaVis->accumRedBits > 0;
+    const GLboolean swStencil = mesaVis->stencilBits > 0 &&
+	mesaVis->depthBits != 24;
+    GLenum rgbFormat = (mesaVis->redBits == 5 ? GL_RGB5 : GL_RGBA8);
+    GLenum depthFormat = GL_NONE;
+    struct gl_framebuffer *fb = _mesa_create_framebuffer(mesaVis);
+
+    if (mesaVis->depthBits == 16)
+	depthFormat = GL_DEPTH_COMPONENT16;
+    else if (mesaVis->depthBits == 24)
+	depthFormat = GL_DEPTH_COMPONENT24;
+
+    /* front color renderbuffer */
+    {
+	struct radeon_renderbuffer *front =
+	    radeon_create_renderbuffer(rgbFormat, driDrawPriv);
+	_mesa_add_renderbuffer(fb, BUFFER_FRONT_LEFT, &front->base);
+    }
+
+    /* back color renderbuffer */
+    if (mesaVis->doubleBufferMode) {
+	struct radeon_renderbuffer *back =
+	    radeon_create_renderbuffer(rgbFormat, driDrawPriv);
+	_mesa_add_renderbuffer(fb, BUFFER_BACK_LEFT, &back->base);
+    }
+
+    /* depth renderbuffer */
+    if (depthFormat != GL_NONE) {
+	struct radeon_renderbuffer *depth =
+	    radeon_create_renderbuffer(depthFormat, driDrawPriv);
+	_mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depth->base);
+	depth->depthHasSurface = screen->depthHasSurface;
+    }
+
+    /* stencil renderbuffer */
+    if (mesaVis->stencilBits > 0 && !swStencil) {
+	struct radeon_renderbuffer *stencil =
+	    radeon_create_renderbuffer(GL_STENCIL_INDEX8_EXT, driDrawPriv);
+	_mesa_add_renderbuffer(fb, BUFFER_STENCIL, &stencil->base);
+	stencil->depthHasSurface = screen->depthHasSurface;
+    }
+
+    _mesa_add_soft_renderbuffers(fb,
+	    GL_FALSE, /* color */
+	    swDepth,
+	    swStencil,
+	    swAccum,
+	    swAlpha,
+	    GL_FALSE /* aux */);
+    driDrawPriv->driverPrivate = (void *) fb;
+
+    return (driDrawPriv->driverPrivate != NULL);
+}
+#else
 
 /**
  * Create the Mesa framebuffer and renderbuffers for a given window/drawable.
@@ -1101,7 +1395,7 @@ radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
       return (driDrawPriv->driverPrivate != NULL);
    }
 }
-
+#endif
 
 static void
 radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
@@ -1197,11 +1491,11 @@ radeonInitScreen(__DRIscreenPrivate *psp)
    if (!radeonInitDriver(psp))
        return NULL;
 
+   /* for now fill in all modes */
    return radeonFillInModes( psp,
 			     dri_priv->bpp,
 			     (dri_priv->bpp == 16) ? 16 : 24,
-			     (dri_priv->bpp == 16) ? 0  : 8,
-			     (dri_priv->backOffset != dri_priv->depthOffset) );
+			     (dri_priv->bpp == 16) ? 0  : 8, 1);
 }
 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h
index b84c70b..59f1e86 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.h
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.h
@@ -54,6 +54,14 @@ typedef struct {
    drmAddress map;			/* Mapping of the DRM region */
 } radeonRegionRec, *radeonRegionPtr;
 
+struct radeon_gem_object {
+   uint32_t gem_name;
+   uint32_t gem_handle;
+   uint64_t size;
+   void *map;
+   uint64_t offset;
+};
+
 typedef struct {
    int chip_family;
    int chip_flags;
@@ -106,6 +114,13 @@ typedef struct {
    const __DRIextension *extensions[8];
 
    int num_gb_pipes;
+
+   int kernel_mm;
+   struct radeon_gem_object front;
+   struct radeon_gem_object back;
+   struct radeon_gem_object depth;
+   struct radeon_gem_object vram_texture;
+   struct radeon_gem_object gart_texture;
 } radeonScreenRec, *radeonScreenPtr;
 
 #define IS_R100_CLASS(screen) \
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.h b/src/mesa/drivers/dri/radeon/radeon_span.h
index 9abe086..1650a9b 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.h
+++ b/src/mesa/drivers/dri/radeon/radeon_span.h
@@ -44,7 +44,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "drirenderbuffer.h"
 
+#include "radeon_buffer.h"
+
 extern void radeonInitSpanFuncs(GLcontext * ctx);
-extern void radeonSetSpanFunctions(driRenderbuffer * rb, const GLvisual * vis);
 
+#if COMPILE_R300
+extern void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb);
+#else
+extern void radeonSetSpanFunctions(driRenderbuffer * rb, const GLvisual * vis);
+#endif
 #endif
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 13d90e7..fe49faf 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -1073,8 +1073,7 @@ _mesa_generate_mipmap(GLcontext *ctx, GLenum target,
          _mesa_free(dstImage->ImageOffsets);
 
       /* Free old image data */
-      if (dstImage->Data)
-         ctx->Driver.FreeTexImageData(ctx, dstImage);
+      ctx->Driver.FreeTexImageData(ctx, dstImage);
 
       /* initialize new image */
       _mesa_init_teximage_fields(ctx, target, dstImage, dstWidth, dstHeight,