Re: [PATCH] 2.6.19 m68k Atari: EtherNEC support

Roman Zippel <zippel@xxxxxxxxxxxxxx> · Tue, 12 Dec 2006 12:59:19 +0100 (CET)

Hi,

On Tue, 12 Dec 2006, Michael Schmitz wrote:

vger refused the atafb patch again, that's why I spammed Geert by CC: on
all this, sigh.

BTW I pretty much rewrote the bit plane stuff, so it doesn't constantly
corrupts the output.

??? When did you get to see output corruption? Can you send me your
rewrite for testing? (Though Aranym should suffice for testing, really)

I was testing with vga256 and pretty much any line edit operation which 
calls copyarea or fillrect causes a corrupt output (i.e. anything besides 
special cases like full lines). Am I the only one seeing that?
I attached the 8 plane version, it still needs some more cleanups. The 2 
and 4 plane version will look similiar and I propably going to merge them 
at some point to use one template to produce the 3 versions.
I killed the movep stuff, I'm not sure it still is really a win (maybe at 
68000 times it was, but on a 68060 it certainly isn't), some operations 
should be even considerably faster now (e.g. odd<->even copy).

bye, Roman
/*
 *  linux/drivers/video/iplan2p8.c -- Low level frame buffer operations for
 *				      interleaved bitplanes à la Atari (8
 *				      planes, 2 bytes interleave)
 *
 *	Created 5 Apr 1997 by Geert Uytterhoeven
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of this archive for
 *  more details.
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/fb.h>

#include <asm/setup.h>

#include "console/fbcon.h"
#include "atafb_iplan2p8.h"

#define BPL	8
#include "atafb_utils.h"

/* Copies a 8 plane column from 's', height 'h', to 'd'. */

/* This expands a 8 bit color into two longs for two movepl (8 plane)
 * operations.
 */

void atafb_iplan2p8_copyarea(struct fb_info *info, u_long next_line,
                             int sy, int sx, int dy, int dx,
			     int height, int width)
{
	/*  bmove() has to distinguish two major cases: If both, source and
	 *  destination, start at even addresses or both are at odd
	 *  addresses, just the first odd and last even column (if present)
	 *  require special treatment (memmove_col()). The rest between
	 *  then can be copied by normal operations, because all adjacent
	 *  bytes are affected and are to be stored in the same order.
	 *    The pathological case is when the move should go from an odd
	 *  address to an even or vice versa. Since the bytes in the plane
	 *  words must be assembled in new order, it seems wisest to make
	 *  all movements by memmove_col().
	 */

	u8 *src, *dst;
	u32 *s, *d;
	int w, l , i, j;
	u_int colsize;
	u_int upwards = (dy < sy) || (dy == sy && dx < sx);

	colsize = height;
	if (!((sx ^ dx) & 15)) {
		/* odd->odd or even->even */

		if (upwards) {
			src = (u8 *)info->screen_base + sy * next_line + (sx & ~15);
			dst = (u8 *)info->screen_base + dy * next_line + (dx & ~15);
			if (sx & 15) {
				memmove32_col(dst, src, 0xff00ff, height, next_line - 16);
				src += 16;
				dst += 16;
				width -= 8;
			}
			w = width >> 4;
			if (w) {
				s = (u32 *)src;
				d = (u32 *)dst;
				w *= 4;
				l = next_line - w * 4;
				for (j = height; j > 0; j--) {
					for (i = w; i > 0; i--)
						*d++ = *s++;
					s = (u32 *)((u8 *)s + l);
					d = (u32 *)((u8 *)d + l);
				}
			}
			if (width & 15)
				memmove32_col(dst + width, src + width,
					      0xff00ff00, height, next_line - 16);
		} else {
			src = (u8 *)info->screen_base + (sy - 1) * next_line + ((sx + width + 8) & ~15);
			dst = (u8 *)info->screen_base + (dy - 1) * next_line + ((dx + width + 8) & ~15);

			if ((sx + width) & 15) {
				src -= 16;
				dst -= 16;
				memmove32_col(dst, src, 0xff00ff00, colsize, -next_line - 16);
				width -= 8;
			}
			w = width >> 4;
			if (w) {
				s = (u32 *)src;
				d = (u32 *)dst;
				w *= 4;
				l = next_line - w * 4;
				for (j = height; j > 0; j--) {
					for (i = w; i > 0; i--)
						*--d = *--s;
					s = (u32 *)((u8 *)s - l);
					d = (u32 *)((u8 *)d - l);
				}
			}
			if (sx & 15)
				memmove32_col(dst - width - 16, src - width - 16,
					      0xff00ff, colsize, -next_line - 16);
		}
	} else {
		/* odd->even or even->odd */
		if (upwards) {
			u32 *src32, *dst32;
			u32 pval[4], v, v1, mask;
			int i, j, w, f;

			src = (u8 *)info->screen_base + sy * next_line + (sx & ~15);
			dst = (u8 *)info->screen_base + dy * next_line + (dx & ~15);

			mask = 0xff00ff00;
			f = 0;
			w = width;
			if (sx & 15) {
				f = 1;
				w += 8;
			}
			if ((sx + width) & 15)
				f |= 2;
			w >>= 4;
			for (i = height; i; i--) {
				src32 = (u32 *)src;
				dst32 = (u32 *)dst;

				if (f & 1) {
					pval[0] = (*src32++ << 8) & mask;
					pval[1] = (*src32++ << 8) & mask;
					pval[2] = (*src32++ << 8) & mask;
					pval[3] = (*src32++ << 8) & mask;
				} else {
					pval[0] = dst32[0] & mask;
					pval[1] = dst32[1] & mask;
					pval[2] = dst32[2] & mask;
					pval[3] = dst32[3] & mask;
				}

				for (j = w; j > 0; j--) {
					v = *src32++;
					v1 = v & mask;
					*dst32++ = pval[0] | (v1 >> 8);
					pval[0] = (v ^ v1) << 8;
					v = *src32++;
					v1 = v & mask;
					*dst32++ = pval[1] | (v1 >> 8);
					pval[1] = (v ^ v1) << 8;
					v = *src32++;
					v1 = v & mask;
					*dst32++ = pval[2] | (v1 >> 8);
					pval[2] = (v ^ v1) << 8;
					v = *src32++;
					v1 = v & mask;
					*dst32++ = pval[3] | (v1 >> 8);
					pval[3] = (v ^ v1) << 8;
				}

				if (f & 2) {
					dst32[0] = (dst32[0] & mask) | pval[0];
					dst32[1] = (dst32[1] & mask) | pval[1];
					dst32[2] = (dst32[2] & mask) | pval[2];
					dst32[3] = (dst32[3] & mask) | pval[3];
				}

				src += next_line;
				dst += next_line;
			}
		} else {
			u32 *src32, *dst32;
			u32 pval[4], v, v1, mask;
			int i, j, w, f;

			src = (u8 *)(info->screen_base + (sy - 1) * next_line + ((sx + width + 8) & ~15));
			dst = (u8 *)(info->screen_base + (dy - 1) * next_line + ((dx + width + 8) & ~15));

			mask = 0xff00ff;
			f = 0;
			w = width;
			if ((dx + width) & 15)
				f = 1;
			if (sx & 15) {
				f |= 2;
				w += 8;
			}
			w >>= 4;
			for (i = height; i; i--) {
				src32 = (u32 *)src;
				dst32 = (u32 *)dst;

				if (f & 1) {
					pval[0] = dst32[-1] & mask;
					pval[1] = dst32[-2] & mask;
					pval[2] = dst32[-3] & mask;
					pval[3] = dst32[-4] & mask;
				} else {
					pval[0] = (*--src32 >> 8) & mask;
					pval[1] = (*--src32 >> 8) & mask;
					pval[2] = (*--src32 >> 8) & mask;
					pval[3] = (*--src32 >> 8) & mask;
				}

				for (j = w; j > 0; j--) {
					v = *--src32;
					v1 = v & mask;
					*--dst32 = pval[0] | (v1 << 8);
					pval[0] = (v ^ v1) >> 8;
					v = *--src32;
					v1 = v & mask;
					*--dst32 = pval[1] | (v1 << 8);
					pval[1] = (v ^ v1) >> 8;
					v = *--src32;
					v1 = v & mask;
					*--dst32 = pval[2] | (v1 << 8);
					pval[2] = (v ^ v1) >> 8;
					v = *--src32;
					v1 = v & mask;
					*--dst32 = pval[3] | (v1 << 8);
					pval[3] = (v ^ v1) >> 8;
				}

				if (!(f & 2)) {
					dst32[-1] = (dst32[-1] & mask) | pval[0];
					dst32[-2] = (dst32[-2] & mask) | pval[1];
					dst32[-3] = (dst32[-3] & mask) | pval[2];
					dst32[-4] = (dst32[-4] & mask) | pval[3];
				}

				src -= next_line;
				dst -= next_line;
			}
		}
	}
}

void atafb_iplan2p8_fillrect(struct fb_info *info, u_long next_line, u32 color,
                             int sy, int sx, int height, int width)
{
	u32 *dest;
	int rows, i, j;
	u32 cval[4], tmp;

	dest = (u32 *)(info->screen_base + sy * next_line + (sx & ~15) / (8 / BPL));
	if (sx & 15) {
		u8 *dest8 = (u8 *)dest + 1;

		expand8_col2mask(color, cval);

		for (i = height; i; i--) {
			fill8_col(dest8, cval);
			dest8 += next_line;
		}
		dest += BPL / 2;
		width -= 8;
	}

	expand16_col2mask(color, cval);
	rows = width >> 4;
	if (rows) {
		u32 *d = dest;
		u32 off = next_line - rows * BPL * 2;
		for (i = height; i; i--) {
			d = fill16_col(d, rows, cval);
			d = (u32 *)((long)d + off);
		}
		dest += rows << 2;
		width &= 15;
	}

	if (width) {
		u8 *dest8 = (u8 *)dest;

		expand8_col2mask(color, cval);

		for (i = height; i; i--) {
			fill8_col(dest8, cval);
			dest8 += next_line;
		}
	}
}

void atafb_iplan2p8_linefill(struct fb_info *info, u_long next_line,
			     int dy, int dx, u32 width,
			     const u8 *data, u32 bgcolor, u32 fgcolor)
{
	u32 *dest;
	const u16 *data16;
	int rows;
	u32 m, tmp;
	u32 fgm[4], bgm[4];

	dest = (u32 *)(info->screen_base + dy * next_line + (dx & ~15) / (8 / BPL));
	if (dx & 15) {
		fill8_2col((u8 *)dest + 1, fgcolor, bgcolor, *data++);
		dest += BPL / 2;
		width -= 8;
	}

	if (width >= 16) {
		data16 = (const u16 *)data;
		expand16_2col2mask(fgcolor, bgcolor, fgm, bgm);

		for (rows = width / 16; rows; rows--) {
			u16 d = *data16++;
			m = d | ((u32)d << 16);
			*dest++ = (m & fgm[0]) ^ bgm[0];
			*dest++ = (m & fgm[1]) ^ bgm[1];
			*dest++ = (m & fgm[2]) ^ bgm[2];
			*dest++ = (m & fgm[3]) ^ bgm[3];
		}

		data = (const u8 *)data16;
		width &= 15;
	}

	if (width)
		fill8_2col((u8 *)dest, fgcolor, bgcolor, *data);
}

#ifdef MODULE
MODULE_LICENSE("GPL");

int init_module(void)
{
	return 0;
}

void cleanup_module(void)
{}
#endif /* MODULE */

    /*
     *  Visible symbols for modules
     */

EXPORT_SYMBOL(atafb_iplan2p8_copyarea);
EXPORT_SYMBOL(atafb_iplan2p8_fillrect);
EXPORT_SYMBOL(atafb_iplan2p8_linefill);
#ifndef _VIDEO_ATAFB_UTILS_H
#define _VIDEO_ATAFB_UTILS_H

/* ================================================================= */
/*                      Utility Assembler Functions                  */
/* ================================================================= */

/* ====================================================================== */

/* Those of a delicate disposition might like to skip the next couple of
 * pages.
 *
 * These functions are drop in replacements for memmove and
 * memset(_, 0, _). However their five instances add at least a kilobyte
 * to the object file. You have been warned.
 *
 * Not a great fan of assembler for the sake of it, but I think
 * that these routines are at least 10 times faster than their C
 * equivalents for large blits, and that's important to the lowest level of
 * a graphics driver. Question is whether some scheme with the blitter
 * would be faster. I suspect not for simple text system - not much
 * asynchrony.
 *
 * Code is very simple, just gruesome expansion. Basic strategy is to
 * increase data moved/cleared at each step to 16 bytes to reduce
 * instruction per data move overhead. movem might be faster still
 * For more than 15 bytes, we try to align the write direction on a
 * longword boundary to get maximum speed. This is even more gruesome.
 * Unaligned read/write used requires 68020+ - think this is a problem?
 *
 * Sorry!
 */

/* ++roman: I've optimized Robert's original versions in some minor
 * aspects, e.g. moveq instead of movel, let gcc choose the registers,
 * use movem in some places...
 * For other modes than 1 plane, lots of more such assembler functions
 * were needed (e.g. the ones using movep or expanding color values).
 */

/* ++andreas: more optimizations:
   subl #65536,d0 replaced by clrw d0; subql #1,d0 for dbcc
   addal is faster than addaw
   movep is rather expensive compared to ordinary move's
   some functions rewritten in C for clarity, no speed loss */

static __inline__ void *fb_memclear_small(void *s, size_t count)
{
   if (!count)
      return(0);

   __asm__ __volatile__(
         "lsrl   #1,%1 ; jcc 1f ; moveb %2,%0@-\n\t"
      "1: lsrl   #1,%1 ; jcc 1f ; movew %2,%0@-\n\t"
      "1: lsrl   #1,%1 ; jcc 1f ; movel %2,%0@-\n\t"
      "1: lsrl   #1,%1 ; jcc 1f ; movel %2,%0@- ; movel %2,%0@-\n\t"
      "1:"
         : "=a" (s), "=d" (count)
         : "d" (0), "0" ((char *)s+count), "1" (count)
   );
   __asm__ __volatile__(
         "subql  #1,%1 ; jcs 3f\n\t"
	 "movel %2,%%d4; movel %2,%%d5; movel %2,%%d6\n\t"
      "2: moveml %2/%%d4/%%d5/%%d6,%0@-\n\t"
         "dbra %1,2b\n\t"
      "3:"
         : "=a" (s), "=d" (count)
         : "d" (0), "0" (s), "1" (count)
	 : "d4", "d5", "d6"
  );

   return(0);
}

static __inline__ void *fb_memclear(void *s, size_t count)
{
   if (!count)
      return(0);

   if (count < 16) {
      __asm__ __volatile__(
            "lsrl   #1,%1 ; jcc 1f ; clrb %0@+\n\t"
         "1: lsrl   #1,%1 ; jcc 1f ; clrw %0@+\n\t"
         "1: lsrl   #1,%1 ; jcc 1f ; clrl %0@+\n\t"
         "1: lsrl   #1,%1 ; jcc 1f ; clrl %0@+ ; clrl %0@+\n\t"
         "1:"
            : "=a" (s), "=d" (count)
            : "0" (s), "1" (count)
     );
   } else {
      long tmp;
      __asm__ __volatile__(
            "movel %1,%2\n\t"
            "lsrl   #1,%2 ; jcc 1f ; clrb %0@+ ; subqw #1,%1\n\t"
            "lsrl   #1,%2 ; jcs 2f\n\t"  /* %0 increased=>bit 2 switched*/
            "clrw   %0@+  ; subqw  #2,%1 ; jra 2f\n\t"
         "1: lsrl   #1,%2 ; jcc 2f\n\t"
            "clrw   %0@+  ; subqw  #2,%1\n\t"
         "2: movew %1,%2; lsrl #2,%1 ; jeq 6f\n\t"
            "lsrl   #1,%1 ; jcc 3f ; clrl %0@+\n\t"
         "3: lsrl   #1,%1 ; jcc 4f ; clrl %0@+ ; clrl %0@+\n\t"
         "4: subql  #1,%1 ; jcs 6f\n\t"
         "5: clrl %0@+; clrl %0@+ ; clrl %0@+ ; clrl %0@+\n\t"
            "dbra %1,5b   ; clrw %1; subql #1,%1; jcc 5b\n\t"
         "6: movew %2,%1; btst #1,%1 ; jeq 7f ; clrw %0@+\n\t"
         "7:            ; btst #0,%1 ; jeq 8f ; clrb %0@+\n\t"
         "8:"
            : "=a" (s), "=d" (count), "=d" (tmp)
            : "0" (s), "1" (count)
     );
   }

   return(0);
}

static __inline__ void *fb_memset255(void *s, size_t count)
{
   if (!count)
      return(0);

   __asm__ __volatile__(
         "lsrl   #1,%1 ; jcc 1f ; moveb %2,%0@-\n\t"
      "1: lsrl   #1,%1 ; jcc 1f ; movew %2,%0@-\n\t"
      "1: lsrl   #1,%1 ; jcc 1f ; movel %2,%0@-\n\t"
      "1: lsrl   #1,%1 ; jcc 1f ; movel %2,%0@- ; movel %2,%0@-\n\t"
      "1:"
         : "=a" (s), "=d" (count)
         : "d" (-1), "0" ((char *)s+count), "1" (count)
   );
   __asm__ __volatile__(
         "subql  #1,%1 ; jcs 3f\n\t"
	 "movel %2,%%d4; movel %2,%%d5; movel %2,%%d6\n\t"
      "2: moveml %2/%%d4/%%d5/%%d6,%0@-\n\t"
         "dbra %1,2b\n\t"
      "3:"
         : "=a" (s), "=d" (count)
         : "d" (-1), "0" (s), "1" (count)
	 : "d4", "d5", "d6"
  );

   return(0);
}

static __inline__ void *fb_memmove(void *d, const void *s, size_t count)
{
   if (d < s) {
      if (count < 16) {
         __asm__ __volatile__(
               "lsrl   #1,%2 ; jcc 1f ; moveb %1@+,%0@+\n\t"
            "1: lsrl   #1,%2 ; jcc 1f ; movew %1@+,%0@+\n\t"
            "1: lsrl   #1,%2 ; jcc 1f ; movel %1@+,%0@+\n\t"
            "1: lsrl   #1,%2 ; jcc 1f ; movel %1@+,%0@+ ; movel %1@+,%0@+\n\t"
            "1:"
               : "=a" (d), "=a" (s), "=d" (count)
               : "0" (d), "1" (s), "2" (count)
        );
      } else {
         long tmp;
         __asm__ __volatile__(
               "movel  %0,%3\n\t"
               "lsrl   #1,%3 ; jcc 1f ; moveb %1@+,%0@+ ; subqw #1,%2\n\t"
               "lsrl   #1,%3 ; jcs 2f\n\t"  /* %0 increased=>bit 2 switched*/
               "movew  %1@+,%0@+  ; subqw  #2,%2 ; jra 2f\n\t"
            "1: lsrl   #1,%3 ; jcc 2f\n\t"
               "movew  %1@+,%0@+  ; subqw  #2,%2\n\t"
            "2: movew  %2,%-; lsrl #2,%2 ; jeq 6f\n\t"
               "lsrl   #1,%2 ; jcc 3f ; movel %1@+,%0@+\n\t"
            "3: lsrl   #1,%2 ; jcc 4f ; movel %1@+,%0@+ ; movel %1@+,%0@+\n\t"
            "4: subql  #1,%2 ; jcs 6f\n\t"
            "5: movel  %1@+,%0@+;movel %1@+,%0@+\n\t"
               "movel  %1@+,%0@+;movel %1@+,%0@+\n\t"
               "dbra   %2,5b ; clrw %2; subql #1,%2; jcc 5b\n\t"
            "6: movew  %+,%2; btst #1,%2 ; jeq 7f ; movew %1@+,%0@+\n\t"
            "7:              ; btst #0,%2 ; jeq 8f ; moveb %1@+,%0@+\n\t"
            "8:"
               : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp)
               : "0" (d), "1" (s), "2" (count)
        );
      }
   } else {
      if (count < 16) {
         __asm__ __volatile__(
               "lsrl   #1,%2 ; jcc 1f ; moveb %1@-,%0@-\n\t"
            "1: lsrl   #1,%2 ; jcc 1f ; movew %1@-,%0@-\n\t"
            "1: lsrl   #1,%2 ; jcc 1f ; movel %1@-,%0@-\n\t"
            "1: lsrl   #1,%2 ; jcc 1f ; movel %1@-,%0@- ; movel %1@-,%0@-\n\t"
            "1:"
               : "=a" (d), "=a" (s), "=d" (count)
               : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)
        );
      } else {
         long tmp;
         __asm__ __volatile__(
               "movel %0,%3\n\t"
               "lsrl   #1,%3 ; jcc 1f ; moveb %1@-,%0@- ; subqw #1,%2\n\t"
               "lsrl   #1,%3 ; jcs 2f\n\t"  /* %0 increased=>bit 2 switched*/
               "movew  %1@-,%0@-  ; subqw  #2,%2 ; jra 2f\n\t"
            "1: lsrl   #1,%3 ; jcc 2f\n\t"
               "movew  %1@-,%0@-  ; subqw  #2,%2\n\t"
            "2: movew %2,%-; lsrl #2,%2 ; jeq 6f\n\t"
               "lsrl   #1,%2 ; jcc 3f ; movel %1@-,%0@-\n\t"
            "3: lsrl   #1,%2 ; jcc 4f ; movel %1@-,%0@- ; movel %1@-,%0@-\n\t"
            "4: subql  #1,%2 ; jcs 6f\n\t"
            "5: movel %1@-,%0@-;movel %1@-,%0@-\n\t"
               "movel %1@-,%0@-;movel %1@-,%0@-\n\t"
               "dbra %2,5b ; clrw %2; subql #1,%2; jcc 5b\n\t"
            "6: movew %+,%2; btst #1,%2 ; jeq 7f ; movew %1@-,%0@-\n\t"
            "7:              ; btst #0,%2 ; jeq 8f ; moveb %1@-,%0@-\n\t"
            "8:"
               : "=a" (d), "=a" (s), "=d" (count), "=d" (tmp)
               : "0" ((char *) d + count), "1" ((char *) s + count), "2" (count)
        );
      }
   }

   return(0);
}

/* ++andreas: Simple and fast version of memmove, assumes size is
   divisible by 16, suitable for moving the whole screen bitplane */
static __inline__ void fast_memmove(char *dst, const char *src, size_t size)
{
  if (!size)
    return;
  if (dst < src)
    __asm__ __volatile__
      ("1:"
       "  moveml %0@+,%/d0/%/d1/%/a0/%/a1\n"
       "  moveml %/d0/%/d1/%/a0/%/a1,%1@\n"
       "  addql #8,%1; addql #8,%1\n"
       "  dbra %2,1b\n"
       "  clrw %2; subql #1,%2\n"
       "  jcc 1b"
       : "=a" (src), "=a" (dst), "=d" (size)
       : "0" (src), "1" (dst), "2" (size / 16 - 1)
       : "d0", "d1", "a0", "a1", "memory");
  else
    __asm__ __volatile__
      ("1:"
       "  subql #8,%0; subql #8,%0\n"
       "  moveml %0@,%/d0/%/d1/%/a0/%/a1\n"
       "  moveml %/d0/%/d1/%/a0/%/a1,%1@-\n"
       "  dbra %2,1b\n"
       "  clrw %2; subql #1,%2\n"
       "  jcc 1b"
       : "=a" (src), "=a" (dst), "=d" (size)
       : "0" (src + size), "1" (dst + size), "2" (size / 16 - 1)
       : "d0", "d1", "a0", "a1", "memory");
}

#ifdef BPL

/*
 * This expands a up to 8 bit color into two longs
 * for movel operations.
 */
static const u32 four2long[] =
{
	0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff,
	0x00ff0000, 0x00ff00ff, 0x00ffff00, 0x00ffffff,
	0xff000000, 0xff0000ff, 0xff00ff00, 0xff00ffff,
	0xffff0000, 0xffff00ff, 0xffffff00, 0xffffffff,
};

static inline void expand8_col2mask(u8 c, u32 m[])
{
	m[0] = four2long[c & 15];
#if BPL > 4
	m[1] = four2long[c >> 4];
#endif
}

static inline void expand8_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[])
{
	fgm[0] = four2long[fg & 15] ^ (bgm[0] = four2long[bg & 15]);
#if BPL > 4
	fgm[1] = four2long[fg >> 4] ^ (bgm[1] = four2long[bg >> 4]);
#endif
}

/*
 * set an 8bit value to a color
 */
static inline void fill8_col(u8 *dst, u32 m[])
{
	u32 tmp = m[0];
	dst[0] = tmp;
	dst[2] = (tmp >>= 8);
#if BPL > 2
	dst[4] = (tmp >>= 8);
	dst[6] = tmp >> 8;
#endif
#if BPL > 4
	tmp = m[1];
	dst[8] = tmp;
	dst[10] = (tmp >>= 8);
	dst[12] = (tmp >>= 8);
	dst[14] = tmp >> 8;
#endif
}

/*
 * set an 8bit value according to foreground/background color
 */
static inline void fill8_2col(u8 *dst, u8 fg, u8 bg, u32 mask)
{
	u32 fgm[2], bgm[2], tmp;

	expand8_2col2mask(fg, bg, fgm, bgm);

	mask |= mask << 8;
#if BPL > 2
	mask |= mask << 16;
#endif
	tmp = (mask & fgm[0]) ^ bgm[0];
	dst[0] = tmp;
	dst[2] = (tmp >>= 8);
#if BPL > 2
	dst[4] = (tmp >>= 8);
	dst[6] = tmp >> 8;
#endif
#if BPL > 4
	tmp = (mask & fgm[1]) ^ bgm[1];
	dst[8] = tmp;
	dst[10] = (tmp >>= 8);
	dst[12] = (tmp >>= 8);
	dst[14] = tmp >> 8;
#endif
}

static const u32 two2word[] =
{
	0x00000000, 0xffff0000, 0x0000ffff, 0xffffffff
};

static inline void expand16_col2mask(u8 c, u32 m[])
{
	m[0] = two2word[c & 3];
#if BPL > 2
	m[1] = two2word[(c >> 2) & 3];
#endif
#if BPL > 4
	m[2] = two2word[(c >> 4) & 3];
	m[3] = two2word[c >> 6];
#endif
}

static inline void expand16_2col2mask(u8 fg, u8 bg, u32 fgm[], u32 bgm[])
{
	bgm[0] = two2word[bg & 3];
	fgm[0] = two2word[fg & 3] ^ bgm[0];
#if BPL > 2
	bgm[1] = two2word[(bg >> 2) & 3];
	fgm[1] = two2word[(fg >> 2) & 3] ^ bgm[1];
#endif
#if BPL > 4
	bgm[2] = two2word[(bg >> 4) & 3];
	fgm[2] = two2word[(fg >> 4) & 3] ^ bgm[2];
	bgm[3] = two2word[bg >> 6];
	fgm[3] = two2word[fg >> 6] ^ bgm[3];
#endif
}

static inline u32 *fill16_col(u32 *dst, int rows, u32 m[])
{
	while (rows) {
		*dst++ = m[0];
#if BPL > 2
		*dst++ = m[1];
#endif
#if BPL > 4
		*dst++ = m[2];
		*dst++ = m[3];
#endif
		rows--;
	}
	return dst;
}

static inline void memmove32_col(void *dst, void *src, u32 mask, u32 h, u32 bytes)
{
	u32 *s, *d, v;

        s = src;
        d = dst;
        do {
                v = (*s++ & mask) | (*d  & ~mask);
                *d++ = v;
#if BPL > 2
                v = (*s++ & mask) | (*d  & ~mask);
                *d++ = v;
#endif
#if BPL > 4
                v = (*s++ & mask) | (*d  & ~mask);
                *d++ = v;
                v = (*s++ & mask) | (*d  & ~mask);
                *d++ = v;
#endif
                d = (u32 *)((u8 *)d + bytes);
                s = (u32 *)((u8 *)s + bytes);
        } while (--h);
}

#endif

#endif /* _VIDEO_ATAFB_UTILS_H */