#ifndef _TSENG_ACCEL_H
#define _TSENG_ACCEL_H
/*
 * Shortcuts to Tseng memory-mapped accelerator-control registers
 */

#define MMU_CONTROL(x)  MMIO_OUT8(pTseng->MMioBase, 0x13<<0, x)
#define ACL_SUSPEND_TERMINATE(x)  MMIO_OUT8(pTseng->MMioBase, 0x30<<0, x)
#define ACL_OPERATION_STATE(x)  MMIO_OUT8(pTseng->MMioBase, 0x31<<0, x)

#define ACL_SYNC_ENABLE(x)  MMIO_OUT8(pTseng->MMioBase, 0x32<<0, x)
    /* for ET6000, ACL_SYNC_ENABLE becomes ACL_6K_CONFIG */

#define ACL_INTERRUPT_STATUS(x) \
                        MMIO_OUT8(pTseng->MMioBase, 0x35<<0, x)
#define ACL_INTERRUPT_MASK(x) MMIO_OUT8(pTseng->MMioBase, 0x34<<0, x)
#define ACL_ACCELERATOR_STATUS (0x36 << 0)
#define ACL_ACCELERATOR_STATUS_SET(x) \
                        MMIO_OUT8(pTseng->MMioBase, ACL_ACCELERATOR_STATUS, x)
#define ACL_WRITE_INTERFACE_VALID (0x33 << 0)

    /* and this is only for the ET6000 */
#define ACL_POWER_CONTROL(x) MMIO_OUT8(pTseng->MMioBase, 0x37<<0, x)

    /* non-queued for w32p's and ET6000 */
#define ACL_NQ_X_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x38<<0, x)
#define ACL_NQ_Y_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x3A<<0, x)
    /* queued for w32 and w32i */
#define ACL_X_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x94<<0, x)
#define ACL_Y_POSITION(x)  MMIO_OUT16(pTseng->MMioBase, 0x96<<0, x)

#define ACL_PATTERN_ADDRESS(x)  MMIO_OUT32(pTseng->MMioBase, 0x80<<0, x)
#define ACL_SOURCE_ADDRESS(x)  MMIO_OUT32(pTseng->MMioBase, 0x84<<0, x)

#define ACL_PATTERN_Y_OFFSET(x)  MMIO_OUT16(pTseng->MMioBase, 0x88<<0, x)
#define ACL_PATTERN_Y_OFFSET32(x)  MMIO_OUT32(pTseng->MMioBase, 0x88<<0, x)
#define ACL_SOURCE_Y_OFFSET(x)  MMIO_OUT16(pTseng->MMioBase, 0x8A<<0, x)
#define ACL_DESTINATION_Y_OFFSET(x)  MMIO_OUT16(pTseng->MMioBase, 0x8C<<0, x)

    /* W32i */
#define ACL_VIRTUAL_BUS_SIZE(x) MMIO_OUT8(pTseng->MMioBase, 0x8E<<0, x)
    /* w32p */
#define ACL_PIXEL_DEPTH(x)  MMIO_OUT8(pTseng->MMioBase, 0x8E<<0, x)

    /* w32 and w32i */
#define ACL_XY_DIRECTION(x)  MMIO_OUT8(pTseng->MMioBase, 0x8F<<0, x)

#define ACL_PATTERN_WRAP(x)   MMIO_OUT8(pTseng->MMioBase, 0x90<<0, x)
#define ACL_PATTERN_WRAP32(x)   MMIO_OUT32(pTseng->MMioBase, 0x90<<0, x)
#define ACL_TRANSFER_DISABLE(x)  MMIO_OUT8(pTseng->MMioBase, 0x91<<0, x) /* ET6000 only */
#define ACL_SOURCE_WRAP(x) MMIO_OUT8(pTseng->MMioBase, 0x92<<0, x)

#define ACL_X_COUNT(x) MMIO_OUT16(pTseng->MMioBase, 0x98<<0, x)
#define ACL_Y_COUNT(x) MMIO_OUT16(pTseng->MMioBase, 0x9A<<0, x)
/* shortcut. not a real register */
#define ACL_XY_COUNT(x) MMIO_OUT32(pTseng->MMioBase, 0x98<<0, x)

#define ACL_ROUTING_CONTROL(x) MMIO_OUT8(pTseng->MMioBase, 0x9C<<0, x)
    /* for ET6000, ACL_ROUTING_CONTROL becomes ACL_MIX_CONTROL */
#define ACL_RELOAD_CONTROL(x) MMIO_OUT8(pTseng->MMioBase, 0x9D<<0, x)
    /* for ET6000, ACL_RELOAD_CONTROL becomes ACL_STEPPING_INHIBIT */

#define ACL_BACKGROUND_RASTER_OPERATION(x)  MMIO_OUT8(pTseng->MMioBase, 0x9E<<0, x)
#define ACL_FOREGROUND_RASTER_OPERATION(x)  MMIO_OUT8(pTseng->MMioBase, 0x9F<<0, x)

#define ACL_DESTINATION_ADDRESS(x) MMIO_OUT32(pTseng->MMioBase, 0xA0<<0, x)

    /* the following is for the w32p's only */
#define ACL_MIX_ADDRESS(x) MMIO_OUT32(pTseng->MMioBase, 0xA4<<0, x)

#define ACL_MIX_Y_OFFSET(x) MMIO_OUT16(pTseng->MMioBase, 0xA8<<0, x)
#define ACL_ERROR_TERM(x) MMIO_OUT16(pTseng->MMioBase, 0xAA<<0, x)
#define ACL_DELTA_MINOR(x) MMIO_OUT16(pTseng->MMioBase, 0xAC<<0, x)
#define ACL_DELTA_MINOR32(x) MMIO_OUT32(pTseng->MMioBase, 0xAC<<0, x)
#define ACL_DELTA_MAJOR(x) MMIO_OUT16(pTseng->MMioBase, 0xAE<<0, x)

    /* ET6000 only (trapezoids) */
#define ACL_SECONDARY_EDGE(x) MMIO_OUT8(pTseng->MMioBase, 0x93<<0, x)
#define ACL_SECONDARY_ERROR_TERM(x) MMIO_OUT16(pTseng->MMioBase, 0xB2<<0, x)
#define ACL_SECONDARY_DELTA_MINOR(x) MMIO_OUT16(pTseng->MMioBase, 0xB4<<0, x)
#define ACL_SECONDARY_DELTA_MINOR32(x) MMIO_OUT32(pTseng->MMioBase, 0xB4<<0, x)
#define ACL_SECONDARY_DELTA_MAJOR(x) MMIO_OUT16(pTseng->MMioBase, 0xB6<<0, x)

/* for ET6000: */
#define ACL_6K_CONFIG ACL_SYNC_ENABLE

/* for ET6000: */
#define ACL_MIX_CONTROL ACL_ROUTING_CONTROL
#define ACL_STEPPING_INHIBIT ACL_RELOAD_CONTROL

/*
 * Some shortcuts. 
 */

#define MAX_WAIT_CNT 500000	       /* how long we wait before we time out */
#undef WAIT_VERBOSE		       /* if defined: print out how long we waited */

void tseng_recover_timeout(TsengPtr pTseng);

static __inline__ void 
tseng_wait(TsengPtr pTseng, int reg, char *name, unsigned char mask)
{
    int cnt = MAX_WAIT_CNT;

    while ((MMIO_IN32(pTseng->MMioBase,reg)) & mask)
	if (--cnt < 0) {
	    ErrorF("WAIT_%s: timeout.\n", name);
	    tseng_recover_timeout(pTseng);
	    break;
	}
#ifdef WAIT_VERBOSE
    ErrorF("%s%d ", name, MAX_WAIT_CNT - cnt);
#endif
}

#define WAIT_QUEUE tseng_wait(pTseng, ACL_ACCELERATOR_STATUS, "QUEUE", 0x1)

/* This is only for W32p rev b...d */
#define WAIT_INTERFACE tseng_wait(pTseng, ACL_WRITE_INTERFACE_VALID, "INTERFACE", 0xf)

#define WAIT_ACL tseng_wait(pTseng, ACL_ACCELERATOR_STATUS, "ACL", 0x2)

#define WAIT_XY tseng_wait(pTseng, ACL_ACCELERATOR_STATUS, "XY", 0x4)

#define SET_FUNCTION_BLT \
    if (pTseng->ChipType == ET6000) \
        ACL_MIX_CONTROL(0x33); \
    else \
        ACL_ROUTING_CONTROL(0x00);

#define SET_FUNCTION_BLT_TR \
        ACL_MIX_CONTROL(0x13);

#define FBADDR(pTseng, x,y) ( (y) * pTseng->line_width + MULBPP(pTseng, x) )

#define SET_FG_ROP(rop) \
    ACL_FOREGROUND_RASTER_OPERATION(W32OpTable[rop]);

#define SET_FG_ROP_PLANEMASK(rop) \
    ACL_FOREGROUND_RASTER_OPERATION(W32OpTable_planemask[rop]);

#define SET_BG_ROP(rop) \
    ACL_BACKGROUND_RASTER_OPERATION(W32PatternOpTable[rop]);

#define SET_BG_ROP_TR(rop, bg_color) \
  if ((bg_color) == -1)    /* transparent color expansion */ \
    ACL_BACKGROUND_RASTER_OPERATION(0xaa); \
  else \
    ACL_BACKGROUND_RASTER_OPERATION(W32PatternOpTable[rop]);

#define SET_DELTA(Min, Maj) \
    ACL_DELTA_MINOR32(((Maj) << 16) + (Min))

#define SET_SECONDARY_DELTA(Min, Maj) \
    ACL_SECONDARY_DELTA_MINOR(((Maj) << 16) + (Min))

#ifdef NO_OPTIMIZE
#define SET_XYDIR(dir) \
      ACL_XY_DIRECTION(dir);
#else
/*
 * only changing ACL_XY_DIRECTION when it needs to be changed avoids
 * unnecessary PCI bus writes, which are slow. This shows up very well
 * on consecutive small fills.
 */
#define SET_XYDIR(dir) \
    if ((dir) != pTseng->tseng_old_dir) \
      pTseng->tseng_old_dir = (dir); \
      ACL_XY_DIRECTION(pTseng->tseng_old_dir);
#endif

#define SET_SECONDARY_XYDIR(dir) \
      ACL_SECONDARY_EDGE(dir);

/* Must do 0x09 (in one operation) for the W32 */
#define START_ACL(pTseng, dst) \
    ACL_DESTINATION_ADDRESS(dst);

/* START_ACL for the ET6000 */
#define START_ACL_6(dst) \
    ACL_DESTINATION_ADDRESS(dst);

#define START_ACL_CPU(pTseng, dst) \
      ACL_DESTINATION_ADDRESS(dst);

/*    ACL_DESTINATION_ADDRESS(dst);    should be enough for START_ACL_CPU */

/*
 * Some commonly used inline functions and utility functions.
 */

static __inline__ int
COLOR_REPLICATE_DWORD(TsengPtr pTseng, int color)
{
    switch (pTseng->Bytesperpixel) {
    case 1:
	color &= 0xFF;
	color = (color << 8) | color;
	color = (color << 16) | color;
	break;
    case 2:
	color &= 0xFFFF;
	color = (color << 16) | color;
	break;
    }
    return color;
}

/*
 * Optimizing note: increasing the wrap size for fixed-color source/pattern
 * tiles from 4x1 (as below) to anything bigger doesn't seem to affect
 * performance (it might have been better for larger wraps, but it isn't).
 */

static __inline__ void
SET_FG_COLOR(TsengPtr pTseng, int color)
{
    ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
    ACL_SOURCE_Y_OFFSET(3);
    color = COLOR_REPLICATE_DWORD(pTseng, color);
    MMIO_OUT32(pTseng->scratchMemBase, pTseng->tsengFg, color);
    
    ACL_SOURCE_WRAP(0x02);
}

static __inline__ void
SET_BG_COLOR(TsengPtr pTseng, int color)
{
    ACL_PATTERN_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengPat);
    ACL_PATTERN_Y_OFFSET(3);
    color = COLOR_REPLICATE_DWORD(pTseng, color);
    MMIO_OUT32(pTseng->scratchMemBase,pTseng->tsengPat, color);
    
    ACL_PATTERN_WRAP(0x02);
}

/*
 * this does the same as SET_FG_COLOR and SET_BG_COLOR together, but is
 * faster, because it allows the PCI chipset to chain the requests into a
 * burst sequence. The order of the commands is partly linear.
 * So far for the theory...
 */
static __inline__ void
SET_FG_BG_COLOR(TsengPtr pTseng, int fgcolor, int bgcolor)
{
    ACL_PATTERN_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengPat);
    ACL_SOURCE_ADDRESS(pTseng->AccelColorBufferOffset + pTseng->tsengFg);
    ACL_PATTERN_Y_OFFSET32(0x00030003);
    fgcolor = COLOR_REPLICATE_DWORD(pTseng, fgcolor);
    bgcolor = COLOR_REPLICATE_DWORD(pTseng, bgcolor);
    MMIO_OUT32(pTseng->scratchMemBase,pTseng->tsengFg, fgcolor);
    MMIO_OUT32(pTseng->scratchMemBase,pTseng->tsengPat, bgcolor);

    ACL_PATTERN_WRAP32(0x00020002);
}

/*
 * Real 32-bit multiplications are horribly slow compared to 16-bit (on i386).
 */
#ifdef NO_OPTIMIZE
static __inline__ int
MULBPP(TsengPtr pTseng, int x)
{
    return (x * pTseng->Bytesperpixel);
}
#else
static __inline__ int
MULBPP(TsengPtr pTseng, int x)
{
    int result = x << pTseng->powerPerPixel;

    if (pTseng->Bytesperpixel != 3)
	return result;
    else
	return result + x;
}
#endif

static __inline__ int
CALC_XY(TsengPtr pTseng, int x, int y)
{
    int new_x, xy;

    if ((pTseng->old_y == y) && (pTseng->old_x == x))
	return -1;

    if (pTseng->ChipType == ET4000)
	new_x = MULBPP(pTseng, x - 1);
    else
	new_x = MULBPP(pTseng, x) - 1;
    xy = ((y - 1) << 16) + new_x;
    pTseng->old_x = x;
    pTseng->old_y = y;
    return xy;
}

/* generic SET_XY */
static __inline__ void
SET_XY(TsengPtr pTseng, int x, int y)
{
    int new_x;

    if (pTseng->ChipType == ET4000)
	new_x = MULBPP(pTseng, x - 1);
    else
	new_x = MULBPP(pTseng, x) - 1;
    ACL_XY_COUNT(((y - 1) << 16) + new_x);
    pTseng->old_x = x;
    pTseng->old_y = y;
}

static __inline__ void
SET_X_YRAW(TsengPtr pTseng, int x, int y)
{
    int new_x;

    if (pTseng->ChipType == ET4000)
	new_x = MULBPP(pTseng, x - 1);
    else
	new_x = MULBPP(pTseng, x) - 1;
    ACL_XY_COUNT((y << 16) + new_x);
    pTseng->old_x = x;
    pTseng->old_y = y - 1;	      /* old_y is invalid (raw transfer) */
}

/*
 * This is plain and simple "benchmark rigging".
 * (no real application does lots of subsequent same-size blits)
 *
 * The effect of this is amazingly good on e.g large blits: 400x400
 * rectangle fill in 24 and 32 bpp on ET6000 jumps from 276 MB/sec to up to
 * 490 MB/sec... But not always. There must be a good reason why this gives
 * such a boost, but I don't know it.
 */

static __inline__ void
SET_XY_4(TsengPtr pTseng, int x, int y)
{
    int new_xy;

    if ((pTseng->old_y != y) || (pTseng->old_x != x)) {
	new_xy = ((y - 1) << 16) + MULBPP(pTseng, x - 1);
	ACL_XY_COUNT(new_xy);
	pTseng->old_x = x;
	pTseng->old_y = y;
    }
}

static __inline__ void
SET_XY_6(TsengPtr pTseng, int x, int y)
{
    int new_xy;			       /* using this intermediate variable is faster */

    if ((pTseng->old_y != y) || (pTseng->old_x != x)) {
	new_xy = ((y - 1) << 16) + MULBPP(pTseng, x) - 1;
	ACL_XY_COUNT(new_xy);
	pTseng->old_x = x;
	pTseng->old_y = y;
    }
}

/* generic SET_XY_RAW */
static __inline__ void
SET_XY_RAW(TsengPtr pTseng,int x, int y)
{
    ACL_XY_COUNT((y << 16) + x);
    pTseng->old_x = pTseng->old_y = -1;   /* invalidate old_x/old_y (raw transfers) */
}

static __inline__ void
PINGPONG(TsengPtr pTseng)
{
    if (pTseng->tsengFg == 0) {
	pTseng->tsengFg = 8;
	pTseng->tsengBg = 24;
	pTseng->tsengPat = 40;
    } else {
	pTseng->tsengFg = 0;
	pTseng->tsengBg = 16;
	pTseng->tsengPat = 32;
    }
}

/*
 * This is called in each ACL function just before the first ACL register is
 * written to. It waits for the accelerator to finish on cards that don't
 * support hardware-wait-state locking, and waits for a free queue entry on
 * others, if hardware-wait-states are not enabled.
 */
static __inline__ void
wait_acl_queue(TsengPtr pTseng)
{
    if (pTseng->UsePCIRetry)
	WAIT_QUEUE;
    if (pTseng->need_wait_acl)
	WAIT_ACL;
}
#endif /* _TSENG_ACCEL_H */